1: /*
2: * regexp.c: generic and extensible Regular Expression engine
3: *
4: * Basically designed with the purpose of compiling regexps for
5: * the variety of validation/shemas mechanisms now available in
6: * XML related specifications these include:
7: * - XML-1.0 DTD validation
8: * - XML Schemas structure part 1
9: * - XML Schemas Datatypes part 2 especially Appendix F
10: * - RELAX-NG/TREX i.e. the counter proposal
11: *
12: * See Copyright for the status of this software.
13: *
14: * Daniel Veillard <veillard@redhat.com>
15: */
16:
17: #define IN_LIBXML
18: #include "libxml.h"
19:
20: #ifdef LIBXML_REGEXP_ENABLED
21:
22: /* #define DEBUG_ERR */
23:
24: #include <stdio.h>
25: #include <string.h>
26: #ifdef HAVE_LIMITS_H
27: #include <limits.h>
28: #endif
29:
30: #include <libxml/tree.h>
31: #include <libxml/parserInternals.h>
32: #include <libxml/xmlregexp.h>
33: #include <libxml/xmlautomata.h>
34: #include <libxml/xmlunicode.h>
35:
36: #ifndef INT_MAX
37: #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38: #endif
39:
40: /* #define DEBUG_REGEXP_GRAPH */
41: /* #define DEBUG_REGEXP_EXEC */
42: /* #define DEBUG_PUSH */
43: /* #define DEBUG_COMPACTION */
44:
45: #define MAX_PUSH 10000000
46:
47: #define ERROR(str) \
48: ctxt->error = XML_REGEXP_COMPILE_ERROR; \
49: xmlRegexpErrCompile(ctxt, str);
50: #define NEXT ctxt->cur++
51: #define CUR (*(ctxt->cur))
52: #define NXT(index) (ctxt->cur[index])
53:
54: #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
55: #define NEXTL(l) ctxt->cur += l;
56: #define XML_REG_STRING_SEPARATOR '|'
57: /*
58: * Need PREV to check on a '-' within a Character Group. May only be used
59: * when it's guaranteed that cur is not at the beginning of ctxt->string!
60: */
61: #define PREV (ctxt->cur[-1])
62:
63: /**
64: * TODO:
65: *
66: * macro to flag unimplemented blocks
67: */
68: #define TODO \
69: xmlGenericError(xmlGenericErrorContext, \
70: "Unimplemented block at %s:%d\n", \
71: __FILE__, __LINE__);
72:
73: /************************************************************************
74: * *
75: * Datatypes and structures *
76: * *
77: ************************************************************************/
78:
79: /*
80: * Note: the order of the enums below is significant, do not shuffle
81: */
82: typedef enum {
83: XML_REGEXP_EPSILON = 1,
84: XML_REGEXP_CHARVAL,
85: XML_REGEXP_RANGES,
86: XML_REGEXP_SUBREG, /* used for () sub regexps */
87: XML_REGEXP_STRING,
88: XML_REGEXP_ANYCHAR, /* . */
89: XML_REGEXP_ANYSPACE, /* \s */
90: XML_REGEXP_NOTSPACE, /* \S */
91: XML_REGEXP_INITNAME, /* \l */
92: XML_REGEXP_NOTINITNAME, /* \L */
93: XML_REGEXP_NAMECHAR, /* \c */
94: XML_REGEXP_NOTNAMECHAR, /* \C */
95: XML_REGEXP_DECIMAL, /* \d */
96: XML_REGEXP_NOTDECIMAL, /* \D */
97: XML_REGEXP_REALCHAR, /* \w */
98: XML_REGEXP_NOTREALCHAR, /* \W */
99: XML_REGEXP_LETTER = 100,
100: XML_REGEXP_LETTER_UPPERCASE,
101: XML_REGEXP_LETTER_LOWERCASE,
102: XML_REGEXP_LETTER_TITLECASE,
103: XML_REGEXP_LETTER_MODIFIER,
104: XML_REGEXP_LETTER_OTHERS,
105: XML_REGEXP_MARK,
106: XML_REGEXP_MARK_NONSPACING,
107: XML_REGEXP_MARK_SPACECOMBINING,
108: XML_REGEXP_MARK_ENCLOSING,
109: XML_REGEXP_NUMBER,
110: XML_REGEXP_NUMBER_DECIMAL,
111: XML_REGEXP_NUMBER_LETTER,
112: XML_REGEXP_NUMBER_OTHERS,
113: XML_REGEXP_PUNCT,
114: XML_REGEXP_PUNCT_CONNECTOR,
115: XML_REGEXP_PUNCT_DASH,
116: XML_REGEXP_PUNCT_OPEN,
117: XML_REGEXP_PUNCT_CLOSE,
118: XML_REGEXP_PUNCT_INITQUOTE,
119: XML_REGEXP_PUNCT_FINQUOTE,
120: XML_REGEXP_PUNCT_OTHERS,
121: XML_REGEXP_SEPAR,
122: XML_REGEXP_SEPAR_SPACE,
123: XML_REGEXP_SEPAR_LINE,
124: XML_REGEXP_SEPAR_PARA,
125: XML_REGEXP_SYMBOL,
126: XML_REGEXP_SYMBOL_MATH,
127: XML_REGEXP_SYMBOL_CURRENCY,
128: XML_REGEXP_SYMBOL_MODIFIER,
129: XML_REGEXP_SYMBOL_OTHERS,
130: XML_REGEXP_OTHER,
131: XML_REGEXP_OTHER_CONTROL,
132: XML_REGEXP_OTHER_FORMAT,
133: XML_REGEXP_OTHER_PRIVATE,
134: XML_REGEXP_OTHER_NA,
135: XML_REGEXP_BLOCK_NAME
136: } xmlRegAtomType;
137:
138: typedef enum {
139: XML_REGEXP_QUANT_EPSILON = 1,
140: XML_REGEXP_QUANT_ONCE,
141: XML_REGEXP_QUANT_OPT,
142: XML_REGEXP_QUANT_MULT,
143: XML_REGEXP_QUANT_PLUS,
144: XML_REGEXP_QUANT_ONCEONLY,
145: XML_REGEXP_QUANT_ALL,
146: XML_REGEXP_QUANT_RANGE
147: } xmlRegQuantType;
148:
149: typedef enum {
150: XML_REGEXP_START_STATE = 1,
151: XML_REGEXP_FINAL_STATE,
152: XML_REGEXP_TRANS_STATE,
153: XML_REGEXP_SINK_STATE,
154: XML_REGEXP_UNREACH_STATE
155: } xmlRegStateType;
156:
157: typedef enum {
158: XML_REGEXP_MARK_NORMAL = 0,
159: XML_REGEXP_MARK_START,
160: XML_REGEXP_MARK_VISITED
161: } xmlRegMarkedType;
162:
163: typedef struct _xmlRegRange xmlRegRange;
164: typedef xmlRegRange *xmlRegRangePtr;
165:
166: struct _xmlRegRange {
167: int neg; /* 0 normal, 1 not, 2 exclude */
168: xmlRegAtomType type;
169: int start;
170: int end;
171: xmlChar *blockName;
172: };
173:
174: typedef struct _xmlRegAtom xmlRegAtom;
175: typedef xmlRegAtom *xmlRegAtomPtr;
176:
177: typedef struct _xmlAutomataState xmlRegState;
178: typedef xmlRegState *xmlRegStatePtr;
179:
180: struct _xmlRegAtom {
181: int no;
182: xmlRegAtomType type;
183: xmlRegQuantType quant;
184: int min;
185: int max;
186:
187: void *valuep;
188: void *valuep2;
189: int neg;
190: int codepoint;
191: xmlRegStatePtr start;
192: xmlRegStatePtr start0;
193: xmlRegStatePtr stop;
194: int maxRanges;
195: int nbRanges;
196: xmlRegRangePtr *ranges;
197: void *data;
198: };
199:
200: typedef struct _xmlRegCounter xmlRegCounter;
201: typedef xmlRegCounter *xmlRegCounterPtr;
202:
203: struct _xmlRegCounter {
204: int min;
205: int max;
206: };
207:
208: typedef struct _xmlRegTrans xmlRegTrans;
209: typedef xmlRegTrans *xmlRegTransPtr;
210:
211: struct _xmlRegTrans {
212: xmlRegAtomPtr atom;
213: int to;
214: int counter;
215: int count;
216: int nd;
217: };
218:
219: struct _xmlAutomataState {
220: xmlRegStateType type;
221: xmlRegMarkedType mark;
222: xmlRegMarkedType reached;
223: int no;
224: int maxTrans;
225: int nbTrans;
226: xmlRegTrans *trans;
227: /* knowing states ponting to us can speed things up */
228: int maxTransTo;
229: int nbTransTo;
230: int *transTo;
231: };
232:
233: typedef struct _xmlAutomata xmlRegParserCtxt;
234: typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
235:
236: #define AM_AUTOMATA_RNG 1
237:
238: struct _xmlAutomata {
239: xmlChar *string;
240: xmlChar *cur;
241:
242: int error;
243: int neg;
244:
245: xmlRegStatePtr start;
246: xmlRegStatePtr end;
247: xmlRegStatePtr state;
248:
249: xmlRegAtomPtr atom;
250:
251: int maxAtoms;
252: int nbAtoms;
253: xmlRegAtomPtr *atoms;
254:
255: int maxStates;
256: int nbStates;
257: xmlRegStatePtr *states;
258:
259: int maxCounters;
260: int nbCounters;
261: xmlRegCounter *counters;
262:
263: int determinist;
264: int negs;
265: int flags;
266: };
267:
268: struct _xmlRegexp {
269: xmlChar *string;
270: int nbStates;
271: xmlRegStatePtr *states;
272: int nbAtoms;
273: xmlRegAtomPtr *atoms;
274: int nbCounters;
275: xmlRegCounter *counters;
276: int determinist;
277: int flags;
278: /*
279: * That's the compact form for determinists automatas
280: */
281: int nbstates;
282: int *compact;
283: void **transdata;
284: int nbstrings;
285: xmlChar **stringMap;
286: };
287:
288: typedef struct _xmlRegExecRollback xmlRegExecRollback;
289: typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
290:
291: struct _xmlRegExecRollback {
292: xmlRegStatePtr state;/* the current state */
293: int index; /* the index in the input stack */
294: int nextbranch; /* the next transition to explore in that state */
295: int *counts; /* save the automata state if it has some */
296: };
297:
298: typedef struct _xmlRegInputToken xmlRegInputToken;
299: typedef xmlRegInputToken *xmlRegInputTokenPtr;
300:
301: struct _xmlRegInputToken {
302: xmlChar *value;
303: void *data;
304: };
305:
306: struct _xmlRegExecCtxt {
307: int status; /* execution status != 0 indicate an error */
308: int determinist; /* did we find an indeterministic behaviour */
309: xmlRegexpPtr comp; /* the compiled regexp */
310: xmlRegExecCallbacks callback;
311: void *data;
312:
313: xmlRegStatePtr state;/* the current state */
314: int transno; /* the current transition on that state */
315: int transcount; /* the number of chars in char counted transitions */
316:
317: /*
318: * A stack of rollback states
319: */
320: int maxRollbacks;
321: int nbRollbacks;
322: xmlRegExecRollback *rollbacks;
323:
324: /*
325: * The state of the automata if any
326: */
327: int *counts;
328:
329: /*
330: * The input stack
331: */
332: int inputStackMax;
333: int inputStackNr;
334: int index;
335: int *charStack;
336: const xmlChar *inputString; /* when operating on characters */
337: xmlRegInputTokenPtr inputStack;/* when operating on strings */
338:
339: /*
340: * error handling
341: */
342: int errStateNo; /* the error state number */
343: xmlRegStatePtr errState; /* the error state */
344: xmlChar *errString; /* the string raising the error */
345: int *errCounts; /* counters at the error state */
346: int nbPush;
347: };
348:
349: #define REGEXP_ALL_COUNTER 0x123456
350: #define REGEXP_ALL_LAX_COUNTER 0x123457
351:
352: static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
353: static void xmlRegFreeState(xmlRegStatePtr state);
354: static void xmlRegFreeAtom(xmlRegAtomPtr atom);
355: static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
356: static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
357: static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
358: int neg, int start, int end, const xmlChar *blockName);
359:
360: void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
361:
362: /************************************************************************
363: * *
364: * Regexp memory error handler *
365: * *
366: ************************************************************************/
367: /**
368: * xmlRegexpErrMemory:
369: * @extra: extra information
370: *
371: * Handle an out of memory condition
372: */
373: static void
374: xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
375: {
376: const char *regexp = NULL;
377: if (ctxt != NULL) {
378: regexp = (const char *) ctxt->string;
379: ctxt->error = XML_ERR_NO_MEMORY;
380: }
381: __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
382: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
383: regexp, NULL, 0, 0,
384: "Memory allocation failed : %s\n", extra);
385: }
386:
387: /**
388: * xmlRegexpErrCompile:
389: * @extra: extra information
390: *
391: * Handle a compilation failure
392: */
393: static void
394: xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
395: {
396: const char *regexp = NULL;
397: int idx = 0;
398:
399: if (ctxt != NULL) {
400: regexp = (const char *) ctxt->string;
401: idx = ctxt->cur - ctxt->string;
402: ctxt->error = XML_REGEXP_COMPILE_ERROR;
403: }
404: __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
405: XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
406: regexp, NULL, idx, 0,
407: "failed to compile: %s\n", extra);
408: }
409:
410: /************************************************************************
411: * *
412: * Allocation/Deallocation *
413: * *
414: ************************************************************************/
415:
416: static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
417: /**
418: * xmlRegEpxFromParse:
419: * @ctxt: the parser context used to build it
420: *
421: * Allocate a new regexp and fill it with the result from the parser
422: *
423: * Returns the new regexp or NULL in case of error
424: */
425: static xmlRegexpPtr
426: xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
427: xmlRegexpPtr ret;
428:
429: ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
430: if (ret == NULL) {
431: xmlRegexpErrMemory(ctxt, "compiling regexp");
432: return(NULL);
433: }
434: memset(ret, 0, sizeof(xmlRegexp));
435: ret->string = ctxt->string;
436: ret->nbStates = ctxt->nbStates;
437: ret->states = ctxt->states;
438: ret->nbAtoms = ctxt->nbAtoms;
439: ret->atoms = ctxt->atoms;
440: ret->nbCounters = ctxt->nbCounters;
441: ret->counters = ctxt->counters;
442: ret->determinist = ctxt->determinist;
443: ret->flags = ctxt->flags;
444: if (ret->determinist == -1) {
445: xmlRegexpIsDeterminist(ret);
446: }
447:
448: if ((ret->determinist != 0) &&
449: (ret->nbCounters == 0) &&
450: (ctxt->negs == 0) &&
451: (ret->atoms != NULL) &&
452: (ret->atoms[0] != NULL) &&
453: (ret->atoms[0]->type == XML_REGEXP_STRING)) {
454: int i, j, nbstates = 0, nbatoms = 0;
455: int *stateRemap;
456: int *stringRemap;
457: int *transitions;
458: void **transdata;
459: xmlChar **stringMap;
460: xmlChar *value;
461:
462: /*
463: * Switch to a compact representation
464: * 1/ counting the effective number of states left
465: * 2/ counting the unique number of atoms, and check that
466: * they are all of the string type
467: * 3/ build a table state x atom for the transitions
468: */
469:
470: stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
471: if (stateRemap == NULL) {
472: xmlRegexpErrMemory(ctxt, "compiling regexp");
473: xmlFree(ret);
474: return(NULL);
475: }
476: for (i = 0;i < ret->nbStates;i++) {
477: if (ret->states[i] != NULL) {
478: stateRemap[i] = nbstates;
479: nbstates++;
480: } else {
481: stateRemap[i] = -1;
482: }
483: }
484: #ifdef DEBUG_COMPACTION
485: printf("Final: %d states\n", nbstates);
486: #endif
487: stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
488: if (stringMap == NULL) {
489: xmlRegexpErrMemory(ctxt, "compiling regexp");
490: xmlFree(stateRemap);
491: xmlFree(ret);
492: return(NULL);
493: }
494: stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
495: if (stringRemap == NULL) {
496: xmlRegexpErrMemory(ctxt, "compiling regexp");
497: xmlFree(stringMap);
498: xmlFree(stateRemap);
499: xmlFree(ret);
500: return(NULL);
501: }
502: for (i = 0;i < ret->nbAtoms;i++) {
503: if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
504: (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
505: value = ret->atoms[i]->valuep;
506: for (j = 0;j < nbatoms;j++) {
507: if (xmlStrEqual(stringMap[j], value)) {
508: stringRemap[i] = j;
509: break;
510: }
511: }
512: if (j >= nbatoms) {
513: stringRemap[i] = nbatoms;
514: stringMap[nbatoms] = xmlStrdup(value);
515: if (stringMap[nbatoms] == NULL) {
516: for (i = 0;i < nbatoms;i++)
517: xmlFree(stringMap[i]);
518: xmlFree(stringRemap);
519: xmlFree(stringMap);
520: xmlFree(stateRemap);
521: xmlFree(ret);
522: return(NULL);
523: }
524: nbatoms++;
525: }
526: } else {
527: xmlFree(stateRemap);
528: xmlFree(stringRemap);
529: for (i = 0;i < nbatoms;i++)
530: xmlFree(stringMap[i]);
531: xmlFree(stringMap);
532: xmlFree(ret);
533: return(NULL);
534: }
535: }
536: #ifdef DEBUG_COMPACTION
537: printf("Final: %d atoms\n", nbatoms);
538: #endif
539: transitions = (int *) xmlMalloc((nbstates + 1) *
540: (nbatoms + 1) * sizeof(int));
541: if (transitions == NULL) {
542: xmlFree(stateRemap);
543: xmlFree(stringRemap);
544: xmlFree(stringMap);
545: xmlFree(ret);
546: return(NULL);
547: }
548: memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
549:
550: /*
551: * Allocate the transition table. The first entry for each
552: * state corresponds to the state type.
553: */
554: transdata = NULL;
555:
556: for (i = 0;i < ret->nbStates;i++) {
557: int stateno, atomno, targetno, prev;
558: xmlRegStatePtr state;
559: xmlRegTransPtr trans;
560:
561: stateno = stateRemap[i];
562: if (stateno == -1)
563: continue;
564: state = ret->states[i];
565:
566: transitions[stateno * (nbatoms + 1)] = state->type;
567:
568: for (j = 0;j < state->nbTrans;j++) {
569: trans = &(state->trans[j]);
570: if ((trans->to == -1) || (trans->atom == NULL))
571: continue;
572: atomno = stringRemap[trans->atom->no];
573: if ((trans->atom->data != NULL) && (transdata == NULL)) {
574: transdata = (void **) xmlMalloc(nbstates * nbatoms *
575: sizeof(void *));
576: if (transdata != NULL)
577: memset(transdata, 0,
578: nbstates * nbatoms * sizeof(void *));
579: else {
580: xmlRegexpErrMemory(ctxt, "compiling regexp");
581: break;
582: }
583: }
584: targetno = stateRemap[trans->to];
585: /*
586: * if the same atom can generate transitions to 2 different
587: * states then it means the automata is not determinist and
588: * the compact form can't be used !
589: */
590: prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
591: if (prev != 0) {
592: if (prev != targetno + 1) {
593: ret->determinist = 0;
594: #ifdef DEBUG_COMPACTION
595: printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
596: i, j, trans->atom->no, trans->to, atomno, targetno);
597: printf(" previous to is %d\n", prev);
598: #endif
599: if (transdata != NULL)
600: xmlFree(transdata);
601: xmlFree(transitions);
602: xmlFree(stateRemap);
603: xmlFree(stringRemap);
604: for (i = 0;i < nbatoms;i++)
605: xmlFree(stringMap[i]);
606: xmlFree(stringMap);
607: goto not_determ;
608: }
609: } else {
610: #if 0
611: printf("State %d trans %d: atom %d to %d : %d to %d\n",
612: i, j, trans->atom->no, trans->to, atomno, targetno);
613: #endif
614: transitions[stateno * (nbatoms + 1) + atomno + 1] =
615: targetno + 1; /* to avoid 0 */
616: if (transdata != NULL)
617: transdata[stateno * nbatoms + atomno] =
618: trans->atom->data;
619: }
620: }
621: }
622: ret->determinist = 1;
623: #ifdef DEBUG_COMPACTION
624: /*
625: * Debug
626: */
627: for (i = 0;i < nbstates;i++) {
628: for (j = 0;j < nbatoms + 1;j++) {
629: printf("%02d ", transitions[i * (nbatoms + 1) + j]);
630: }
631: printf("\n");
632: }
633: printf("\n");
634: #endif
635: /*
636: * Cleanup of the old data
637: */
638: if (ret->states != NULL) {
639: for (i = 0;i < ret->nbStates;i++)
640: xmlRegFreeState(ret->states[i]);
641: xmlFree(ret->states);
642: }
643: ret->states = NULL;
644: ret->nbStates = 0;
645: if (ret->atoms != NULL) {
646: for (i = 0;i < ret->nbAtoms;i++)
647: xmlRegFreeAtom(ret->atoms[i]);
648: xmlFree(ret->atoms);
649: }
650: ret->atoms = NULL;
651: ret->nbAtoms = 0;
652:
653: ret->compact = transitions;
654: ret->transdata = transdata;
655: ret->stringMap = stringMap;
656: ret->nbstrings = nbatoms;
657: ret->nbstates = nbstates;
658: xmlFree(stateRemap);
659: xmlFree(stringRemap);
660: }
661: not_determ:
662: ctxt->string = NULL;
663: ctxt->nbStates = 0;
664: ctxt->states = NULL;
665: ctxt->nbAtoms = 0;
666: ctxt->atoms = NULL;
667: ctxt->nbCounters = 0;
668: ctxt->counters = NULL;
669: return(ret);
670: }
671:
672: /**
673: * xmlRegNewParserCtxt:
674: * @string: the string to parse
675: *
676: * Allocate a new regexp parser context
677: *
678: * Returns the new context or NULL in case of error
679: */
680: static xmlRegParserCtxtPtr
681: xmlRegNewParserCtxt(const xmlChar *string) {
682: xmlRegParserCtxtPtr ret;
683:
684: ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
685: if (ret == NULL)
686: return(NULL);
687: memset(ret, 0, sizeof(xmlRegParserCtxt));
688: if (string != NULL)
689: ret->string = xmlStrdup(string);
690: ret->cur = ret->string;
691: ret->neg = 0;
692: ret->negs = 0;
693: ret->error = 0;
694: ret->determinist = -1;
695: return(ret);
696: }
697:
698: /**
699: * xmlRegNewRange:
700: * @ctxt: the regexp parser context
701: * @neg: is that negative
702: * @type: the type of range
703: * @start: the start codepoint
704: * @end: the end codepoint
705: *
706: * Allocate a new regexp range
707: *
708: * Returns the new range or NULL in case of error
709: */
710: static xmlRegRangePtr
711: xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
712: int neg, xmlRegAtomType type, int start, int end) {
713: xmlRegRangePtr ret;
714:
715: ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
716: if (ret == NULL) {
717: xmlRegexpErrMemory(ctxt, "allocating range");
718: return(NULL);
719: }
720: ret->neg = neg;
721: ret->type = type;
722: ret->start = start;
723: ret->end = end;
724: return(ret);
725: }
726:
727: /**
728: * xmlRegFreeRange:
729: * @range: the regexp range
730: *
731: * Free a regexp range
732: */
733: static void
734: xmlRegFreeRange(xmlRegRangePtr range) {
735: if (range == NULL)
736: return;
737:
738: if (range->blockName != NULL)
739: xmlFree(range->blockName);
740: xmlFree(range);
741: }
742:
743: /**
744: * xmlRegCopyRange:
745: * @range: the regexp range
746: *
747: * Copy a regexp range
748: *
749: * Returns the new copy or NULL in case of error.
750: */
751: static xmlRegRangePtr
752: xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
753: xmlRegRangePtr ret;
754:
755: if (range == NULL)
756: return(NULL);
757:
758: ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
759: range->end);
760: if (ret == NULL)
761: return(NULL);
762: if (range->blockName != NULL) {
763: ret->blockName = xmlStrdup(range->blockName);
764: if (ret->blockName == NULL) {
765: xmlRegexpErrMemory(ctxt, "allocating range");
766: xmlRegFreeRange(ret);
767: return(NULL);
768: }
769: }
770: return(ret);
771: }
772:
773: /**
774: * xmlRegNewAtom:
775: * @ctxt: the regexp parser context
776: * @type: the type of atom
777: *
778: * Allocate a new atom
779: *
780: * Returns the new atom or NULL in case of error
781: */
782: static xmlRegAtomPtr
783: xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
784: xmlRegAtomPtr ret;
785:
786: ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
787: if (ret == NULL) {
788: xmlRegexpErrMemory(ctxt, "allocating atom");
789: return(NULL);
790: }
791: memset(ret, 0, sizeof(xmlRegAtom));
792: ret->type = type;
793: ret->quant = XML_REGEXP_QUANT_ONCE;
794: ret->min = 0;
795: ret->max = 0;
796: return(ret);
797: }
798:
799: /**
800: * xmlRegFreeAtom:
801: * @atom: the regexp atom
802: *
803: * Free a regexp atom
804: */
805: static void
806: xmlRegFreeAtom(xmlRegAtomPtr atom) {
807: int i;
808:
809: if (atom == NULL)
810: return;
811:
812: for (i = 0;i < atom->nbRanges;i++)
813: xmlRegFreeRange(atom->ranges[i]);
814: if (atom->ranges != NULL)
815: xmlFree(atom->ranges);
816: if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
817: xmlFree(atom->valuep);
818: if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
819: xmlFree(atom->valuep2);
820: if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
821: xmlFree(atom->valuep);
822: xmlFree(atom);
823: }
824:
825: /**
826: * xmlRegCopyAtom:
827: * @ctxt: the regexp parser context
828: * @atom: the oiginal atom
829: *
830: * Allocate a new regexp range
831: *
832: * Returns the new atom or NULL in case of error
833: */
834: static xmlRegAtomPtr
835: xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
836: xmlRegAtomPtr ret;
837:
838: ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
839: if (ret == NULL) {
840: xmlRegexpErrMemory(ctxt, "copying atom");
841: return(NULL);
842: }
843: memset(ret, 0, sizeof(xmlRegAtom));
844: ret->type = atom->type;
845: ret->quant = atom->quant;
846: ret->min = atom->min;
847: ret->max = atom->max;
848: if (atom->nbRanges > 0) {
849: int i;
850:
851: ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
852: atom->nbRanges);
853: if (ret->ranges == NULL) {
854: xmlRegexpErrMemory(ctxt, "copying atom");
855: goto error;
856: }
857: for (i = 0;i < atom->nbRanges;i++) {
858: ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
859: if (ret->ranges[i] == NULL)
860: goto error;
861: ret->nbRanges = i + 1;
862: }
863: }
864: return(ret);
865:
866: error:
867: xmlRegFreeAtom(ret);
868: return(NULL);
869: }
870:
871: static xmlRegStatePtr
872: xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
873: xmlRegStatePtr ret;
874:
875: ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
876: if (ret == NULL) {
877: xmlRegexpErrMemory(ctxt, "allocating state");
878: return(NULL);
879: }
880: memset(ret, 0, sizeof(xmlRegState));
881: ret->type = XML_REGEXP_TRANS_STATE;
882: ret->mark = XML_REGEXP_MARK_NORMAL;
883: return(ret);
884: }
885:
886: /**
887: * xmlRegFreeState:
888: * @state: the regexp state
889: *
890: * Free a regexp state
891: */
892: static void
893: xmlRegFreeState(xmlRegStatePtr state) {
894: if (state == NULL)
895: return;
896:
897: if (state->trans != NULL)
898: xmlFree(state->trans);
899: if (state->transTo != NULL)
900: xmlFree(state->transTo);
901: xmlFree(state);
902: }
903:
904: /**
905: * xmlRegFreeParserCtxt:
906: * @ctxt: the regexp parser context
907: *
908: * Free a regexp parser context
909: */
910: static void
911: xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
912: int i;
913: if (ctxt == NULL)
914: return;
915:
916: if (ctxt->string != NULL)
917: xmlFree(ctxt->string);
918: if (ctxt->states != NULL) {
919: for (i = 0;i < ctxt->nbStates;i++)
920: xmlRegFreeState(ctxt->states[i]);
921: xmlFree(ctxt->states);
922: }
923: if (ctxt->atoms != NULL) {
924: for (i = 0;i < ctxt->nbAtoms;i++)
925: xmlRegFreeAtom(ctxt->atoms[i]);
926: xmlFree(ctxt->atoms);
927: }
928: if (ctxt->counters != NULL)
929: xmlFree(ctxt->counters);
930: xmlFree(ctxt);
931: }
932:
933: /************************************************************************
934: * *
935: * Display of Data structures *
936: * *
937: ************************************************************************/
938:
939: static void
940: xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
941: switch (type) {
942: case XML_REGEXP_EPSILON:
943: fprintf(output, "epsilon "); break;
944: case XML_REGEXP_CHARVAL:
945: fprintf(output, "charval "); break;
946: case XML_REGEXP_RANGES:
947: fprintf(output, "ranges "); break;
948: case XML_REGEXP_SUBREG:
949: fprintf(output, "subexpr "); break;
950: case XML_REGEXP_STRING:
951: fprintf(output, "string "); break;
952: case XML_REGEXP_ANYCHAR:
953: fprintf(output, "anychar "); break;
954: case XML_REGEXP_ANYSPACE:
955: fprintf(output, "anyspace "); break;
956: case XML_REGEXP_NOTSPACE:
957: fprintf(output, "notspace "); break;
958: case XML_REGEXP_INITNAME:
959: fprintf(output, "initname "); break;
960: case XML_REGEXP_NOTINITNAME:
961: fprintf(output, "notinitname "); break;
962: case XML_REGEXP_NAMECHAR:
963: fprintf(output, "namechar "); break;
964: case XML_REGEXP_NOTNAMECHAR:
965: fprintf(output, "notnamechar "); break;
966: case XML_REGEXP_DECIMAL:
967: fprintf(output, "decimal "); break;
968: case XML_REGEXP_NOTDECIMAL:
969: fprintf(output, "notdecimal "); break;
970: case XML_REGEXP_REALCHAR:
971: fprintf(output, "realchar "); break;
972: case XML_REGEXP_NOTREALCHAR:
973: fprintf(output, "notrealchar "); break;
974: case XML_REGEXP_LETTER:
975: fprintf(output, "LETTER "); break;
976: case XML_REGEXP_LETTER_UPPERCASE:
977: fprintf(output, "LETTER_UPPERCASE "); break;
978: case XML_REGEXP_LETTER_LOWERCASE:
979: fprintf(output, "LETTER_LOWERCASE "); break;
980: case XML_REGEXP_LETTER_TITLECASE:
981: fprintf(output, "LETTER_TITLECASE "); break;
982: case XML_REGEXP_LETTER_MODIFIER:
983: fprintf(output, "LETTER_MODIFIER "); break;
984: case XML_REGEXP_LETTER_OTHERS:
985: fprintf(output, "LETTER_OTHERS "); break;
986: case XML_REGEXP_MARK:
987: fprintf(output, "MARK "); break;
988: case XML_REGEXP_MARK_NONSPACING:
989: fprintf(output, "MARK_NONSPACING "); break;
990: case XML_REGEXP_MARK_SPACECOMBINING:
991: fprintf(output, "MARK_SPACECOMBINING "); break;
992: case XML_REGEXP_MARK_ENCLOSING:
993: fprintf(output, "MARK_ENCLOSING "); break;
994: case XML_REGEXP_NUMBER:
995: fprintf(output, "NUMBER "); break;
996: case XML_REGEXP_NUMBER_DECIMAL:
997: fprintf(output, "NUMBER_DECIMAL "); break;
998: case XML_REGEXP_NUMBER_LETTER:
999: fprintf(output, "NUMBER_LETTER "); break;
1000: case XML_REGEXP_NUMBER_OTHERS:
1001: fprintf(output, "NUMBER_OTHERS "); break;
1002: case XML_REGEXP_PUNCT:
1003: fprintf(output, "PUNCT "); break;
1004: case XML_REGEXP_PUNCT_CONNECTOR:
1005: fprintf(output, "PUNCT_CONNECTOR "); break;
1006: case XML_REGEXP_PUNCT_DASH:
1007: fprintf(output, "PUNCT_DASH "); break;
1008: case XML_REGEXP_PUNCT_OPEN:
1009: fprintf(output, "PUNCT_OPEN "); break;
1010: case XML_REGEXP_PUNCT_CLOSE:
1011: fprintf(output, "PUNCT_CLOSE "); break;
1012: case XML_REGEXP_PUNCT_INITQUOTE:
1013: fprintf(output, "PUNCT_INITQUOTE "); break;
1014: case XML_REGEXP_PUNCT_FINQUOTE:
1015: fprintf(output, "PUNCT_FINQUOTE "); break;
1016: case XML_REGEXP_PUNCT_OTHERS:
1017: fprintf(output, "PUNCT_OTHERS "); break;
1018: case XML_REGEXP_SEPAR:
1019: fprintf(output, "SEPAR "); break;
1020: case XML_REGEXP_SEPAR_SPACE:
1021: fprintf(output, "SEPAR_SPACE "); break;
1022: case XML_REGEXP_SEPAR_LINE:
1023: fprintf(output, "SEPAR_LINE "); break;
1024: case XML_REGEXP_SEPAR_PARA:
1025: fprintf(output, "SEPAR_PARA "); break;
1026: case XML_REGEXP_SYMBOL:
1027: fprintf(output, "SYMBOL "); break;
1028: case XML_REGEXP_SYMBOL_MATH:
1029: fprintf(output, "SYMBOL_MATH "); break;
1030: case XML_REGEXP_SYMBOL_CURRENCY:
1031: fprintf(output, "SYMBOL_CURRENCY "); break;
1032: case XML_REGEXP_SYMBOL_MODIFIER:
1033: fprintf(output, "SYMBOL_MODIFIER "); break;
1034: case XML_REGEXP_SYMBOL_OTHERS:
1035: fprintf(output, "SYMBOL_OTHERS "); break;
1036: case XML_REGEXP_OTHER:
1037: fprintf(output, "OTHER "); break;
1038: case XML_REGEXP_OTHER_CONTROL:
1039: fprintf(output, "OTHER_CONTROL "); break;
1040: case XML_REGEXP_OTHER_FORMAT:
1041: fprintf(output, "OTHER_FORMAT "); break;
1042: case XML_REGEXP_OTHER_PRIVATE:
1043: fprintf(output, "OTHER_PRIVATE "); break;
1044: case XML_REGEXP_OTHER_NA:
1045: fprintf(output, "OTHER_NA "); break;
1046: case XML_REGEXP_BLOCK_NAME:
1047: fprintf(output, "BLOCK "); break;
1048: }
1049: }
1050:
1051: static void
1052: xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1053: switch (type) {
1054: case XML_REGEXP_QUANT_EPSILON:
1055: fprintf(output, "epsilon "); break;
1056: case XML_REGEXP_QUANT_ONCE:
1057: fprintf(output, "once "); break;
1058: case XML_REGEXP_QUANT_OPT:
1059: fprintf(output, "? "); break;
1060: case XML_REGEXP_QUANT_MULT:
1061: fprintf(output, "* "); break;
1062: case XML_REGEXP_QUANT_PLUS:
1063: fprintf(output, "+ "); break;
1064: case XML_REGEXP_QUANT_RANGE:
1065: fprintf(output, "range "); break;
1066: case XML_REGEXP_QUANT_ONCEONLY:
1067: fprintf(output, "onceonly "); break;
1068: case XML_REGEXP_QUANT_ALL:
1069: fprintf(output, "all "); break;
1070: }
1071: }
1072: static void
1073: xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1074: fprintf(output, " range: ");
1075: if (range->neg)
1076: fprintf(output, "negative ");
1077: xmlRegPrintAtomType(output, range->type);
1078: fprintf(output, "%c - %c\n", range->start, range->end);
1079: }
1080:
1081: static void
1082: xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1083: fprintf(output, " atom: ");
1084: if (atom == NULL) {
1085: fprintf(output, "NULL\n");
1086: return;
1087: }
1088: if (atom->neg)
1089: fprintf(output, "not ");
1090: xmlRegPrintAtomType(output, atom->type);
1091: xmlRegPrintQuantType(output, atom->quant);
1092: if (atom->quant == XML_REGEXP_QUANT_RANGE)
1093: fprintf(output, "%d-%d ", atom->min, atom->max);
1094: if (atom->type == XML_REGEXP_STRING)
1095: fprintf(output, "'%s' ", (char *) atom->valuep);
1096: if (atom->type == XML_REGEXP_CHARVAL)
1097: fprintf(output, "char %c\n", atom->codepoint);
1098: else if (atom->type == XML_REGEXP_RANGES) {
1099: int i;
1100: fprintf(output, "%d entries\n", atom->nbRanges);
1101: for (i = 0; i < atom->nbRanges;i++)
1102: xmlRegPrintRange(output, atom->ranges[i]);
1103: } else if (atom->type == XML_REGEXP_SUBREG) {
1104: fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1105: } else {
1106: fprintf(output, "\n");
1107: }
1108: }
1109:
1110: static void
1111: xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1112: fprintf(output, " trans: ");
1113: if (trans == NULL) {
1114: fprintf(output, "NULL\n");
1115: return;
1116: }
1117: if (trans->to < 0) {
1118: fprintf(output, "removed\n");
1119: return;
1120: }
1121: if (trans->nd != 0) {
1122: if (trans->nd == 2)
1123: fprintf(output, "last not determinist, ");
1124: else
1125: fprintf(output, "not determinist, ");
1126: }
1127: if (trans->counter >= 0) {
1128: fprintf(output, "counted %d, ", trans->counter);
1129: }
1130: if (trans->count == REGEXP_ALL_COUNTER) {
1131: fprintf(output, "all transition, ");
1132: } else if (trans->count >= 0) {
1133: fprintf(output, "count based %d, ", trans->count);
1134: }
1135: if (trans->atom == NULL) {
1136: fprintf(output, "epsilon to %d\n", trans->to);
1137: return;
1138: }
1139: if (trans->atom->type == XML_REGEXP_CHARVAL)
1140: fprintf(output, "char %c ", trans->atom->codepoint);
1141: fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1142: }
1143:
1144: static void
1145: xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1146: int i;
1147:
1148: fprintf(output, " state: ");
1149: if (state == NULL) {
1150: fprintf(output, "NULL\n");
1151: return;
1152: }
1153: if (state->type == XML_REGEXP_START_STATE)
1154: fprintf(output, "START ");
1155: if (state->type == XML_REGEXP_FINAL_STATE)
1156: fprintf(output, "FINAL ");
1157:
1158: fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1159: for (i = 0;i < state->nbTrans; i++) {
1160: xmlRegPrintTrans(output, &(state->trans[i]));
1161: }
1162: }
1163:
1164: #ifdef DEBUG_REGEXP_GRAPH
1165: static void
1166: xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1167: int i;
1168:
1169: fprintf(output, " ctxt: ");
1170: if (ctxt == NULL) {
1171: fprintf(output, "NULL\n");
1172: return;
1173: }
1174: fprintf(output, "'%s' ", ctxt->string);
1175: if (ctxt->error)
1176: fprintf(output, "error ");
1177: if (ctxt->neg)
1178: fprintf(output, "neg ");
1179: fprintf(output, "\n");
1180: fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1181: for (i = 0;i < ctxt->nbAtoms; i++) {
1182: fprintf(output, " %02d ", i);
1183: xmlRegPrintAtom(output, ctxt->atoms[i]);
1184: }
1185: if (ctxt->atom != NULL) {
1186: fprintf(output, "current atom:\n");
1187: xmlRegPrintAtom(output, ctxt->atom);
1188: }
1189: fprintf(output, "%d states:", ctxt->nbStates);
1190: if (ctxt->start != NULL)
1191: fprintf(output, " start: %d", ctxt->start->no);
1192: if (ctxt->end != NULL)
1193: fprintf(output, " end: %d", ctxt->end->no);
1194: fprintf(output, "\n");
1195: for (i = 0;i < ctxt->nbStates; i++) {
1196: xmlRegPrintState(output, ctxt->states[i]);
1197: }
1198: fprintf(output, "%d counters:\n", ctxt->nbCounters);
1199: for (i = 0;i < ctxt->nbCounters; i++) {
1200: fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1201: ctxt->counters[i].max);
1202: }
1203: }
1204: #endif
1205:
1206: /************************************************************************
1207: * *
1208: * Finite Automata structures manipulations *
1209: * *
1210: ************************************************************************/
1211:
1212: static void
1213: xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1214: int neg, xmlRegAtomType type, int start, int end,
1215: xmlChar *blockName) {
1216: xmlRegRangePtr range;
1217:
1218: if (atom == NULL) {
1219: ERROR("add range: atom is NULL");
1220: return;
1221: }
1222: if (atom->type != XML_REGEXP_RANGES) {
1223: ERROR("add range: atom is not ranges");
1224: return;
1225: }
1226: if (atom->maxRanges == 0) {
1227: atom->maxRanges = 4;
1228: atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1229: sizeof(xmlRegRangePtr));
1230: if (atom->ranges == NULL) {
1231: xmlRegexpErrMemory(ctxt, "adding ranges");
1232: atom->maxRanges = 0;
1233: return;
1234: }
1235: } else if (atom->nbRanges >= atom->maxRanges) {
1236: xmlRegRangePtr *tmp;
1237: atom->maxRanges *= 2;
1238: tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1239: sizeof(xmlRegRangePtr));
1240: if (tmp == NULL) {
1241: xmlRegexpErrMemory(ctxt, "adding ranges");
1242: atom->maxRanges /= 2;
1243: return;
1244: }
1245: atom->ranges = tmp;
1246: }
1247: range = xmlRegNewRange(ctxt, neg, type, start, end);
1248: if (range == NULL)
1249: return;
1250: range->blockName = blockName;
1251: atom->ranges[atom->nbRanges++] = range;
1252:
1253: }
1254:
1255: static int
1256: xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1257: if (ctxt->maxCounters == 0) {
1258: ctxt->maxCounters = 4;
1259: ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1260: sizeof(xmlRegCounter));
1261: if (ctxt->counters == NULL) {
1262: xmlRegexpErrMemory(ctxt, "allocating counter");
1263: ctxt->maxCounters = 0;
1264: return(-1);
1265: }
1266: } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1267: xmlRegCounter *tmp;
1268: ctxt->maxCounters *= 2;
1269: tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1270: sizeof(xmlRegCounter));
1271: if (tmp == NULL) {
1272: xmlRegexpErrMemory(ctxt, "allocating counter");
1273: ctxt->maxCounters /= 2;
1274: return(-1);
1275: }
1276: ctxt->counters = tmp;
1277: }
1278: ctxt->counters[ctxt->nbCounters].min = -1;
1279: ctxt->counters[ctxt->nbCounters].max = -1;
1280: return(ctxt->nbCounters++);
1281: }
1282:
1283: static int
1284: xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1285: if (atom == NULL) {
1286: ERROR("atom push: atom is NULL");
1287: return(-1);
1288: }
1289: if (ctxt->maxAtoms == 0) {
1290: ctxt->maxAtoms = 4;
1291: ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1292: sizeof(xmlRegAtomPtr));
1293: if (ctxt->atoms == NULL) {
1294: xmlRegexpErrMemory(ctxt, "pushing atom");
1295: ctxt->maxAtoms = 0;
1296: return(-1);
1297: }
1298: } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1299: xmlRegAtomPtr *tmp;
1300: ctxt->maxAtoms *= 2;
1301: tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1302: sizeof(xmlRegAtomPtr));
1303: if (tmp == NULL) {
1304: xmlRegexpErrMemory(ctxt, "allocating counter");
1305: ctxt->maxAtoms /= 2;
1306: return(-1);
1307: }
1308: ctxt->atoms = tmp;
1309: }
1310: atom->no = ctxt->nbAtoms;
1311: ctxt->atoms[ctxt->nbAtoms++] = atom;
1312: return(0);
1313: }
1314:
1315: static void
1316: xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1317: int from) {
1318: if (target->maxTransTo == 0) {
1319: target->maxTransTo = 8;
1320: target->transTo = (int *) xmlMalloc(target->maxTransTo *
1321: sizeof(int));
1322: if (target->transTo == NULL) {
1323: xmlRegexpErrMemory(ctxt, "adding transition");
1324: target->maxTransTo = 0;
1325: return;
1326: }
1327: } else if (target->nbTransTo >= target->maxTransTo) {
1328: int *tmp;
1329: target->maxTransTo *= 2;
1330: tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1331: sizeof(int));
1332: if (tmp == NULL) {
1333: xmlRegexpErrMemory(ctxt, "adding transition");
1334: target->maxTransTo /= 2;
1335: return;
1336: }
1337: target->transTo = tmp;
1338: }
1339: target->transTo[target->nbTransTo] = from;
1340: target->nbTransTo++;
1341: }
1342:
1343: static void
1344: xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1345: xmlRegAtomPtr atom, xmlRegStatePtr target,
1346: int counter, int count) {
1347:
1348: int nrtrans;
1349:
1350: if (state == NULL) {
1351: ERROR("add state: state is NULL");
1352: return;
1353: }
1354: if (target == NULL) {
1355: ERROR("add state: target is NULL");
1356: return;
1357: }
1358: /*
1359: * Other routines follow the philosophy 'When in doubt, add a transition'
1360: * so we check here whether such a transition is already present and, if
1361: * so, silently ignore this request.
1362: */
1363:
1364: for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1365: xmlRegTransPtr trans = &(state->trans[nrtrans]);
1366: if ((trans->atom == atom) &&
1367: (trans->to == target->no) &&
1368: (trans->counter == counter) &&
1369: (trans->count == count)) {
1370: #ifdef DEBUG_REGEXP_GRAPH
1371: printf("Ignoring duplicate transition from %d to %d\n",
1372: state->no, target->no);
1373: #endif
1374: return;
1375: }
1376: }
1377:
1378: if (state->maxTrans == 0) {
1379: state->maxTrans = 8;
1380: state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1381: sizeof(xmlRegTrans));
1382: if (state->trans == NULL) {
1383: xmlRegexpErrMemory(ctxt, "adding transition");
1384: state->maxTrans = 0;
1385: return;
1386: }
1387: } else if (state->nbTrans >= state->maxTrans) {
1388: xmlRegTrans *tmp;
1389: state->maxTrans *= 2;
1390: tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1391: sizeof(xmlRegTrans));
1392: if (tmp == NULL) {
1393: xmlRegexpErrMemory(ctxt, "adding transition");
1394: state->maxTrans /= 2;
1395: return;
1396: }
1397: state->trans = tmp;
1398: }
1399: #ifdef DEBUG_REGEXP_GRAPH
1400: printf("Add trans from %d to %d ", state->no, target->no);
1401: if (count == REGEXP_ALL_COUNTER)
1402: printf("all transition\n");
1403: else if (count >= 0)
1404: printf("count based %d\n", count);
1405: else if (counter >= 0)
1406: printf("counted %d\n", counter);
1407: else if (atom == NULL)
1408: printf("epsilon transition\n");
1409: else if (atom != NULL)
1410: xmlRegPrintAtom(stdout, atom);
1411: #endif
1412:
1413: state->trans[state->nbTrans].atom = atom;
1414: state->trans[state->nbTrans].to = target->no;
1415: state->trans[state->nbTrans].counter = counter;
1416: state->trans[state->nbTrans].count = count;
1417: state->trans[state->nbTrans].nd = 0;
1418: state->nbTrans++;
1419: xmlRegStateAddTransTo(ctxt, target, state->no);
1420: }
1421:
1422: static int
1423: xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1424: if (state == NULL) return(-1);
1425: if (ctxt->maxStates == 0) {
1426: ctxt->maxStates = 4;
1427: ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1428: sizeof(xmlRegStatePtr));
1429: if (ctxt->states == NULL) {
1430: xmlRegexpErrMemory(ctxt, "adding state");
1431: ctxt->maxStates = 0;
1432: return(-1);
1433: }
1434: } else if (ctxt->nbStates >= ctxt->maxStates) {
1435: xmlRegStatePtr *tmp;
1436: ctxt->maxStates *= 2;
1437: tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1438: sizeof(xmlRegStatePtr));
1439: if (tmp == NULL) {
1440: xmlRegexpErrMemory(ctxt, "adding state");
1441: ctxt->maxStates /= 2;
1442: return(-1);
1443: }
1444: ctxt->states = tmp;
1445: }
1446: state->no = ctxt->nbStates;
1447: ctxt->states[ctxt->nbStates++] = state;
1448: return(0);
1449: }
1450:
1451: /**
1452: * xmlFAGenerateAllTransition:
1453: * @ctxt: a regexp parser context
1454: * @from: the from state
1455: * @to: the target state or NULL for building a new one
1456: * @lax:
1457: *
1458: */
1459: static void
1460: xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1461: xmlRegStatePtr from, xmlRegStatePtr to,
1462: int lax) {
1463: if (to == NULL) {
1464: to = xmlRegNewState(ctxt);
1465: xmlRegStatePush(ctxt, to);
1466: ctxt->state = to;
1467: }
1468: if (lax)
1469: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1470: else
1471: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1472: }
1473:
1474: /**
1475: * xmlFAGenerateEpsilonTransition:
1476: * @ctxt: a regexp parser context
1477: * @from: the from state
1478: * @to: the target state or NULL for building a new one
1479: *
1480: */
1481: static void
1482: xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1483: xmlRegStatePtr from, xmlRegStatePtr to) {
1484: if (to == NULL) {
1485: to = xmlRegNewState(ctxt);
1486: xmlRegStatePush(ctxt, to);
1487: ctxt->state = to;
1488: }
1489: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1490: }
1491:
1492: /**
1493: * xmlFAGenerateCountedEpsilonTransition:
1494: * @ctxt: a regexp parser context
1495: * @from: the from state
1496: * @to: the target state or NULL for building a new one
1497: * counter: the counter for that transition
1498: *
1499: */
1500: static void
1501: xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1502: xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1503: if (to == NULL) {
1504: to = xmlRegNewState(ctxt);
1505: xmlRegStatePush(ctxt, to);
1506: ctxt->state = to;
1507: }
1508: xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1509: }
1510:
1511: /**
1512: * xmlFAGenerateCountedTransition:
1513: * @ctxt: a regexp parser context
1514: * @from: the from state
1515: * @to: the target state or NULL for building a new one
1516: * counter: the counter for that transition
1517: *
1518: */
1519: static void
1520: xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1521: xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1522: if (to == NULL) {
1523: to = xmlRegNewState(ctxt);
1524: xmlRegStatePush(ctxt, to);
1525: ctxt->state = to;
1526: }
1527: xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1528: }
1529:
1530: /**
1531: * xmlFAGenerateTransitions:
1532: * @ctxt: a regexp parser context
1533: * @from: the from state
1534: * @to: the target state or NULL for building a new one
1535: * @atom: the atom generating the transition
1536: *
1537: * Returns 0 if success and -1 in case of error.
1538: */
1539: static int
1540: xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1541: xmlRegStatePtr to, xmlRegAtomPtr atom) {
1542: xmlRegStatePtr end;
1543:
1544: if (atom == NULL) {
1545: ERROR("genrate transition: atom == NULL");
1546: return(-1);
1547: }
1548: if (atom->type == XML_REGEXP_SUBREG) {
1549: /*
1550: * this is a subexpression handling one should not need to
1551: * create a new node except for XML_REGEXP_QUANT_RANGE.
1552: */
1553: if (xmlRegAtomPush(ctxt, atom) < 0) {
1554: return(-1);
1555: }
1556: if ((to != NULL) && (atom->stop != to) &&
1557: (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1558: /*
1559: * Generate an epsilon transition to link to the target
1560: */
1561: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1562: #ifdef DV
1563: } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1564: (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1565: to = xmlRegNewState(ctxt);
1566: xmlRegStatePush(ctxt, to);
1567: ctxt->state = to;
1568: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1569: #endif
1570: }
1571: switch (atom->quant) {
1572: case XML_REGEXP_QUANT_OPT:
1573: atom->quant = XML_REGEXP_QUANT_ONCE;
1574: /*
1575: * transition done to the state after end of atom.
1576: * 1. set transition from atom start to new state
1577: * 2. set transition from atom end to this state.
1578: */
1579: if (to == NULL) {
1580: xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1581: xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1582: ctxt->state);
1583: } else {
1584: xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1585: }
1586: break;
1587: case XML_REGEXP_QUANT_MULT:
1588: atom->quant = XML_REGEXP_QUANT_ONCE;
1589: xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1590: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1591: break;
1592: case XML_REGEXP_QUANT_PLUS:
1593: atom->quant = XML_REGEXP_QUANT_ONCE;
1594: xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1595: break;
1596: case XML_REGEXP_QUANT_RANGE: {
1597: int counter;
1598: xmlRegStatePtr inter, newstate;
1599:
1600: /*
1601: * create the final state now if needed
1602: */
1603: if (to != NULL) {
1604: newstate = to;
1605: } else {
1606: newstate = xmlRegNewState(ctxt);
1607: xmlRegStatePush(ctxt, newstate);
1608: }
1609:
1610: /*
1611: * The principle here is to use counted transition
1612: * to avoid explosion in the number of states in the
1613: * graph. This is clearly more complex but should not
1614: * be exploitable at runtime.
1615: */
1616: if ((atom->min == 0) && (atom->start0 == NULL)) {
1617: xmlRegAtomPtr copy;
1618: /*
1619: * duplicate a transition based on atom to count next
1620: * occurences after 1. We cannot loop to atom->start
1621: * directly because we need an epsilon transition to
1622: * newstate.
1623: */
1624: /* ???? For some reason it seems we never reach that
1625: case, I suppose this got optimized out before when
1626: building the automata */
1627: copy = xmlRegCopyAtom(ctxt, atom);
1628: if (copy == NULL)
1629: return(-1);
1630: copy->quant = XML_REGEXP_QUANT_ONCE;
1631: copy->min = 0;
1632: copy->max = 0;
1633:
1634: if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1635: < 0)
1636: return(-1);
1637: inter = ctxt->state;
1638: counter = xmlRegGetCounter(ctxt);
1639: ctxt->counters[counter].min = atom->min - 1;
1640: ctxt->counters[counter].max = atom->max - 1;
1641: /* count the number of times we see it again */
1642: xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1643: atom->stop, counter);
1644: /* allow a way out based on the count */
1645: xmlFAGenerateCountedTransition(ctxt, inter,
1646: newstate, counter);
1647: /* and also allow a direct exit for 0 */
1648: xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1649: newstate);
1650: } else {
1651: /*
1652: * either we need the atom at least once or there
1653: * is an atom->start0 allowing to easilly plug the
1654: * epsilon transition.
1655: */
1656: counter = xmlRegGetCounter(ctxt);
1657: ctxt->counters[counter].min = atom->min - 1;
1658: ctxt->counters[counter].max = atom->max - 1;
1659: /* count the number of times we see it again */
1660: xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1661: atom->start, counter);
1662: /* allow a way out based on the count */
1663: xmlFAGenerateCountedTransition(ctxt, atom->stop,
1664: newstate, counter);
1665: /* and if needed allow a direct exit for 0 */
1666: if (atom->min == 0)
1667: xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1668: newstate);
1669:
1670: }
1671: atom->min = 0;
1672: atom->max = 0;
1673: atom->quant = XML_REGEXP_QUANT_ONCE;
1674: ctxt->state = newstate;
1675: }
1676: default:
1677: break;
1678: }
1679: return(0);
1680: }
1681: if ((atom->min == 0) && (atom->max == 0) &&
1682: (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1683: /*
1684: * we can discard the atom and generate an epsilon transition instead
1685: */
1686: if (to == NULL) {
1687: to = xmlRegNewState(ctxt);
1688: if (to != NULL)
1689: xmlRegStatePush(ctxt, to);
1690: else {
1691: return(-1);
1692: }
1693: }
1694: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1695: ctxt->state = to;
1696: xmlRegFreeAtom(atom);
1697: return(0);
1698: }
1699: if (to == NULL) {
1700: to = xmlRegNewState(ctxt);
1701: if (to != NULL)
1702: xmlRegStatePush(ctxt, to);
1703: else {
1704: return(-1);
1705: }
1706: }
1707: end = to;
1708: if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1709: (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1710: /*
1711: * Do not pollute the target state by adding transitions from
1712: * it as it is likely to be the shared target of multiple branches.
1713: * So isolate with an epsilon transition.
1714: */
1715: xmlRegStatePtr tmp;
1716:
1717: tmp = xmlRegNewState(ctxt);
1718: if (tmp != NULL)
1719: xmlRegStatePush(ctxt, tmp);
1720: else {
1721: return(-1);
1722: }
1723: xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1724: to = tmp;
1725: }
1726: if (xmlRegAtomPush(ctxt, atom) < 0) {
1727: return(-1);
1728: }
1729: xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1730: ctxt->state = end;
1731: switch (atom->quant) {
1732: case XML_REGEXP_QUANT_OPT:
1733: atom->quant = XML_REGEXP_QUANT_ONCE;
1734: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1735: break;
1736: case XML_REGEXP_QUANT_MULT:
1737: atom->quant = XML_REGEXP_QUANT_ONCE;
1738: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1739: xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1740: break;
1741: case XML_REGEXP_QUANT_PLUS:
1742: atom->quant = XML_REGEXP_QUANT_ONCE;
1743: xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1744: break;
1745: case XML_REGEXP_QUANT_RANGE:
1746: #if DV_test
1747: if (atom->min == 0) {
1748: xmlFAGenerateEpsilonTransition(ctxt, from, to);
1749: }
1750: #endif
1751: break;
1752: default:
1753: break;
1754: }
1755: return(0);
1756: }
1757:
1758: /**
1759: * xmlFAReduceEpsilonTransitions:
1760: * @ctxt: a regexp parser context
1761: * @fromnr: the from state
1762: * @tonr: the to state
1763: * @counter: should that transition be associated to a counted
1764: *
1765: */
1766: static void
1767: xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1768: int tonr, int counter) {
1769: int transnr;
1770: xmlRegStatePtr from;
1771: xmlRegStatePtr to;
1772:
1773: #ifdef DEBUG_REGEXP_GRAPH
1774: printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1775: #endif
1776: from = ctxt->states[fromnr];
1777: if (from == NULL)
1778: return;
1779: to = ctxt->states[tonr];
1780: if (to == NULL)
1781: return;
1782: if ((to->mark == XML_REGEXP_MARK_START) ||
1783: (to->mark == XML_REGEXP_MARK_VISITED))
1784: return;
1785:
1786: to->mark = XML_REGEXP_MARK_VISITED;
1787: if (to->type == XML_REGEXP_FINAL_STATE) {
1788: #ifdef DEBUG_REGEXP_GRAPH
1789: printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1790: #endif
1791: from->type = XML_REGEXP_FINAL_STATE;
1792: }
1793: for (transnr = 0;transnr < to->nbTrans;transnr++) {
1794: if (to->trans[transnr].to < 0)
1795: continue;
1796: if (to->trans[transnr].atom == NULL) {
1797: /*
1798: * Don't remove counted transitions
1799: * Don't loop either
1800: */
1801: if (to->trans[transnr].to != fromnr) {
1802: if (to->trans[transnr].count >= 0) {
1803: int newto = to->trans[transnr].to;
1804:
1805: xmlRegStateAddTrans(ctxt, from, NULL,
1806: ctxt->states[newto],
1807: -1, to->trans[transnr].count);
1808: } else {
1809: #ifdef DEBUG_REGEXP_GRAPH
1810: printf("Found epsilon trans %d from %d to %d\n",
1811: transnr, tonr, to->trans[transnr].to);
1812: #endif
1813: if (to->trans[transnr].counter >= 0) {
1814: xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1815: to->trans[transnr].to,
1816: to->trans[transnr].counter);
1817: } else {
1818: xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1819: to->trans[transnr].to,
1820: counter);
1821: }
1822: }
1823: }
1824: } else {
1825: int newto = to->trans[transnr].to;
1826:
1827: if (to->trans[transnr].counter >= 0) {
1828: xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1829: ctxt->states[newto],
1830: to->trans[transnr].counter, -1);
1831: } else {
1832: xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1833: ctxt->states[newto], counter, -1);
1834: }
1835: }
1836: }
1837: to->mark = XML_REGEXP_MARK_NORMAL;
1838: }
1839:
1840: /**
1841: * xmlFAEliminateSimpleEpsilonTransitions:
1842: * @ctxt: a regexp parser context
1843: *
1844: * Eliminating general epsilon transitions can get costly in the general
1845: * algorithm due to the large amount of generated new transitions and
1846: * associated comparisons. However for simple epsilon transition used just
1847: * to separate building blocks when generating the automata this can be
1848: * reduced to state elimination:
1849: * - if there exists an epsilon from X to Y
1850: * - if there is no other transition from X
1851: * then X and Y are semantically equivalent and X can be eliminated
1852: * If X is the start state then make Y the start state, else replace the
1853: * target of all transitions to X by transitions to Y.
1854: */
1855: static void
1856: xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1857: int statenr, i, j, newto;
1858: xmlRegStatePtr state, tmp;
1859:
1860: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1861: state = ctxt->states[statenr];
1862: if (state == NULL)
1863: continue;
1864: if (state->nbTrans != 1)
1865: continue;
1866: if (state->type == XML_REGEXP_UNREACH_STATE)
1867: continue;
1868: /* is the only transition out a basic transition */
1869: if ((state->trans[0].atom == NULL) &&
1870: (state->trans[0].to >= 0) &&
1871: (state->trans[0].to != statenr) &&
1872: (state->trans[0].counter < 0) &&
1873: (state->trans[0].count < 0)) {
1874: newto = state->trans[0].to;
1875:
1876: if (state->type == XML_REGEXP_START_STATE) {
1877: #ifdef DEBUG_REGEXP_GRAPH
1878: printf("Found simple epsilon trans from start %d to %d\n",
1879: statenr, newto);
1880: #endif
1881: } else {
1882: #ifdef DEBUG_REGEXP_GRAPH
1883: printf("Found simple epsilon trans from %d to %d\n",
1884: statenr, newto);
1885: #endif
1886: for (i = 0;i < state->nbTransTo;i++) {
1887: tmp = ctxt->states[state->transTo[i]];
1888: for (j = 0;j < tmp->nbTrans;j++) {
1889: if (tmp->trans[j].to == statenr) {
1890: #ifdef DEBUG_REGEXP_GRAPH
1891: printf("Changed transition %d on %d to go to %d\n",
1892: j, tmp->no, newto);
1893: #endif
1894: tmp->trans[j].to = -1;
1895: xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1896: ctxt->states[newto],
1897: tmp->trans[j].counter,
1898: tmp->trans[j].count);
1899: }
1900: }
1901: }
1902: if (state->type == XML_REGEXP_FINAL_STATE)
1903: ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1904: /* eliminate the transition completely */
1905: state->nbTrans = 0;
1906:
1907: state->type = XML_REGEXP_UNREACH_STATE;
1908:
1909: }
1910:
1911: }
1912: }
1913: }
1914: /**
1915: * xmlFAEliminateEpsilonTransitions:
1916: * @ctxt: a regexp parser context
1917: *
1918: */
1919: static void
1920: xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1921: int statenr, transnr;
1922: xmlRegStatePtr state;
1923: int has_epsilon;
1924:
1925: if (ctxt->states == NULL) return;
1926:
1927: /*
1928: * Eliminate simple epsilon transition and the associated unreachable
1929: * states.
1930: */
1931: xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1932: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1933: state = ctxt->states[statenr];
1934: if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1935: #ifdef DEBUG_REGEXP_GRAPH
1936: printf("Removed unreachable state %d\n", statenr);
1937: #endif
1938: xmlRegFreeState(state);
1939: ctxt->states[statenr] = NULL;
1940: }
1941: }
1942:
1943: has_epsilon = 0;
1944:
1945: /*
1946: * Build the completed transitions bypassing the epsilons
1947: * Use a marking algorithm to avoid loops
1948: * Mark sink states too.
1949: * Process from the latests states backward to the start when
1950: * there is long cascading epsilon chains this minimize the
1951: * recursions and transition compares when adding the new ones
1952: */
1953: for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1954: state = ctxt->states[statenr];
1955: if (state == NULL)
1956: continue;
1957: if ((state->nbTrans == 0) &&
1958: (state->type != XML_REGEXP_FINAL_STATE)) {
1959: state->type = XML_REGEXP_SINK_STATE;
1960: }
1961: for (transnr = 0;transnr < state->nbTrans;transnr++) {
1962: if ((state->trans[transnr].atom == NULL) &&
1963: (state->trans[transnr].to >= 0)) {
1964: if (state->trans[transnr].to == statenr) {
1965: state->trans[transnr].to = -1;
1966: #ifdef DEBUG_REGEXP_GRAPH
1967: printf("Removed loopback epsilon trans %d on %d\n",
1968: transnr, statenr);
1969: #endif
1970: } else if (state->trans[transnr].count < 0) {
1971: int newto = state->trans[transnr].to;
1972:
1973: #ifdef DEBUG_REGEXP_GRAPH
1974: printf("Found epsilon trans %d from %d to %d\n",
1975: transnr, statenr, newto);
1976: #endif
1977: has_epsilon = 1;
1978: state->trans[transnr].to = -2;
1979: state->mark = XML_REGEXP_MARK_START;
1980: xmlFAReduceEpsilonTransitions(ctxt, statenr,
1981: newto, state->trans[transnr].counter);
1982: state->mark = XML_REGEXP_MARK_NORMAL;
1983: #ifdef DEBUG_REGEXP_GRAPH
1984: } else {
1985: printf("Found counted transition %d on %d\n",
1986: transnr, statenr);
1987: #endif
1988: }
1989: }
1990: }
1991: }
1992: /*
1993: * Eliminate the epsilon transitions
1994: */
1995: if (has_epsilon) {
1996: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1997: state = ctxt->states[statenr];
1998: if (state == NULL)
1999: continue;
2000: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2001: xmlRegTransPtr trans = &(state->trans[transnr]);
2002: if ((trans->atom == NULL) &&
2003: (trans->count < 0) &&
2004: (trans->to >= 0)) {
2005: trans->to = -1;
2006: }
2007: }
2008: }
2009: }
2010:
2011: /*
2012: * Use this pass to detect unreachable states too
2013: */
2014: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2015: state = ctxt->states[statenr];
2016: if (state != NULL)
2017: state->reached = XML_REGEXP_MARK_NORMAL;
2018: }
2019: state = ctxt->states[0];
2020: if (state != NULL)
2021: state->reached = XML_REGEXP_MARK_START;
2022: while (state != NULL) {
2023: xmlRegStatePtr target = NULL;
2024: state->reached = XML_REGEXP_MARK_VISITED;
2025: /*
2026: * Mark all states reachable from the current reachable state
2027: */
2028: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2029: if ((state->trans[transnr].to >= 0) &&
2030: ((state->trans[transnr].atom != NULL) ||
2031: (state->trans[transnr].count >= 0))) {
2032: int newto = state->trans[transnr].to;
2033:
2034: if (ctxt->states[newto] == NULL)
2035: continue;
2036: if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2037: ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2038: target = ctxt->states[newto];
2039: }
2040: }
2041: }
2042:
2043: /*
2044: * find the next accessible state not explored
2045: */
2046: if (target == NULL) {
2047: for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2048: state = ctxt->states[statenr];
2049: if ((state != NULL) && (state->reached ==
2050: XML_REGEXP_MARK_START)) {
2051: target = state;
2052: break;
2053: }
2054: }
2055: }
2056: state = target;
2057: }
2058: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2059: state = ctxt->states[statenr];
2060: if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2061: #ifdef DEBUG_REGEXP_GRAPH
2062: printf("Removed unreachable state %d\n", statenr);
2063: #endif
2064: xmlRegFreeState(state);
2065: ctxt->states[statenr] = NULL;
2066: }
2067: }
2068:
2069: }
2070:
2071: static int
2072: xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2073: int ret = 0;
2074:
2075: if ((range1->type == XML_REGEXP_RANGES) ||
2076: (range2->type == XML_REGEXP_RANGES) ||
2077: (range2->type == XML_REGEXP_SUBREG) ||
2078: (range1->type == XML_REGEXP_SUBREG) ||
2079: (range1->type == XML_REGEXP_STRING) ||
2080: (range2->type == XML_REGEXP_STRING))
2081: return(-1);
2082:
2083: /* put them in order */
2084: if (range1->type > range2->type) {
2085: xmlRegRangePtr tmp;
2086:
2087: tmp = range1;
2088: range1 = range2;
2089: range2 = tmp;
2090: }
2091: if ((range1->type == XML_REGEXP_ANYCHAR) ||
2092: (range2->type == XML_REGEXP_ANYCHAR)) {
2093: ret = 1;
2094: } else if ((range1->type == XML_REGEXP_EPSILON) ||
2095: (range2->type == XML_REGEXP_EPSILON)) {
2096: return(0);
2097: } else if (range1->type == range2->type) {
2098: if (range1->type != XML_REGEXP_CHARVAL)
2099: ret = 1;
2100: else if ((range1->end < range2->start) ||
2101: (range2->end < range1->start))
2102: ret = 0;
2103: else
2104: ret = 1;
2105: } else if (range1->type == XML_REGEXP_CHARVAL) {
2106: int codepoint;
2107: int neg = 0;
2108:
2109: /*
2110: * just check all codepoints in the range for acceptance,
2111: * this is usually way cheaper since done only once at
2112: * compilation than testing over and over at runtime or
2113: * pushing too many states when evaluating.
2114: */
2115: if (((range1->neg == 0) && (range2->neg != 0)) ||
2116: ((range1->neg != 0) && (range2->neg == 0)))
2117: neg = 1;
2118:
2119: for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2120: ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2121: 0, range2->start, range2->end,
2122: range2->blockName);
2123: if (ret < 0)
2124: return(-1);
2125: if (((neg == 1) && (ret == 0)) ||
2126: ((neg == 0) && (ret == 1)))
2127: return(1);
2128: }
2129: return(0);
2130: } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2131: (range2->type == XML_REGEXP_BLOCK_NAME)) {
2132: if (range1->type == range2->type) {
2133: ret = xmlStrEqual(range1->blockName, range2->blockName);
2134: } else {
2135: /*
2136: * comparing a block range with anything else is way
2137: * too costly, and maintining the table is like too much
2138: * memory too, so let's force the automata to save state
2139: * here.
2140: */
2141: return(1);
2142: }
2143: } else if ((range1->type < XML_REGEXP_LETTER) ||
2144: (range2->type < XML_REGEXP_LETTER)) {
2145: if ((range1->type == XML_REGEXP_ANYSPACE) &&
2146: (range2->type == XML_REGEXP_NOTSPACE))
2147: ret = 0;
2148: else if ((range1->type == XML_REGEXP_INITNAME) &&
2149: (range2->type == XML_REGEXP_NOTINITNAME))
2150: ret = 0;
2151: else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2152: (range2->type == XML_REGEXP_NOTNAMECHAR))
2153: ret = 0;
2154: else if ((range1->type == XML_REGEXP_DECIMAL) &&
2155: (range2->type == XML_REGEXP_NOTDECIMAL))
2156: ret = 0;
2157: else if ((range1->type == XML_REGEXP_REALCHAR) &&
2158: (range2->type == XML_REGEXP_NOTREALCHAR))
2159: ret = 0;
2160: else {
2161: /* same thing to limit complexity */
2162: return(1);
2163: }
2164: } else {
2165: ret = 0;
2166: /* range1->type < range2->type here */
2167: switch (range1->type) {
2168: case XML_REGEXP_LETTER:
2169: /* all disjoint except in the subgroups */
2170: if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2171: (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2172: (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2173: (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2174: (range2->type == XML_REGEXP_LETTER_OTHERS))
2175: ret = 1;
2176: break;
2177: case XML_REGEXP_MARK:
2178: if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2179: (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2180: (range2->type == XML_REGEXP_MARK_ENCLOSING))
2181: ret = 1;
2182: break;
2183: case XML_REGEXP_NUMBER:
2184: if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2185: (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2186: (range2->type == XML_REGEXP_NUMBER_OTHERS))
2187: ret = 1;
2188: break;
2189: case XML_REGEXP_PUNCT:
2190: if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2191: (range2->type == XML_REGEXP_PUNCT_DASH) ||
2192: (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2193: (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2194: (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2195: (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2196: (range2->type == XML_REGEXP_PUNCT_OTHERS))
2197: ret = 1;
2198: break;
2199: case XML_REGEXP_SEPAR:
2200: if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2201: (range2->type == XML_REGEXP_SEPAR_LINE) ||
2202: (range2->type == XML_REGEXP_SEPAR_PARA))
2203: ret = 1;
2204: break;
2205: case XML_REGEXP_SYMBOL:
2206: if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2207: (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2208: (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2209: (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2210: ret = 1;
2211: break;
2212: case XML_REGEXP_OTHER:
2213: if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2214: (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2215: (range2->type == XML_REGEXP_OTHER_PRIVATE))
2216: ret = 1;
2217: break;
2218: default:
2219: if ((range2->type >= XML_REGEXP_LETTER) &&
2220: (range2->type < XML_REGEXP_BLOCK_NAME))
2221: ret = 0;
2222: else {
2223: /* safety net ! */
2224: return(1);
2225: }
2226: }
2227: }
2228: if (((range1->neg == 0) && (range2->neg != 0)) ||
2229: ((range1->neg != 0) && (range2->neg == 0)))
2230: ret = !ret;
2231: return(ret);
2232: }
2233:
2234: /**
2235: * xmlFACompareAtomTypes:
2236: * @type1: an atom type
2237: * @type2: an atom type
2238: *
2239: * Compares two atoms type to check whether they intersect in some ways,
2240: * this is used by xmlFACompareAtoms only
2241: *
2242: * Returns 1 if they may intersect and 0 otherwise
2243: */
2244: static int
2245: xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2246: if ((type1 == XML_REGEXP_EPSILON) ||
2247: (type1 == XML_REGEXP_CHARVAL) ||
2248: (type1 == XML_REGEXP_RANGES) ||
2249: (type1 == XML_REGEXP_SUBREG) ||
2250: (type1 == XML_REGEXP_STRING) ||
2251: (type1 == XML_REGEXP_ANYCHAR))
2252: return(1);
2253: if ((type2 == XML_REGEXP_EPSILON) ||
2254: (type2 == XML_REGEXP_CHARVAL) ||
2255: (type2 == XML_REGEXP_RANGES) ||
2256: (type2 == XML_REGEXP_SUBREG) ||
2257: (type2 == XML_REGEXP_STRING) ||
2258: (type2 == XML_REGEXP_ANYCHAR))
2259: return(1);
2260:
2261: if (type1 == type2) return(1);
2262:
2263: /* simplify subsequent compares by making sure type1 < type2 */
2264: if (type1 > type2) {
2265: xmlRegAtomType tmp = type1;
2266: type1 = type2;
2267: type2 = tmp;
2268: }
2269: switch (type1) {
2270: case XML_REGEXP_ANYSPACE: /* \s */
2271: /* can't be a letter, number, mark, pontuation, symbol */
2272: if ((type2 == XML_REGEXP_NOTSPACE) ||
2273: ((type2 >= XML_REGEXP_LETTER) &&
2274: (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2275: ((type2 >= XML_REGEXP_NUMBER) &&
2276: (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2277: ((type2 >= XML_REGEXP_MARK) &&
2278: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2279: ((type2 >= XML_REGEXP_PUNCT) &&
2280: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2281: ((type2 >= XML_REGEXP_SYMBOL) &&
2282: (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2283: ) return(0);
2284: break;
2285: case XML_REGEXP_NOTSPACE: /* \S */
2286: break;
2287: case XML_REGEXP_INITNAME: /* \l */
2288: /* can't be a number, mark, separator, pontuation, symbol or other */
2289: if ((type2 == XML_REGEXP_NOTINITNAME) ||
2290: ((type2 >= XML_REGEXP_NUMBER) &&
2291: (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2292: ((type2 >= XML_REGEXP_MARK) &&
2293: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2294: ((type2 >= XML_REGEXP_SEPAR) &&
2295: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2296: ((type2 >= XML_REGEXP_PUNCT) &&
2297: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2298: ((type2 >= XML_REGEXP_SYMBOL) &&
2299: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2300: ((type2 >= XML_REGEXP_OTHER) &&
2301: (type2 <= XML_REGEXP_OTHER_NA))
2302: ) return(0);
2303: break;
2304: case XML_REGEXP_NOTINITNAME: /* \L */
2305: break;
2306: case XML_REGEXP_NAMECHAR: /* \c */
2307: /* can't be a mark, separator, pontuation, symbol or other */
2308: if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2309: ((type2 >= XML_REGEXP_MARK) &&
2310: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2311: ((type2 >= XML_REGEXP_PUNCT) &&
2312: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2313: ((type2 >= XML_REGEXP_SEPAR) &&
2314: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2315: ((type2 >= XML_REGEXP_SYMBOL) &&
2316: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2317: ((type2 >= XML_REGEXP_OTHER) &&
2318: (type2 <= XML_REGEXP_OTHER_NA))
2319: ) return(0);
2320: break;
2321: case XML_REGEXP_NOTNAMECHAR: /* \C */
2322: break;
2323: case XML_REGEXP_DECIMAL: /* \d */
2324: /* can't be a letter, mark, separator, pontuation, symbol or other */
2325: if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2326: (type2 == XML_REGEXP_REALCHAR) ||
2327: ((type2 >= XML_REGEXP_LETTER) &&
2328: (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2329: ((type2 >= XML_REGEXP_MARK) &&
2330: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2331: ((type2 >= XML_REGEXP_PUNCT) &&
2332: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2333: ((type2 >= XML_REGEXP_SEPAR) &&
2334: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2335: ((type2 >= XML_REGEXP_SYMBOL) &&
2336: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2337: ((type2 >= XML_REGEXP_OTHER) &&
2338: (type2 <= XML_REGEXP_OTHER_NA))
2339: )return(0);
2340: break;
2341: case XML_REGEXP_NOTDECIMAL: /* \D */
2342: break;
2343: case XML_REGEXP_REALCHAR: /* \w */
2344: /* can't be a mark, separator, pontuation, symbol or other */
2345: if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2346: ((type2 >= XML_REGEXP_MARK) &&
2347: (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2348: ((type2 >= XML_REGEXP_PUNCT) &&
2349: (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2350: ((type2 >= XML_REGEXP_SEPAR) &&
2351: (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2352: ((type2 >= XML_REGEXP_SYMBOL) &&
2353: (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2354: ((type2 >= XML_REGEXP_OTHER) &&
2355: (type2 <= XML_REGEXP_OTHER_NA))
2356: )return(0);
2357: break;
2358: case XML_REGEXP_NOTREALCHAR: /* \W */
2359: break;
2360: /*
2361: * at that point we know both type 1 and type2 are from
2362: * character categories are ordered and are different,
2363: * it becomes simple because this is a partition
2364: */
2365: case XML_REGEXP_LETTER:
2366: if (type2 <= XML_REGEXP_LETTER_OTHERS)
2367: return(1);
2368: return(0);
2369: case XML_REGEXP_LETTER_UPPERCASE:
2370: case XML_REGEXP_LETTER_LOWERCASE:
2371: case XML_REGEXP_LETTER_TITLECASE:
2372: case XML_REGEXP_LETTER_MODIFIER:
2373: case XML_REGEXP_LETTER_OTHERS:
2374: return(0);
2375: case XML_REGEXP_MARK:
2376: if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2377: return(1);
2378: return(0);
2379: case XML_REGEXP_MARK_NONSPACING:
2380: case XML_REGEXP_MARK_SPACECOMBINING:
2381: case XML_REGEXP_MARK_ENCLOSING:
2382: return(0);
2383: case XML_REGEXP_NUMBER:
2384: if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2385: return(1);
2386: return(0);
2387: case XML_REGEXP_NUMBER_DECIMAL:
2388: case XML_REGEXP_NUMBER_LETTER:
2389: case XML_REGEXP_NUMBER_OTHERS:
2390: return(0);
2391: case XML_REGEXP_PUNCT:
2392: if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2393: return(1);
2394: return(0);
2395: case XML_REGEXP_PUNCT_CONNECTOR:
2396: case XML_REGEXP_PUNCT_DASH:
2397: case XML_REGEXP_PUNCT_OPEN:
2398: case XML_REGEXP_PUNCT_CLOSE:
2399: case XML_REGEXP_PUNCT_INITQUOTE:
2400: case XML_REGEXP_PUNCT_FINQUOTE:
2401: case XML_REGEXP_PUNCT_OTHERS:
2402: return(0);
2403: case XML_REGEXP_SEPAR:
2404: if (type2 <= XML_REGEXP_SEPAR_PARA)
2405: return(1);
2406: return(0);
2407: case XML_REGEXP_SEPAR_SPACE:
2408: case XML_REGEXP_SEPAR_LINE:
2409: case XML_REGEXP_SEPAR_PARA:
2410: return(0);
2411: case XML_REGEXP_SYMBOL:
2412: if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2413: return(1);
2414: return(0);
2415: case XML_REGEXP_SYMBOL_MATH:
2416: case XML_REGEXP_SYMBOL_CURRENCY:
2417: case XML_REGEXP_SYMBOL_MODIFIER:
2418: case XML_REGEXP_SYMBOL_OTHERS:
2419: return(0);
2420: case XML_REGEXP_OTHER:
2421: if (type2 <= XML_REGEXP_OTHER_NA)
2422: return(1);
2423: return(0);
2424: case XML_REGEXP_OTHER_CONTROL:
2425: case XML_REGEXP_OTHER_FORMAT:
2426: case XML_REGEXP_OTHER_PRIVATE:
2427: case XML_REGEXP_OTHER_NA:
2428: return(0);
2429: default:
2430: break;
2431: }
2432: return(1);
2433: }
2434:
2435: /**
2436: * xmlFAEqualAtoms:
2437: * @atom1: an atom
2438: * @atom2: an atom
2439: * @deep: if not set only compare string pointers
2440: *
2441: * Compares two atoms to check whether they are the same exactly
2442: * this is used to remove equivalent transitions
2443: *
2444: * Returns 1 if same and 0 otherwise
2445: */
2446: static int
2447: xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2448: int ret = 0;
2449:
2450: if (atom1 == atom2)
2451: return(1);
2452: if ((atom1 == NULL) || (atom2 == NULL))
2453: return(0);
2454:
2455: if (atom1->type != atom2->type)
2456: return(0);
2457: switch (atom1->type) {
2458: case XML_REGEXP_EPSILON:
2459: ret = 0;
2460: break;
2461: case XML_REGEXP_STRING:
2462: if (!deep)
2463: ret = (atom1->valuep == atom2->valuep);
2464: else
2465: ret = xmlStrEqual((xmlChar *)atom1->valuep,
2466: (xmlChar *)atom2->valuep);
2467: break;
2468: case XML_REGEXP_CHARVAL:
2469: ret = (atom1->codepoint == atom2->codepoint);
2470: break;
2471: case XML_REGEXP_RANGES:
2472: /* too hard to do in the general case */
2473: ret = 0;
2474: default:
2475: break;
2476: }
2477: return(ret);
2478: }
2479:
2480: /**
2481: * xmlFACompareAtoms:
2482: * @atom1: an atom
2483: * @atom2: an atom
2484: * @deep: if not set only compare string pointers
2485: *
2486: * Compares two atoms to check whether they intersect in some ways,
2487: * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2488: *
2489: * Returns 1 if yes and 0 otherwise
2490: */
2491: static int
2492: xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2493: int ret = 1;
2494:
2495: if (atom1 == atom2)
2496: return(1);
2497: if ((atom1 == NULL) || (atom2 == NULL))
2498: return(0);
2499:
2500: if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2501: (atom2->type == XML_REGEXP_ANYCHAR))
2502: return(1);
2503:
2504: if (atom1->type > atom2->type) {
2505: xmlRegAtomPtr tmp;
2506: tmp = atom1;
2507: atom1 = atom2;
2508: atom2 = tmp;
2509: }
2510: if (atom1->type != atom2->type) {
2511: ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2512: /* if they can't intersect at the type level break now */
2513: if (ret == 0)
2514: return(0);
2515: }
2516: switch (atom1->type) {
2517: case XML_REGEXP_STRING:
2518: if (!deep)
2519: ret = (atom1->valuep != atom2->valuep);
2520: else
2521: ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2522: (xmlChar *)atom2->valuep);
2523: break;
2524: case XML_REGEXP_EPSILON:
2525: goto not_determinist;
2526: case XML_REGEXP_CHARVAL:
2527: if (atom2->type == XML_REGEXP_CHARVAL) {
2528: ret = (atom1->codepoint == atom2->codepoint);
2529: } else {
2530: ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2531: if (ret < 0)
2532: ret = 1;
2533: }
2534: break;
2535: case XML_REGEXP_RANGES:
2536: if (atom2->type == XML_REGEXP_RANGES) {
2537: int i, j, res;
2538: xmlRegRangePtr r1, r2;
2539:
2540: /*
2541: * need to check that none of the ranges eventually matches
2542: */
2543: for (i = 0;i < atom1->nbRanges;i++) {
2544: for (j = 0;j < atom2->nbRanges;j++) {
2545: r1 = atom1->ranges[i];
2546: r2 = atom2->ranges[j];
2547: res = xmlFACompareRanges(r1, r2);
2548: if (res == 1) {
2549: ret = 1;
2550: goto done;
2551: }
2552: }
2553: }
2554: ret = 0;
2555: }
2556: break;
2557: default:
2558: goto not_determinist;
2559: }
2560: done:
2561: if (atom1->neg != atom2->neg) {
2562: ret = !ret;
2563: }
2564: if (ret == 0)
2565: return(0);
2566: not_determinist:
2567: return(1);
2568: }
2569:
2570: /**
2571: * xmlFARecurseDeterminism:
2572: * @ctxt: a regexp parser context
2573: *
2574: * Check whether the associated regexp is determinist,
2575: * should be called after xmlFAEliminateEpsilonTransitions()
2576: *
2577: */
2578: static int
2579: xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2580: int to, xmlRegAtomPtr atom) {
2581: int ret = 1;
2582: int res;
2583: int transnr, nbTrans;
2584: xmlRegTransPtr t1;
2585: int deep = 1;
2586:
2587: if (state == NULL)
2588: return(ret);
2589:
2590: if (ctxt->flags & AM_AUTOMATA_RNG)
2591: deep = 0;
2592:
2593: /*
2594: * don't recurse on transitions potentially added in the course of
2595: * the elimination.
2596: */
2597: nbTrans = state->nbTrans;
2598: for (transnr = 0;transnr < nbTrans;transnr++) {
2599: t1 = &(state->trans[transnr]);
2600: /*
2601: * check transitions conflicting with the one looked at
2602: */
2603: if (t1->atom == NULL) {
2604: if (t1->to < 0)
2605: continue;
2606: res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2607: to, atom);
2608: if (res == 0) {
2609: ret = 0;
2610: /* t1->nd = 1; */
2611: }
2612: continue;
2613: }
2614: if (t1->to != to)
2615: continue;
2616: if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2617: ret = 0;
2618: /* mark the transition as non-deterministic */
2619: t1->nd = 1;
2620: }
2621: }
2622: return(ret);
2623: }
2624:
2625: /**
2626: * xmlFAComputesDeterminism:
2627: * @ctxt: a regexp parser context
2628: *
2629: * Check whether the associated regexp is determinist,
2630: * should be called after xmlFAEliminateEpsilonTransitions()
2631: *
2632: */
2633: static int
2634: xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2635: int statenr, transnr;
2636: xmlRegStatePtr state;
2637: xmlRegTransPtr t1, t2, last;
2638: int i;
2639: int ret = 1;
2640: int deep = 1;
2641:
2642: #ifdef DEBUG_REGEXP_GRAPH
2643: printf("xmlFAComputesDeterminism\n");
2644: xmlRegPrintCtxt(stdout, ctxt);
2645: #endif
2646: if (ctxt->determinist != -1)
2647: return(ctxt->determinist);
2648:
2649: if (ctxt->flags & AM_AUTOMATA_RNG)
2650: deep = 0;
2651:
2652: /*
2653: * First cleanup the automata removing cancelled transitions
2654: */
2655: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2656: state = ctxt->states[statenr];
2657: if (state == NULL)
2658: continue;
2659: if (state->nbTrans < 2)
2660: continue;
2661: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2662: t1 = &(state->trans[transnr]);
2663: /*
2664: * Determinism checks in case of counted or all transitions
2665: * will have to be handled separately
2666: */
2667: if (t1->atom == NULL) {
2668: /* t1->nd = 1; */
2669: continue;
2670: }
2671: if (t1->to == -1) /* eliminated */
2672: continue;
2673: for (i = 0;i < transnr;i++) {
2674: t2 = &(state->trans[i]);
2675: if (t2->to == -1) /* eliminated */
2676: continue;
2677: if (t2->atom != NULL) {
2678: if (t1->to == t2->to) {
2679: /*
2680: * Here we use deep because we want to keep the
2681: * transitions which indicate a conflict
2682: */
2683: if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2684: (t1->counter == t2->counter) &&
2685: (t1->count == t2->count))
2686: t2->to = -1; /* eliminated */
2687: }
2688: }
2689: }
2690: }
2691: }
2692:
2693: /*
2694: * Check for all states that there aren't 2 transitions
2695: * with the same atom and a different target.
2696: */
2697: for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2698: state = ctxt->states[statenr];
2699: if (state == NULL)
2700: continue;
2701: if (state->nbTrans < 2)
2702: continue;
2703: last = NULL;
2704: for (transnr = 0;transnr < state->nbTrans;transnr++) {
2705: t1 = &(state->trans[transnr]);
2706: /*
2707: * Determinism checks in case of counted or all transitions
2708: * will have to be handled separately
2709: */
2710: if (t1->atom == NULL) {
2711: continue;
2712: }
2713: if (t1->to == -1) /* eliminated */
2714: continue;
2715: for (i = 0;i < transnr;i++) {
2716: t2 = &(state->trans[i]);
2717: if (t2->to == -1) /* eliminated */
2718: continue;
2719: if (t2->atom != NULL) {
2720: /*
2721: * But here we don't use deep because we want to
2722: * find transitions which indicate a conflict
2723: */
2724: if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2725: ret = 0;
2726: /* mark the transitions as non-deterministic ones */
2727: t1->nd = 1;
2728: t2->nd = 1;
2729: last = t1;
2730: }
2731: } else if (t1->to != -1) {
2732: /*
2733: * do the closure in case of remaining specific
2734: * epsilon transitions like choices or all
2735: */
2736: ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2737: t2->to, t2->atom);
2738: /* don't shortcut the computation so all non deterministic
2739: transition get marked down
2740: if (ret == 0)
2741: return(0);
2742: */
2743: if (ret == 0) {
2744: t1->nd = 1;
2745: /* t2->nd = 1; */
2746: last = t1;
2747: }
2748: }
2749: }
2750: /* don't shortcut the computation so all non deterministic
2751: transition get marked down
2752: if (ret == 0)
2753: break; */
2754: }
2755:
2756: /*
2757: * mark specifically the last non-deterministic transition
2758: * from a state since there is no need to set-up rollback
2759: * from it
2760: */
2761: if (last != NULL) {
2762: last->nd = 2;
2763: }
2764:
2765: /* don't shortcut the computation so all non deterministic
2766: transition get marked down
2767: if (ret == 0)
2768: break; */
2769: }
2770:
2771: ctxt->determinist = ret;
2772: return(ret);
2773: }
2774:
2775: /************************************************************************
2776: * *
2777: * Routines to check input against transition atoms *
2778: * *
2779: ************************************************************************/
2780:
2781: static int
2782: xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2783: int start, int end, const xmlChar *blockName) {
2784: int ret = 0;
2785:
2786: switch (type) {
2787: case XML_REGEXP_STRING:
2788: case XML_REGEXP_SUBREG:
2789: case XML_REGEXP_RANGES:
2790: case XML_REGEXP_EPSILON:
2791: return(-1);
2792: case XML_REGEXP_ANYCHAR:
2793: ret = ((codepoint != '\n') && (codepoint != '\r'));
2794: break;
2795: case XML_REGEXP_CHARVAL:
2796: ret = ((codepoint >= start) && (codepoint <= end));
2797: break;
2798: case XML_REGEXP_NOTSPACE:
2799: neg = !neg;
2800: case XML_REGEXP_ANYSPACE:
2801: ret = ((codepoint == '\n') || (codepoint == '\r') ||
2802: (codepoint == '\t') || (codepoint == ' '));
2803: break;
2804: case XML_REGEXP_NOTINITNAME:
2805: neg = !neg;
2806: case XML_REGEXP_INITNAME:
2807: ret = (IS_LETTER(codepoint) ||
2808: (codepoint == '_') || (codepoint == ':'));
2809: break;
2810: case XML_REGEXP_NOTNAMECHAR:
2811: neg = !neg;
2812: case XML_REGEXP_NAMECHAR:
2813: ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2814: (codepoint == '.') || (codepoint == '-') ||
2815: (codepoint == '_') || (codepoint == ':') ||
2816: IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2817: break;
2818: case XML_REGEXP_NOTDECIMAL:
2819: neg = !neg;
2820: case XML_REGEXP_DECIMAL:
2821: ret = xmlUCSIsCatNd(codepoint);
2822: break;
2823: case XML_REGEXP_REALCHAR:
2824: neg = !neg;
2825: case XML_REGEXP_NOTREALCHAR:
2826: ret = xmlUCSIsCatP(codepoint);
2827: if (ret == 0)
2828: ret = xmlUCSIsCatZ(codepoint);
2829: if (ret == 0)
2830: ret = xmlUCSIsCatC(codepoint);
2831: break;
2832: case XML_REGEXP_LETTER:
2833: ret = xmlUCSIsCatL(codepoint);
2834: break;
2835: case XML_REGEXP_LETTER_UPPERCASE:
2836: ret = xmlUCSIsCatLu(codepoint);
2837: break;
2838: case XML_REGEXP_LETTER_LOWERCASE:
2839: ret = xmlUCSIsCatLl(codepoint);
2840: break;
2841: case XML_REGEXP_LETTER_TITLECASE:
2842: ret = xmlUCSIsCatLt(codepoint);
2843: break;
2844: case XML_REGEXP_LETTER_MODIFIER:
2845: ret = xmlUCSIsCatLm(codepoint);
2846: break;
2847: case XML_REGEXP_LETTER_OTHERS:
2848: ret = xmlUCSIsCatLo(codepoint);
2849: break;
2850: case XML_REGEXP_MARK:
2851: ret = xmlUCSIsCatM(codepoint);
2852: break;
2853: case XML_REGEXP_MARK_NONSPACING:
2854: ret = xmlUCSIsCatMn(codepoint);
2855: break;
2856: case XML_REGEXP_MARK_SPACECOMBINING:
2857: ret = xmlUCSIsCatMc(codepoint);
2858: break;
2859: case XML_REGEXP_MARK_ENCLOSING:
2860: ret = xmlUCSIsCatMe(codepoint);
2861: break;
2862: case XML_REGEXP_NUMBER:
2863: ret = xmlUCSIsCatN(codepoint);
2864: break;
2865: case XML_REGEXP_NUMBER_DECIMAL:
2866: ret = xmlUCSIsCatNd(codepoint);
2867: break;
2868: case XML_REGEXP_NUMBER_LETTER:
2869: ret = xmlUCSIsCatNl(codepoint);
2870: break;
2871: case XML_REGEXP_NUMBER_OTHERS:
2872: ret = xmlUCSIsCatNo(codepoint);
2873: break;
2874: case XML_REGEXP_PUNCT:
2875: ret = xmlUCSIsCatP(codepoint);
2876: break;
2877: case XML_REGEXP_PUNCT_CONNECTOR:
2878: ret = xmlUCSIsCatPc(codepoint);
2879: break;
2880: case XML_REGEXP_PUNCT_DASH:
2881: ret = xmlUCSIsCatPd(codepoint);
2882: break;
2883: case XML_REGEXP_PUNCT_OPEN:
2884: ret = xmlUCSIsCatPs(codepoint);
2885: break;
2886: case XML_REGEXP_PUNCT_CLOSE:
2887: ret = xmlUCSIsCatPe(codepoint);
2888: break;
2889: case XML_REGEXP_PUNCT_INITQUOTE:
2890: ret = xmlUCSIsCatPi(codepoint);
2891: break;
2892: case XML_REGEXP_PUNCT_FINQUOTE:
2893: ret = xmlUCSIsCatPf(codepoint);
2894: break;
2895: case XML_REGEXP_PUNCT_OTHERS:
2896: ret = xmlUCSIsCatPo(codepoint);
2897: break;
2898: case XML_REGEXP_SEPAR:
2899: ret = xmlUCSIsCatZ(codepoint);
2900: break;
2901: case XML_REGEXP_SEPAR_SPACE:
2902: ret = xmlUCSIsCatZs(codepoint);
2903: break;
2904: case XML_REGEXP_SEPAR_LINE:
2905: ret = xmlUCSIsCatZl(codepoint);
2906: break;
2907: case XML_REGEXP_SEPAR_PARA:
2908: ret = xmlUCSIsCatZp(codepoint);
2909: break;
2910: case XML_REGEXP_SYMBOL:
2911: ret = xmlUCSIsCatS(codepoint);
2912: break;
2913: case XML_REGEXP_SYMBOL_MATH:
2914: ret = xmlUCSIsCatSm(codepoint);
2915: break;
2916: case XML_REGEXP_SYMBOL_CURRENCY:
2917: ret = xmlUCSIsCatSc(codepoint);
2918: break;
2919: case XML_REGEXP_SYMBOL_MODIFIER:
2920: ret = xmlUCSIsCatSk(codepoint);
2921: break;
2922: case XML_REGEXP_SYMBOL_OTHERS:
2923: ret = xmlUCSIsCatSo(codepoint);
2924: break;
2925: case XML_REGEXP_OTHER:
2926: ret = xmlUCSIsCatC(codepoint);
2927: break;
2928: case XML_REGEXP_OTHER_CONTROL:
2929: ret = xmlUCSIsCatCc(codepoint);
2930: break;
2931: case XML_REGEXP_OTHER_FORMAT:
2932: ret = xmlUCSIsCatCf(codepoint);
2933: break;
2934: case XML_REGEXP_OTHER_PRIVATE:
2935: ret = xmlUCSIsCatCo(codepoint);
2936: break;
2937: case XML_REGEXP_OTHER_NA:
2938: /* ret = xmlUCSIsCatCn(codepoint); */
2939: /* Seems it doesn't exist anymore in recent Unicode releases */
2940: ret = 0;
2941: break;
2942: case XML_REGEXP_BLOCK_NAME:
2943: ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2944: break;
2945: }
2946: if (neg)
2947: return(!ret);
2948: return(ret);
2949: }
2950:
2951: static int
2952: xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2953: int i, ret = 0;
2954: xmlRegRangePtr range;
2955:
2956: if ((atom == NULL) || (!IS_CHAR(codepoint)))
2957: return(-1);
2958:
2959: switch (atom->type) {
2960: case XML_REGEXP_SUBREG:
2961: case XML_REGEXP_EPSILON:
2962: return(-1);
2963: case XML_REGEXP_CHARVAL:
2964: return(codepoint == atom->codepoint);
2965: case XML_REGEXP_RANGES: {
2966: int accept = 0;
2967:
2968: for (i = 0;i < atom->nbRanges;i++) {
2969: range = atom->ranges[i];
2970: if (range->neg == 2) {
2971: ret = xmlRegCheckCharacterRange(range->type, codepoint,
2972: 0, range->start, range->end,
2973: range->blockName);
2974: if (ret != 0)
2975: return(0); /* excluded char */
2976: } else if (range->neg) {
2977: ret = xmlRegCheckCharacterRange(range->type, codepoint,
2978: 0, range->start, range->end,
2979: range->blockName);
2980: if (ret == 0)
2981: accept = 1;
2982: else
2983: return(0);
2984: } else {
2985: ret = xmlRegCheckCharacterRange(range->type, codepoint,
2986: 0, range->start, range->end,
2987: range->blockName);
2988: if (ret != 0)
2989: accept = 1; /* might still be excluded */
2990: }
2991: }
2992: return(accept);
2993: }
2994: case XML_REGEXP_STRING:
2995: printf("TODO: XML_REGEXP_STRING\n");
2996: return(-1);
2997: case XML_REGEXP_ANYCHAR:
2998: case XML_REGEXP_ANYSPACE:
2999: case XML_REGEXP_NOTSPACE:
3000: case XML_REGEXP_INITNAME:
3001: case XML_REGEXP_NOTINITNAME:
3002: case XML_REGEXP_NAMECHAR:
3003: case XML_REGEXP_NOTNAMECHAR:
3004: case XML_REGEXP_DECIMAL:
3005: case XML_REGEXP_NOTDECIMAL:
3006: case XML_REGEXP_REALCHAR:
3007: case XML_REGEXP_NOTREALCHAR:
3008: case XML_REGEXP_LETTER:
3009: case XML_REGEXP_LETTER_UPPERCASE:
3010: case XML_REGEXP_LETTER_LOWERCASE:
3011: case XML_REGEXP_LETTER_TITLECASE:
3012: case XML_REGEXP_LETTER_MODIFIER:
3013: case XML_REGEXP_LETTER_OTHERS:
3014: case XML_REGEXP_MARK:
3015: case XML_REGEXP_MARK_NONSPACING:
3016: case XML_REGEXP_MARK_SPACECOMBINING:
3017: case XML_REGEXP_MARK_ENCLOSING:
3018: case XML_REGEXP_NUMBER:
3019: case XML_REGEXP_NUMBER_DECIMAL:
3020: case XML_REGEXP_NUMBER_LETTER:
3021: case XML_REGEXP_NUMBER_OTHERS:
3022: case XML_REGEXP_PUNCT:
3023: case XML_REGEXP_PUNCT_CONNECTOR:
3024: case XML_REGEXP_PUNCT_DASH:
3025: case XML_REGEXP_PUNCT_OPEN:
3026: case XML_REGEXP_PUNCT_CLOSE:
3027: case XML_REGEXP_PUNCT_INITQUOTE:
3028: case XML_REGEXP_PUNCT_FINQUOTE:
3029: case XML_REGEXP_PUNCT_OTHERS:
3030: case XML_REGEXP_SEPAR:
3031: case XML_REGEXP_SEPAR_SPACE:
3032: case XML_REGEXP_SEPAR_LINE:
3033: case XML_REGEXP_SEPAR_PARA:
3034: case XML_REGEXP_SYMBOL:
3035: case XML_REGEXP_SYMBOL_MATH:
3036: case XML_REGEXP_SYMBOL_CURRENCY:
3037: case XML_REGEXP_SYMBOL_MODIFIER:
3038: case XML_REGEXP_SYMBOL_OTHERS:
3039: case XML_REGEXP_OTHER:
3040: case XML_REGEXP_OTHER_CONTROL:
3041: case XML_REGEXP_OTHER_FORMAT:
3042: case XML_REGEXP_OTHER_PRIVATE:
3043: case XML_REGEXP_OTHER_NA:
3044: case XML_REGEXP_BLOCK_NAME:
3045: ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3046: (const xmlChar *)atom->valuep);
3047: if (atom->neg)
3048: ret = !ret;
3049: break;
3050: }
3051: return(ret);
3052: }
3053:
3054: /************************************************************************
3055: * *
3056: * Saving and restoring state of an execution context *
3057: * *
3058: ************************************************************************/
3059:
3060: #ifdef DEBUG_REGEXP_EXEC
3061: static void
3062: xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3063: printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3064: if (exec->inputStack != NULL) {
3065: int i;
3066: printf(": ");
3067: for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3068: printf("%s ", (const char *)
3069: exec->inputStack[exec->inputStackNr - (i + 1)].value);
3070: } else {
3071: printf(": %s", &(exec->inputString[exec->index]));
3072: }
3073: printf("\n");
3074: }
3075: #endif
3076:
3077: static void
3078: xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3079: #ifdef DEBUG_REGEXP_EXEC
3080: printf("saving ");
3081: exec->transno++;
3082: xmlFARegDebugExec(exec);
3083: exec->transno--;
3084: #endif
3085: #ifdef MAX_PUSH
3086: if (exec->nbPush > MAX_PUSH) {
3087: return;
3088: }
3089: exec->nbPush++;
3090: #endif
3091:
3092: if (exec->maxRollbacks == 0) {
3093: exec->maxRollbacks = 4;
3094: exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3095: sizeof(xmlRegExecRollback));
3096: if (exec->rollbacks == NULL) {
3097: xmlRegexpErrMemory(NULL, "saving regexp");
3098: exec->maxRollbacks = 0;
3099: return;
3100: }
3101: memset(exec->rollbacks, 0,
3102: exec->maxRollbacks * sizeof(xmlRegExecRollback));
3103: } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3104: xmlRegExecRollback *tmp;
3105: int len = exec->maxRollbacks;
3106:
3107: exec->maxRollbacks *= 2;
3108: tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3109: exec->maxRollbacks * sizeof(xmlRegExecRollback));
3110: if (tmp == NULL) {
3111: xmlRegexpErrMemory(NULL, "saving regexp");
3112: exec->maxRollbacks /= 2;
3113: return;
3114: }
3115: exec->rollbacks = tmp;
3116: tmp = &exec->rollbacks[len];
3117: memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3118: }
3119: exec->rollbacks[exec->nbRollbacks].state = exec->state;
3120: exec->rollbacks[exec->nbRollbacks].index = exec->index;
3121: exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3122: if (exec->comp->nbCounters > 0) {
3123: if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3124: exec->rollbacks[exec->nbRollbacks].counts = (int *)
3125: xmlMalloc(exec->comp->nbCounters * sizeof(int));
3126: if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3127: xmlRegexpErrMemory(NULL, "saving regexp");
3128: exec->status = -5;
3129: return;
3130: }
3131: }
3132: memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3133: exec->comp->nbCounters * sizeof(int));
3134: }
3135: exec->nbRollbacks++;
3136: }
3137:
3138: static void
3139: xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3140: if (exec->nbRollbacks <= 0) {
3141: exec->status = -1;
3142: #ifdef DEBUG_REGEXP_EXEC
3143: printf("rollback failed on empty stack\n");
3144: #endif
3145: return;
3146: }
3147: exec->nbRollbacks--;
3148: exec->state = exec->rollbacks[exec->nbRollbacks].state;
3149: exec->index = exec->rollbacks[exec->nbRollbacks].index;
3150: exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3151: if (exec->comp->nbCounters > 0) {
3152: if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3153: fprintf(stderr, "exec save: allocation failed");
3154: exec->status = -6;
3155: return;
3156: }
3157: memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3158: exec->comp->nbCounters * sizeof(int));
3159: }
3160:
3161: #ifdef DEBUG_REGEXP_EXEC
3162: printf("restored ");
3163: xmlFARegDebugExec(exec);
3164: #endif
3165: }
3166:
3167: /************************************************************************
3168: * *
3169: * Verifier, running an input against a compiled regexp *
3170: * *
3171: ************************************************************************/
3172:
3173: static int
3174: xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3175: xmlRegExecCtxt execval;
3176: xmlRegExecCtxtPtr exec = &execval;
3177: int ret, codepoint = 0, len, deter;
3178:
3179: exec->inputString = content;
3180: exec->index = 0;
3181: exec->nbPush = 0;
3182: exec->determinist = 1;
3183: exec->maxRollbacks = 0;
3184: exec->nbRollbacks = 0;
3185: exec->rollbacks = NULL;
3186: exec->status = 0;
3187: exec->comp = comp;
3188: exec->state = comp->states[0];
3189: exec->transno = 0;
3190: exec->transcount = 0;
3191: exec->inputStack = NULL;
3192: exec->inputStackMax = 0;
3193: if (comp->nbCounters > 0) {
3194: exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3195: if (exec->counts == NULL) {
3196: xmlRegexpErrMemory(NULL, "running regexp");
3197: return(-1);
3198: }
3199: memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3200: } else
3201: exec->counts = NULL;
3202: while ((exec->status == 0) &&
3203: ((exec->inputString[exec->index] != 0) ||
3204: ((exec->state != NULL) &&
3205: (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3206: xmlRegTransPtr trans;
3207: xmlRegAtomPtr atom;
3208:
3209: /*
3210: * If end of input on non-terminal state, rollback, however we may
3211: * still have epsilon like transition for counted transitions
3212: * on counters, in that case don't break too early. Additionally,
3213: * if we are working on a range like "AB{0,2}", where B is not present,
3214: * we don't want to break.
3215: */
3216: len = 1;
3217: if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3218: /*
3219: * if there is a transition, we must check if
3220: * atom allows minOccurs of 0
3221: */
3222: if (exec->transno < exec->state->nbTrans) {
3223: trans = &exec->state->trans[exec->transno];
3224: if (trans->to >=0) {
3225: atom = trans->atom;
3226: if (!((atom->min == 0) && (atom->max > 0)))
3227: goto rollback;
3228: }
3229: } else
3230: goto rollback;
3231: }
3232:
3233: exec->transcount = 0;
3234: for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3235: trans = &exec->state->trans[exec->transno];
3236: if (trans->to < 0)
3237: continue;
3238: atom = trans->atom;
3239: ret = 0;
3240: deter = 1;
3241: if (trans->count >= 0) {
3242: int count;
3243: xmlRegCounterPtr counter;
3244:
3245: if (exec->counts == NULL) {
3246: exec->status = -1;
3247: goto error;
3248: }
3249: /*
3250: * A counted transition.
3251: */
3252:
3253: count = exec->counts[trans->count];
3254: counter = &exec->comp->counters[trans->count];
3255: #ifdef DEBUG_REGEXP_EXEC
3256: printf("testing count %d: val %d, min %d, max %d\n",
3257: trans->count, count, counter->min, counter->max);
3258: #endif
3259: ret = ((count >= counter->min) && (count <= counter->max));
3260: if ((ret) && (counter->min != counter->max))
3261: deter = 0;
3262: } else if (atom == NULL) {
3263: fprintf(stderr, "epsilon transition left at runtime\n");
3264: exec->status = -2;
3265: break;
3266: } else if (exec->inputString[exec->index] != 0) {
3267: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3268: ret = xmlRegCheckCharacter(atom, codepoint);
3269: if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3270: xmlRegStatePtr to = comp->states[trans->to];
3271:
3272: /*
3273: * this is a multiple input sequence
3274: * If there is a counter associated increment it now.
3275: * before potentially saving and rollback
3276: * do not increment if the counter is already over the
3277: * maximum limit in which case get to next transition
3278: */
3279: if (trans->counter >= 0) {
3280: xmlRegCounterPtr counter;
3281:
3282: if ((exec->counts == NULL) ||
3283: (exec->comp == NULL) ||
3284: (exec->comp->counters == NULL)) {
3285: exec->status = -1;
3286: goto error;
3287: }
3288: counter = &exec->comp->counters[trans->counter];
3289: if (exec->counts[trans->counter] >= counter->max)
3290: continue; /* for loop on transitions */
3291:
3292: #ifdef DEBUG_REGEXP_EXEC
3293: printf("Increasing count %d\n", trans->counter);
3294: #endif
3295: exec->counts[trans->counter]++;
3296: }
3297: if (exec->state->nbTrans > exec->transno + 1) {
3298: xmlFARegExecSave(exec);
3299: }
3300: exec->transcount = 1;
3301: do {
3302: /*
3303: * Try to progress as much as possible on the input
3304: */
3305: if (exec->transcount == atom->max) {
3306: break;
3307: }
3308: exec->index += len;
3309: /*
3310: * End of input: stop here
3311: */
3312: if (exec->inputString[exec->index] == 0) {
3313: exec->index -= len;
3314: break;
3315: }
3316: if (exec->transcount >= atom->min) {
3317: int transno = exec->transno;
3318: xmlRegStatePtr state = exec->state;
3319:
3320: /*
3321: * The transition is acceptable save it
3322: */
3323: exec->transno = -1; /* trick */
3324: exec->state = to;
3325: xmlFARegExecSave(exec);
3326: exec->transno = transno;
3327: exec->state = state;
3328: }
3329: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3330: len);
3331: ret = xmlRegCheckCharacter(atom, codepoint);
3332: exec->transcount++;
3333: } while (ret == 1);
3334: if (exec->transcount < atom->min)
3335: ret = 0;
3336:
3337: /*
3338: * If the last check failed but one transition was found
3339: * possible, rollback
3340: */
3341: if (ret < 0)
3342: ret = 0;
3343: if (ret == 0) {
3344: goto rollback;
3345: }
3346: if (trans->counter >= 0) {
3347: if (exec->counts == NULL) {
3348: exec->status = -1;
3349: goto error;
3350: }
3351: #ifdef DEBUG_REGEXP_EXEC
3352: printf("Decreasing count %d\n", trans->counter);
3353: #endif
3354: exec->counts[trans->counter]--;
3355: }
3356: } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3357: /*
3358: * we don't match on the codepoint, but minOccurs of 0
3359: * says that's ok. Setting len to 0 inhibits stepping
3360: * over the codepoint.
3361: */
3362: exec->transcount = 1;
3363: len = 0;
3364: ret = 1;
3365: }
3366: } else if ((atom->min == 0) && (atom->max > 0)) {
3367: /* another spot to match when minOccurs is 0 */
3368: exec->transcount = 1;
3369: len = 0;
3370: ret = 1;
3371: }
3372: if (ret == 1) {
3373: if ((trans->nd == 1) ||
3374: ((trans->count >= 0) && (deter == 0) &&
3375: (exec->state->nbTrans > exec->transno + 1))) {
3376: #ifdef DEBUG_REGEXP_EXEC
3377: if (trans->nd == 1)
3378: printf("Saving on nd transition atom %d for %c at %d\n",
3379: trans->atom->no, codepoint, exec->index);
3380: else
3381: printf("Saving on counted transition count %d for %c at %d\n",
3382: trans->count, codepoint, exec->index);
3383: #endif
3384: xmlFARegExecSave(exec);
3385: }
3386: if (trans->counter >= 0) {
3387: xmlRegCounterPtr counter;
3388:
3389: /* make sure we don't go over the counter maximum value */
3390: if ((exec->counts == NULL) ||
3391: (exec->comp == NULL) ||
3392: (exec->comp->counters == NULL)) {
3393: exec->status = -1;
3394: goto error;
3395: }
3396: counter = &exec->comp->counters[trans->counter];
3397: if (exec->counts[trans->counter] >= counter->max)
3398: continue; /* for loop on transitions */
3399: #ifdef DEBUG_REGEXP_EXEC
3400: printf("Increasing count %d\n", trans->counter);
3401: #endif
3402: exec->counts[trans->counter]++;
3403: }
3404: if ((trans->count >= 0) &&
3405: (trans->count < REGEXP_ALL_COUNTER)) {
3406: if (exec->counts == NULL) {
3407: exec->status = -1;
3408: goto error;
3409: }
3410: #ifdef DEBUG_REGEXP_EXEC
3411: printf("resetting count %d on transition\n",
3412: trans->count);
3413: #endif
3414: exec->counts[trans->count] = 0;
3415: }
3416: #ifdef DEBUG_REGEXP_EXEC
3417: printf("entering state %d\n", trans->to);
3418: #endif
3419: exec->state = comp->states[trans->to];
3420: exec->transno = 0;
3421: if (trans->atom != NULL) {
3422: exec->index += len;
3423: }
3424: goto progress;
3425: } else if (ret < 0) {
3426: exec->status = -4;
3427: break;
3428: }
3429: }
3430: if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3431: rollback:
3432: /*
3433: * Failed to find a way out
3434: */
3435: exec->determinist = 0;
3436: #ifdef DEBUG_REGEXP_EXEC
3437: printf("rollback from state %d on %d:%c\n", exec->state->no,
3438: codepoint,codepoint);
3439: #endif
3440: xmlFARegExecRollBack(exec);
3441: }
3442: progress:
3443: continue;
3444: }
3445: error:
3446: if (exec->rollbacks != NULL) {
3447: if (exec->counts != NULL) {
3448: int i;
3449:
3450: for (i = 0;i < exec->maxRollbacks;i++)
3451: if (exec->rollbacks[i].counts != NULL)
3452: xmlFree(exec->rollbacks[i].counts);
3453: }
3454: xmlFree(exec->rollbacks);
3455: }
3456: if (exec->counts != NULL)
3457: xmlFree(exec->counts);
3458: if (exec->status == 0)
3459: return(1);
3460: if (exec->status == -1) {
3461: if (exec->nbPush > MAX_PUSH)
3462: return(-1);
3463: return(0);
3464: }
3465: return(exec->status);
3466: }
3467:
3468: /************************************************************************
3469: * *
3470: * Progressive interface to the verifier one atom at a time *
3471: * *
3472: ************************************************************************/
3473: #ifdef DEBUG_ERR
3474: static void testerr(xmlRegExecCtxtPtr exec);
3475: #endif
3476:
3477: /**
3478: * xmlRegNewExecCtxt:
3479: * @comp: a precompiled regular expression
3480: * @callback: a callback function used for handling progresses in the
3481: * automata matching phase
3482: * @data: the context data associated to the callback in this context
3483: *
3484: * Build a context used for progressive evaluation of a regexp.
3485: *
3486: * Returns the new context
3487: */
3488: xmlRegExecCtxtPtr
3489: xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3490: xmlRegExecCtxtPtr exec;
3491:
3492: if (comp == NULL)
3493: return(NULL);
3494: if ((comp->compact == NULL) && (comp->states == NULL))
3495: return(NULL);
3496: exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3497: if (exec == NULL) {
3498: xmlRegexpErrMemory(NULL, "creating execution context");
3499: return(NULL);
3500: }
3501: memset(exec, 0, sizeof(xmlRegExecCtxt));
3502: exec->inputString = NULL;
3503: exec->index = 0;
3504: exec->determinist = 1;
3505: exec->maxRollbacks = 0;
3506: exec->nbRollbacks = 0;
3507: exec->rollbacks = NULL;
3508: exec->status = 0;
3509: exec->comp = comp;
3510: if (comp->compact == NULL)
3511: exec->state = comp->states[0];
3512: exec->transno = 0;
3513: exec->transcount = 0;
3514: exec->callback = callback;
3515: exec->data = data;
3516: if (comp->nbCounters > 0) {
3517: /*
3518: * For error handling, exec->counts is allocated twice the size
3519: * the second half is used to store the data in case of rollback
3520: */
3521: exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3522: * 2);
3523: if (exec->counts == NULL) {
3524: xmlRegexpErrMemory(NULL, "creating execution context");
3525: xmlFree(exec);
3526: return(NULL);
3527: }
3528: memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3529: exec->errCounts = &exec->counts[comp->nbCounters];
3530: } else {
3531: exec->counts = NULL;
3532: exec->errCounts = NULL;
3533: }
3534: exec->inputStackMax = 0;
3535: exec->inputStackNr = 0;
3536: exec->inputStack = NULL;
3537: exec->errStateNo = -1;
3538: exec->errString = NULL;
3539: exec->nbPush = 0;
3540: return(exec);
3541: }
3542:
3543: /**
3544: * xmlRegFreeExecCtxt:
3545: * @exec: a regular expression evaulation context
3546: *
3547: * Free the structures associated to a regular expression evaulation context.
3548: */
3549: void
3550: xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3551: if (exec == NULL)
3552: return;
3553:
3554: if (exec->rollbacks != NULL) {
3555: if (exec->counts != NULL) {
3556: int i;
3557:
3558: for (i = 0;i < exec->maxRollbacks;i++)
3559: if (exec->rollbacks[i].counts != NULL)
3560: xmlFree(exec->rollbacks[i].counts);
3561: }
3562: xmlFree(exec->rollbacks);
3563: }
3564: if (exec->counts != NULL)
3565: xmlFree(exec->counts);
3566: if (exec->inputStack != NULL) {
3567: int i;
3568:
3569: for (i = 0;i < exec->inputStackNr;i++) {
3570: if (exec->inputStack[i].value != NULL)
3571: xmlFree(exec->inputStack[i].value);
3572: }
3573: xmlFree(exec->inputStack);
3574: }
3575: if (exec->errString != NULL)
3576: xmlFree(exec->errString);
3577: xmlFree(exec);
3578: }
3579:
3580: static void
3581: xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3582: void *data) {
3583: #ifdef DEBUG_PUSH
3584: printf("saving value: %d:%s\n", exec->inputStackNr, value);
3585: #endif
3586: if (exec->inputStackMax == 0) {
3587: exec->inputStackMax = 4;
3588: exec->inputStack = (xmlRegInputTokenPtr)
3589: xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3590: if (exec->inputStack == NULL) {
3591: xmlRegexpErrMemory(NULL, "pushing input string");
3592: exec->inputStackMax = 0;
3593: return;
3594: }
3595: } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3596: xmlRegInputTokenPtr tmp;
3597:
3598: exec->inputStackMax *= 2;
3599: tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3600: exec->inputStackMax * sizeof(xmlRegInputToken));
3601: if (tmp == NULL) {
3602: xmlRegexpErrMemory(NULL, "pushing input string");
3603: exec->inputStackMax /= 2;
3604: return;
3605: }
3606: exec->inputStack = tmp;
3607: }
3608: exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3609: exec->inputStack[exec->inputStackNr].data = data;
3610: exec->inputStackNr++;
3611: exec->inputStack[exec->inputStackNr].value = NULL;
3612: exec->inputStack[exec->inputStackNr].data = NULL;
3613: }
3614:
3615: /**
3616: * xmlRegStrEqualWildcard:
3617: * @expStr: the string to be evaluated
3618: * @valStr: the validation string
3619: *
3620: * Checks if both strings are equal or have the same content. "*"
3621: * can be used as a wildcard in @valStr; "|" is used as a seperator of
3622: * substrings in both @expStr and @valStr.
3623: *
3624: * Returns 1 if the comparison is satisfied and the number of substrings
3625: * is equal, 0 otherwise.
3626: */
3627:
3628: static int
3629: xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3630: if (expStr == valStr) return(1);
3631: if (expStr == NULL) return(0);
3632: if (valStr == NULL) return(0);
3633: do {
3634: /*
3635: * Eval if we have a wildcard for the current item.
3636: */
3637: if (*expStr != *valStr) {
3638: /* if one of them starts with a wildcard make valStr be it */
3639: if (*valStr == '*') {
3640: const xmlChar *tmp;
3641:
3642: tmp = valStr;
3643: valStr = expStr;
3644: expStr = tmp;
3645: }
3646: if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3647: do {
3648: if (*valStr == XML_REG_STRING_SEPARATOR)
3649: break;
3650: valStr++;
3651: } while (*valStr != 0);
3652: continue;
3653: } else
3654: return(0);
3655: }
3656: expStr++;
3657: valStr++;
3658: } while (*valStr != 0);
3659: if (*expStr != 0)
3660: return (0);
3661: else
3662: return (1);
3663: }
3664:
3665: /**
3666: * xmlRegCompactPushString:
3667: * @exec: a regexp execution context
3668: * @comp: the precompiled exec with a compact table
3669: * @value: a string token input
3670: * @data: data associated to the token to reuse in callbacks
3671: *
3672: * Push one input token in the execution context
3673: *
3674: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3675: * a negative value in case of error.
3676: */
3677: static int
3678: xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3679: xmlRegexpPtr comp,
3680: const xmlChar *value,
3681: void *data) {
3682: int state = exec->index;
3683: int i, target;
3684:
3685: if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3686: return(-1);
3687:
3688: if (value == NULL) {
3689: /*
3690: * are we at a final state ?
3691: */
3692: if (comp->compact[state * (comp->nbstrings + 1)] ==
3693: XML_REGEXP_FINAL_STATE)
3694: return(1);
3695: return(0);
3696: }
3697:
3698: #ifdef DEBUG_PUSH
3699: printf("value pushed: %s\n", value);
3700: #endif
3701:
3702: /*
3703: * Examine all outside transitions from current state
3704: */
3705: for (i = 0;i < comp->nbstrings;i++) {
3706: target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3707: if ((target > 0) && (target <= comp->nbstates)) {
3708: target--; /* to avoid 0 */
3709: if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3710: exec->index = target;
3711: if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3712: exec->callback(exec->data, value,
3713: comp->transdata[state * comp->nbstrings + i], data);
3714: }
3715: #ifdef DEBUG_PUSH
3716: printf("entering state %d\n", target);
3717: #endif
3718: if (comp->compact[target * (comp->nbstrings + 1)] ==
3719: XML_REGEXP_SINK_STATE)
3720: goto error;
3721:
3722: if (comp->compact[target * (comp->nbstrings + 1)] ==
3723: XML_REGEXP_FINAL_STATE)
3724: return(1);
3725: return(0);
3726: }
3727: }
3728: }
3729: /*
3730: * Failed to find an exit transition out from current state for the
3731: * current token
3732: */
3733: #ifdef DEBUG_PUSH
3734: printf("failed to find a transition for %s on state %d\n", value, state);
3735: #endif
3736: error:
3737: if (exec->errString != NULL)
3738: xmlFree(exec->errString);
3739: exec->errString = xmlStrdup(value);
3740: exec->errStateNo = state;
3741: exec->status = -1;
3742: #ifdef DEBUG_ERR
3743: testerr(exec);
3744: #endif
3745: return(-1);
3746: }
3747:
3748: /**
3749: * xmlRegExecPushStringInternal:
3750: * @exec: a regexp execution context or NULL to indicate the end
3751: * @value: a string token input
3752: * @data: data associated to the token to reuse in callbacks
3753: * @compound: value was assembled from 2 strings
3754: *
3755: * Push one input token in the execution context
3756: *
3757: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3758: * a negative value in case of error.
3759: */
3760: static int
3761: xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3762: void *data, int compound) {
3763: xmlRegTransPtr trans;
3764: xmlRegAtomPtr atom;
3765: int ret;
3766: int final = 0;
3767: int progress = 1;
3768:
3769: if (exec == NULL)
3770: return(-1);
3771: if (exec->comp == NULL)
3772: return(-1);
3773: if (exec->status != 0)
3774: return(exec->status);
3775:
3776: if (exec->comp->compact != NULL)
3777: return(xmlRegCompactPushString(exec, exec->comp, value, data));
3778:
3779: if (value == NULL) {
3780: if (exec->state->type == XML_REGEXP_FINAL_STATE)
3781: return(1);
3782: final = 1;
3783: }
3784:
3785: #ifdef DEBUG_PUSH
3786: printf("value pushed: %s\n", value);
3787: #endif
3788: /*
3789: * If we have an active rollback stack push the new value there
3790: * and get back to where we were left
3791: */
3792: if ((value != NULL) && (exec->inputStackNr > 0)) {
3793: xmlFARegExecSaveInputString(exec, value, data);
3794: value = exec->inputStack[exec->index].value;
3795: data = exec->inputStack[exec->index].data;
3796: #ifdef DEBUG_PUSH
3797: printf("value loaded: %s\n", value);
3798: #endif
3799: }
3800:
3801: while ((exec->status == 0) &&
3802: ((value != NULL) ||
3803: ((final == 1) &&
3804: (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3805:
3806: /*
3807: * End of input on non-terminal state, rollback, however we may
3808: * still have epsilon like transition for counted transitions
3809: * on counters, in that case don't break too early.
3810: */
3811: if ((value == NULL) && (exec->counts == NULL))
3812: goto rollback;
3813:
3814: exec->transcount = 0;
3815: for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3816: trans = &exec->state->trans[exec->transno];
3817: if (trans->to < 0)
3818: continue;
3819: atom = trans->atom;
3820: ret = 0;
3821: if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3822: int i;
3823: int count;
3824: xmlRegTransPtr t;
3825: xmlRegCounterPtr counter;
3826:
3827: ret = 0;
3828:
3829: #ifdef DEBUG_PUSH
3830: printf("testing all lax %d\n", trans->count);
3831: #endif
3832: /*
3833: * Check all counted transitions from the current state
3834: */
3835: if ((value == NULL) && (final)) {
3836: ret = 1;
3837: } else if (value != NULL) {
3838: for (i = 0;i < exec->state->nbTrans;i++) {
3839: t = &exec->state->trans[i];
3840: if ((t->counter < 0) || (t == trans))
3841: continue;
3842: counter = &exec->comp->counters[t->counter];
3843: count = exec->counts[t->counter];
3844: if ((count < counter->max) &&
3845: (t->atom != NULL) &&
3846: (xmlStrEqual(value, t->atom->valuep))) {
3847: ret = 0;
3848: break;
3849: }
3850: if ((count >= counter->min) &&
3851: (count < counter->max) &&
3852: (t->atom != NULL) &&
3853: (xmlStrEqual(value, t->atom->valuep))) {
3854: ret = 1;
3855: break;
3856: }
3857: }
3858: }
3859: } else if (trans->count == REGEXP_ALL_COUNTER) {
3860: int i;
3861: int count;
3862: xmlRegTransPtr t;
3863: xmlRegCounterPtr counter;
3864:
3865: ret = 1;
3866:
3867: #ifdef DEBUG_PUSH
3868: printf("testing all %d\n", trans->count);
3869: #endif
3870: /*
3871: * Check all counted transitions from the current state
3872: */
3873: for (i = 0;i < exec->state->nbTrans;i++) {
3874: t = &exec->state->trans[i];
3875: if ((t->counter < 0) || (t == trans))
3876: continue;
3877: counter = &exec->comp->counters[t->counter];
3878: count = exec->counts[t->counter];
3879: if ((count < counter->min) || (count > counter->max)) {
3880: ret = 0;
3881: break;
3882: }
3883: }
3884: } else if (trans->count >= 0) {
3885: int count;
3886: xmlRegCounterPtr counter;
3887:
3888: /*
3889: * A counted transition.
3890: */
3891:
3892: count = exec->counts[trans->count];
3893: counter = &exec->comp->counters[trans->count];
3894: #ifdef DEBUG_PUSH
3895: printf("testing count %d: val %d, min %d, max %d\n",
3896: trans->count, count, counter->min, counter->max);
3897: #endif
3898: ret = ((count >= counter->min) && (count <= counter->max));
3899: } else if (atom == NULL) {
3900: fprintf(stderr, "epsilon transition left at runtime\n");
3901: exec->status = -2;
3902: break;
3903: } else if (value != NULL) {
3904: ret = xmlRegStrEqualWildcard(atom->valuep, value);
3905: if (atom->neg) {
3906: ret = !ret;
3907: if (!compound)
3908: ret = 0;
3909: }
3910: if ((ret == 1) && (trans->counter >= 0)) {
3911: xmlRegCounterPtr counter;
3912: int count;
3913:
3914: count = exec->counts[trans->counter];
3915: counter = &exec->comp->counters[trans->counter];
3916: if (count >= counter->max)
3917: ret = 0;
3918: }
3919:
3920: if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3921: xmlRegStatePtr to = exec->comp->states[trans->to];
3922:
3923: /*
3924: * this is a multiple input sequence
3925: */
3926: if (exec->state->nbTrans > exec->transno + 1) {
3927: if (exec->inputStackNr <= 0) {
3928: xmlFARegExecSaveInputString(exec, value, data);
3929: }
3930: xmlFARegExecSave(exec);
3931: }
3932: exec->transcount = 1;
3933: do {
3934: /*
3935: * Try to progress as much as possible on the input
3936: */
3937: if (exec->transcount == atom->max) {
3938: break;
3939: }
3940: exec->index++;
3941: value = exec->inputStack[exec->index].value;
3942: data = exec->inputStack[exec->index].data;
3943: #ifdef DEBUG_PUSH
3944: printf("value loaded: %s\n", value);
3945: #endif
3946:
3947: /*
3948: * End of input: stop here
3949: */
3950: if (value == NULL) {
3951: exec->index --;
3952: break;
3953: }
3954: if (exec->transcount >= atom->min) {
3955: int transno = exec->transno;
3956: xmlRegStatePtr state = exec->state;
3957:
3958: /*
3959: * The transition is acceptable save it
3960: */
3961: exec->transno = -1; /* trick */
3962: exec->state = to;
3963: if (exec->inputStackNr <= 0) {
3964: xmlFARegExecSaveInputString(exec, value, data);
3965: }
3966: xmlFARegExecSave(exec);
3967: exec->transno = transno;
3968: exec->state = state;
3969: }
3970: ret = xmlStrEqual(value, atom->valuep);
3971: exec->transcount++;
3972: } while (ret == 1);
3973: if (exec->transcount < atom->min)
3974: ret = 0;
3975:
3976: /*
3977: * If the last check failed but one transition was found
3978: * possible, rollback
3979: */
3980: if (ret < 0)
3981: ret = 0;
3982: if (ret == 0) {
3983: goto rollback;
3984: }
3985: }
3986: }
3987: if (ret == 1) {
3988: if ((exec->callback != NULL) && (atom != NULL) &&
3989: (data != NULL)) {
3990: exec->callback(exec->data, atom->valuep,
3991: atom->data, data);
3992: }
3993: if (exec->state->nbTrans > exec->transno + 1) {
3994: if (exec->inputStackNr <= 0) {
3995: xmlFARegExecSaveInputString(exec, value, data);
3996: }
3997: xmlFARegExecSave(exec);
3998: }
3999: if (trans->counter >= 0) {
4000: #ifdef DEBUG_PUSH
4001: printf("Increasing count %d\n", trans->counter);
4002: #endif
4003: exec->counts[trans->counter]++;
4004: }
4005: if ((trans->count >= 0) &&
4006: (trans->count < REGEXP_ALL_COUNTER)) {
4007: #ifdef DEBUG_REGEXP_EXEC
4008: printf("resetting count %d on transition\n",
4009: trans->count);
4010: #endif
4011: exec->counts[trans->count] = 0;
4012: }
4013: #ifdef DEBUG_PUSH
4014: printf("entering state %d\n", trans->to);
4015: #endif
4016: if ((exec->comp->states[trans->to] != NULL) &&
4017: (exec->comp->states[trans->to]->type ==
4018: XML_REGEXP_SINK_STATE)) {
4019: /*
4020: * entering a sink state, save the current state as error
4021: * state.
4022: */
4023: if (exec->errString != NULL)
4024: xmlFree(exec->errString);
4025: exec->errString = xmlStrdup(value);
4026: exec->errState = exec->state;
4027: memcpy(exec->errCounts, exec->counts,
4028: exec->comp->nbCounters * sizeof(int));
4029: }
4030: exec->state = exec->comp->states[trans->to];
4031: exec->transno = 0;
4032: if (trans->atom != NULL) {
4033: if (exec->inputStack != NULL) {
4034: exec->index++;
4035: if (exec->index < exec->inputStackNr) {
4036: value = exec->inputStack[exec->index].value;
4037: data = exec->inputStack[exec->index].data;
4038: #ifdef DEBUG_PUSH
4039: printf("value loaded: %s\n", value);
4040: #endif
4041: } else {
4042: value = NULL;
4043: data = NULL;
4044: #ifdef DEBUG_PUSH
4045: printf("end of input\n");
4046: #endif
4047: }
4048: } else {
4049: value = NULL;
4050: data = NULL;
4051: #ifdef DEBUG_PUSH
4052: printf("end of input\n");
4053: #endif
4054: }
4055: }
4056: goto progress;
4057: } else if (ret < 0) {
4058: exec->status = -4;
4059: break;
4060: }
4061: }
4062: if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4063: rollback:
4064: /*
4065: * if we didn't yet rollback on the current input
4066: * store the current state as the error state.
4067: */
4068: if ((progress) && (exec->state != NULL) &&
4069: (exec->state->type != XML_REGEXP_SINK_STATE)) {
4070: progress = 0;
4071: if (exec->errString != NULL)
4072: xmlFree(exec->errString);
4073: exec->errString = xmlStrdup(value);
4074: exec->errState = exec->state;
4075: memcpy(exec->errCounts, exec->counts,
4076: exec->comp->nbCounters * sizeof(int));
4077: }
4078:
4079: /*
4080: * Failed to find a way out
4081: */
4082: exec->determinist = 0;
4083: xmlFARegExecRollBack(exec);
4084: if (exec->status == 0) {
4085: value = exec->inputStack[exec->index].value;
4086: data = exec->inputStack[exec->index].data;
4087: #ifdef DEBUG_PUSH
4088: printf("value loaded: %s\n", value);
4089: #endif
4090: }
4091: }
4092: continue;
4093: progress:
4094: progress = 1;
4095: continue;
4096: }
4097: if (exec->status == 0) {
4098: return(exec->state->type == XML_REGEXP_FINAL_STATE);
4099: }
4100: #ifdef DEBUG_ERR
4101: if (exec->status < 0) {
4102: testerr(exec);
4103: }
4104: #endif
4105: return(exec->status);
4106: }
4107:
4108: /**
4109: * xmlRegExecPushString:
4110: * @exec: a regexp execution context or NULL to indicate the end
4111: * @value: a string token input
4112: * @data: data associated to the token to reuse in callbacks
4113: *
4114: * Push one input token in the execution context
4115: *
4116: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4117: * a negative value in case of error.
4118: */
4119: int
4120: xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4121: void *data) {
4122: return(xmlRegExecPushStringInternal(exec, value, data, 0));
4123: }
4124:
4125: /**
4126: * xmlRegExecPushString2:
4127: * @exec: a regexp execution context or NULL to indicate the end
4128: * @value: the first string token input
4129: * @value2: the second string token input
4130: * @data: data associated to the token to reuse in callbacks
4131: *
4132: * Push one input token in the execution context
4133: *
4134: * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4135: * a negative value in case of error.
4136: */
4137: int
4138: xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4139: const xmlChar *value2, void *data) {
4140: xmlChar buf[150];
4141: int lenn, lenp, ret;
4142: xmlChar *str;
4143:
4144: if (exec == NULL)
4145: return(-1);
4146: if (exec->comp == NULL)
4147: return(-1);
4148: if (exec->status != 0)
4149: return(exec->status);
4150:
4151: if (value2 == NULL)
4152: return(xmlRegExecPushString(exec, value, data));
4153:
4154: lenn = strlen((char *) value2);
4155: lenp = strlen((char *) value);
4156:
4157: if (150 < lenn + lenp + 2) {
4158: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4159: if (str == NULL) {
4160: exec->status = -1;
4161: return(-1);
4162: }
4163: } else {
4164: str = buf;
4165: }
4166: memcpy(&str[0], value, lenp);
4167: str[lenp] = XML_REG_STRING_SEPARATOR;
4168: memcpy(&str[lenp + 1], value2, lenn);
4169: str[lenn + lenp + 1] = 0;
4170:
4171: if (exec->comp->compact != NULL)
4172: ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4173: else
4174: ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4175:
4176: if (str != buf)
4177: xmlFree(str);
4178: return(ret);
4179: }
4180:
4181: /**
4182: * xmlRegExecGetValues:
4183: * @exec: a regexp execution context
4184: * @err: error extraction or normal one
4185: * @nbval: pointer to the number of accepted values IN/OUT
4186: * @nbneg: return number of negative transitions
4187: * @values: pointer to the array of acceptable values
4188: * @terminal: return value if this was a terminal state
4189: *
4190: * Extract informations from the regexp execution, internal routine to
4191: * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4192: *
4193: * Returns: 0 in case of success or -1 in case of error.
4194: */
4195: static int
4196: xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4197: int *nbval, int *nbneg,
4198: xmlChar **values, int *terminal) {
4199: int maxval;
4200: int nb = 0;
4201:
4202: if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4203: (values == NULL) || (*nbval <= 0))
4204: return(-1);
4205:
4206: maxval = *nbval;
4207: *nbval = 0;
4208: *nbneg = 0;
4209: if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4210: xmlRegexpPtr comp;
4211: int target, i, state;
4212:
4213: comp = exec->comp;
4214:
4215: if (err) {
4216: if (exec->errStateNo == -1) return(-1);
4217: state = exec->errStateNo;
4218: } else {
4219: state = exec->index;
4220: }
4221: if (terminal != NULL) {
4222: if (comp->compact[state * (comp->nbstrings + 1)] ==
4223: XML_REGEXP_FINAL_STATE)
4224: *terminal = 1;
4225: else
4226: *terminal = 0;
4227: }
4228: for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4229: target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4230: if ((target > 0) && (target <= comp->nbstates) &&
4231: (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4232: XML_REGEXP_SINK_STATE)) {
4233: values[nb++] = comp->stringMap[i];
4234: (*nbval)++;
4235: }
4236: }
4237: for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4238: target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4239: if ((target > 0) && (target <= comp->nbstates) &&
4240: (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4241: XML_REGEXP_SINK_STATE)) {
4242: values[nb++] = comp->stringMap[i];
4243: (*nbneg)++;
4244: }
4245: }
4246: } else {
4247: int transno;
4248: xmlRegTransPtr trans;
4249: xmlRegAtomPtr atom;
4250: xmlRegStatePtr state;
4251:
4252: if (terminal != NULL) {
4253: if (exec->state->type == XML_REGEXP_FINAL_STATE)
4254: *terminal = 1;
4255: else
4256: *terminal = 0;
4257: }
4258:
4259: if (err) {
4260: if (exec->errState == NULL) return(-1);
4261: state = exec->errState;
4262: } else {
4263: if (exec->state == NULL) return(-1);
4264: state = exec->state;
4265: }
4266: for (transno = 0;
4267: (transno < state->nbTrans) && (nb < maxval);
4268: transno++) {
4269: trans = &state->trans[transno];
4270: if (trans->to < 0)
4271: continue;
4272: atom = trans->atom;
4273: if ((atom == NULL) || (atom->valuep == NULL))
4274: continue;
4275: if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4276: /* this should not be reached but ... */
4277: TODO;
4278: } else if (trans->count == REGEXP_ALL_COUNTER) {
4279: /* this should not be reached but ... */
4280: TODO;
4281: } else if (trans->counter >= 0) {
4282: xmlRegCounterPtr counter = NULL;
4283: int count;
4284:
4285: if (err)
4286: count = exec->errCounts[trans->counter];
4287: else
4288: count = exec->counts[trans->counter];
4289: if (exec->comp != NULL)
4290: counter = &exec->comp->counters[trans->counter];
4291: if ((counter == NULL) || (count < counter->max)) {
4292: if (atom->neg)
4293: values[nb++] = (xmlChar *) atom->valuep2;
4294: else
4295: values[nb++] = (xmlChar *) atom->valuep;
4296: (*nbval)++;
4297: }
4298: } else {
4299: if ((exec->comp->states[trans->to] != NULL) &&
4300: (exec->comp->states[trans->to]->type !=
4301: XML_REGEXP_SINK_STATE)) {
4302: if (atom->neg)
4303: values[nb++] = (xmlChar *) atom->valuep2;
4304: else
4305: values[nb++] = (xmlChar *) atom->valuep;
4306: (*nbval)++;
4307: }
4308: }
4309: }
4310: for (transno = 0;
4311: (transno < state->nbTrans) && (nb < maxval);
4312: transno++) {
4313: trans = &state->trans[transno];
4314: if (trans->to < 0)
4315: continue;
4316: atom = trans->atom;
4317: if ((atom == NULL) || (atom->valuep == NULL))
4318: continue;
4319: if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4320: continue;
4321: } else if (trans->count == REGEXP_ALL_COUNTER) {
4322: continue;
4323: } else if (trans->counter >= 0) {
4324: continue;
4325: } else {
4326: if ((exec->comp->states[trans->to] != NULL) &&
4327: (exec->comp->states[trans->to]->type ==
4328: XML_REGEXP_SINK_STATE)) {
4329: if (atom->neg)
4330: values[nb++] = (xmlChar *) atom->valuep2;
4331: else
4332: values[nb++] = (xmlChar *) atom->valuep;
4333: (*nbneg)++;
4334: }
4335: }
4336: }
4337: }
4338: return(0);
4339: }
4340:
4341: /**
4342: * xmlRegExecNextValues:
4343: * @exec: a regexp execution context
4344: * @nbval: pointer to the number of accepted values IN/OUT
4345: * @nbneg: return number of negative transitions
4346: * @values: pointer to the array of acceptable values
4347: * @terminal: return value if this was a terminal state
4348: *
4349: * Extract informations from the regexp execution,
4350: * the parameter @values must point to an array of @nbval string pointers
4351: * on return nbval will contain the number of possible strings in that
4352: * state and the @values array will be updated with them. The string values
4353: * returned will be freed with the @exec context and don't need to be
4354: * deallocated.
4355: *
4356: * Returns: 0 in case of success or -1 in case of error.
4357: */
4358: int
4359: xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4360: xmlChar **values, int *terminal) {
4361: return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4362: }
4363:
4364: /**
4365: * xmlRegExecErrInfo:
4366: * @exec: a regexp execution context generating an error
4367: * @string: return value for the error string
4368: * @nbval: pointer to the number of accepted values IN/OUT
4369: * @nbneg: return number of negative transitions
4370: * @values: pointer to the array of acceptable values
4371: * @terminal: return value if this was a terminal state
4372: *
4373: * Extract error informations from the regexp execution, the parameter
4374: * @string will be updated with the value pushed and not accepted,
4375: * the parameter @values must point to an array of @nbval string pointers
4376: * on return nbval will contain the number of possible strings in that
4377: * state and the @values array will be updated with them. The string values
4378: * returned will be freed with the @exec context and don't need to be
4379: * deallocated.
4380: *
4381: * Returns: 0 in case of success or -1 in case of error.
4382: */
4383: int
4384: xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4385: int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4386: if (exec == NULL)
4387: return(-1);
4388: if (string != NULL) {
4389: if (exec->status != 0)
4390: *string = exec->errString;
4391: else
4392: *string = NULL;
4393: }
4394: return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4395: }
4396:
4397: #ifdef DEBUG_ERR
4398: static void testerr(xmlRegExecCtxtPtr exec) {
4399: const xmlChar *string;
4400: xmlChar *values[5];
4401: int nb = 5;
4402: int nbneg;
4403: int terminal;
4404: xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4405: }
4406: #endif
4407:
4408: #if 0
4409: static int
4410: xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4411: xmlRegTransPtr trans;
4412: xmlRegAtomPtr atom;
4413: int ret;
4414: int codepoint, len;
4415:
4416: if (exec == NULL)
4417: return(-1);
4418: if (exec->status != 0)
4419: return(exec->status);
4420:
4421: while ((exec->status == 0) &&
4422: ((exec->inputString[exec->index] != 0) ||
4423: (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4424:
4425: /*
4426: * End of input on non-terminal state, rollback, however we may
4427: * still have epsilon like transition for counted transitions
4428: * on counters, in that case don't break too early.
4429: */
4430: if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4431: goto rollback;
4432:
4433: exec->transcount = 0;
4434: for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4435: trans = &exec->state->trans[exec->transno];
4436: if (trans->to < 0)
4437: continue;
4438: atom = trans->atom;
4439: ret = 0;
4440: if (trans->count >= 0) {
4441: int count;
4442: xmlRegCounterPtr counter;
4443:
4444: /*
4445: * A counted transition.
4446: */
4447:
4448: count = exec->counts[trans->count];
4449: counter = &exec->comp->counters[trans->count];
4450: #ifdef DEBUG_REGEXP_EXEC
4451: printf("testing count %d: val %d, min %d, max %d\n",
4452: trans->count, count, counter->min, counter->max);
4453: #endif
4454: ret = ((count >= counter->min) && (count <= counter->max));
4455: } else if (atom == NULL) {
4456: fprintf(stderr, "epsilon transition left at runtime\n");
4457: exec->status = -2;
4458: break;
4459: } else if (exec->inputString[exec->index] != 0) {
4460: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4461: ret = xmlRegCheckCharacter(atom, codepoint);
4462: if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4463: xmlRegStatePtr to = exec->comp->states[trans->to];
4464:
4465: /*
4466: * this is a multiple input sequence
4467: */
4468: if (exec->state->nbTrans > exec->transno + 1) {
4469: xmlFARegExecSave(exec);
4470: }
4471: exec->transcount = 1;
4472: do {
4473: /*
4474: * Try to progress as much as possible on the input
4475: */
4476: if (exec->transcount == atom->max) {
4477: break;
4478: }
4479: exec->index += len;
4480: /*
4481: * End of input: stop here
4482: */
4483: if (exec->inputString[exec->index] == 0) {
4484: exec->index -= len;
4485: break;
4486: }
4487: if (exec->transcount >= atom->min) {
4488: int transno = exec->transno;
4489: xmlRegStatePtr state = exec->state;
4490:
4491: /*
4492: * The transition is acceptable save it
4493: */
4494: exec->transno = -1; /* trick */
4495: exec->state = to;
4496: xmlFARegExecSave(exec);
4497: exec->transno = transno;
4498: exec->state = state;
4499: }
4500: codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4501: len);
4502: ret = xmlRegCheckCharacter(atom, codepoint);
4503: exec->transcount++;
4504: } while (ret == 1);
4505: if (exec->transcount < atom->min)
4506: ret = 0;
4507:
4508: /*
4509: * If the last check failed but one transition was found
4510: * possible, rollback
4511: */
4512: if (ret < 0)
4513: ret = 0;
4514: if (ret == 0) {
4515: goto rollback;
4516: }
4517: }
4518: }
4519: if (ret == 1) {
4520: if (exec->state->nbTrans > exec->transno + 1) {
4521: xmlFARegExecSave(exec);
4522: }
4523: /*
4524: * restart count for expressions like this ((abc){2})*
4525: */
4526: if (trans->count >= 0) {
4527: #ifdef DEBUG_REGEXP_EXEC
4528: printf("Reset count %d\n", trans->count);
4529: #endif
4530: exec->counts[trans->count] = 0;
4531: }
4532: if (trans->counter >= 0) {
4533: #ifdef DEBUG_REGEXP_EXEC
4534: printf("Increasing count %d\n", trans->counter);
4535: #endif
4536: exec->counts[trans->counter]++;
4537: }
4538: #ifdef DEBUG_REGEXP_EXEC
4539: printf("entering state %d\n", trans->to);
4540: #endif
4541: exec->state = exec->comp->states[trans->to];
4542: exec->transno = 0;
4543: if (trans->atom != NULL) {
4544: exec->index += len;
4545: }
4546: goto progress;
4547: } else if (ret < 0) {
4548: exec->status = -4;
4549: break;
4550: }
4551: }
4552: if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4553: rollback:
4554: /*
4555: * Failed to find a way out
4556: */
4557: exec->determinist = 0;
4558: xmlFARegExecRollBack(exec);
4559: }
4560: progress:
4561: continue;
4562: }
4563: }
4564: #endif
4565: /************************************************************************
4566: * *
4567: * Parser for the Schemas Datatype Regular Expressions *
4568: * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4569: * *
4570: ************************************************************************/
4571:
4572: /**
4573: * xmlFAIsChar:
4574: * @ctxt: a regexp parser context
4575: *
4576: * [10] Char ::= [^.\?*+()|#x5B#x5D]
4577: */
4578: static int
4579: xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4580: int cur;
4581: int len;
4582:
4583: cur = CUR_SCHAR(ctxt->cur, len);
4584: if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4585: (cur == '*') || (cur == '+') || (cur == '(') ||
4586: (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4587: (cur == 0x5D) || (cur == 0))
4588: return(-1);
4589: return(cur);
4590: }
4591:
4592: /**
4593: * xmlFAParseCharProp:
4594: * @ctxt: a regexp parser context
4595: *
4596: * [27] charProp ::= IsCategory | IsBlock
4597: * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4598: * Separators | Symbols | Others
4599: * [29] Letters ::= 'L' [ultmo]?
4600: * [30] Marks ::= 'M' [nce]?
4601: * [31] Numbers ::= 'N' [dlo]?
4602: * [32] Punctuation ::= 'P' [cdseifo]?
4603: * [33] Separators ::= 'Z' [slp]?
4604: * [34] Symbols ::= 'S' [mcko]?
4605: * [35] Others ::= 'C' [cfon]?
4606: * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4607: */
4608: static void
4609: xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4610: int cur;
4611: xmlRegAtomType type = (xmlRegAtomType) 0;
4612: xmlChar *blockName = NULL;
4613:
4614: cur = CUR;
4615: if (cur == 'L') {
4616: NEXT;
4617: cur = CUR;
4618: if (cur == 'u') {
4619: NEXT;
4620: type = XML_REGEXP_LETTER_UPPERCASE;
4621: } else if (cur == 'l') {
4622: NEXT;
4623: type = XML_REGEXP_LETTER_LOWERCASE;
4624: } else if (cur == 't') {
4625: NEXT;
4626: type = XML_REGEXP_LETTER_TITLECASE;
4627: } else if (cur == 'm') {
4628: NEXT;
4629: type = XML_REGEXP_LETTER_MODIFIER;
4630: } else if (cur == 'o') {
4631: NEXT;
4632: type = XML_REGEXP_LETTER_OTHERS;
4633: } else {
4634: type = XML_REGEXP_LETTER;
4635: }
4636: } else if (cur == 'M') {
4637: NEXT;
4638: cur = CUR;
4639: if (cur == 'n') {
4640: NEXT;
4641: /* nonspacing */
4642: type = XML_REGEXP_MARK_NONSPACING;
4643: } else if (cur == 'c') {
4644: NEXT;
4645: /* spacing combining */
4646: type = XML_REGEXP_MARK_SPACECOMBINING;
4647: } else if (cur == 'e') {
4648: NEXT;
4649: /* enclosing */
4650: type = XML_REGEXP_MARK_ENCLOSING;
4651: } else {
4652: /* all marks */
4653: type = XML_REGEXP_MARK;
4654: }
4655: } else if (cur == 'N') {
4656: NEXT;
4657: cur = CUR;
4658: if (cur == 'd') {
4659: NEXT;
4660: /* digital */
4661: type = XML_REGEXP_NUMBER_DECIMAL;
4662: } else if (cur == 'l') {
4663: NEXT;
4664: /* letter */
4665: type = XML_REGEXP_NUMBER_LETTER;
4666: } else if (cur == 'o') {
4667: NEXT;
4668: /* other */
4669: type = XML_REGEXP_NUMBER_OTHERS;
4670: } else {
4671: /* all numbers */
4672: type = XML_REGEXP_NUMBER;
4673: }
4674: } else if (cur == 'P') {
4675: NEXT;
4676: cur = CUR;
4677: if (cur == 'c') {
4678: NEXT;
4679: /* connector */
4680: type = XML_REGEXP_PUNCT_CONNECTOR;
4681: } else if (cur == 'd') {
4682: NEXT;
4683: /* dash */
4684: type = XML_REGEXP_PUNCT_DASH;
4685: } else if (cur == 's') {
4686: NEXT;
4687: /* open */
4688: type = XML_REGEXP_PUNCT_OPEN;
4689: } else if (cur == 'e') {
4690: NEXT;
4691: /* close */
4692: type = XML_REGEXP_PUNCT_CLOSE;
4693: } else if (cur == 'i') {
4694: NEXT;
4695: /* initial quote */
4696: type = XML_REGEXP_PUNCT_INITQUOTE;
4697: } else if (cur == 'f') {
4698: NEXT;
4699: /* final quote */
4700: type = XML_REGEXP_PUNCT_FINQUOTE;
4701: } else if (cur == 'o') {
4702: NEXT;
4703: /* other */
4704: type = XML_REGEXP_PUNCT_OTHERS;
4705: } else {
4706: /* all punctuation */
4707: type = XML_REGEXP_PUNCT;
4708: }
4709: } else if (cur == 'Z') {
4710: NEXT;
4711: cur = CUR;
4712: if (cur == 's') {
4713: NEXT;
4714: /* space */
4715: type = XML_REGEXP_SEPAR_SPACE;
4716: } else if (cur == 'l') {
4717: NEXT;
4718: /* line */
4719: type = XML_REGEXP_SEPAR_LINE;
4720: } else if (cur == 'p') {
4721: NEXT;
4722: /* paragraph */
4723: type = XML_REGEXP_SEPAR_PARA;
4724: } else {
4725: /* all separators */
4726: type = XML_REGEXP_SEPAR;
4727: }
4728: } else if (cur == 'S') {
4729: NEXT;
4730: cur = CUR;
4731: if (cur == 'm') {
4732: NEXT;
4733: type = XML_REGEXP_SYMBOL_MATH;
4734: /* math */
4735: } else if (cur == 'c') {
4736: NEXT;
4737: type = XML_REGEXP_SYMBOL_CURRENCY;
4738: /* currency */
4739: } else if (cur == 'k') {
4740: NEXT;
4741: type = XML_REGEXP_SYMBOL_MODIFIER;
4742: /* modifiers */
4743: } else if (cur == 'o') {
4744: NEXT;
4745: type = XML_REGEXP_SYMBOL_OTHERS;
4746: /* other */
4747: } else {
4748: /* all symbols */
4749: type = XML_REGEXP_SYMBOL;
4750: }
4751: } else if (cur == 'C') {
4752: NEXT;
4753: cur = CUR;
4754: if (cur == 'c') {
4755: NEXT;
4756: /* control */
4757: type = XML_REGEXP_OTHER_CONTROL;
4758: } else if (cur == 'f') {
4759: NEXT;
4760: /* format */
4761: type = XML_REGEXP_OTHER_FORMAT;
4762: } else if (cur == 'o') {
4763: NEXT;
4764: /* private use */
4765: type = XML_REGEXP_OTHER_PRIVATE;
4766: } else if (cur == 'n') {
4767: NEXT;
4768: /* not assigned */
4769: type = XML_REGEXP_OTHER_NA;
4770: } else {
4771: /* all others */
4772: type = XML_REGEXP_OTHER;
4773: }
4774: } else if (cur == 'I') {
4775: const xmlChar *start;
4776: NEXT;
4777: cur = CUR;
4778: if (cur != 's') {
4779: ERROR("IsXXXX expected");
4780: return;
4781: }
4782: NEXT;
4783: start = ctxt->cur;
4784: cur = CUR;
4785: if (((cur >= 'a') && (cur <= 'z')) ||
4786: ((cur >= 'A') && (cur <= 'Z')) ||
4787: ((cur >= '0') && (cur <= '9')) ||
4788: (cur == 0x2D)) {
4789: NEXT;
4790: cur = CUR;
4791: while (((cur >= 'a') && (cur <= 'z')) ||
4792: ((cur >= 'A') && (cur <= 'Z')) ||
4793: ((cur >= '0') && (cur <= '9')) ||
4794: (cur == 0x2D)) {
4795: NEXT;
4796: cur = CUR;
4797: }
4798: }
4799: type = XML_REGEXP_BLOCK_NAME;
4800: blockName = xmlStrndup(start, ctxt->cur - start);
4801: } else {
4802: ERROR("Unknown char property");
4803: return;
4804: }
4805: if (ctxt->atom == NULL) {
4806: ctxt->atom = xmlRegNewAtom(ctxt, type);
4807: if (ctxt->atom != NULL)
4808: ctxt->atom->valuep = blockName;
4809: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4810: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4811: type, 0, 0, blockName);
4812: }
4813: }
4814:
4815: /**
4816: * xmlFAParseCharClassEsc:
4817: * @ctxt: a regexp parser context
4818: *
4819: * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4820: * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4821: * [25] catEsc ::= '\p{' charProp '}'
4822: * [26] complEsc ::= '\P{' charProp '}'
4823: * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4824: */
4825: static void
4826: xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4827: int cur;
4828:
4829: if (CUR == '.') {
4830: if (ctxt->atom == NULL) {
4831: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4832: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4833: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4834: XML_REGEXP_ANYCHAR, 0, 0, NULL);
4835: }
4836: NEXT;
4837: return;
4838: }
4839: if (CUR != '\\') {
4840: ERROR("Escaped sequence: expecting \\");
4841: return;
4842: }
4843: NEXT;
4844: cur = CUR;
4845: if (cur == 'p') {
4846: NEXT;
4847: if (CUR != '{') {
4848: ERROR("Expecting '{'");
4849: return;
4850: }
4851: NEXT;
4852: xmlFAParseCharProp(ctxt);
4853: if (CUR != '}') {
4854: ERROR("Expecting '}'");
4855: return;
4856: }
4857: NEXT;
4858: } else if (cur == 'P') {
4859: NEXT;
4860: if (CUR != '{') {
4861: ERROR("Expecting '{'");
4862: return;
4863: }
4864: NEXT;
4865: xmlFAParseCharProp(ctxt);
4866: ctxt->atom->neg = 1;
4867: if (CUR != '}') {
4868: ERROR("Expecting '}'");
4869: return;
4870: }
4871: NEXT;
4872: } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4873: (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4874: (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4875: (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4876: (cur == 0x5E)) {
4877: if (ctxt->atom == NULL) {
4878: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4879: if (ctxt->atom != NULL) {
4880: switch (cur) {
4881: case 'n':
4882: ctxt->atom->codepoint = '\n';
4883: break;
4884: case 'r':
4885: ctxt->atom->codepoint = '\r';
4886: break;
4887: case 't':
4888: ctxt->atom->codepoint = '\t';
4889: break;
4890: default:
4891: ctxt->atom->codepoint = cur;
4892: }
4893: }
4894: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4895: switch (cur) {
4896: case 'n':
4897: cur = '\n';
4898: break;
4899: case 'r':
4900: cur = '\r';
4901: break;
4902: case 't':
4903: cur = '\t';
4904: break;
4905: }
4906: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4907: XML_REGEXP_CHARVAL, cur, cur, NULL);
4908: }
4909: NEXT;
4910: } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4911: (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4912: (cur == 'w') || (cur == 'W')) {
4913: xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4914:
4915: switch (cur) {
4916: case 's':
4917: type = XML_REGEXP_ANYSPACE;
4918: break;
4919: case 'S':
4920: type = XML_REGEXP_NOTSPACE;
4921: break;
4922: case 'i':
4923: type = XML_REGEXP_INITNAME;
4924: break;
4925: case 'I':
4926: type = XML_REGEXP_NOTINITNAME;
4927: break;
4928: case 'c':
4929: type = XML_REGEXP_NAMECHAR;
4930: break;
4931: case 'C':
4932: type = XML_REGEXP_NOTNAMECHAR;
4933: break;
4934: case 'd':
4935: type = XML_REGEXP_DECIMAL;
4936: break;
4937: case 'D':
4938: type = XML_REGEXP_NOTDECIMAL;
4939: break;
4940: case 'w':
4941: type = XML_REGEXP_REALCHAR;
4942: break;
4943: case 'W':
4944: type = XML_REGEXP_NOTREALCHAR;
4945: break;
4946: }
4947: NEXT;
4948: if (ctxt->atom == NULL) {
4949: ctxt->atom = xmlRegNewAtom(ctxt, type);
4950: } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4951: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4952: type, 0, 0, NULL);
4953: }
4954: } else {
4955: ERROR("Wrong escape sequence, misuse of character '\\'");
4956: }
4957: }
4958:
4959: /**
4960: * xmlFAParseCharRange:
4961: * @ctxt: a regexp parser context
4962: *
4963: * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4964: * [18] seRange ::= charOrEsc '-' charOrEsc
4965: * [20] charOrEsc ::= XmlChar | SingleCharEsc
4966: * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4967: * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4968: */
4969: static void
4970: xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
4971: int cur, len;
4972: int start = -1;
4973: int end = -1;
4974:
4975: if (CUR == '\0') {
4976: ERROR("Expecting ']'");
4977: return;
4978: }
4979:
4980: cur = CUR;
4981: if (cur == '\\') {
4982: NEXT;
4983: cur = CUR;
4984: switch (cur) {
4985: case 'n': start = 0xA; break;
4986: case 'r': start = 0xD; break;
4987: case 't': start = 0x9; break;
4988: case '\\': case '|': case '.': case '-': case '^': case '?':
4989: case '*': case '+': case '{': case '}': case '(': case ')':
4990: case '[': case ']':
4991: start = cur; break;
4992: default:
4993: ERROR("Invalid escape value");
4994: return;
4995: }
4996: end = start;
4997: len = 1;
4998: } else if ((cur != 0x5B) && (cur != 0x5D)) {
4999: end = start = CUR_SCHAR(ctxt->cur, len);
5000: } else {
5001: ERROR("Expecting a char range");
5002: return;
5003: }
5004: /*
5005: * Since we are "inside" a range, we can assume ctxt->cur is past
5006: * the start of ctxt->string, and PREV should be safe
5007: */
5008: if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5009: NEXTL(len);
5010: return;
5011: }
5012: NEXTL(len);
5013: cur = CUR;
5014: if ((cur != '-') || (NXT(1) == ']')) {
5015: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5016: XML_REGEXP_CHARVAL, start, end, NULL);
5017: return;
5018: }
5019: NEXT;
5020: cur = CUR;
5021: if (cur == '\\') {
5022: NEXT;
5023: cur = CUR;
5024: switch (cur) {
5025: case 'n': end = 0xA; break;
5026: case 'r': end = 0xD; break;
5027: case 't': end = 0x9; break;
5028: case '\\': case '|': case '.': case '-': case '^': case '?':
5029: case '*': case '+': case '{': case '}': case '(': case ')':
5030: case '[': case ']':
5031: end = cur; break;
5032: default:
5033: ERROR("Invalid escape value");
5034: return;
5035: }
5036: len = 1;
5037: } else if ((cur != 0x5B) && (cur != 0x5D)) {
5038: end = CUR_SCHAR(ctxt->cur, len);
5039: } else {
5040: ERROR("Expecting the end of a char range");
5041: return;
5042: }
5043: NEXTL(len);
5044: /* TODO check that the values are acceptable character ranges for XML */
5045: if (end < start) {
5046: ERROR("End of range is before start of range");
5047: } else {
5048: xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5049: XML_REGEXP_CHARVAL, start, end, NULL);
5050: }
5051: return;
5052: }
5053:
5054: /**
5055: * xmlFAParsePosCharGroup:
5056: * @ctxt: a regexp parser context
5057: *
5058: * [14] posCharGroup ::= ( charRange | charClassEsc )+
5059: */
5060: static void
5061: xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5062: do {
5063: if (CUR == '\\') {
5064: xmlFAParseCharClassEsc(ctxt);
5065: } else {
5066: xmlFAParseCharRange(ctxt);
5067: }
5068: } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
5069: (CUR != 0) && (ctxt->error == 0));
5070: }
5071:
5072: /**
5073: * xmlFAParseCharGroup:
5074: * @ctxt: a regexp parser context
5075: *
5076: * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5077: * [15] negCharGroup ::= '^' posCharGroup
5078: * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5079: * [12] charClassExpr ::= '[' charGroup ']'
5080: */
5081: static void
5082: xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5083: int n = ctxt->neg;
5084: while ((CUR != ']') && (ctxt->error == 0)) {
5085: if (CUR == '^') {
5086: int neg = ctxt->neg;
5087:
5088: NEXT;
5089: ctxt->neg = !ctxt->neg;
5090: xmlFAParsePosCharGroup(ctxt);
5091: ctxt->neg = neg;
5092: } else if ((CUR == '-') && (NXT(1) == '[')) {
5093: int neg = ctxt->neg;
5094: ctxt->neg = 2;
5095: NEXT; /* eat the '-' */
5096: NEXT; /* eat the '[' */
5097: xmlFAParseCharGroup(ctxt);
5098: if (CUR == ']') {
5099: NEXT;
5100: } else {
5101: ERROR("charClassExpr: ']' expected");
5102: break;
5103: }
5104: ctxt->neg = neg;
5105: break;
5106: } else if (CUR != ']') {
5107: xmlFAParsePosCharGroup(ctxt);
5108: }
5109: }
5110: ctxt->neg = n;
5111: }
5112:
5113: /**
5114: * xmlFAParseCharClass:
5115: * @ctxt: a regexp parser context
5116: *
5117: * [11] charClass ::= charClassEsc | charClassExpr
5118: * [12] charClassExpr ::= '[' charGroup ']'
5119: */
5120: static void
5121: xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5122: if (CUR == '[') {
5123: NEXT;
5124: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5125: if (ctxt->atom == NULL)
5126: return;
5127: xmlFAParseCharGroup(ctxt);
5128: if (CUR == ']') {
5129: NEXT;
5130: } else {
5131: ERROR("xmlFAParseCharClass: ']' expected");
5132: }
5133: } else {
5134: xmlFAParseCharClassEsc(ctxt);
5135: }
5136: }
5137:
5138: /**
5139: * xmlFAParseQuantExact:
5140: * @ctxt: a regexp parser context
5141: *
5142: * [8] QuantExact ::= [0-9]+
5143: *
5144: * Returns 0 if success or -1 in case of error
5145: */
5146: static int
5147: xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5148: int ret = 0;
5149: int ok = 0;
5150:
5151: while ((CUR >= '0') && (CUR <= '9')) {
5152: ret = ret * 10 + (CUR - '0');
5153: ok = 1;
5154: NEXT;
5155: }
5156: if (ok != 1) {
5157: return(-1);
5158: }
5159: return(ret);
5160: }
5161:
5162: /**
5163: * xmlFAParseQuantifier:
5164: * @ctxt: a regexp parser context
5165: *
5166: * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5167: * [5] quantity ::= quantRange | quantMin | QuantExact
5168: * [6] quantRange ::= QuantExact ',' QuantExact
5169: * [7] quantMin ::= QuantExact ','
5170: * [8] QuantExact ::= [0-9]+
5171: */
5172: static int
5173: xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5174: int cur;
5175:
5176: cur = CUR;
5177: if ((cur == '?') || (cur == '*') || (cur == '+')) {
5178: if (ctxt->atom != NULL) {
5179: if (cur == '?')
5180: ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5181: else if (cur == '*')
5182: ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5183: else if (cur == '+')
5184: ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5185: }
5186: NEXT;
5187: return(1);
5188: }
5189: if (cur == '{') {
5190: int min = 0, max = 0;
5191:
5192: NEXT;
5193: cur = xmlFAParseQuantExact(ctxt);
5194: if (cur >= 0)
5195: min = cur;
5196: if (CUR == ',') {
5197: NEXT;
5198: if (CUR == '}')
5199: max = INT_MAX;
5200: else {
5201: cur = xmlFAParseQuantExact(ctxt);
5202: if (cur >= 0)
5203: max = cur;
5204: else {
5205: ERROR("Improper quantifier");
5206: }
5207: }
5208: }
5209: if (CUR == '}') {
5210: NEXT;
5211: } else {
5212: ERROR("Unterminated quantifier");
5213: }
5214: if (max == 0)
5215: max = min;
5216: if (ctxt->atom != NULL) {
5217: ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5218: ctxt->atom->min = min;
5219: ctxt->atom->max = max;
5220: }
5221: return(1);
5222: }
5223: return(0);
5224: }
5225:
5226: /**
5227: * xmlFAParseAtom:
5228: * @ctxt: a regexp parser context
5229: *
5230: * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5231: */
5232: static int
5233: xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5234: int codepoint, len;
5235:
5236: codepoint = xmlFAIsChar(ctxt);
5237: if (codepoint > 0) {
5238: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5239: if (ctxt->atom == NULL)
5240: return(-1);
5241: codepoint = CUR_SCHAR(ctxt->cur, len);
5242: ctxt->atom->codepoint = codepoint;
5243: NEXTL(len);
5244: return(1);
5245: } else if (CUR == '|') {
5246: return(0);
5247: } else if (CUR == 0) {
5248: return(0);
5249: } else if (CUR == ')') {
5250: return(0);
5251: } else if (CUR == '(') {
5252: xmlRegStatePtr start, oldend, start0;
5253:
5254: NEXT;
5255: /*
5256: * this extra Epsilon transition is needed if we count with 0 allowed
5257: * unfortunately this can't be known at that point
5258: */
5259: xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5260: start0 = ctxt->state;
5261: xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5262: start = ctxt->state;
5263: oldend = ctxt->end;
5264: ctxt->end = NULL;
5265: ctxt->atom = NULL;
5266: xmlFAParseRegExp(ctxt, 0);
5267: if (CUR == ')') {
5268: NEXT;
5269: } else {
5270: ERROR("xmlFAParseAtom: expecting ')'");
5271: }
5272: ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5273: if (ctxt->atom == NULL)
5274: return(-1);
5275: ctxt->atom->start = start;
5276: ctxt->atom->start0 = start0;
5277: ctxt->atom->stop = ctxt->state;
5278: ctxt->end = oldend;
5279: return(1);
5280: } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5281: xmlFAParseCharClass(ctxt);
5282: return(1);
5283: }
5284: return(0);
5285: }
5286:
5287: /**
5288: * xmlFAParsePiece:
5289: * @ctxt: a regexp parser context
5290: *
5291: * [3] piece ::= atom quantifier?
5292: */
5293: static int
5294: xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5295: int ret;
5296:
5297: ctxt->atom = NULL;
5298: ret = xmlFAParseAtom(ctxt);
5299: if (ret == 0)
5300: return(0);
5301: if (ctxt->atom == NULL) {
5302: ERROR("internal: no atom generated");
5303: }
5304: xmlFAParseQuantifier(ctxt);
5305: return(1);
5306: }
5307:
5308: /**
5309: * xmlFAParseBranch:
5310: * @ctxt: a regexp parser context
5311: * @to: optional target to the end of the branch
5312: *
5313: * @to is used to optimize by removing duplicate path in automata
5314: * in expressions like (a|b)(c|d)
5315: *
5316: * [2] branch ::= piece*
5317: */
5318: static int
5319: xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5320: xmlRegStatePtr previous;
5321: int ret;
5322:
5323: previous = ctxt->state;
5324: ret = xmlFAParsePiece(ctxt);
5325: if (ret != 0) {
5326: if (xmlFAGenerateTransitions(ctxt, previous,
5327: (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5328: return(-1);
5329: previous = ctxt->state;
5330: ctxt->atom = NULL;
5331: }
5332: while ((ret != 0) && (ctxt->error == 0)) {
5333: ret = xmlFAParsePiece(ctxt);
5334: if (ret != 0) {
5335: if (xmlFAGenerateTransitions(ctxt, previous,
5336: (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
5337: return(-1);
5338: previous = ctxt->state;
5339: ctxt->atom = NULL;
5340: }
5341: }
5342: return(0);
5343: }
5344:
5345: /**
5346: * xmlFAParseRegExp:
5347: * @ctxt: a regexp parser context
5348: * @top: is this the top-level expression ?
5349: *
5350: * [1] regExp ::= branch ( '|' branch )*
5351: */
5352: static void
5353: xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5354: xmlRegStatePtr start, end;
5355:
5356: /* if not top start should have been generated by an epsilon trans */
5357: start = ctxt->state;
5358: ctxt->end = NULL;
5359: xmlFAParseBranch(ctxt, NULL);
5360: if (top) {
5361: #ifdef DEBUG_REGEXP_GRAPH
5362: printf("State %d is final\n", ctxt->state->no);
5363: #endif
5364: ctxt->state->type = XML_REGEXP_FINAL_STATE;
5365: }
5366: if (CUR != '|') {
5367: ctxt->end = ctxt->state;
5368: return;
5369: }
5370: end = ctxt->state;
5371: while ((CUR == '|') && (ctxt->error == 0)) {
5372: NEXT;
5373: ctxt->state = start;
5374: ctxt->end = NULL;
5375: xmlFAParseBranch(ctxt, end);
5376: }
5377: if (!top) {
5378: ctxt->state = end;
5379: ctxt->end = end;
5380: }
5381: }
5382:
5383: /************************************************************************
5384: * *
5385: * The basic API *
5386: * *
5387: ************************************************************************/
5388:
5389: /**
5390: * xmlRegexpPrint:
5391: * @output: the file for the output debug
5392: * @regexp: the compiled regexp
5393: *
5394: * Print the content of the compiled regular expression
5395: */
5396: void
5397: xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5398: int i;
5399:
5400: if (output == NULL)
5401: return;
5402: fprintf(output, " regexp: ");
5403: if (regexp == NULL) {
5404: fprintf(output, "NULL\n");
5405: return;
5406: }
5407: fprintf(output, "'%s' ", regexp->string);
5408: fprintf(output, "\n");
5409: fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5410: for (i = 0;i < regexp->nbAtoms; i++) {
5411: fprintf(output, " %02d ", i);
5412: xmlRegPrintAtom(output, regexp->atoms[i]);
5413: }
5414: fprintf(output, "%d states:", regexp->nbStates);
5415: fprintf(output, "\n");
5416: for (i = 0;i < regexp->nbStates; i++) {
5417: xmlRegPrintState(output, regexp->states[i]);
5418: }
5419: fprintf(output, "%d counters:\n", regexp->nbCounters);
5420: for (i = 0;i < regexp->nbCounters; i++) {
5421: fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5422: regexp->counters[i].max);
5423: }
5424: }
5425:
5426: /**
5427: * xmlRegexpCompile:
5428: * @regexp: a regular expression string
5429: *
5430: * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5431: * Appendix F and builds an automata suitable for testing strings against
5432: * that regular expression
5433: *
5434: * Returns the compiled expression or NULL in case of error
5435: */
5436: xmlRegexpPtr
5437: xmlRegexpCompile(const xmlChar *regexp) {
5438: xmlRegexpPtr ret;
5439: xmlRegParserCtxtPtr ctxt;
5440:
5441: ctxt = xmlRegNewParserCtxt(regexp);
5442: if (ctxt == NULL)
5443: return(NULL);
5444:
5445: /* initialize the parser */
5446: ctxt->end = NULL;
5447: ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5448: xmlRegStatePush(ctxt, ctxt->start);
5449:
5450: /* parse the expression building an automata */
5451: xmlFAParseRegExp(ctxt, 1);
5452: if (CUR != 0) {
5453: ERROR("xmlFAParseRegExp: extra characters");
5454: }
5455: if (ctxt->error != 0) {
5456: xmlRegFreeParserCtxt(ctxt);
5457: return(NULL);
5458: }
5459: ctxt->end = ctxt->state;
5460: ctxt->start->type = XML_REGEXP_START_STATE;
5461: ctxt->end->type = XML_REGEXP_FINAL_STATE;
5462:
5463: /* remove the Epsilon except for counted transitions */
5464: xmlFAEliminateEpsilonTransitions(ctxt);
5465:
5466:
5467: if (ctxt->error != 0) {
5468: xmlRegFreeParserCtxt(ctxt);
5469: return(NULL);
5470: }
5471: ret = xmlRegEpxFromParse(ctxt);
5472: xmlRegFreeParserCtxt(ctxt);
5473: return(ret);
5474: }
5475:
5476: /**
5477: * xmlRegexpExec:
5478: * @comp: the compiled regular expression
5479: * @content: the value to check against the regular expression
5480: *
5481: * Check if the regular expression generates the value
5482: *
5483: * Returns 1 if it matches, 0 if not and a negative value in case of error
5484: */
5485: int
5486: xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5487: if ((comp == NULL) || (content == NULL))
5488: return(-1);
5489: return(xmlFARegExec(comp, content));
5490: }
5491:
5492: /**
5493: * xmlRegexpIsDeterminist:
5494: * @comp: the compiled regular expression
5495: *
5496: * Check if the regular expression is determinist
5497: *
5498: * Returns 1 if it yes, 0 if not and a negative value in case of error
5499: */
5500: int
5501: xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5502: xmlAutomataPtr am;
5503: int ret;
5504:
5505: if (comp == NULL)
5506: return(-1);
5507: if (comp->determinist != -1)
5508: return(comp->determinist);
5509:
5510: am = xmlNewAutomata();
5511: if (am->states != NULL) {
5512: int i;
5513:
5514: for (i = 0;i < am->nbStates;i++)
5515: xmlRegFreeState(am->states[i]);
5516: xmlFree(am->states);
5517: }
5518: am->nbAtoms = comp->nbAtoms;
5519: am->atoms = comp->atoms;
5520: am->nbStates = comp->nbStates;
5521: am->states = comp->states;
5522: am->determinist = -1;
5523: am->flags = comp->flags;
5524: ret = xmlFAComputesDeterminism(am);
5525: am->atoms = NULL;
5526: am->states = NULL;
5527: xmlFreeAutomata(am);
5528: comp->determinist = ret;
5529: return(ret);
5530: }
5531:
5532: /**
5533: * xmlRegFreeRegexp:
5534: * @regexp: the regexp
5535: *
5536: * Free a regexp
5537: */
5538: void
5539: xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5540: int i;
5541: if (regexp == NULL)
5542: return;
5543:
5544: if (regexp->string != NULL)
5545: xmlFree(regexp->string);
5546: if (regexp->states != NULL) {
5547: for (i = 0;i < regexp->nbStates;i++)
5548: xmlRegFreeState(regexp->states[i]);
5549: xmlFree(regexp->states);
5550: }
5551: if (regexp->atoms != NULL) {
5552: for (i = 0;i < regexp->nbAtoms;i++)
5553: xmlRegFreeAtom(regexp->atoms[i]);
5554: xmlFree(regexp->atoms);
5555: }
5556: if (regexp->counters != NULL)
5557: xmlFree(regexp->counters);
5558: if (regexp->compact != NULL)
5559: xmlFree(regexp->compact);
5560: if (regexp->transdata != NULL)
5561: xmlFree(regexp->transdata);
5562: if (regexp->stringMap != NULL) {
5563: for (i = 0; i < regexp->nbstrings;i++)
5564: xmlFree(regexp->stringMap[i]);
5565: xmlFree(regexp->stringMap);
5566: }
5567:
5568: xmlFree(regexp);
5569: }
5570:
5571: #ifdef LIBXML_AUTOMATA_ENABLED
5572: /************************************************************************
5573: * *
5574: * The Automata interface *
5575: * *
5576: ************************************************************************/
5577:
5578: /**
5579: * xmlNewAutomata:
5580: *
5581: * Create a new automata
5582: *
5583: * Returns the new object or NULL in case of failure
5584: */
5585: xmlAutomataPtr
5586: xmlNewAutomata(void) {
5587: xmlAutomataPtr ctxt;
5588:
5589: ctxt = xmlRegNewParserCtxt(NULL);
5590: if (ctxt == NULL)
5591: return(NULL);
5592:
5593: /* initialize the parser */
5594: ctxt->end = NULL;
5595: ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5596: if (ctxt->start == NULL) {
5597: xmlFreeAutomata(ctxt);
5598: return(NULL);
5599: }
5600: ctxt->start->type = XML_REGEXP_START_STATE;
5601: if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5602: xmlRegFreeState(ctxt->start);
5603: xmlFreeAutomata(ctxt);
5604: return(NULL);
5605: }
5606: ctxt->flags = 0;
5607:
5608: return(ctxt);
5609: }
5610:
5611: /**
5612: * xmlFreeAutomata:
5613: * @am: an automata
5614: *
5615: * Free an automata
5616: */
5617: void
5618: xmlFreeAutomata(xmlAutomataPtr am) {
5619: if (am == NULL)
5620: return;
5621: xmlRegFreeParserCtxt(am);
5622: }
5623:
5624: /**
5625: * xmlAutomataSetFlags:
5626: * @am: an automata
5627: * @flags: a set of internal flags
5628: *
5629: * Set some flags on the automata
5630: */
5631: void
5632: xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5633: if (am == NULL)
5634: return;
5635: am->flags |= flags;
5636: }
5637:
5638: /**
5639: * xmlAutomataGetInitState:
5640: * @am: an automata
5641: *
5642: * Initial state lookup
5643: *
5644: * Returns the initial state of the automata
5645: */
5646: xmlAutomataStatePtr
5647: xmlAutomataGetInitState(xmlAutomataPtr am) {
5648: if (am == NULL)
5649: return(NULL);
5650: return(am->start);
5651: }
5652:
5653: /**
5654: * xmlAutomataSetFinalState:
5655: * @am: an automata
5656: * @state: a state in this automata
5657: *
5658: * Makes that state a final state
5659: *
5660: * Returns 0 or -1 in case of error
5661: */
5662: int
5663: xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5664: if ((am == NULL) || (state == NULL))
5665: return(-1);
5666: state->type = XML_REGEXP_FINAL_STATE;
5667: return(0);
5668: }
5669:
5670: /**
5671: * xmlAutomataNewTransition:
5672: * @am: an automata
5673: * @from: the starting point of the transition
5674: * @to: the target point of the transition or NULL
5675: * @token: the input string associated to that transition
5676: * @data: data passed to the callback function if the transition is activated
5677: *
5678: * If @to is NULL, this creates first a new target state in the automata
5679: * and then adds a transition from the @from state to the target state
5680: * activated by the value of @token
5681: *
5682: * Returns the target state or NULL in case of error
5683: */
5684: xmlAutomataStatePtr
5685: xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5686: xmlAutomataStatePtr to, const xmlChar *token,
5687: void *data) {
5688: xmlRegAtomPtr atom;
5689:
5690: if ((am == NULL) || (from == NULL) || (token == NULL))
5691: return(NULL);
5692: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5693: if (atom == NULL)
5694: return(NULL);
5695: atom->data = data;
5696: if (atom == NULL)
5697: return(NULL);
5698: atom->valuep = xmlStrdup(token);
5699:
5700: if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5701: xmlRegFreeAtom(atom);
5702: return(NULL);
5703: }
5704: if (to == NULL)
5705: return(am->state);
5706: return(to);
5707: }
5708:
5709: /**
5710: * xmlAutomataNewTransition2:
5711: * @am: an automata
5712: * @from: the starting point of the transition
5713: * @to: the target point of the transition or NULL
5714: * @token: the first input string associated to that transition
5715: * @token2: the second input string associated to that transition
5716: * @data: data passed to the callback function if the transition is activated
5717: *
5718: * If @to is NULL, this creates first a new target state in the automata
5719: * and then adds a transition from the @from state to the target state
5720: * activated by the value of @token
5721: *
5722: * Returns the target state or NULL in case of error
5723: */
5724: xmlAutomataStatePtr
5725: xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5726: xmlAutomataStatePtr to, const xmlChar *token,
5727: const xmlChar *token2, void *data) {
5728: xmlRegAtomPtr atom;
5729:
5730: if ((am == NULL) || (from == NULL) || (token == NULL))
5731: return(NULL);
5732: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5733: if (atom == NULL)
5734: return(NULL);
5735: atom->data = data;
5736: if ((token2 == NULL) || (*token2 == 0)) {
5737: atom->valuep = xmlStrdup(token);
5738: } else {
5739: int lenn, lenp;
5740: xmlChar *str;
5741:
5742: lenn = strlen((char *) token2);
5743: lenp = strlen((char *) token);
5744:
5745: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5746: if (str == NULL) {
5747: xmlRegFreeAtom(atom);
5748: return(NULL);
5749: }
5750: memcpy(&str[0], token, lenp);
5751: str[lenp] = '|';
5752: memcpy(&str[lenp + 1], token2, lenn);
5753: str[lenn + lenp + 1] = 0;
5754:
5755: atom->valuep = str;
5756: }
5757:
5758: if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5759: xmlRegFreeAtom(atom);
5760: return(NULL);
5761: }
5762: if (to == NULL)
5763: return(am->state);
5764: return(to);
5765: }
5766:
5767: /**
5768: * xmlAutomataNewNegTrans:
5769: * @am: an automata
5770: * @from: the starting point of the transition
5771: * @to: the target point of the transition or NULL
5772: * @token: the first input string associated to that transition
5773: * @token2: the second input string associated to that transition
5774: * @data: data passed to the callback function if the transition is activated
5775: *
5776: * If @to is NULL, this creates first a new target state in the automata
5777: * and then adds a transition from the @from state to the target state
5778: * activated by any value except (@token,@token2)
5779: * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5780: # the semantic of XSD ##other
5781: *
5782: * Returns the target state or NULL in case of error
5783: */
5784: xmlAutomataStatePtr
5785: xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5786: xmlAutomataStatePtr to, const xmlChar *token,
5787: const xmlChar *token2, void *data) {
5788: xmlRegAtomPtr atom;
5789: xmlChar err_msg[200];
5790:
5791: if ((am == NULL) || (from == NULL) || (token == NULL))
5792: return(NULL);
5793: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5794: if (atom == NULL)
5795: return(NULL);
5796: atom->data = data;
5797: atom->neg = 1;
5798: if ((token2 == NULL) || (*token2 == 0)) {
5799: atom->valuep = xmlStrdup(token);
5800: } else {
5801: int lenn, lenp;
5802: xmlChar *str;
5803:
5804: lenn = strlen((char *) token2);
5805: lenp = strlen((char *) token);
5806:
5807: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5808: if (str == NULL) {
5809: xmlRegFreeAtom(atom);
5810: return(NULL);
5811: }
5812: memcpy(&str[0], token, lenp);
5813: str[lenp] = '|';
5814: memcpy(&str[lenp + 1], token2, lenn);
5815: str[lenn + lenp + 1] = 0;
5816:
5817: atom->valuep = str;
5818: }
5819: snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5820: err_msg[199] = 0;
5821: atom->valuep2 = xmlStrdup(err_msg);
5822:
5823: if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5824: xmlRegFreeAtom(atom);
5825: return(NULL);
5826: }
5827: am->negs++;
5828: if (to == NULL)
5829: return(am->state);
5830: return(to);
5831: }
5832:
5833: /**
5834: * xmlAutomataNewCountTrans2:
5835: * @am: an automata
5836: * @from: the starting point of the transition
5837: * @to: the target point of the transition or NULL
5838: * @token: the input string associated to that transition
5839: * @token2: the second input string associated to that transition
5840: * @min: the minimum successive occurences of token
5841: * @max: the maximum successive occurences of token
5842: * @data: data associated to the transition
5843: *
5844: * If @to is NULL, this creates first a new target state in the automata
5845: * and then adds a transition from the @from state to the target state
5846: * activated by a succession of input of value @token and @token2 and
5847: * whose number is between @min and @max
5848: *
5849: * Returns the target state or NULL in case of error
5850: */
5851: xmlAutomataStatePtr
5852: xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5853: xmlAutomataStatePtr to, const xmlChar *token,
5854: const xmlChar *token2,
5855: int min, int max, void *data) {
5856: xmlRegAtomPtr atom;
5857: int counter;
5858:
5859: if ((am == NULL) || (from == NULL) || (token == NULL))
5860: return(NULL);
5861: if (min < 0)
5862: return(NULL);
5863: if ((max < min) || (max < 1))
5864: return(NULL);
5865: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5866: if (atom == NULL)
5867: return(NULL);
5868: if ((token2 == NULL) || (*token2 == 0)) {
5869: atom->valuep = xmlStrdup(token);
5870: } else {
5871: int lenn, lenp;
5872: xmlChar *str;
5873:
5874: lenn = strlen((char *) token2);
5875: lenp = strlen((char *) token);
5876:
5877: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5878: if (str == NULL) {
5879: xmlRegFreeAtom(atom);
5880: return(NULL);
5881: }
5882: memcpy(&str[0], token, lenp);
5883: str[lenp] = '|';
5884: memcpy(&str[lenp + 1], token2, lenn);
5885: str[lenn + lenp + 1] = 0;
5886:
5887: atom->valuep = str;
5888: }
5889: atom->data = data;
5890: if (min == 0)
5891: atom->min = 1;
5892: else
5893: atom->min = min;
5894: atom->max = max;
5895:
5896: /*
5897: * associate a counter to the transition.
5898: */
5899: counter = xmlRegGetCounter(am);
5900: am->counters[counter].min = min;
5901: am->counters[counter].max = max;
5902:
5903: /* xmlFAGenerateTransitions(am, from, to, atom); */
5904: if (to == NULL) {
5905: to = xmlRegNewState(am);
5906: xmlRegStatePush(am, to);
5907: }
5908: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5909: xmlRegAtomPush(am, atom);
5910: am->state = to;
5911:
5912: if (to == NULL)
5913: to = am->state;
5914: if (to == NULL)
5915: return(NULL);
5916: if (min == 0)
5917: xmlFAGenerateEpsilonTransition(am, from, to);
5918: return(to);
5919: }
5920:
5921: /**
5922: * xmlAutomataNewCountTrans:
5923: * @am: an automata
5924: * @from: the starting point of the transition
5925: * @to: the target point of the transition or NULL
5926: * @token: the input string associated to that transition
5927: * @min: the minimum successive occurences of token
5928: * @max: the maximum successive occurences of token
5929: * @data: data associated to the transition
5930: *
5931: * If @to is NULL, this creates first a new target state in the automata
5932: * and then adds a transition from the @from state to the target state
5933: * activated by a succession of input of value @token and whose number
5934: * is between @min and @max
5935: *
5936: * Returns the target state or NULL in case of error
5937: */
5938: xmlAutomataStatePtr
5939: xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5940: xmlAutomataStatePtr to, const xmlChar *token,
5941: int min, int max, void *data) {
5942: xmlRegAtomPtr atom;
5943: int counter;
5944:
5945: if ((am == NULL) || (from == NULL) || (token == NULL))
5946: return(NULL);
5947: if (min < 0)
5948: return(NULL);
5949: if ((max < min) || (max < 1))
5950: return(NULL);
5951: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5952: if (atom == NULL)
5953: return(NULL);
5954: atom->valuep = xmlStrdup(token);
5955: atom->data = data;
5956: if (min == 0)
5957: atom->min = 1;
5958: else
5959: atom->min = min;
5960: atom->max = max;
5961:
5962: /*
5963: * associate a counter to the transition.
5964: */
5965: counter = xmlRegGetCounter(am);
5966: am->counters[counter].min = min;
5967: am->counters[counter].max = max;
5968:
5969: /* xmlFAGenerateTransitions(am, from, to, atom); */
5970: if (to == NULL) {
5971: to = xmlRegNewState(am);
5972: xmlRegStatePush(am, to);
5973: }
5974: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5975: xmlRegAtomPush(am, atom);
5976: am->state = to;
5977:
5978: if (to == NULL)
5979: to = am->state;
5980: if (to == NULL)
5981: return(NULL);
5982: if (min == 0)
5983: xmlFAGenerateEpsilonTransition(am, from, to);
5984: return(to);
5985: }
5986:
5987: /**
5988: * xmlAutomataNewOnceTrans2:
5989: * @am: an automata
5990: * @from: the starting point of the transition
5991: * @to: the target point of the transition or NULL
5992: * @token: the input string associated to that transition
5993: * @token2: the second input string associated to that transition
5994: * @min: the minimum successive occurences of token
5995: * @max: the maximum successive occurences of token
5996: * @data: data associated to the transition
5997: *
5998: * If @to is NULL, this creates first a new target state in the automata
5999: * and then adds a transition from the @from state to the target state
6000: * activated by a succession of input of value @token and @token2 and whose
6001: * number is between @min and @max, moreover that transition can only be
6002: * crossed once.
6003: *
6004: * Returns the target state or NULL in case of error
6005: */
6006: xmlAutomataStatePtr
6007: xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6008: xmlAutomataStatePtr to, const xmlChar *token,
6009: const xmlChar *token2,
6010: int min, int max, void *data) {
6011: xmlRegAtomPtr atom;
6012: int counter;
6013:
6014: if ((am == NULL) || (from == NULL) || (token == NULL))
6015: return(NULL);
6016: if (min < 1)
6017: return(NULL);
6018: if ((max < min) || (max < 1))
6019: return(NULL);
6020: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6021: if (atom == NULL)
6022: return(NULL);
6023: if ((token2 == NULL) || (*token2 == 0)) {
6024: atom->valuep = xmlStrdup(token);
6025: } else {
6026: int lenn, lenp;
6027: xmlChar *str;
6028:
6029: lenn = strlen((char *) token2);
6030: lenp = strlen((char *) token);
6031:
6032: str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6033: if (str == NULL) {
6034: xmlRegFreeAtom(atom);
6035: return(NULL);
6036: }
6037: memcpy(&str[0], token, lenp);
6038: str[lenp] = '|';
6039: memcpy(&str[lenp + 1], token2, lenn);
6040: str[lenn + lenp + 1] = 0;
6041:
6042: atom->valuep = str;
6043: }
6044: atom->data = data;
6045: atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6046: atom->min = min;
6047: atom->max = max;
6048: /*
6049: * associate a counter to the transition.
6050: */
6051: counter = xmlRegGetCounter(am);
6052: am->counters[counter].min = 1;
6053: am->counters[counter].max = 1;
6054:
6055: /* xmlFAGenerateTransitions(am, from, to, atom); */
6056: if (to == NULL) {
6057: to = xmlRegNewState(am);
6058: xmlRegStatePush(am, to);
6059: }
6060: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6061: xmlRegAtomPush(am, atom);
6062: am->state = to;
6063: return(to);
6064: }
6065:
6066:
6067:
6068: /**
6069: * xmlAutomataNewOnceTrans:
6070: * @am: an automata
6071: * @from: the starting point of the transition
6072: * @to: the target point of the transition or NULL
6073: * @token: the input string associated to that transition
6074: * @min: the minimum successive occurences of token
6075: * @max: the maximum successive occurences of token
6076: * @data: data associated to the transition
6077: *
6078: * If @to is NULL, this creates first a new target state in the automata
6079: * and then adds a transition from the @from state to the target state
6080: * activated by a succession of input of value @token and whose number
6081: * is between @min and @max, moreover that transition can only be crossed
6082: * once.
6083: *
6084: * Returns the target state or NULL in case of error
6085: */
6086: xmlAutomataStatePtr
6087: xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6088: xmlAutomataStatePtr to, const xmlChar *token,
6089: int min, int max, void *data) {
6090: xmlRegAtomPtr atom;
6091: int counter;
6092:
6093: if ((am == NULL) || (from == NULL) || (token == NULL))
6094: return(NULL);
6095: if (min < 1)
6096: return(NULL);
6097: if ((max < min) || (max < 1))
6098: return(NULL);
6099: atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6100: if (atom == NULL)
6101: return(NULL);
6102: atom->valuep = xmlStrdup(token);
6103: atom->data = data;
6104: atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6105: atom->min = min;
6106: atom->max = max;
6107: /*
6108: * associate a counter to the transition.
6109: */
6110: counter = xmlRegGetCounter(am);
6111: am->counters[counter].min = 1;
6112: am->counters[counter].max = 1;
6113:
6114: /* xmlFAGenerateTransitions(am, from, to, atom); */
6115: if (to == NULL) {
6116: to = xmlRegNewState(am);
6117: xmlRegStatePush(am, to);
6118: }
6119: xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6120: xmlRegAtomPush(am, atom);
6121: am->state = to;
6122: return(to);
6123: }
6124:
6125: /**
6126: * xmlAutomataNewState:
6127: * @am: an automata
6128: *
6129: * Create a new disconnected state in the automata
6130: *
6131: * Returns the new state or NULL in case of error
6132: */
6133: xmlAutomataStatePtr
6134: xmlAutomataNewState(xmlAutomataPtr am) {
6135: xmlAutomataStatePtr to;
6136:
6137: if (am == NULL)
6138: return(NULL);
6139: to = xmlRegNewState(am);
6140: xmlRegStatePush(am, to);
6141: return(to);
6142: }
6143:
6144: /**
6145: * xmlAutomataNewEpsilon:
6146: * @am: an automata
6147: * @from: the starting point of the transition
6148: * @to: the target point of the transition or NULL
6149: *
6150: * If @to is NULL, this creates first a new target state in the automata
6151: * and then adds an epsilon transition from the @from state to the
6152: * target state
6153: *
6154: * Returns the target state or NULL in case of error
6155: */
6156: xmlAutomataStatePtr
6157: xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6158: xmlAutomataStatePtr to) {
6159: if ((am == NULL) || (from == NULL))
6160: return(NULL);
6161: xmlFAGenerateEpsilonTransition(am, from, to);
6162: if (to == NULL)
6163: return(am->state);
6164: return(to);
6165: }
6166:
6167: /**
6168: * xmlAutomataNewAllTrans:
6169: * @am: an automata
6170: * @from: the starting point of the transition
6171: * @to: the target point of the transition or NULL
6172: * @lax: allow to transition if not all all transitions have been activated
6173: *
6174: * If @to is NULL, this creates first a new target state in the automata
6175: * and then adds a an ALL transition from the @from state to the
6176: * target state. That transition is an epsilon transition allowed only when
6177: * all transitions from the @from node have been activated.
6178: *
6179: * Returns the target state or NULL in case of error
6180: */
6181: xmlAutomataStatePtr
6182: xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6183: xmlAutomataStatePtr to, int lax) {
6184: if ((am == NULL) || (from == NULL))
6185: return(NULL);
6186: xmlFAGenerateAllTransition(am, from, to, lax);
6187: if (to == NULL)
6188: return(am->state);
6189: return(to);
6190: }
6191:
6192: /**
6193: * xmlAutomataNewCounter:
6194: * @am: an automata
6195: * @min: the minimal value on the counter
6196: * @max: the maximal value on the counter
6197: *
6198: * Create a new counter
6199: *
6200: * Returns the counter number or -1 in case of error
6201: */
6202: int
6203: xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6204: int ret;
6205:
6206: if (am == NULL)
6207: return(-1);
6208:
6209: ret = xmlRegGetCounter(am);
6210: if (ret < 0)
6211: return(-1);
6212: am->counters[ret].min = min;
6213: am->counters[ret].max = max;
6214: return(ret);
6215: }
6216:
6217: /**
6218: * xmlAutomataNewCountedTrans:
6219: * @am: an automata
6220: * @from: the starting point of the transition
6221: * @to: the target point of the transition or NULL
6222: * @counter: the counter associated to that transition
6223: *
6224: * If @to is NULL, this creates first a new target state in the automata
6225: * and then adds an epsilon transition from the @from state to the target state
6226: * which will increment the counter provided
6227: *
6228: * Returns the target state or NULL in case of error
6229: */
6230: xmlAutomataStatePtr
6231: xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6232: xmlAutomataStatePtr to, int counter) {
6233: if ((am == NULL) || (from == NULL) || (counter < 0))
6234: return(NULL);
6235: xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6236: if (to == NULL)
6237: return(am->state);
6238: return(to);
6239: }
6240:
6241: /**
6242: * xmlAutomataNewCounterTrans:
6243: * @am: an automata
6244: * @from: the starting point of the transition
6245: * @to: the target point of the transition or NULL
6246: * @counter: the counter associated to that transition
6247: *
6248: * If @to is NULL, this creates first a new target state in the automata
6249: * and then adds an epsilon transition from the @from state to the target state
6250: * which will be allowed only if the counter is within the right range.
6251: *
6252: * Returns the target state or NULL in case of error
6253: */
6254: xmlAutomataStatePtr
6255: xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6256: xmlAutomataStatePtr to, int counter) {
6257: if ((am == NULL) || (from == NULL) || (counter < 0))
6258: return(NULL);
6259: xmlFAGenerateCountedTransition(am, from, to, counter);
6260: if (to == NULL)
6261: return(am->state);
6262: return(to);
6263: }
6264:
6265: /**
6266: * xmlAutomataCompile:
6267: * @am: an automata
6268: *
6269: * Compile the automata into a Reg Exp ready for being executed.
6270: * The automata should be free after this point.
6271: *
6272: * Returns the compiled regexp or NULL in case of error
6273: */
6274: xmlRegexpPtr
6275: xmlAutomataCompile(xmlAutomataPtr am) {
6276: xmlRegexpPtr ret;
6277:
6278: if ((am == NULL) || (am->error != 0)) return(NULL);
6279: xmlFAEliminateEpsilonTransitions(am);
6280: /* xmlFAComputesDeterminism(am); */
6281: ret = xmlRegEpxFromParse(am);
6282:
6283: return(ret);
6284: }
6285:
6286: /**
6287: * xmlAutomataIsDeterminist:
6288: * @am: an automata
6289: *
6290: * Checks if an automata is determinist.
6291: *
6292: * Returns 1 if true, 0 if not, and -1 in case of error
6293: */
6294: int
6295: xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6296: int ret;
6297:
6298: if (am == NULL)
6299: return(-1);
6300:
6301: ret = xmlFAComputesDeterminism(am);
6302: return(ret);
6303: }
6304: #endif /* LIBXML_AUTOMATA_ENABLED */
6305:
6306: #ifdef LIBXML_EXPR_ENABLED
6307: /************************************************************************
6308: * *
6309: * Formal Expression handling code *
6310: * *
6311: ************************************************************************/
6312: /************************************************************************
6313: * *
6314: * Expression handling context *
6315: * *
6316: ************************************************************************/
6317:
6318: struct _xmlExpCtxt {
6319: xmlDictPtr dict;
6320: xmlExpNodePtr *table;
6321: int size;
6322: int nbElems;
6323: int nb_nodes;
6324: int maxNodes;
6325: const char *expr;
6326: const char *cur;
6327: int nb_cons;
6328: int tabSize;
6329: };
6330:
6331: /**
6332: * xmlExpNewCtxt:
6333: * @maxNodes: the maximum number of nodes
6334: * @dict: optional dictionnary to use internally
6335: *
6336: * Creates a new context for manipulating expressions
6337: *
6338: * Returns the context or NULL in case of error
6339: */
6340: xmlExpCtxtPtr
6341: xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6342: xmlExpCtxtPtr ret;
6343: int size = 256;
6344:
6345: if (maxNodes <= 4096)
6346: maxNodes = 4096;
6347:
6348: ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6349: if (ret == NULL)
6350: return(NULL);
6351: memset(ret, 0, sizeof(xmlExpCtxt));
6352: ret->size = size;
6353: ret->nbElems = 0;
6354: ret->maxNodes = maxNodes;
6355: ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6356: if (ret->table == NULL) {
6357: xmlFree(ret);
6358: return(NULL);
6359: }
6360: memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6361: if (dict == NULL) {
6362: ret->dict = xmlDictCreate();
6363: if (ret->dict == NULL) {
6364: xmlFree(ret->table);
6365: xmlFree(ret);
6366: return(NULL);
6367: }
6368: } else {
6369: ret->dict = dict;
6370: xmlDictReference(ret->dict);
6371: }
6372: return(ret);
6373: }
6374:
6375: /**
6376: * xmlExpFreeCtxt:
6377: * @ctxt: an expression context
6378: *
6379: * Free an expression context
6380: */
6381: void
6382: xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6383: if (ctxt == NULL)
6384: return;
6385: xmlDictFree(ctxt->dict);
6386: if (ctxt->table != NULL)
6387: xmlFree(ctxt->table);
6388: xmlFree(ctxt);
6389: }
6390:
6391: /************************************************************************
6392: * *
6393: * Structure associated to an expression node *
6394: * *
6395: ************************************************************************/
6396: #define MAX_NODES 10000
6397:
6398: /* #define DEBUG_DERIV */
6399:
6400: /*
6401: * TODO:
6402: * - Wildcards
6403: * - public API for creation
6404: *
6405: * Started
6406: * - regression testing
6407: *
6408: * Done
6409: * - split into module and test tool
6410: * - memleaks
6411: */
6412:
6413: typedef enum {
6414: XML_EXP_NILABLE = (1 << 0)
6415: } xmlExpNodeInfo;
6416:
6417: #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6418:
6419: struct _xmlExpNode {
6420: unsigned char type;/* xmlExpNodeType */
6421: unsigned char info;/* OR of xmlExpNodeInfo */
6422: unsigned short key; /* the hash key */
6423: unsigned int ref; /* The number of references */
6424: int c_max; /* the maximum length it can consume */
6425: xmlExpNodePtr exp_left;
6426: xmlExpNodePtr next;/* the next node in the hash table or free list */
6427: union {
6428: struct {
6429: int f_min;
6430: int f_max;
6431: } count;
6432: struct {
6433: xmlExpNodePtr f_right;
6434: } children;
6435: const xmlChar *f_str;
6436: } field;
6437: };
6438:
6439: #define exp_min field.count.f_min
6440: #define exp_max field.count.f_max
6441: /* #define exp_left field.children.f_left */
6442: #define exp_right field.children.f_right
6443: #define exp_str field.f_str
6444:
6445: static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6446: static xmlExpNode forbiddenExpNode = {
6447: XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6448: };
6449: xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6450: static xmlExpNode emptyExpNode = {
6451: XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6452: };
6453: xmlExpNodePtr emptyExp = &emptyExpNode;
6454:
6455: /************************************************************************
6456: * *
6457: * The custom hash table for unicity and canonicalization *
6458: * of sub-expressions pointers *
6459: * *
6460: ************************************************************************/
6461: /*
6462: * xmlExpHashNameComputeKey:
6463: * Calculate the hash key for a token
6464: */
6465: static unsigned short
6466: xmlExpHashNameComputeKey(const xmlChar *name) {
6467: unsigned short value = 0L;
6468: char ch;
6469:
6470: if (name != NULL) {
6471: value += 30 * (*name);
6472: while ((ch = *name++) != 0) {
6473: value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6474: }
6475: }
6476: return (value);
6477: }
6478:
6479: /*
6480: * xmlExpHashComputeKey:
6481: * Calculate the hash key for a compound expression
6482: */
6483: static unsigned short
6484: xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6485: xmlExpNodePtr right) {
6486: unsigned long value;
6487: unsigned short ret;
6488:
6489: switch (type) {
6490: case XML_EXP_SEQ:
6491: value = left->key;
6492: value += right->key;
6493: value *= 3;
6494: ret = (unsigned short) value;
6495: break;
6496: case XML_EXP_OR:
6497: value = left->key;
6498: value += right->key;
6499: value *= 7;
6500: ret = (unsigned short) value;
6501: break;
6502: case XML_EXP_COUNT:
6503: value = left->key;
6504: value += right->key;
6505: ret = (unsigned short) value;
6506: break;
6507: default:
6508: ret = 0;
6509: }
6510: return(ret);
6511: }
6512:
6513:
6514: static xmlExpNodePtr
6515: xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6516: xmlExpNodePtr ret;
6517:
6518: if (ctxt->nb_nodes >= MAX_NODES)
6519: return(NULL);
6520: ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6521: if (ret == NULL)
6522: return(NULL);
6523: memset(ret, 0, sizeof(xmlExpNode));
6524: ret->type = type;
6525: ret->next = NULL;
6526: ctxt->nb_nodes++;
6527: ctxt->nb_cons++;
6528: return(ret);
6529: }
6530:
6531: /**
6532: * xmlExpHashGetEntry:
6533: * @table: the hash table
6534: *
6535: * Get the unique entry from the hash table. The entry is created if
6536: * needed. @left and @right are consumed, i.e. their ref count will
6537: * be decremented by the operation.
6538: *
6539: * Returns the pointer or NULL in case of error
6540: */
6541: static xmlExpNodePtr
6542: xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6543: xmlExpNodePtr left, xmlExpNodePtr right,
6544: const xmlChar *name, int min, int max) {
6545: unsigned short kbase, key;
6546: xmlExpNodePtr entry;
6547: xmlExpNodePtr insert;
6548:
6549: if (ctxt == NULL)
6550: return(NULL);
6551:
6552: /*
6553: * Check for duplicate and insertion location.
6554: */
6555: if (type == XML_EXP_ATOM) {
6556: kbase = xmlExpHashNameComputeKey(name);
6557: } else if (type == XML_EXP_COUNT) {
6558: /* COUNT reduction rule 1 */
6559: /* a{1} -> a */
6560: if (min == max) {
6561: if (min == 1) {
6562: return(left);
6563: }
6564: if (min == 0) {
6565: xmlExpFree(ctxt, left);
6566: return(emptyExp);
6567: }
6568: }
6569: if (min < 0) {
6570: xmlExpFree(ctxt, left);
6571: return(forbiddenExp);
6572: }
6573: if (max == -1)
6574: kbase = min + 79;
6575: else
6576: kbase = max - min;
6577: kbase += left->key;
6578: } else if (type == XML_EXP_OR) {
6579: /* Forbid reduction rules */
6580: if (left->type == XML_EXP_FORBID) {
6581: xmlExpFree(ctxt, left);
6582: return(right);
6583: }
6584: if (right->type == XML_EXP_FORBID) {
6585: xmlExpFree(ctxt, right);
6586: return(left);
6587: }
6588:
6589: /* OR reduction rule 1 */
6590: /* a | a reduced to a */
6591: if (left == right) {
6592: left->ref--;
6593: return(left);
6594: }
6595: /* OR canonicalization rule 1 */
6596: /* linearize (a | b) | c into a | (b | c) */
6597: if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6598: xmlExpNodePtr tmp = left;
6599: left = right;
6600: right = tmp;
6601: }
6602: /* OR reduction rule 2 */
6603: /* a | (a | b) and b | (a | b) are reduced to a | b */
6604: if (right->type == XML_EXP_OR) {
6605: if ((left == right->exp_left) ||
6606: (left == right->exp_right)) {
6607: xmlExpFree(ctxt, left);
6608: return(right);
6609: }
6610: }
6611: /* OR canonicalization rule 2 */
6612: /* linearize (a | b) | c into a | (b | c) */
6613: if (left->type == XML_EXP_OR) {
6614: xmlExpNodePtr tmp;
6615:
6616: /* OR canonicalization rule 2 */
6617: if ((left->exp_right->type != XML_EXP_OR) &&
6618: (left->exp_right->key < left->exp_left->key)) {
6619: tmp = left->exp_right;
6620: left->exp_right = left->exp_left;
6621: left->exp_left = tmp;
6622: }
6623: left->exp_right->ref++;
6624: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6625: NULL, 0, 0);
6626: left->exp_left->ref++;
6627: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6628: NULL, 0, 0);
6629:
6630: xmlExpFree(ctxt, left);
6631: return(tmp);
6632: }
6633: if (right->type == XML_EXP_OR) {
6634: /* Ordering in the tree */
6635: /* C | (A | B) -> A | (B | C) */
6636: if (left->key > right->exp_right->key) {
6637: xmlExpNodePtr tmp;
6638: right->exp_right->ref++;
6639: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6640: left, NULL, 0, 0);
6641: right->exp_left->ref++;
6642: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6643: tmp, NULL, 0, 0);
6644: xmlExpFree(ctxt, right);
6645: return(tmp);
6646: }
6647: /* Ordering in the tree */
6648: /* B | (A | C) -> A | (B | C) */
6649: if (left->key > right->exp_left->key) {
6650: xmlExpNodePtr tmp;
6651: right->exp_right->ref++;
6652: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6653: right->exp_right, NULL, 0, 0);
6654: right->exp_left->ref++;
6655: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6656: tmp, NULL, 0, 0);
6657: xmlExpFree(ctxt, right);
6658: return(tmp);
6659: }
6660: }
6661: /* we know both types are != XML_EXP_OR here */
6662: else if (left->key > right->key) {
6663: xmlExpNodePtr tmp = left;
6664: left = right;
6665: right = tmp;
6666: }
6667: kbase = xmlExpHashComputeKey(type, left, right);
6668: } else if (type == XML_EXP_SEQ) {
6669: /* Forbid reduction rules */
6670: if (left->type == XML_EXP_FORBID) {
6671: xmlExpFree(ctxt, right);
6672: return(left);
6673: }
6674: if (right->type == XML_EXP_FORBID) {
6675: xmlExpFree(ctxt, left);
6676: return(right);
6677: }
6678: /* Empty reduction rules */
6679: if (right->type == XML_EXP_EMPTY) {
6680: return(left);
6681: }
6682: if (left->type == XML_EXP_EMPTY) {
6683: return(right);
6684: }
6685: kbase = xmlExpHashComputeKey(type, left, right);
6686: } else
6687: return(NULL);
6688:
6689: key = kbase % ctxt->size;
6690: if (ctxt->table[key] != NULL) {
6691: for (insert = ctxt->table[key]; insert != NULL;
6692: insert = insert->next) {
6693: if ((insert->key == kbase) &&
6694: (insert->type == type)) {
6695: if (type == XML_EXP_ATOM) {
6696: if (name == insert->exp_str) {
6697: insert->ref++;
6698: return(insert);
6699: }
6700: } else if (type == XML_EXP_COUNT) {
6701: if ((insert->exp_min == min) && (insert->exp_max == max) &&
6702: (insert->exp_left == left)) {
6703: insert->ref++;
6704: left->ref--;
6705: return(insert);
6706: }
6707: } else if ((insert->exp_left == left) &&
6708: (insert->exp_right == right)) {
6709: insert->ref++;
6710: left->ref--;
6711: right->ref--;
6712: return(insert);
6713: }
6714: }
6715: }
6716: }
6717:
6718: entry = xmlExpNewNode(ctxt, type);
6719: if (entry == NULL)
6720: return(NULL);
6721: entry->key = kbase;
6722: if (type == XML_EXP_ATOM) {
6723: entry->exp_str = name;
6724: entry->c_max = 1;
6725: } else if (type == XML_EXP_COUNT) {
6726: entry->exp_min = min;
6727: entry->exp_max = max;
6728: entry->exp_left = left;
6729: if ((min == 0) || (IS_NILLABLE(left)))
6730: entry->info |= XML_EXP_NILABLE;
6731: if (max < 0)
6732: entry->c_max = -1;
6733: else
6734: entry->c_max = max * entry->exp_left->c_max;
6735: } else {
6736: entry->exp_left = left;
6737: entry->exp_right = right;
6738: if (type == XML_EXP_OR) {
6739: if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6740: entry->info |= XML_EXP_NILABLE;
6741: if ((entry->exp_left->c_max == -1) ||
6742: (entry->exp_right->c_max == -1))
6743: entry->c_max = -1;
6744: else if (entry->exp_left->c_max > entry->exp_right->c_max)
6745: entry->c_max = entry->exp_left->c_max;
6746: else
6747: entry->c_max = entry->exp_right->c_max;
6748: } else {
6749: if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6750: entry->info |= XML_EXP_NILABLE;
6751: if ((entry->exp_left->c_max == -1) ||
6752: (entry->exp_right->c_max == -1))
6753: entry->c_max = -1;
6754: else
6755: entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6756: }
6757: }
6758: entry->ref = 1;
6759: if (ctxt->table[key] != NULL)
6760: entry->next = ctxt->table[key];
6761:
6762: ctxt->table[key] = entry;
6763: ctxt->nbElems++;
6764:
6765: return(entry);
6766: }
6767:
6768: /**
6769: * xmlExpFree:
6770: * @ctxt: the expression context
6771: * @exp: the expression
6772: *
6773: * Dereference the expression
6774: */
6775: void
6776: xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6777: if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6778: return;
6779: exp->ref--;
6780: if (exp->ref == 0) {
6781: unsigned short key;
6782:
6783: /* Unlink it first from the hash table */
6784: key = exp->key % ctxt->size;
6785: if (ctxt->table[key] == exp) {
6786: ctxt->table[key] = exp->next;
6787: } else {
6788: xmlExpNodePtr tmp;
6789:
6790: tmp = ctxt->table[key];
6791: while (tmp != NULL) {
6792: if (tmp->next == exp) {
6793: tmp->next = exp->next;
6794: break;
6795: }
6796: tmp = tmp->next;
6797: }
6798: }
6799:
6800: if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6801: xmlExpFree(ctxt, exp->exp_left);
6802: xmlExpFree(ctxt, exp->exp_right);
6803: } else if (exp->type == XML_EXP_COUNT) {
6804: xmlExpFree(ctxt, exp->exp_left);
6805: }
6806: xmlFree(exp);
6807: ctxt->nb_nodes--;
6808: }
6809: }
6810:
6811: /**
6812: * xmlExpRef:
6813: * @exp: the expression
6814: *
6815: * Increase the reference count of the expression
6816: */
6817: void
6818: xmlExpRef(xmlExpNodePtr exp) {
6819: if (exp != NULL)
6820: exp->ref++;
6821: }
6822:
6823: /**
6824: * xmlExpNewAtom:
6825: * @ctxt: the expression context
6826: * @name: the atom name
6827: * @len: the atom name lenght in byte (or -1);
6828: *
6829: * Get the atom associated to this name from that context
6830: *
6831: * Returns the node or NULL in case of error
6832: */
6833: xmlExpNodePtr
6834: xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6835: if ((ctxt == NULL) || (name == NULL))
6836: return(NULL);
6837: name = xmlDictLookup(ctxt->dict, name, len);
6838: if (name == NULL)
6839: return(NULL);
6840: return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6841: }
6842:
6843: /**
6844: * xmlExpNewOr:
6845: * @ctxt: the expression context
6846: * @left: left expression
6847: * @right: right expression
6848: *
6849: * Get the atom associated to the choice @left | @right
6850: * Note that @left and @right are consumed in the operation, to keep
6851: * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6852: * this is true even in case of failure (unless ctxt == NULL).
6853: *
6854: * Returns the node or NULL in case of error
6855: */
6856: xmlExpNodePtr
6857: xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6858: if (ctxt == NULL)
6859: return(NULL);
6860: if ((left == NULL) || (right == NULL)) {
6861: xmlExpFree(ctxt, left);
6862: xmlExpFree(ctxt, right);
6863: return(NULL);
6864: }
6865: return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6866: }
6867:
6868: /**
6869: * xmlExpNewSeq:
6870: * @ctxt: the expression context
6871: * @left: left expression
6872: * @right: right expression
6873: *
6874: * Get the atom associated to the sequence @left , @right
6875: * Note that @left and @right are consumed in the operation, to keep
6876: * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6877: * this is true even in case of failure (unless ctxt == NULL).
6878: *
6879: * Returns the node or NULL in case of error
6880: */
6881: xmlExpNodePtr
6882: xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6883: if (ctxt == NULL)
6884: return(NULL);
6885: if ((left == NULL) || (right == NULL)) {
6886: xmlExpFree(ctxt, left);
6887: xmlExpFree(ctxt, right);
6888: return(NULL);
6889: }
6890: return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6891: }
6892:
6893: /**
6894: * xmlExpNewRange:
6895: * @ctxt: the expression context
6896: * @subset: the expression to be repeated
6897: * @min: the lower bound for the repetition
6898: * @max: the upper bound for the repetition, -1 means infinite
6899: *
6900: * Get the atom associated to the range (@subset){@min, @max}
6901: * Note that @subset is consumed in the operation, to keep
6902: * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6903: * this is true even in case of failure (unless ctxt == NULL).
6904: *
6905: * Returns the node or NULL in case of error
6906: */
6907: xmlExpNodePtr
6908: xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6909: if (ctxt == NULL)
6910: return(NULL);
6911: if ((subset == NULL) || (min < 0) || (max < -1) ||
6912: ((max >= 0) && (min > max))) {
6913: xmlExpFree(ctxt, subset);
6914: return(NULL);
6915: }
6916: return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6917: NULL, NULL, min, max));
6918: }
6919:
6920: /************************************************************************
6921: * *
6922: * Public API for operations on expressions *
6923: * *
6924: ************************************************************************/
6925:
6926: static int
6927: xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6928: const xmlChar**list, int len, int nb) {
6929: int tmp, tmp2;
6930: tail:
6931: switch (exp->type) {
6932: case XML_EXP_EMPTY:
6933: return(0);
6934: case XML_EXP_ATOM:
6935: for (tmp = 0;tmp < nb;tmp++)
6936: if (list[tmp] == exp->exp_str)
6937: return(0);
6938: if (nb >= len)
6939: return(-2);
6940: list[nb] = exp->exp_str;
6941: return(1);
6942: case XML_EXP_COUNT:
6943: exp = exp->exp_left;
6944: goto tail;
6945: case XML_EXP_SEQ:
6946: case XML_EXP_OR:
6947: tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6948: if (tmp < 0)
6949: return(tmp);
6950: tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6951: nb + tmp);
6952: if (tmp2 < 0)
6953: return(tmp2);
6954: return(tmp + tmp2);
6955: }
6956: return(-1);
6957: }
6958:
6959: /**
6960: * xmlExpGetLanguage:
6961: * @ctxt: the expression context
6962: * @exp: the expression
6963: * @langList: where to store the tokens
6964: * @len: the allocated lenght of @list
6965: *
6966: * Find all the strings used in @exp and store them in @list
6967: *
6968: * Returns the number of unique strings found, -1 in case of errors and
6969: * -2 if there is more than @len strings
6970: */
6971: int
6972: xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6973: const xmlChar**langList, int len) {
6974: if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
6975: return(-1);
6976: return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
6977: }
6978:
6979: static int
6980: xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6981: const xmlChar**list, int len, int nb) {
6982: int tmp, tmp2;
6983: tail:
6984: switch (exp->type) {
6985: case XML_EXP_FORBID:
6986: return(0);
6987: case XML_EXP_EMPTY:
6988: return(0);
6989: case XML_EXP_ATOM:
6990: for (tmp = 0;tmp < nb;tmp++)
6991: if (list[tmp] == exp->exp_str)
6992: return(0);
6993: if (nb >= len)
6994: return(-2);
6995: list[nb] = exp->exp_str;
6996: return(1);
6997: case XML_EXP_COUNT:
6998: exp = exp->exp_left;
6999: goto tail;
7000: case XML_EXP_SEQ:
7001: tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7002: if (tmp < 0)
7003: return(tmp);
7004: if (IS_NILLABLE(exp->exp_left)) {
7005: tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7006: nb + tmp);
7007: if (tmp2 < 0)
7008: return(tmp2);
7009: tmp += tmp2;
7010: }
7011: return(tmp);
7012: case XML_EXP_OR:
7013: tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7014: if (tmp < 0)
7015: return(tmp);
7016: tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7017: nb + tmp);
7018: if (tmp2 < 0)
7019: return(tmp2);
7020: return(tmp + tmp2);
7021: }
7022: return(-1);
7023: }
7024:
7025: /**
7026: * xmlExpGetStart:
7027: * @ctxt: the expression context
7028: * @exp: the expression
7029: * @tokList: where to store the tokens
7030: * @len: the allocated lenght of @list
7031: *
7032: * Find all the strings that appears at the start of the languages
7033: * accepted by @exp and store them in @list. E.g. for (a, b) | c
7034: * it will return the list [a, c]
7035: *
7036: * Returns the number of unique strings found, -1 in case of errors and
7037: * -2 if there is more than @len strings
7038: */
7039: int
7040: xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7041: const xmlChar**tokList, int len) {
7042: if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7043: return(-1);
7044: return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7045: }
7046:
7047: /**
7048: * xmlExpIsNillable:
7049: * @exp: the expression
7050: *
7051: * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
7052: *
7053: * Returns 1 if nillable, 0 if not and -1 in case of error
7054: */
7055: int
7056: xmlExpIsNillable(xmlExpNodePtr exp) {
7057: if (exp == NULL)
7058: return(-1);
7059: return(IS_NILLABLE(exp) != 0);
7060: }
7061:
7062: static xmlExpNodePtr
7063: xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7064: {
7065: xmlExpNodePtr ret;
7066:
7067: switch (exp->type) {
7068: case XML_EXP_EMPTY:
7069: return(forbiddenExp);
7070: case XML_EXP_FORBID:
7071: return(forbiddenExp);
7072: case XML_EXP_ATOM:
7073: if (exp->exp_str == str) {
7074: #ifdef DEBUG_DERIV
7075: printf("deriv atom: equal => Empty\n");
7076: #endif
7077: ret = emptyExp;
7078: } else {
7079: #ifdef DEBUG_DERIV
7080: printf("deriv atom: mismatch => forbid\n");
7081: #endif
7082: /* TODO wildcards here */
7083: ret = forbiddenExp;
7084: }
7085: return(ret);
7086: case XML_EXP_OR: {
7087: xmlExpNodePtr tmp;
7088:
7089: #ifdef DEBUG_DERIV
7090: printf("deriv or: => or(derivs)\n");
7091: #endif
7092: tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7093: if (tmp == NULL) {
7094: return(NULL);
7095: }
7096: ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7097: if (ret == NULL) {
7098: xmlExpFree(ctxt, tmp);
7099: return(NULL);
7100: }
7101: ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7102: NULL, 0, 0);
7103: return(ret);
7104: }
7105: case XML_EXP_SEQ:
7106: #ifdef DEBUG_DERIV
7107: printf("deriv seq: starting with left\n");
7108: #endif
7109: ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7110: if (ret == NULL) {
7111: return(NULL);
7112: } else if (ret == forbiddenExp) {
7113: if (IS_NILLABLE(exp->exp_left)) {
7114: #ifdef DEBUG_DERIV
7115: printf("deriv seq: left failed but nillable\n");
7116: #endif
7117: ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7118: }
7119: } else {
7120: #ifdef DEBUG_DERIV
7121: printf("deriv seq: left match => sequence\n");
7122: #endif
7123: exp->exp_right->ref++;
7124: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7125: NULL, 0, 0);
7126: }
7127: return(ret);
7128: case XML_EXP_COUNT: {
7129: int min, max;
7130: xmlExpNodePtr tmp;
7131:
7132: if (exp->exp_max == 0)
7133: return(forbiddenExp);
7134: ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7135: if (ret == NULL)
7136: return(NULL);
7137: if (ret == forbiddenExp) {
7138: #ifdef DEBUG_DERIV
7139: printf("deriv count: pattern mismatch => forbid\n");
7140: #endif
7141: return(ret);
7142: }
7143: if (exp->exp_max == 1)
7144: return(ret);
7145: if (exp->exp_max < 0) /* unbounded */
7146: max = -1;
7147: else
7148: max = exp->exp_max - 1;
7149: if (exp->exp_min > 0)
7150: min = exp->exp_min - 1;
7151: else
7152: min = 0;
7153: exp->exp_left->ref++;
7154: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7155: NULL, min, max);
7156: if (ret == emptyExp) {
7157: #ifdef DEBUG_DERIV
7158: printf("deriv count: match to empty => new count\n");
7159: #endif
7160: return(tmp);
7161: }
7162: #ifdef DEBUG_DERIV
7163: printf("deriv count: match => sequence with new count\n");
7164: #endif
7165: return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7166: NULL, 0, 0));
7167: }
7168: }
7169: return(NULL);
7170: }
7171:
7172: /**
7173: * xmlExpStringDerive:
7174: * @ctxt: the expression context
7175: * @exp: the expression
7176: * @str: the string
7177: * @len: the string len in bytes if available
7178: *
7179: * Do one step of Brzozowski derivation of the expression @exp with
7180: * respect to the input string
7181: *
7182: * Returns the resulting expression or NULL in case of internal error
7183: */
7184: xmlExpNodePtr
7185: xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7186: const xmlChar *str, int len) {
7187: const xmlChar *input;
7188:
7189: if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7190: return(NULL);
7191: }
7192: /*
7193: * check the string is in the dictionnary, if yes use an interned
7194: * copy, otherwise we know it's not an acceptable input
7195: */
7196: input = xmlDictExists(ctxt->dict, str, len);
7197: if (input == NULL) {
7198: return(forbiddenExp);
7199: }
7200: return(xmlExpStringDeriveInt(ctxt, exp, input));
7201: }
7202:
7203: static int
7204: xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7205: int ret = 1;
7206:
7207: if (sub->c_max == -1) {
7208: if (exp->c_max != -1)
7209: ret = 0;
7210: } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7211: ret = 0;
7212: }
7213: #if 0
7214: if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7215: ret = 0;
7216: #endif
7217: return(ret);
7218: }
7219:
7220: static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7221: xmlExpNodePtr sub);
7222: /**
7223: * xmlExpDivide:
7224: * @ctxt: the expressions context
7225: * @exp: the englobing expression
7226: * @sub: the subexpression
7227: * @mult: the multiple expression
7228: * @remain: the remain from the derivation of the multiple
7229: *
7230: * Check if exp is a multiple of sub, i.e. if there is a finite number n
7231: * so that sub{n} subsume exp
7232: *
7233: * Returns the multiple value if successful, 0 if it is not a multiple
7234: * and -1 in case of internel error.
7235: */
7236:
7237: static int
7238: xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7239: xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7240: int i;
7241: xmlExpNodePtr tmp, tmp2;
7242:
7243: if (mult != NULL) *mult = NULL;
7244: if (remain != NULL) *remain = NULL;
7245: if (exp->c_max == -1) return(0);
7246: if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7247:
7248: for (i = 1;i <= exp->c_max;i++) {
7249: sub->ref++;
7250: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7251: sub, NULL, NULL, i, i);
7252: if (tmp == NULL) {
7253: return(-1);
7254: }
7255: if (!xmlExpCheckCard(tmp, exp)) {
7256: xmlExpFree(ctxt, tmp);
7257: continue;
7258: }
7259: tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7260: if (tmp2 == NULL) {
7261: xmlExpFree(ctxt, tmp);
7262: return(-1);
7263: }
7264: if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7265: if (remain != NULL)
7266: *remain = tmp2;
7267: else
7268: xmlExpFree(ctxt, tmp2);
7269: if (mult != NULL)
7270: *mult = tmp;
7271: else
7272: xmlExpFree(ctxt, tmp);
7273: #ifdef DEBUG_DERIV
7274: printf("Divide succeeded %d\n", i);
7275: #endif
7276: return(i);
7277: }
7278: xmlExpFree(ctxt, tmp);
7279: xmlExpFree(ctxt, tmp2);
7280: }
7281: #ifdef DEBUG_DERIV
7282: printf("Divide failed\n");
7283: #endif
7284: return(0);
7285: }
7286:
7287: /**
7288: * xmlExpExpDeriveInt:
7289: * @ctxt: the expressions context
7290: * @exp: the englobing expression
7291: * @sub: the subexpression
7292: *
7293: * Try to do a step of Brzozowski derivation but at a higher level
7294: * the input being a subexpression.
7295: *
7296: * Returns the resulting expression or NULL in case of internal error
7297: */
7298: static xmlExpNodePtr
7299: xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7300: xmlExpNodePtr ret, tmp, tmp2, tmp3;
7301: const xmlChar **tab;
7302: int len, i;
7303:
7304: /*
7305: * In case of equality and if the expression can only consume a finite
7306: * amount, then the derivation is empty
7307: */
7308: if ((exp == sub) && (exp->c_max >= 0)) {
7309: #ifdef DEBUG_DERIV
7310: printf("Equal(exp, sub) and finite -> Empty\n");
7311: #endif
7312: return(emptyExp);
7313: }
7314: /*
7315: * decompose sub sequence first
7316: */
7317: if (sub->type == XML_EXP_EMPTY) {
7318: #ifdef DEBUG_DERIV
7319: printf("Empty(sub) -> Empty\n");
7320: #endif
7321: exp->ref++;
7322: return(exp);
7323: }
7324: if (sub->type == XML_EXP_SEQ) {
7325: #ifdef DEBUG_DERIV
7326: printf("Seq(sub) -> decompose\n");
7327: #endif
7328: tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7329: if (tmp == NULL)
7330: return(NULL);
7331: if (tmp == forbiddenExp)
7332: return(tmp);
7333: ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7334: xmlExpFree(ctxt, tmp);
7335: return(ret);
7336: }
7337: if (sub->type == XML_EXP_OR) {
7338: #ifdef DEBUG_DERIV
7339: printf("Or(sub) -> decompose\n");
7340: #endif
7341: tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7342: if (tmp == forbiddenExp)
7343: return(tmp);
7344: if (tmp == NULL)
7345: return(NULL);
7346: ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7347: if ((ret == NULL) || (ret == forbiddenExp)) {
7348: xmlExpFree(ctxt, tmp);
7349: return(ret);
7350: }
7351: return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7352: }
7353: if (!xmlExpCheckCard(exp, sub)) {
7354: #ifdef DEBUG_DERIV
7355: printf("CheckCard(exp, sub) failed -> Forbid\n");
7356: #endif
7357: return(forbiddenExp);
7358: }
7359: switch (exp->type) {
7360: case XML_EXP_EMPTY:
7361: if (sub == emptyExp)
7362: return(emptyExp);
7363: #ifdef DEBUG_DERIV
7364: printf("Empty(exp) -> Forbid\n");
7365: #endif
7366: return(forbiddenExp);
7367: case XML_EXP_FORBID:
7368: #ifdef DEBUG_DERIV
7369: printf("Forbid(exp) -> Forbid\n");
7370: #endif
7371: return(forbiddenExp);
7372: case XML_EXP_ATOM:
7373: if (sub->type == XML_EXP_ATOM) {
7374: /* TODO: handle wildcards */
7375: if (exp->exp_str == sub->exp_str) {
7376: #ifdef DEBUG_DERIV
7377: printf("Atom match -> Empty\n");
7378: #endif
7379: return(emptyExp);
7380: }
7381: #ifdef DEBUG_DERIV
7382: printf("Atom mismatch -> Forbid\n");
7383: #endif
7384: return(forbiddenExp);
7385: }
7386: if ((sub->type == XML_EXP_COUNT) &&
7387: (sub->exp_max == 1) &&
7388: (sub->exp_left->type == XML_EXP_ATOM)) {
7389: /* TODO: handle wildcards */
7390: if (exp->exp_str == sub->exp_left->exp_str) {
7391: #ifdef DEBUG_DERIV
7392: printf("Atom match -> Empty\n");
7393: #endif
7394: return(emptyExp);
7395: }
7396: #ifdef DEBUG_DERIV
7397: printf("Atom mismatch -> Forbid\n");
7398: #endif
7399: return(forbiddenExp);
7400: }
7401: #ifdef DEBUG_DERIV
7402: printf("Compex exp vs Atom -> Forbid\n");
7403: #endif
7404: return(forbiddenExp);
7405: case XML_EXP_SEQ:
7406: /* try to get the sequence consumed only if possible */
7407: if (xmlExpCheckCard(exp->exp_left, sub)) {
7408: /* See if the sequence can be consumed directly */
7409: #ifdef DEBUG_DERIV
7410: printf("Seq trying left only\n");
7411: #endif
7412: ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7413: if ((ret != forbiddenExp) && (ret != NULL)) {
7414: #ifdef DEBUG_DERIV
7415: printf("Seq trying left only worked\n");
7416: #endif
7417: /*
7418: * TODO: assumption here that we are determinist
7419: * i.e. we won't get to a nillable exp left
7420: * subset which could be matched by the right
7421: * part too.
7422: * e.g.: (a | b)+,(a | c) and 'a+,a'
7423: */
7424: exp->exp_right->ref++;
7425: return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7426: exp->exp_right, NULL, 0, 0));
7427: }
7428: #ifdef DEBUG_DERIV
7429: } else {
7430: printf("Seq: left too short\n");
7431: #endif
7432: }
7433: /* Try instead to decompose */
7434: if (sub->type == XML_EXP_COUNT) {
7435: int min, max;
7436:
7437: #ifdef DEBUG_DERIV
7438: printf("Seq: sub is a count\n");
7439: #endif
7440: ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7441: if (ret == NULL)
7442: return(NULL);
7443: if (ret != forbiddenExp) {
7444: #ifdef DEBUG_DERIV
7445: printf("Seq , Count match on left\n");
7446: #endif
7447: if (sub->exp_max < 0)
7448: max = -1;
7449: else
7450: max = sub->exp_max -1;
7451: if (sub->exp_min > 0)
7452: min = sub->exp_min -1;
7453: else
7454: min = 0;
7455: exp->exp_right->ref++;
7456: tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7457: exp->exp_right, NULL, 0, 0);
7458: if (tmp == NULL)
7459: return(NULL);
7460:
7461: sub->exp_left->ref++;
7462: tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7463: sub->exp_left, NULL, NULL, min, max);
7464: if (tmp2 == NULL) {
7465: xmlExpFree(ctxt, tmp);
7466: return(NULL);
7467: }
7468: ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7469: xmlExpFree(ctxt, tmp);
7470: xmlExpFree(ctxt, tmp2);
7471: return(ret);
7472: }
7473: }
7474: /* we made no progress on structured operations */
7475: break;
7476: case XML_EXP_OR:
7477: #ifdef DEBUG_DERIV
7478: printf("Or , trying both side\n");
7479: #endif
7480: ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7481: if (ret == NULL)
7482: return(NULL);
7483: tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7484: if (tmp == NULL) {
7485: xmlExpFree(ctxt, ret);
7486: return(NULL);
7487: }
7488: return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7489: case XML_EXP_COUNT: {
7490: int min, max;
7491:
7492: if (sub->type == XML_EXP_COUNT) {
7493: /*
7494: * Try to see if the loop is completely subsumed
7495: */
7496: tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7497: if (tmp == NULL)
7498: return(NULL);
7499: if (tmp == forbiddenExp) {
7500: int mult;
7501:
7502: #ifdef DEBUG_DERIV
7503: printf("Count, Count inner don't subsume\n");
7504: #endif
7505: mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7506: NULL, &tmp);
7507: if (mult <= 0) {
7508: #ifdef DEBUG_DERIV
7509: printf("Count, Count not multiple => forbidden\n");
7510: #endif
7511: return(forbiddenExp);
7512: }
7513: if (sub->exp_max == -1) {
7514: max = -1;
7515: if (exp->exp_max == -1) {
7516: if (exp->exp_min <= sub->exp_min * mult)
7517: min = 0;
7518: else
7519: min = exp->exp_min - sub->exp_min * mult;
7520: } else {
7521: #ifdef DEBUG_DERIV
7522: printf("Count, Count finite can't subsume infinite\n");
7523: #endif
7524: xmlExpFree(ctxt, tmp);
7525: return(forbiddenExp);
7526: }
7527: } else {
7528: if (exp->exp_max == -1) {
7529: #ifdef DEBUG_DERIV
7530: printf("Infinite loop consume mult finite loop\n");
7531: #endif
7532: if (exp->exp_min > sub->exp_min * mult) {
7533: max = -1;
7534: min = exp->exp_min - sub->exp_min * mult;
7535: } else {
7536: max = -1;
7537: min = 0;
7538: }
7539: } else {
7540: if (exp->exp_max < sub->exp_max * mult) {
7541: #ifdef DEBUG_DERIV
7542: printf("loops max mult mismatch => forbidden\n");
7543: #endif
7544: xmlExpFree(ctxt, tmp);
7545: return(forbiddenExp);
7546: }
7547: if (sub->exp_max * mult > exp->exp_min)
7548: min = 0;
7549: else
7550: min = exp->exp_min - sub->exp_max * mult;
7551: max = exp->exp_max - sub->exp_max * mult;
7552: }
7553: }
7554: } else if (!IS_NILLABLE(tmp)) {
7555: /*
7556: * TODO: loop here to try to grow if working on finite
7557: * blocks.
7558: */
7559: #ifdef DEBUG_DERIV
7560: printf("Count, Count remain not nillable => forbidden\n");
7561: #endif
7562: xmlExpFree(ctxt, tmp);
7563: return(forbiddenExp);
7564: } else if (sub->exp_max == -1) {
7565: if (exp->exp_max == -1) {
7566: if (exp->exp_min <= sub->exp_min) {
7567: #ifdef DEBUG_DERIV
7568: printf("Infinite loops Okay => COUNT(0,Inf)\n");
7569: #endif
7570: max = -1;
7571: min = 0;
7572: } else {
7573: #ifdef DEBUG_DERIV
7574: printf("Infinite loops min => Count(X,Inf)\n");
7575: #endif
7576: max = -1;
7577: min = exp->exp_min - sub->exp_min;
7578: }
7579: } else if (exp->exp_min > sub->exp_min) {
7580: #ifdef DEBUG_DERIV
7581: printf("loops min mismatch 1 => forbidden ???\n");
7582: #endif
7583: xmlExpFree(ctxt, tmp);
7584: return(forbiddenExp);
7585: } else {
7586: max = -1;
7587: min = 0;
7588: }
7589: } else {
7590: if (exp->exp_max == -1) {
7591: #ifdef DEBUG_DERIV
7592: printf("Infinite loop consume finite loop\n");
7593: #endif
7594: if (exp->exp_min > sub->exp_min) {
7595: max = -1;
7596: min = exp->exp_min - sub->exp_min;
7597: } else {
7598: max = -1;
7599: min = 0;
7600: }
7601: } else {
7602: if (exp->exp_max < sub->exp_max) {
7603: #ifdef DEBUG_DERIV
7604: printf("loops max mismatch => forbidden\n");
7605: #endif
7606: xmlExpFree(ctxt, tmp);
7607: return(forbiddenExp);
7608: }
7609: if (sub->exp_max > exp->exp_min)
7610: min = 0;
7611: else
7612: min = exp->exp_min - sub->exp_max;
7613: max = exp->exp_max - sub->exp_max;
7614: }
7615: }
7616: #ifdef DEBUG_DERIV
7617: printf("loops match => SEQ(COUNT())\n");
7618: #endif
7619: exp->exp_left->ref++;
7620: tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7621: NULL, NULL, min, max);
7622: if (tmp2 == NULL) {
7623: return(NULL);
7624: }
7625: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7626: NULL, 0, 0);
7627: return(ret);
7628: }
7629: tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7630: if (tmp == NULL)
7631: return(NULL);
7632: if (tmp == forbiddenExp) {
7633: #ifdef DEBUG_DERIV
7634: printf("loop mismatch => forbidden\n");
7635: #endif
7636: return(forbiddenExp);
7637: }
7638: if (exp->exp_min > 0)
7639: min = exp->exp_min - 1;
7640: else
7641: min = 0;
7642: if (exp->exp_max < 0)
7643: max = -1;
7644: else
7645: max = exp->exp_max - 1;
7646:
7647: #ifdef DEBUG_DERIV
7648: printf("loop match => SEQ(COUNT())\n");
7649: #endif
7650: exp->exp_left->ref++;
7651: tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7652: NULL, NULL, min, max);
7653: if (tmp2 == NULL)
7654: return(NULL);
7655: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7656: NULL, 0, 0);
7657: return(ret);
7658: }
7659: }
7660:
7661: #ifdef DEBUG_DERIV
7662: printf("Fallback to derivative\n");
7663: #endif
7664: if (IS_NILLABLE(sub)) {
7665: if (!(IS_NILLABLE(exp)))
7666: return(forbiddenExp);
7667: else
7668: ret = emptyExp;
7669: } else
7670: ret = NULL;
7671: /*
7672: * here the structured derivation made no progress so
7673: * we use the default token based derivation to force one more step
7674: */
7675: if (ctxt->tabSize == 0)
7676: ctxt->tabSize = 40;
7677:
7678: tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7679: sizeof(const xmlChar *));
7680: if (tab == NULL) {
7681: return(NULL);
7682: }
7683:
7684: /*
7685: * collect all the strings accepted by the subexpression on input
7686: */
7687: len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7688: while (len < 0) {
7689: const xmlChar **temp;
7690: temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7691: sizeof(const xmlChar *));
7692: if (temp == NULL) {
7693: xmlFree((xmlChar **) tab);
7694: return(NULL);
7695: }
7696: tab = temp;
7697: ctxt->tabSize *= 2;
7698: len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7699: }
7700: for (i = 0;i < len;i++) {
7701: tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7702: if ((tmp == NULL) || (tmp == forbiddenExp)) {
7703: xmlExpFree(ctxt, ret);
7704: xmlFree((xmlChar **) tab);
7705: return(tmp);
7706: }
7707: tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7708: if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7709: xmlExpFree(ctxt, tmp);
7710: xmlExpFree(ctxt, ret);
7711: xmlFree((xmlChar **) tab);
7712: return(tmp);
7713: }
7714: tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7715: xmlExpFree(ctxt, tmp);
7716: xmlExpFree(ctxt, tmp2);
7717:
7718: if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7719: xmlExpFree(ctxt, ret);
7720: xmlFree((xmlChar **) tab);
7721: return(tmp3);
7722: }
7723:
7724: if (ret == NULL)
7725: ret = tmp3;
7726: else {
7727: ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7728: if (ret == NULL) {
7729: xmlFree((xmlChar **) tab);
7730: return(NULL);
7731: }
7732: }
7733: }
7734: xmlFree((xmlChar **) tab);
7735: return(ret);
7736: }
7737:
7738: /**
7739: * xmlExpExpDerive:
7740: * @ctxt: the expressions context
7741: * @exp: the englobing expression
7742: * @sub: the subexpression
7743: *
7744: * Evaluates the expression resulting from @exp consuming a sub expression @sub
7745: * Based on algebraic derivation and sometimes direct Brzozowski derivation
7746: * it usually tatkes less than linear time and can handle expressions generating
7747: * infinite languages.
7748: *
7749: * Returns the resulting expression or NULL in case of internal error, the
7750: * result must be freed
7751: */
7752: xmlExpNodePtr
7753: xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7754: if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7755: return(NULL);
7756:
7757: /*
7758: * O(1) speedups
7759: */
7760: if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7761: #ifdef DEBUG_DERIV
7762: printf("Sub nillable and not exp : can't subsume\n");
7763: #endif
7764: return(forbiddenExp);
7765: }
7766: if (xmlExpCheckCard(exp, sub) == 0) {
7767: #ifdef DEBUG_DERIV
7768: printf("sub generate longuer sequances than exp : can't subsume\n");
7769: #endif
7770: return(forbiddenExp);
7771: }
7772: return(xmlExpExpDeriveInt(ctxt, exp, sub));
7773: }
7774:
7775: /**
7776: * xmlExpSubsume:
7777: * @ctxt: the expressions context
7778: * @exp: the englobing expression
7779: * @sub: the subexpression
7780: *
7781: * Check whether @exp accepts all the languages accexpted by @sub
7782: * the input being a subexpression.
7783: *
7784: * Returns 1 if true 0 if false and -1 in case of failure.
7785: */
7786: int
7787: xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7788: xmlExpNodePtr tmp;
7789:
7790: if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7791: return(-1);
7792:
7793: /*
7794: * TODO: speedup by checking the language of sub is a subset of the
7795: * language of exp
7796: */
7797: /*
7798: * O(1) speedups
7799: */
7800: if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7801: #ifdef DEBUG_DERIV
7802: printf("Sub nillable and not exp : can't subsume\n");
7803: #endif
7804: return(0);
7805: }
7806: if (xmlExpCheckCard(exp, sub) == 0) {
7807: #ifdef DEBUG_DERIV
7808: printf("sub generate longuer sequances than exp : can't subsume\n");
7809: #endif
7810: return(0);
7811: }
7812: tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7813: #ifdef DEBUG_DERIV
7814: printf("Result derivation :\n");
7815: PRINT_EXP(tmp);
7816: #endif
7817: if (tmp == NULL)
7818: return(-1);
7819: if (tmp == forbiddenExp)
7820: return(0);
7821: if (tmp == emptyExp)
7822: return(1);
7823: if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7824: xmlExpFree(ctxt, tmp);
7825: return(1);
7826: }
7827: xmlExpFree(ctxt, tmp);
7828: return(0);
7829: }
7830:
7831: /************************************************************************
7832: * *
7833: * Parsing expression *
7834: * *
7835: ************************************************************************/
7836:
7837: static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7838:
7839: #undef CUR
7840: #define CUR (*ctxt->cur)
7841: #undef NEXT
7842: #define NEXT ctxt->cur++;
7843: #undef IS_BLANK
7844: #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7845: #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7846:
7847: static int
7848: xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7849: int ret = 0;
7850:
7851: SKIP_BLANKS
7852: if (CUR == '*') {
7853: NEXT
7854: return(-1);
7855: }
7856: if ((CUR < '0') || (CUR > '9'))
7857: return(-1);
7858: while ((CUR >= '0') && (CUR <= '9')) {
7859: ret = ret * 10 + (CUR - '0');
7860: NEXT
7861: }
7862: return(ret);
7863: }
7864:
7865: static xmlExpNodePtr
7866: xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7867: const char *base;
7868: xmlExpNodePtr ret;
7869: const xmlChar *val;
7870:
7871: SKIP_BLANKS
7872: base = ctxt->cur;
7873: if (*ctxt->cur == '(') {
7874: NEXT
7875: ret = xmlExpParseExpr(ctxt);
7876: SKIP_BLANKS
7877: if (*ctxt->cur != ')') {
7878: fprintf(stderr, "unbalanced '(' : %s\n", base);
7879: xmlExpFree(ctxt, ret);
7880: return(NULL);
7881: }
7882: NEXT;
7883: SKIP_BLANKS
7884: goto parse_quantifier;
7885: }
7886: while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7887: (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7888: (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7889: NEXT;
7890: val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7891: if (val == NULL)
7892: return(NULL);
7893: ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7894: if (ret == NULL)
7895: return(NULL);
7896: SKIP_BLANKS
7897: parse_quantifier:
7898: if (CUR == '{') {
7899: int min, max;
7900:
7901: NEXT
7902: min = xmlExpParseNumber(ctxt);
7903: if (min < 0) {
7904: xmlExpFree(ctxt, ret);
7905: return(NULL);
7906: }
7907: SKIP_BLANKS
7908: if (CUR == ',') {
7909: NEXT
7910: max = xmlExpParseNumber(ctxt);
7911: SKIP_BLANKS
7912: } else
7913: max = min;
7914: if (CUR != '}') {
7915: xmlExpFree(ctxt, ret);
7916: return(NULL);
7917: }
7918: NEXT
7919: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7920: min, max);
7921: SKIP_BLANKS
7922: } else if (CUR == '?') {
7923: NEXT
7924: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7925: 0, 1);
7926: SKIP_BLANKS
7927: } else if (CUR == '+') {
7928: NEXT
7929: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7930: 1, -1);
7931: SKIP_BLANKS
7932: } else if (CUR == '*') {
7933: NEXT
7934: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7935: 0, -1);
7936: SKIP_BLANKS
7937: }
7938: return(ret);
7939: }
7940:
7941:
7942: static xmlExpNodePtr
7943: xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7944: xmlExpNodePtr ret, right;
7945:
7946: ret = xmlExpParseOr(ctxt);
7947: SKIP_BLANKS
7948: while (CUR == '|') {
7949: NEXT
7950: right = xmlExpParseOr(ctxt);
7951: if (right == NULL) {
7952: xmlExpFree(ctxt, ret);
7953: return(NULL);
7954: }
7955: ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7956: if (ret == NULL)
7957: return(NULL);
7958: }
7959: return(ret);
7960: }
7961:
7962: static xmlExpNodePtr
7963: xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7964: xmlExpNodePtr ret, right;
7965:
7966: ret = xmlExpParseSeq(ctxt);
7967: SKIP_BLANKS
7968: while (CUR == ',') {
7969: NEXT
7970: right = xmlExpParseSeq(ctxt);
7971: if (right == NULL) {
7972: xmlExpFree(ctxt, ret);
7973: return(NULL);
7974: }
7975: ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7976: if (ret == NULL)
7977: return(NULL);
7978: }
7979: return(ret);
7980: }
7981:
7982: /**
7983: * xmlExpParse:
7984: * @ctxt: the expressions context
7985: * @expr: the 0 terminated string
7986: *
7987: * Minimal parser for regexps, it understand the following constructs
7988: * - string terminals
7989: * - choice operator |
7990: * - sequence operator ,
7991: * - subexpressions (...)
7992: * - usual cardinality operators + * and ?
7993: * - finite sequences { min, max }
7994: * - infinite sequences { min, * }
7995: * There is minimal checkings made especially no checking on strings values
7996: *
7997: * Returns a new expression or NULL in case of failure
7998: */
7999: xmlExpNodePtr
8000: xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8001: xmlExpNodePtr ret;
8002:
8003: ctxt->expr = expr;
8004: ctxt->cur = expr;
8005:
8006: ret = xmlExpParseExpr(ctxt);
8007: SKIP_BLANKS
8008: if (*ctxt->cur != 0) {
8009: xmlExpFree(ctxt, ret);
8010: return(NULL);
8011: }
8012: return(ret);
8013: }
8014:
8015: static void
8016: xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8017: xmlExpNodePtr c;
8018:
8019: if (expr == NULL) return;
8020: if (glob) xmlBufferWriteChar(buf, "(");
8021: switch (expr->type) {
8022: case XML_EXP_EMPTY:
8023: xmlBufferWriteChar(buf, "empty");
8024: break;
8025: case XML_EXP_FORBID:
8026: xmlBufferWriteChar(buf, "forbidden");
8027: break;
8028: case XML_EXP_ATOM:
8029: xmlBufferWriteCHAR(buf, expr->exp_str);
8030: break;
8031: case XML_EXP_SEQ:
8032: c = expr->exp_left;
8033: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8034: xmlExpDumpInt(buf, c, 1);
8035: else
8036: xmlExpDumpInt(buf, c, 0);
8037: xmlBufferWriteChar(buf, " , ");
8038: c = expr->exp_right;
8039: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8040: xmlExpDumpInt(buf, c, 1);
8041: else
8042: xmlExpDumpInt(buf, c, 0);
8043: break;
8044: case XML_EXP_OR:
8045: c = expr->exp_left;
8046: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8047: xmlExpDumpInt(buf, c, 1);
8048: else
8049: xmlExpDumpInt(buf, c, 0);
8050: xmlBufferWriteChar(buf, " | ");
8051: c = expr->exp_right;
8052: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8053: xmlExpDumpInt(buf, c, 1);
8054: else
8055: xmlExpDumpInt(buf, c, 0);
8056: break;
8057: case XML_EXP_COUNT: {
8058: char rep[40];
8059:
8060: c = expr->exp_left;
8061: if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8062: xmlExpDumpInt(buf, c, 1);
8063: else
8064: xmlExpDumpInt(buf, c, 0);
8065: if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8066: rep[0] = '?';
8067: rep[1] = 0;
8068: } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8069: rep[0] = '*';
8070: rep[1] = 0;
8071: } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8072: rep[0] = '+';
8073: rep[1] = 0;
8074: } else if (expr->exp_max == expr->exp_min) {
8075: snprintf(rep, 39, "{%d}", expr->exp_min);
8076: } else if (expr->exp_max < 0) {
8077: snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8078: } else {
8079: snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8080: }
8081: rep[39] = 0;
8082: xmlBufferWriteChar(buf, rep);
8083: break;
8084: }
8085: default:
8086: fprintf(stderr, "Error in tree\n");
8087: }
8088: if (glob)
8089: xmlBufferWriteChar(buf, ")");
8090: }
8091: /**
8092: * xmlExpDump:
8093: * @buf: a buffer to receive the output
8094: * @expr: the compiled expression
8095: *
8096: * Serialize the expression as compiled to the buffer
8097: */
8098: void
8099: xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8100: if ((buf == NULL) || (expr == NULL))
8101: return;
8102: xmlExpDumpInt(buf, expr, 0);
8103: }
8104:
8105: /**
8106: * xmlExpMaxToken:
8107: * @expr: a compiled expression
8108: *
8109: * Indicate the maximum number of input a expression can accept
8110: *
8111: * Returns the maximum length or -1 in case of error
8112: */
8113: int
8114: xmlExpMaxToken(xmlExpNodePtr expr) {
8115: if (expr == NULL)
8116: return(-1);
8117: return(expr->c_max);
8118: }
8119:
8120: /**
8121: * xmlExpCtxtNbNodes:
8122: * @ctxt: an expression context
8123: *
8124: * Debugging facility provides the number of allocated nodes at a that point
8125: *
8126: * Returns the number of nodes in use or -1 in case of error
8127: */
8128: int
8129: xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8130: if (ctxt == NULL)
8131: return(-1);
8132: return(ctxt->nb_nodes);
8133: }
8134:
8135: /**
8136: * xmlExpCtxtNbCons:
8137: * @ctxt: an expression context
8138: *
8139: * Debugging facility provides the number of allocated nodes over lifetime
8140: *
8141: * Returns the number of nodes ever allocated or -1 in case of error
8142: */
8143: int
8144: xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8145: if (ctxt == NULL)
8146: return(-1);
8147: return(ctxt->nb_cons);
8148: }
8149:
8150: #endif /* LIBXML_EXPR_ENABLED */
8151: #define bottom_xmlregexp
8152: #include "elfgcchack.h"
8153: #endif /* LIBXML_REGEXP_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>