Annotation of embedaddon/expat/lib/xmltok_impl.c, revision 1.1.1.1.2.1
1.1 misho 1: /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2: See the file COPYING for copying permission.
3: */
4:
5: /* This file is included! */
6: #ifdef XML_TOK_IMPL_C
7:
8: #ifndef IS_INVALID_CHAR
9: #define IS_INVALID_CHAR(enc, ptr, n) (0)
10: #endif
11:
12: #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13: case BT_LEAD ## n: \
14: if (end - ptr < n) \
15: return XML_TOK_PARTIAL_CHAR; \
16: if (IS_INVALID_CHAR(enc, ptr, n)) { \
17: *(nextTokPtr) = (ptr); \
18: return XML_TOK_INVALID; \
19: } \
20: ptr += n; \
21: break;
22:
23: #define INVALID_CASES(ptr, nextTokPtr) \
24: INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25: INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26: INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27: case BT_NONXML: \
28: case BT_MALFORM: \
29: case BT_TRAIL: \
30: *(nextTokPtr) = (ptr); \
31: return XML_TOK_INVALID;
32:
33: #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34: case BT_LEAD ## n: \
35: if (end - ptr < n) \
36: return XML_TOK_PARTIAL_CHAR; \
37: if (!IS_NAME_CHAR(enc, ptr, n)) { \
38: *nextTokPtr = ptr; \
39: return XML_TOK_INVALID; \
40: } \
41: ptr += n; \
42: break;
43:
44: #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45: case BT_NONASCII: \
46: if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47: *nextTokPtr = ptr; \
48: return XML_TOK_INVALID; \
49: } \
50: case BT_NMSTRT: \
51: case BT_HEX: \
52: case BT_DIGIT: \
53: case BT_NAME: \
54: case BT_MINUS: \
55: ptr += MINBPC(enc); \
56: break; \
57: CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58: CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59: CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60:
61: #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62: case BT_LEAD ## n: \
63: if (end - ptr < n) \
64: return XML_TOK_PARTIAL_CHAR; \
65: if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66: *nextTokPtr = ptr; \
67: return XML_TOK_INVALID; \
68: } \
69: ptr += n; \
70: break;
71:
72: #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73: case BT_NONASCII: \
74: if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75: *nextTokPtr = ptr; \
76: return XML_TOK_INVALID; \
77: } \
78: case BT_NMSTRT: \
79: case BT_HEX: \
80: ptr += MINBPC(enc); \
81: break; \
82: CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83: CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84: CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85:
86: #ifndef PREFIX
87: #define PREFIX(ident) ident
88: #endif
89:
90: /* ptr points to character following "<!-" */
91:
92: static int PTRCALL
93: PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
94: const char *end, const char **nextTokPtr)
95: {
96: if (ptr != end) {
97: if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
98: *nextTokPtr = ptr;
99: return XML_TOK_INVALID;
100: }
101: ptr += MINBPC(enc);
102: while (ptr != end) {
103: switch (BYTE_TYPE(enc, ptr)) {
104: INVALID_CASES(ptr, nextTokPtr)
105: case BT_MINUS:
106: if ((ptr += MINBPC(enc)) == end)
107: return XML_TOK_PARTIAL;
108: if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
109: if ((ptr += MINBPC(enc)) == end)
110: return XML_TOK_PARTIAL;
111: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
112: *nextTokPtr = ptr;
113: return XML_TOK_INVALID;
114: }
115: *nextTokPtr = ptr + MINBPC(enc);
116: return XML_TOK_COMMENT;
117: }
118: break;
119: default:
120: ptr += MINBPC(enc);
121: break;
122: }
123: }
124: }
125: return XML_TOK_PARTIAL;
126: }
127:
128: /* ptr points to character following "<!" */
129:
130: static int PTRCALL
131: PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
132: const char *end, const char **nextTokPtr)
133: {
134: if (ptr == end)
135: return XML_TOK_PARTIAL;
136: switch (BYTE_TYPE(enc, ptr)) {
137: case BT_MINUS:
138: return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
139: case BT_LSQB:
140: *nextTokPtr = ptr + MINBPC(enc);
141: return XML_TOK_COND_SECT_OPEN;
142: case BT_NMSTRT:
143: case BT_HEX:
144: ptr += MINBPC(enc);
145: break;
146: default:
147: *nextTokPtr = ptr;
148: return XML_TOK_INVALID;
149: }
150: while (ptr != end) {
151: switch (BYTE_TYPE(enc, ptr)) {
152: case BT_PERCNT:
153: if (ptr + MINBPC(enc) == end)
154: return XML_TOK_PARTIAL;
155: /* don't allow <!ENTITY% foo "whatever"> */
156: switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
157: case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
158: *nextTokPtr = ptr;
159: return XML_TOK_INVALID;
160: }
161: /* fall through */
162: case BT_S: case BT_CR: case BT_LF:
163: *nextTokPtr = ptr;
164: return XML_TOK_DECL_OPEN;
165: case BT_NMSTRT:
166: case BT_HEX:
167: ptr += MINBPC(enc);
168: break;
169: default:
170: *nextTokPtr = ptr;
171: return XML_TOK_INVALID;
172: }
173: }
174: return XML_TOK_PARTIAL;
175: }
176:
177: static int PTRCALL
178: PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
179: const char *end, int *tokPtr)
180: {
181: int upper = 0;
182: *tokPtr = XML_TOK_PI;
183: if (end - ptr != MINBPC(enc)*3)
184: return 1;
185: switch (BYTE_TO_ASCII(enc, ptr)) {
186: case ASCII_x:
187: break;
188: case ASCII_X:
189: upper = 1;
190: break;
191: default:
192: return 1;
193: }
194: ptr += MINBPC(enc);
195: switch (BYTE_TO_ASCII(enc, ptr)) {
196: case ASCII_m:
197: break;
198: case ASCII_M:
199: upper = 1;
200: break;
201: default:
202: return 1;
203: }
204: ptr += MINBPC(enc);
205: switch (BYTE_TO_ASCII(enc, ptr)) {
206: case ASCII_l:
207: break;
208: case ASCII_L:
209: upper = 1;
210: break;
211: default:
212: return 1;
213: }
214: if (upper)
215: return 0;
216: *tokPtr = XML_TOK_XML_DECL;
217: return 1;
218: }
219:
220: /* ptr points to character following "<?" */
221:
222: static int PTRCALL
223: PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
224: const char *end, const char **nextTokPtr)
225: {
226: int tok;
227: const char *target = ptr;
228: if (ptr == end)
229: return XML_TOK_PARTIAL;
230: switch (BYTE_TYPE(enc, ptr)) {
231: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
232: default:
233: *nextTokPtr = ptr;
234: return XML_TOK_INVALID;
235: }
236: while (ptr != end) {
237: switch (BYTE_TYPE(enc, ptr)) {
238: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
239: case BT_S: case BT_CR: case BT_LF:
240: if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
241: *nextTokPtr = ptr;
242: return XML_TOK_INVALID;
243: }
244: ptr += MINBPC(enc);
245: while (ptr != end) {
246: switch (BYTE_TYPE(enc, ptr)) {
247: INVALID_CASES(ptr, nextTokPtr)
248: case BT_QUEST:
249: ptr += MINBPC(enc);
250: if (ptr == end)
251: return XML_TOK_PARTIAL;
252: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
253: *nextTokPtr = ptr + MINBPC(enc);
254: return tok;
255: }
256: break;
257: default:
258: ptr += MINBPC(enc);
259: break;
260: }
261: }
262: return XML_TOK_PARTIAL;
263: case BT_QUEST:
264: if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
265: *nextTokPtr = ptr;
266: return XML_TOK_INVALID;
267: }
268: ptr += MINBPC(enc);
269: if (ptr == end)
270: return XML_TOK_PARTIAL;
271: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
272: *nextTokPtr = ptr + MINBPC(enc);
273: return tok;
274: }
275: /* fall through */
276: default:
277: *nextTokPtr = ptr;
278: return XML_TOK_INVALID;
279: }
280: }
281: return XML_TOK_PARTIAL;
282: }
283:
284: static int PTRCALL
285: PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
286: const char *end, const char **nextTokPtr)
287: {
288: static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
289: ASCII_T, ASCII_A, ASCII_LSQB };
290: int i;
291: /* CDATA[ */
292: if (end - ptr < 6 * MINBPC(enc))
293: return XML_TOK_PARTIAL;
294: for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
295: if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
296: *nextTokPtr = ptr;
297: return XML_TOK_INVALID;
298: }
299: }
300: *nextTokPtr = ptr;
301: return XML_TOK_CDATA_SECT_OPEN;
302: }
303:
304: static int PTRCALL
305: PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
306: const char *end, const char **nextTokPtr)
307: {
308: if (ptr == end)
309: return XML_TOK_NONE;
310: if (MINBPC(enc) > 1) {
311: size_t n = end - ptr;
312: if (n & (MINBPC(enc) - 1)) {
313: n &= ~(MINBPC(enc) - 1);
314: if (n == 0)
315: return XML_TOK_PARTIAL;
316: end = ptr + n;
317: }
318: }
319: switch (BYTE_TYPE(enc, ptr)) {
320: case BT_RSQB:
321: ptr += MINBPC(enc);
322: if (ptr == end)
323: return XML_TOK_PARTIAL;
324: if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
325: break;
326: ptr += MINBPC(enc);
327: if (ptr == end)
328: return XML_TOK_PARTIAL;
329: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
330: ptr -= MINBPC(enc);
331: break;
332: }
333: *nextTokPtr = ptr + MINBPC(enc);
334: return XML_TOK_CDATA_SECT_CLOSE;
335: case BT_CR:
336: ptr += MINBPC(enc);
337: if (ptr == end)
338: return XML_TOK_PARTIAL;
339: if (BYTE_TYPE(enc, ptr) == BT_LF)
340: ptr += MINBPC(enc);
341: *nextTokPtr = ptr;
342: return XML_TOK_DATA_NEWLINE;
343: case BT_LF:
344: *nextTokPtr = ptr + MINBPC(enc);
345: return XML_TOK_DATA_NEWLINE;
346: INVALID_CASES(ptr, nextTokPtr)
347: default:
348: ptr += MINBPC(enc);
349: break;
350: }
351: while (ptr != end) {
352: switch (BYTE_TYPE(enc, ptr)) {
353: #define LEAD_CASE(n) \
354: case BT_LEAD ## n: \
355: if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
356: *nextTokPtr = ptr; \
357: return XML_TOK_DATA_CHARS; \
358: } \
359: ptr += n; \
360: break;
361: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
362: #undef LEAD_CASE
363: case BT_NONXML:
364: case BT_MALFORM:
365: case BT_TRAIL:
366: case BT_CR:
367: case BT_LF:
368: case BT_RSQB:
369: *nextTokPtr = ptr;
370: return XML_TOK_DATA_CHARS;
371: default:
372: ptr += MINBPC(enc);
373: break;
374: }
375: }
376: *nextTokPtr = ptr;
377: return XML_TOK_DATA_CHARS;
378: }
379:
380: /* ptr points to character following "</" */
381:
382: static int PTRCALL
383: PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
384: const char *end, const char **nextTokPtr)
385: {
386: if (ptr == end)
387: return XML_TOK_PARTIAL;
388: switch (BYTE_TYPE(enc, ptr)) {
389: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
390: default:
391: *nextTokPtr = ptr;
392: return XML_TOK_INVALID;
393: }
394: while (ptr != end) {
395: switch (BYTE_TYPE(enc, ptr)) {
396: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
397: case BT_S: case BT_CR: case BT_LF:
398: for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
399: switch (BYTE_TYPE(enc, ptr)) {
400: case BT_S: case BT_CR: case BT_LF:
401: break;
402: case BT_GT:
403: *nextTokPtr = ptr + MINBPC(enc);
404: return XML_TOK_END_TAG;
405: default:
406: *nextTokPtr = ptr;
407: return XML_TOK_INVALID;
408: }
409: }
410: return XML_TOK_PARTIAL;
411: #ifdef XML_NS
412: case BT_COLON:
413: /* no need to check qname syntax here,
414: since end-tag must match exactly */
415: ptr += MINBPC(enc);
416: break;
417: #endif
418: case BT_GT:
419: *nextTokPtr = ptr + MINBPC(enc);
420: return XML_TOK_END_TAG;
421: default:
422: *nextTokPtr = ptr;
423: return XML_TOK_INVALID;
424: }
425: }
426: return XML_TOK_PARTIAL;
427: }
428:
429: /* ptr points to character following "&#X" */
430:
431: static int PTRCALL
432: PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
433: const char *end, const char **nextTokPtr)
434: {
435: if (ptr != end) {
436: switch (BYTE_TYPE(enc, ptr)) {
437: case BT_DIGIT:
438: case BT_HEX:
439: break;
440: default:
441: *nextTokPtr = ptr;
442: return XML_TOK_INVALID;
443: }
444: for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
445: switch (BYTE_TYPE(enc, ptr)) {
446: case BT_DIGIT:
447: case BT_HEX:
448: break;
449: case BT_SEMI:
450: *nextTokPtr = ptr + MINBPC(enc);
451: return XML_TOK_CHAR_REF;
452: default:
453: *nextTokPtr = ptr;
454: return XML_TOK_INVALID;
455: }
456: }
457: }
458: return XML_TOK_PARTIAL;
459: }
460:
461: /* ptr points to character following "&#" */
462:
463: static int PTRCALL
464: PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
465: const char *end, const char **nextTokPtr)
466: {
467: if (ptr != end) {
468: if (CHAR_MATCHES(enc, ptr, ASCII_x))
469: return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
470: switch (BYTE_TYPE(enc, ptr)) {
471: case BT_DIGIT:
472: break;
473: default:
474: *nextTokPtr = ptr;
475: return XML_TOK_INVALID;
476: }
477: for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
478: switch (BYTE_TYPE(enc, ptr)) {
479: case BT_DIGIT:
480: break;
481: case BT_SEMI:
482: *nextTokPtr = ptr + MINBPC(enc);
483: return XML_TOK_CHAR_REF;
484: default:
485: *nextTokPtr = ptr;
486: return XML_TOK_INVALID;
487: }
488: }
489: }
490: return XML_TOK_PARTIAL;
491: }
492:
493: /* ptr points to character following "&" */
494:
495: static int PTRCALL
496: PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
497: const char **nextTokPtr)
498: {
499: if (ptr == end)
500: return XML_TOK_PARTIAL;
501: switch (BYTE_TYPE(enc, ptr)) {
502: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
503: case BT_NUM:
504: return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
505: default:
506: *nextTokPtr = ptr;
507: return XML_TOK_INVALID;
508: }
509: while (ptr != end) {
510: switch (BYTE_TYPE(enc, ptr)) {
511: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
512: case BT_SEMI:
513: *nextTokPtr = ptr + MINBPC(enc);
514: return XML_TOK_ENTITY_REF;
515: default:
516: *nextTokPtr = ptr;
517: return XML_TOK_INVALID;
518: }
519: }
520: return XML_TOK_PARTIAL;
521: }
522:
523: /* ptr points to character following first character of attribute name */
524:
525: static int PTRCALL
526: PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
527: const char **nextTokPtr)
528: {
529: #ifdef XML_NS
530: int hadColon = 0;
531: #endif
532: while (ptr != end) {
533: switch (BYTE_TYPE(enc, ptr)) {
534: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
535: #ifdef XML_NS
536: case BT_COLON:
537: if (hadColon) {
538: *nextTokPtr = ptr;
539: return XML_TOK_INVALID;
540: }
541: hadColon = 1;
542: ptr += MINBPC(enc);
543: if (ptr == end)
544: return XML_TOK_PARTIAL;
545: switch (BYTE_TYPE(enc, ptr)) {
546: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
547: default:
548: *nextTokPtr = ptr;
549: return XML_TOK_INVALID;
550: }
551: break;
552: #endif
553: case BT_S: case BT_CR: case BT_LF:
554: for (;;) {
555: int t;
556:
557: ptr += MINBPC(enc);
558: if (ptr == end)
559: return XML_TOK_PARTIAL;
560: t = BYTE_TYPE(enc, ptr);
561: if (t == BT_EQUALS)
562: break;
563: switch (t) {
564: case BT_S:
565: case BT_LF:
566: case BT_CR:
567: break;
568: default:
569: *nextTokPtr = ptr;
570: return XML_TOK_INVALID;
571: }
572: }
573: /* fall through */
574: case BT_EQUALS:
575: {
576: int open;
577: #ifdef XML_NS
578: hadColon = 0;
579: #endif
580: for (;;) {
581: ptr += MINBPC(enc);
582: if (ptr == end)
583: return XML_TOK_PARTIAL;
584: open = BYTE_TYPE(enc, ptr);
585: if (open == BT_QUOT || open == BT_APOS)
586: break;
587: switch (open) {
588: case BT_S:
589: case BT_LF:
590: case BT_CR:
591: break;
592: default:
593: *nextTokPtr = ptr;
594: return XML_TOK_INVALID;
595: }
596: }
597: ptr += MINBPC(enc);
598: /* in attribute value */
599: for (;;) {
600: int t;
601: if (ptr == end)
602: return XML_TOK_PARTIAL;
603: t = BYTE_TYPE(enc, ptr);
604: if (t == open)
605: break;
606: switch (t) {
607: INVALID_CASES(ptr, nextTokPtr)
608: case BT_AMP:
609: {
610: int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
611: if (tok <= 0) {
612: if (tok == XML_TOK_INVALID)
613: *nextTokPtr = ptr;
614: return tok;
615: }
616: break;
617: }
618: case BT_LT:
619: *nextTokPtr = ptr;
620: return XML_TOK_INVALID;
621: default:
622: ptr += MINBPC(enc);
623: break;
624: }
625: }
626: ptr += MINBPC(enc);
627: if (ptr == end)
628: return XML_TOK_PARTIAL;
629: switch (BYTE_TYPE(enc, ptr)) {
630: case BT_S:
631: case BT_CR:
632: case BT_LF:
633: break;
634: case BT_SOL:
635: goto sol;
636: case BT_GT:
637: goto gt;
638: default:
639: *nextTokPtr = ptr;
640: return XML_TOK_INVALID;
641: }
642: /* ptr points to closing quote */
643: for (;;) {
644: ptr += MINBPC(enc);
645: if (ptr == end)
646: return XML_TOK_PARTIAL;
647: switch (BYTE_TYPE(enc, ptr)) {
648: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
649: case BT_S: case BT_CR: case BT_LF:
650: continue;
651: case BT_GT:
652: gt:
653: *nextTokPtr = ptr + MINBPC(enc);
654: return XML_TOK_START_TAG_WITH_ATTS;
655: case BT_SOL:
656: sol:
657: ptr += MINBPC(enc);
658: if (ptr == end)
659: return XML_TOK_PARTIAL;
660: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661: *nextTokPtr = ptr;
662: return XML_TOK_INVALID;
663: }
664: *nextTokPtr = ptr + MINBPC(enc);
665: return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666: default:
667: *nextTokPtr = ptr;
668: return XML_TOK_INVALID;
669: }
670: break;
671: }
672: break;
673: }
674: default:
675: *nextTokPtr = ptr;
676: return XML_TOK_INVALID;
677: }
678: }
679: return XML_TOK_PARTIAL;
680: }
681:
682: /* ptr points to character following "<" */
683:
684: static int PTRCALL
685: PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686: const char **nextTokPtr)
687: {
688: #ifdef XML_NS
689: int hadColon;
690: #endif
691: if (ptr == end)
692: return XML_TOK_PARTIAL;
693: switch (BYTE_TYPE(enc, ptr)) {
694: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
695: case BT_EXCL:
696: if ((ptr += MINBPC(enc)) == end)
697: return XML_TOK_PARTIAL;
698: switch (BYTE_TYPE(enc, ptr)) {
699: case BT_MINUS:
700: return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
701: case BT_LSQB:
702: return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
703: end, nextTokPtr);
704: }
705: *nextTokPtr = ptr;
706: return XML_TOK_INVALID;
707: case BT_QUEST:
708: return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
709: case BT_SOL:
710: return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
711: default:
712: *nextTokPtr = ptr;
713: return XML_TOK_INVALID;
714: }
715: #ifdef XML_NS
716: hadColon = 0;
717: #endif
718: /* we have a start-tag */
719: while (ptr != end) {
720: switch (BYTE_TYPE(enc, ptr)) {
721: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
722: #ifdef XML_NS
723: case BT_COLON:
724: if (hadColon) {
725: *nextTokPtr = ptr;
726: return XML_TOK_INVALID;
727: }
728: hadColon = 1;
729: ptr += MINBPC(enc);
730: if (ptr == end)
731: return XML_TOK_PARTIAL;
732: switch (BYTE_TYPE(enc, ptr)) {
733: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
734: default:
735: *nextTokPtr = ptr;
736: return XML_TOK_INVALID;
737: }
738: break;
739: #endif
740: case BT_S: case BT_CR: case BT_LF:
741: {
742: ptr += MINBPC(enc);
743: while (ptr != end) {
744: switch (BYTE_TYPE(enc, ptr)) {
745: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
746: case BT_GT:
747: goto gt;
748: case BT_SOL:
749: goto sol;
750: case BT_S: case BT_CR: case BT_LF:
751: ptr += MINBPC(enc);
752: continue;
753: default:
754: *nextTokPtr = ptr;
755: return XML_TOK_INVALID;
756: }
757: return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
758: }
759: return XML_TOK_PARTIAL;
760: }
761: case BT_GT:
762: gt:
763: *nextTokPtr = ptr + MINBPC(enc);
764: return XML_TOK_START_TAG_NO_ATTS;
765: case BT_SOL:
766: sol:
767: ptr += MINBPC(enc);
768: if (ptr == end)
769: return XML_TOK_PARTIAL;
770: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
771: *nextTokPtr = ptr;
772: return XML_TOK_INVALID;
773: }
774: *nextTokPtr = ptr + MINBPC(enc);
775: return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
776: default:
777: *nextTokPtr = ptr;
778: return XML_TOK_INVALID;
779: }
780: }
781: return XML_TOK_PARTIAL;
782: }
783:
784: static int PTRCALL
785: PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
786: const char **nextTokPtr)
787: {
788: if (ptr == end)
789: return XML_TOK_NONE;
790: if (MINBPC(enc) > 1) {
791: size_t n = end - ptr;
792: if (n & (MINBPC(enc) - 1)) {
793: n &= ~(MINBPC(enc) - 1);
794: if (n == 0)
795: return XML_TOK_PARTIAL;
796: end = ptr + n;
797: }
798: }
799: switch (BYTE_TYPE(enc, ptr)) {
800: case BT_LT:
801: return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
802: case BT_AMP:
803: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
804: case BT_CR:
805: ptr += MINBPC(enc);
806: if (ptr == end)
807: return XML_TOK_TRAILING_CR;
808: if (BYTE_TYPE(enc, ptr) == BT_LF)
809: ptr += MINBPC(enc);
810: *nextTokPtr = ptr;
811: return XML_TOK_DATA_NEWLINE;
812: case BT_LF:
813: *nextTokPtr = ptr + MINBPC(enc);
814: return XML_TOK_DATA_NEWLINE;
815: case BT_RSQB:
816: ptr += MINBPC(enc);
817: if (ptr == end)
818: return XML_TOK_TRAILING_RSQB;
819: if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
820: break;
821: ptr += MINBPC(enc);
822: if (ptr == end)
823: return XML_TOK_TRAILING_RSQB;
824: if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
825: ptr -= MINBPC(enc);
826: break;
827: }
828: *nextTokPtr = ptr;
829: return XML_TOK_INVALID;
830: INVALID_CASES(ptr, nextTokPtr)
831: default:
832: ptr += MINBPC(enc);
833: break;
834: }
835: while (ptr != end) {
836: switch (BYTE_TYPE(enc, ptr)) {
837: #define LEAD_CASE(n) \
838: case BT_LEAD ## n: \
839: if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
840: *nextTokPtr = ptr; \
841: return XML_TOK_DATA_CHARS; \
842: } \
843: ptr += n; \
844: break;
845: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
846: #undef LEAD_CASE
847: case BT_RSQB:
848: if (ptr + MINBPC(enc) != end) {
849: if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
850: ptr += MINBPC(enc);
851: break;
852: }
853: if (ptr + 2*MINBPC(enc) != end) {
854: if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
855: ptr += MINBPC(enc);
856: break;
857: }
858: *nextTokPtr = ptr + 2*MINBPC(enc);
859: return XML_TOK_INVALID;
860: }
861: }
862: /* fall through */
863: case BT_AMP:
864: case BT_LT:
865: case BT_NONXML:
866: case BT_MALFORM:
867: case BT_TRAIL:
868: case BT_CR:
869: case BT_LF:
870: *nextTokPtr = ptr;
871: return XML_TOK_DATA_CHARS;
872: default:
873: ptr += MINBPC(enc);
874: break;
875: }
876: }
877: *nextTokPtr = ptr;
878: return XML_TOK_DATA_CHARS;
879: }
880:
881: /* ptr points to character following "%" */
882:
883: static int PTRCALL
884: PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
885: const char **nextTokPtr)
886: {
887: if (ptr == end)
888: return -XML_TOK_PERCENT;
889: switch (BYTE_TYPE(enc, ptr)) {
890: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
891: case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
892: *nextTokPtr = ptr;
893: return XML_TOK_PERCENT;
894: default:
895: *nextTokPtr = ptr;
896: return XML_TOK_INVALID;
897: }
898: while (ptr != end) {
899: switch (BYTE_TYPE(enc, ptr)) {
900: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
901: case BT_SEMI:
902: *nextTokPtr = ptr + MINBPC(enc);
903: return XML_TOK_PARAM_ENTITY_REF;
904: default:
905: *nextTokPtr = ptr;
906: return XML_TOK_INVALID;
907: }
908: }
909: return XML_TOK_PARTIAL;
910: }
911:
912: static int PTRCALL
913: PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
914: const char **nextTokPtr)
915: {
916: if (ptr == end)
917: return XML_TOK_PARTIAL;
918: switch (BYTE_TYPE(enc, ptr)) {
919: CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
920: default:
921: *nextTokPtr = ptr;
922: return XML_TOK_INVALID;
923: }
924: while (ptr != end) {
925: switch (BYTE_TYPE(enc, ptr)) {
926: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
927: case BT_CR: case BT_LF: case BT_S:
928: case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
929: *nextTokPtr = ptr;
930: return XML_TOK_POUND_NAME;
931: default:
932: *nextTokPtr = ptr;
933: return XML_TOK_INVALID;
934: }
935: }
936: return -XML_TOK_POUND_NAME;
937: }
938:
939: static int PTRCALL
940: PREFIX(scanLit)(int open, const ENCODING *enc,
941: const char *ptr, const char *end,
942: const char **nextTokPtr)
943: {
944: while (ptr != end) {
945: int t = BYTE_TYPE(enc, ptr);
946: switch (t) {
947: INVALID_CASES(ptr, nextTokPtr)
948: case BT_QUOT:
949: case BT_APOS:
950: ptr += MINBPC(enc);
951: if (t != open)
952: break;
953: if (ptr == end)
954: return -XML_TOK_LITERAL;
955: *nextTokPtr = ptr;
956: switch (BYTE_TYPE(enc, ptr)) {
957: case BT_S: case BT_CR: case BT_LF:
958: case BT_GT: case BT_PERCNT: case BT_LSQB:
959: return XML_TOK_LITERAL;
960: default:
961: return XML_TOK_INVALID;
962: }
963: default:
964: ptr += MINBPC(enc);
965: break;
966: }
967: }
968: return XML_TOK_PARTIAL;
969: }
970:
971: static int PTRCALL
972: PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
973: const char **nextTokPtr)
974: {
975: int tok;
976: if (ptr == end)
977: return XML_TOK_NONE;
978: if (MINBPC(enc) > 1) {
979: size_t n = end - ptr;
980: if (n & (MINBPC(enc) - 1)) {
981: n &= ~(MINBPC(enc) - 1);
982: if (n == 0)
983: return XML_TOK_PARTIAL;
984: end = ptr + n;
985: }
986: }
987: switch (BYTE_TYPE(enc, ptr)) {
988: case BT_QUOT:
989: return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
990: case BT_APOS:
991: return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
992: case BT_LT:
993: {
994: ptr += MINBPC(enc);
995: if (ptr == end)
996: return XML_TOK_PARTIAL;
997: switch (BYTE_TYPE(enc, ptr)) {
998: case BT_EXCL:
999: return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1000: case BT_QUEST:
1001: return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1002: case BT_NMSTRT:
1003: case BT_HEX:
1004: case BT_NONASCII:
1005: case BT_LEAD2:
1006: case BT_LEAD3:
1007: case BT_LEAD4:
1008: *nextTokPtr = ptr - MINBPC(enc);
1009: return XML_TOK_INSTANCE_START;
1010: }
1011: *nextTokPtr = ptr;
1012: return XML_TOK_INVALID;
1013: }
1014: case BT_CR:
1015: if (ptr + MINBPC(enc) == end) {
1016: *nextTokPtr = end;
1017: /* indicate that this might be part of a CR/LF pair */
1018: return -XML_TOK_PROLOG_S;
1019: }
1020: /* fall through */
1021: case BT_S: case BT_LF:
1022: for (;;) {
1023: ptr += MINBPC(enc);
1024: if (ptr == end)
1025: break;
1026: switch (BYTE_TYPE(enc, ptr)) {
1027: case BT_S: case BT_LF:
1028: break;
1029: case BT_CR:
1030: /* don't split CR/LF pair */
1031: if (ptr + MINBPC(enc) != end)
1032: break;
1033: /* fall through */
1034: default:
1035: *nextTokPtr = ptr;
1036: return XML_TOK_PROLOG_S;
1037: }
1038: }
1039: *nextTokPtr = ptr;
1040: return XML_TOK_PROLOG_S;
1041: case BT_PERCNT:
1042: return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1043: case BT_COMMA:
1044: *nextTokPtr = ptr + MINBPC(enc);
1045: return XML_TOK_COMMA;
1046: case BT_LSQB:
1047: *nextTokPtr = ptr + MINBPC(enc);
1048: return XML_TOK_OPEN_BRACKET;
1049: case BT_RSQB:
1050: ptr += MINBPC(enc);
1051: if (ptr == end)
1052: return -XML_TOK_CLOSE_BRACKET;
1053: if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1054: if (ptr + MINBPC(enc) == end)
1055: return XML_TOK_PARTIAL;
1056: if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1057: *nextTokPtr = ptr + 2*MINBPC(enc);
1058: return XML_TOK_COND_SECT_CLOSE;
1059: }
1060: }
1061: *nextTokPtr = ptr;
1062: return XML_TOK_CLOSE_BRACKET;
1063: case BT_LPAR:
1064: *nextTokPtr = ptr + MINBPC(enc);
1065: return XML_TOK_OPEN_PAREN;
1066: case BT_RPAR:
1067: ptr += MINBPC(enc);
1068: if (ptr == end)
1069: return -XML_TOK_CLOSE_PAREN;
1070: switch (BYTE_TYPE(enc, ptr)) {
1071: case BT_AST:
1072: *nextTokPtr = ptr + MINBPC(enc);
1073: return XML_TOK_CLOSE_PAREN_ASTERISK;
1074: case BT_QUEST:
1075: *nextTokPtr = ptr + MINBPC(enc);
1076: return XML_TOK_CLOSE_PAREN_QUESTION;
1077: case BT_PLUS:
1078: *nextTokPtr = ptr + MINBPC(enc);
1079: return XML_TOK_CLOSE_PAREN_PLUS;
1080: case BT_CR: case BT_LF: case BT_S:
1081: case BT_GT: case BT_COMMA: case BT_VERBAR:
1082: case BT_RPAR:
1083: *nextTokPtr = ptr;
1084: return XML_TOK_CLOSE_PAREN;
1085: }
1086: *nextTokPtr = ptr;
1087: return XML_TOK_INVALID;
1088: case BT_VERBAR:
1089: *nextTokPtr = ptr + MINBPC(enc);
1090: return XML_TOK_OR;
1091: case BT_GT:
1092: *nextTokPtr = ptr + MINBPC(enc);
1093: return XML_TOK_DECL_CLOSE;
1094: case BT_NUM:
1095: return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1096: #define LEAD_CASE(n) \
1097: case BT_LEAD ## n: \
1098: if (end - ptr < n) \
1099: return XML_TOK_PARTIAL_CHAR; \
1100: if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1101: ptr += n; \
1102: tok = XML_TOK_NAME; \
1103: break; \
1104: } \
1105: if (IS_NAME_CHAR(enc, ptr, n)) { \
1106: ptr += n; \
1107: tok = XML_TOK_NMTOKEN; \
1108: break; \
1109: } \
1110: *nextTokPtr = ptr; \
1111: return XML_TOK_INVALID;
1112: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1113: #undef LEAD_CASE
1114: case BT_NMSTRT:
1115: case BT_HEX:
1116: tok = XML_TOK_NAME;
1117: ptr += MINBPC(enc);
1118: break;
1119: case BT_DIGIT:
1120: case BT_NAME:
1121: case BT_MINUS:
1122: #ifdef XML_NS
1123: case BT_COLON:
1124: #endif
1125: tok = XML_TOK_NMTOKEN;
1126: ptr += MINBPC(enc);
1127: break;
1128: case BT_NONASCII:
1129: if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1130: ptr += MINBPC(enc);
1131: tok = XML_TOK_NAME;
1132: break;
1133: }
1134: if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1135: ptr += MINBPC(enc);
1136: tok = XML_TOK_NMTOKEN;
1137: break;
1138: }
1139: /* fall through */
1140: default:
1141: *nextTokPtr = ptr;
1142: return XML_TOK_INVALID;
1143: }
1144: while (ptr != end) {
1145: switch (BYTE_TYPE(enc, ptr)) {
1146: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1147: case BT_GT: case BT_RPAR: case BT_COMMA:
1148: case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1149: case BT_S: case BT_CR: case BT_LF:
1150: *nextTokPtr = ptr;
1151: return tok;
1152: #ifdef XML_NS
1153: case BT_COLON:
1154: ptr += MINBPC(enc);
1155: switch (tok) {
1156: case XML_TOK_NAME:
1157: if (ptr == end)
1158: return XML_TOK_PARTIAL;
1159: tok = XML_TOK_PREFIXED_NAME;
1160: switch (BYTE_TYPE(enc, ptr)) {
1161: CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1162: default:
1163: tok = XML_TOK_NMTOKEN;
1164: break;
1165: }
1166: break;
1167: case XML_TOK_PREFIXED_NAME:
1168: tok = XML_TOK_NMTOKEN;
1169: break;
1170: }
1171: break;
1172: #endif
1173: case BT_PLUS:
1174: if (tok == XML_TOK_NMTOKEN) {
1175: *nextTokPtr = ptr;
1176: return XML_TOK_INVALID;
1177: }
1178: *nextTokPtr = ptr + MINBPC(enc);
1179: return XML_TOK_NAME_PLUS;
1180: case BT_AST:
1181: if (tok == XML_TOK_NMTOKEN) {
1182: *nextTokPtr = ptr;
1183: return XML_TOK_INVALID;
1184: }
1185: *nextTokPtr = ptr + MINBPC(enc);
1186: return XML_TOK_NAME_ASTERISK;
1187: case BT_QUEST:
1188: if (tok == XML_TOK_NMTOKEN) {
1189: *nextTokPtr = ptr;
1190: return XML_TOK_INVALID;
1191: }
1192: *nextTokPtr = ptr + MINBPC(enc);
1193: return XML_TOK_NAME_QUESTION;
1194: default:
1195: *nextTokPtr = ptr;
1196: return XML_TOK_INVALID;
1197: }
1198: }
1199: return -tok;
1200: }
1201:
1202: static int PTRCALL
1203: PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1204: const char *end, const char **nextTokPtr)
1205: {
1206: const char *start;
1207: if (ptr == end)
1208: return XML_TOK_NONE;
1209: start = ptr;
1210: while (ptr != end) {
1211: switch (BYTE_TYPE(enc, ptr)) {
1212: #define LEAD_CASE(n) \
1213: case BT_LEAD ## n: ptr += n; break;
1214: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1215: #undef LEAD_CASE
1216: case BT_AMP:
1217: if (ptr == start)
1218: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1219: *nextTokPtr = ptr;
1220: return XML_TOK_DATA_CHARS;
1221: case BT_LT:
1222: /* this is for inside entity references */
1223: *nextTokPtr = ptr;
1224: return XML_TOK_INVALID;
1225: case BT_LF:
1226: if (ptr == start) {
1227: *nextTokPtr = ptr + MINBPC(enc);
1228: return XML_TOK_DATA_NEWLINE;
1229: }
1230: *nextTokPtr = ptr;
1231: return XML_TOK_DATA_CHARS;
1232: case BT_CR:
1233: if (ptr == start) {
1234: ptr += MINBPC(enc);
1235: if (ptr == end)
1236: return XML_TOK_TRAILING_CR;
1237: if (BYTE_TYPE(enc, ptr) == BT_LF)
1238: ptr += MINBPC(enc);
1239: *nextTokPtr = ptr;
1240: return XML_TOK_DATA_NEWLINE;
1241: }
1242: *nextTokPtr = ptr;
1243: return XML_TOK_DATA_CHARS;
1244: case BT_S:
1245: if (ptr == start) {
1246: *nextTokPtr = ptr + MINBPC(enc);
1247: return XML_TOK_ATTRIBUTE_VALUE_S;
1248: }
1249: *nextTokPtr = ptr;
1250: return XML_TOK_DATA_CHARS;
1251: default:
1252: ptr += MINBPC(enc);
1253: break;
1254: }
1255: }
1256: *nextTokPtr = ptr;
1257: return XML_TOK_DATA_CHARS;
1258: }
1259:
1260: static int PTRCALL
1261: PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1262: const char *end, const char **nextTokPtr)
1263: {
1264: const char *start;
1265: if (ptr == end)
1266: return XML_TOK_NONE;
1267: start = ptr;
1268: while (ptr != end) {
1269: switch (BYTE_TYPE(enc, ptr)) {
1270: #define LEAD_CASE(n) \
1271: case BT_LEAD ## n: ptr += n; break;
1272: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1273: #undef LEAD_CASE
1274: case BT_AMP:
1275: if (ptr == start)
1276: return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1277: *nextTokPtr = ptr;
1278: return XML_TOK_DATA_CHARS;
1279: case BT_PERCNT:
1280: if (ptr == start) {
1281: int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1282: end, nextTokPtr);
1283: return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1284: }
1285: *nextTokPtr = ptr;
1286: return XML_TOK_DATA_CHARS;
1287: case BT_LF:
1288: if (ptr == start) {
1289: *nextTokPtr = ptr + MINBPC(enc);
1290: return XML_TOK_DATA_NEWLINE;
1291: }
1292: *nextTokPtr = ptr;
1293: return XML_TOK_DATA_CHARS;
1294: case BT_CR:
1295: if (ptr == start) {
1296: ptr += MINBPC(enc);
1297: if (ptr == end)
1298: return XML_TOK_TRAILING_CR;
1299: if (BYTE_TYPE(enc, ptr) == BT_LF)
1300: ptr += MINBPC(enc);
1301: *nextTokPtr = ptr;
1302: return XML_TOK_DATA_NEWLINE;
1303: }
1304: *nextTokPtr = ptr;
1305: return XML_TOK_DATA_CHARS;
1306: default:
1307: ptr += MINBPC(enc);
1308: break;
1309: }
1310: }
1311: *nextTokPtr = ptr;
1312: return XML_TOK_DATA_CHARS;
1313: }
1314:
1315: #ifdef XML_DTD
1316:
1317: static int PTRCALL
1318: PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1319: const char *end, const char **nextTokPtr)
1320: {
1321: int level = 0;
1322: if (MINBPC(enc) > 1) {
1323: size_t n = end - ptr;
1324: if (n & (MINBPC(enc) - 1)) {
1325: n &= ~(MINBPC(enc) - 1);
1326: end = ptr + n;
1327: }
1328: }
1329: while (ptr != end) {
1330: switch (BYTE_TYPE(enc, ptr)) {
1331: INVALID_CASES(ptr, nextTokPtr)
1332: case BT_LT:
1333: if ((ptr += MINBPC(enc)) == end)
1334: return XML_TOK_PARTIAL;
1335: if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1336: if ((ptr += MINBPC(enc)) == end)
1337: return XML_TOK_PARTIAL;
1338: if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1339: ++level;
1340: ptr += MINBPC(enc);
1341: }
1342: }
1343: break;
1344: case BT_RSQB:
1345: if ((ptr += MINBPC(enc)) == end)
1346: return XML_TOK_PARTIAL;
1347: if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1348: if ((ptr += MINBPC(enc)) == end)
1349: return XML_TOK_PARTIAL;
1350: if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1351: ptr += MINBPC(enc);
1352: if (level == 0) {
1353: *nextTokPtr = ptr;
1354: return XML_TOK_IGNORE_SECT;
1355: }
1356: --level;
1357: }
1358: }
1359: break;
1360: default:
1361: ptr += MINBPC(enc);
1362: break;
1363: }
1364: }
1365: return XML_TOK_PARTIAL;
1366: }
1367:
1368: #endif /* XML_DTD */
1369:
1370: static int PTRCALL
1371: PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1372: const char **badPtr)
1373: {
1374: ptr += MINBPC(enc);
1375: end -= MINBPC(enc);
1376: for (; ptr != end; ptr += MINBPC(enc)) {
1377: switch (BYTE_TYPE(enc, ptr)) {
1378: case BT_DIGIT:
1379: case BT_HEX:
1380: case BT_MINUS:
1381: case BT_APOS:
1382: case BT_LPAR:
1383: case BT_RPAR:
1384: case BT_PLUS:
1385: case BT_COMMA:
1386: case BT_SOL:
1387: case BT_EQUALS:
1388: case BT_QUEST:
1389: case BT_CR:
1390: case BT_LF:
1391: case BT_SEMI:
1392: case BT_EXCL:
1393: case BT_AST:
1394: case BT_PERCNT:
1395: case BT_NUM:
1396: #ifdef XML_NS
1397: case BT_COLON:
1398: #endif
1399: break;
1400: case BT_S:
1401: if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1402: *badPtr = ptr;
1403: return 0;
1404: }
1405: break;
1406: case BT_NAME:
1407: case BT_NMSTRT:
1408: if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1409: break;
1410: default:
1411: switch (BYTE_TO_ASCII(enc, ptr)) {
1412: case 0x24: /* $ */
1413: case 0x40: /* @ */
1414: break;
1415: default:
1416: *badPtr = ptr;
1417: return 0;
1418: }
1419: break;
1420: }
1421: }
1422: return 1;
1423: }
1424:
1425: /* This must only be called for a well-formed start-tag or empty
1426: element tag. Returns the number of attributes. Pointers to the
1427: first attsMax attributes are stored in atts.
1428: */
1429:
1430: static int PTRCALL
1431: PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1432: int attsMax, ATTRIBUTE *atts)
1433: {
1434: enum { other, inName, inValue } state = inName;
1435: int nAtts = 0;
1436: int open = 0; /* defined when state == inValue;
1437: initialization just to shut up compilers */
1438:
1439: for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1440: switch (BYTE_TYPE(enc, ptr)) {
1441: #define START_NAME \
1442: if (state == other) { \
1443: if (nAtts < attsMax) { \
1444: atts[nAtts].name = ptr; \
1445: atts[nAtts].normalized = 1; \
1446: } \
1447: state = inName; \
1448: }
1449: #define LEAD_CASE(n) \
1450: case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1451: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1452: #undef LEAD_CASE
1453: case BT_NONASCII:
1454: case BT_NMSTRT:
1455: case BT_HEX:
1456: START_NAME
1457: break;
1458: #undef START_NAME
1459: case BT_QUOT:
1460: if (state != inValue) {
1461: if (nAtts < attsMax)
1462: atts[nAtts].valuePtr = ptr + MINBPC(enc);
1463: state = inValue;
1464: open = BT_QUOT;
1465: }
1466: else if (open == BT_QUOT) {
1467: state = other;
1468: if (nAtts < attsMax)
1469: atts[nAtts].valueEnd = ptr;
1470: nAtts++;
1471: }
1472: break;
1473: case BT_APOS:
1474: if (state != inValue) {
1475: if (nAtts < attsMax)
1476: atts[nAtts].valuePtr = ptr + MINBPC(enc);
1477: state = inValue;
1478: open = BT_APOS;
1479: }
1480: else if (open == BT_APOS) {
1481: state = other;
1482: if (nAtts < attsMax)
1483: atts[nAtts].valueEnd = ptr;
1484: nAtts++;
1485: }
1486: break;
1487: case BT_AMP:
1488: if (nAtts < attsMax)
1489: atts[nAtts].normalized = 0;
1490: break;
1491: case BT_S:
1492: if (state == inName)
1493: state = other;
1494: else if (state == inValue
1495: && nAtts < attsMax
1496: && atts[nAtts].normalized
1497: && (ptr == atts[nAtts].valuePtr
1498: || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1499: || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1500: || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1501: atts[nAtts].normalized = 0;
1502: break;
1503: case BT_CR: case BT_LF:
1504: /* This case ensures that the first attribute name is counted
1505: Apart from that we could just change state on the quote. */
1506: if (state == inName)
1507: state = other;
1508: else if (state == inValue && nAtts < attsMax)
1509: atts[nAtts].normalized = 0;
1510: break;
1511: case BT_GT:
1512: case BT_SOL:
1513: if (state != inValue)
1514: return nAtts;
1515: break;
1516: default:
1517: break;
1518: }
1519: }
1520: /* not reached */
1521: }
1522:
1523: static int PTRFASTCALL
1524: PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1525: {
1526: int result = 0;
1527: /* skip &# */
1528: ptr += 2*MINBPC(enc);
1529: if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1530: for (ptr += MINBPC(enc);
1531: !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1532: ptr += MINBPC(enc)) {
1533: int c = BYTE_TO_ASCII(enc, ptr);
1534: switch (c) {
1535: case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1536: case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1537: result <<= 4;
1538: result |= (c - ASCII_0);
1539: break;
1540: case ASCII_A: case ASCII_B: case ASCII_C:
1541: case ASCII_D: case ASCII_E: case ASCII_F:
1542: result <<= 4;
1543: result += 10 + (c - ASCII_A);
1544: break;
1545: case ASCII_a: case ASCII_b: case ASCII_c:
1546: case ASCII_d: case ASCII_e: case ASCII_f:
1547: result <<= 4;
1548: result += 10 + (c - ASCII_a);
1549: break;
1550: }
1551: if (result >= 0x110000)
1552: return -1;
1553: }
1554: }
1555: else {
1556: for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1557: int c = BYTE_TO_ASCII(enc, ptr);
1558: result *= 10;
1559: result += (c - ASCII_0);
1560: if (result >= 0x110000)
1561: return -1;
1562: }
1563: }
1564: return checkCharRefNumber(result);
1565: }
1566:
1567: static int PTRCALL
1568: PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1569: const char *end)
1570: {
1571: switch ((end - ptr)/MINBPC(enc)) {
1572: case 2:
1573: if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1574: switch (BYTE_TO_ASCII(enc, ptr)) {
1575: case ASCII_l:
1576: return ASCII_LT;
1577: case ASCII_g:
1578: return ASCII_GT;
1579: }
1580: }
1581: break;
1582: case 3:
1583: if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1584: ptr += MINBPC(enc);
1585: if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1586: ptr += MINBPC(enc);
1587: if (CHAR_MATCHES(enc, ptr, ASCII_p))
1588: return ASCII_AMP;
1589: }
1590: }
1591: break;
1592: case 4:
1593: switch (BYTE_TO_ASCII(enc, ptr)) {
1594: case ASCII_q:
1595: ptr += MINBPC(enc);
1596: if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1597: ptr += MINBPC(enc);
1598: if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1599: ptr += MINBPC(enc);
1600: if (CHAR_MATCHES(enc, ptr, ASCII_t))
1601: return ASCII_QUOT;
1602: }
1603: }
1604: break;
1605: case ASCII_a:
1606: ptr += MINBPC(enc);
1607: if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1608: ptr += MINBPC(enc);
1609: if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1610: ptr += MINBPC(enc);
1611: if (CHAR_MATCHES(enc, ptr, ASCII_s))
1612: return ASCII_APOS;
1613: }
1614: }
1615: break;
1616: }
1617: }
1618: return 0;
1619: }
1620:
1621: static int PTRCALL
1622: PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1623: {
1624: for (;;) {
1625: switch (BYTE_TYPE(enc, ptr1)) {
1626: #define LEAD_CASE(n) \
1627: case BT_LEAD ## n: \
1628: if (*ptr1++ != *ptr2++) \
1629: return 0;
1630: LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1631: #undef LEAD_CASE
1632: /* fall through */
1633: if (*ptr1++ != *ptr2++)
1634: return 0;
1635: break;
1636: case BT_NONASCII:
1637: case BT_NMSTRT:
1638: #ifdef XML_NS
1639: case BT_COLON:
1640: #endif
1641: case BT_HEX:
1642: case BT_DIGIT:
1643: case BT_NAME:
1644: case BT_MINUS:
1645: if (*ptr2++ != *ptr1++)
1646: return 0;
1647: if (MINBPC(enc) > 1) {
1648: if (*ptr2++ != *ptr1++)
1649: return 0;
1650: if (MINBPC(enc) > 2) {
1651: if (*ptr2++ != *ptr1++)
1652: return 0;
1653: if (MINBPC(enc) > 3) {
1654: if (*ptr2++ != *ptr1++)
1655: return 0;
1656: }
1657: }
1658: }
1659: break;
1660: default:
1661: if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1662: return 1;
1663: switch (BYTE_TYPE(enc, ptr2)) {
1664: case BT_LEAD2:
1665: case BT_LEAD3:
1666: case BT_LEAD4:
1667: case BT_NONASCII:
1668: case BT_NMSTRT:
1669: #ifdef XML_NS
1670: case BT_COLON:
1671: #endif
1672: case BT_HEX:
1673: case BT_DIGIT:
1674: case BT_NAME:
1675: case BT_MINUS:
1676: return 0;
1677: default:
1678: return 1;
1679: }
1680: }
1681: }
1682: /* not reached */
1683: }
1684:
1685: static int PTRCALL
1686: PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1687: const char *end1, const char *ptr2)
1688: {
1689: for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1690: if (ptr1 == end1)
1691: return 0;
1692: if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1693: return 0;
1694: }
1695: return ptr1 == end1;
1696: }
1697:
1698: static int PTRFASTCALL
1699: PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1700: {
1701: const char *start = ptr;
1702: for (;;) {
1703: switch (BYTE_TYPE(enc, ptr)) {
1704: #define LEAD_CASE(n) \
1705: case BT_LEAD ## n: ptr += n; break;
1706: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1707: #undef LEAD_CASE
1708: case BT_NONASCII:
1709: case BT_NMSTRT:
1710: #ifdef XML_NS
1711: case BT_COLON:
1712: #endif
1713: case BT_HEX:
1714: case BT_DIGIT:
1715: case BT_NAME:
1716: case BT_MINUS:
1717: ptr += MINBPC(enc);
1718: break;
1719: default:
1720: return (int)(ptr - start);
1721: }
1722: }
1723: }
1724:
1725: static const char * PTRFASTCALL
1726: PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1727: {
1728: for (;;) {
1729: switch (BYTE_TYPE(enc, ptr)) {
1730: case BT_LF:
1731: case BT_CR:
1732: case BT_S:
1733: ptr += MINBPC(enc);
1734: break;
1735: default:
1736: return ptr;
1737: }
1738: }
1739: }
1740:
1741: static void PTRCALL
1742: PREFIX(updatePosition)(const ENCODING *enc,
1743: const char *ptr,
1744: const char *end,
1745: POSITION *pos)
1746: {
1.1.1.1.2.1! misho 1747: while (ptr < end) {
1.1 misho 1748: switch (BYTE_TYPE(enc, ptr)) {
1749: #define LEAD_CASE(n) \
1750: case BT_LEAD ## n: \
1751: ptr += n; \
1752: break;
1753: LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1754: #undef LEAD_CASE
1755: case BT_LF:
1756: pos->columnNumber = (XML_Size)-1;
1757: pos->lineNumber++;
1758: ptr += MINBPC(enc);
1759: break;
1760: case BT_CR:
1761: pos->lineNumber++;
1762: ptr += MINBPC(enc);
1763: if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1764: ptr += MINBPC(enc);
1765: pos->columnNumber = (XML_Size)-1;
1766: break;
1767: default:
1768: ptr += MINBPC(enc);
1769: break;
1770: }
1771: pos->columnNumber++;
1772: }
1773: }
1774:
1775: #undef DO_LEAD_CASE
1776: #undef MULTIBYTE_CASES
1777: #undef INVALID_CASES
1778: #undef CHECK_NAME_CASE
1779: #undef CHECK_NAME_CASES
1780: #undef CHECK_NMSTRT_CASE
1781: #undef CHECK_NMSTRT_CASES
1782:
1783: #endif /* XML_TOK_IMPL_C */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>