File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / expat / lib / xmltok_impl.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 16:09:43 2014 UTC (10 years ago) by misho
Branches: expat, MAIN
CVS tags: v2_1_0, HEAD
expat 2.1.0

    1: /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
    2:    See the file COPYING for copying permission.
    3: */
    4: 
    5: /* This file is included! */
    6: #ifdef XML_TOK_IMPL_C
    7: 
    8: #ifndef IS_INVALID_CHAR
    9: #define IS_INVALID_CHAR(enc, ptr, n) (0)
   10: #endif
   11: 
   12: #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
   13:     case BT_LEAD ## n: \
   14:       if (end - ptr < n) \
   15:         return XML_TOK_PARTIAL_CHAR; \
   16:       if (IS_INVALID_CHAR(enc, ptr, n)) { \
   17:         *(nextTokPtr) = (ptr); \
   18:         return XML_TOK_INVALID; \
   19:       } \
   20:       ptr += n; \
   21:       break;
   22: 
   23: #define INVALID_CASES(ptr, nextTokPtr) \
   24:   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
   25:   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
   26:   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
   27:   case BT_NONXML: \
   28:   case BT_MALFORM: \
   29:   case BT_TRAIL: \
   30:     *(nextTokPtr) = (ptr); \
   31:     return XML_TOK_INVALID;
   32: 
   33: #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
   34:    case BT_LEAD ## n: \
   35:      if (end - ptr < n) \
   36:        return XML_TOK_PARTIAL_CHAR; \
   37:      if (!IS_NAME_CHAR(enc, ptr, n)) { \
   38:        *nextTokPtr = ptr; \
   39:        return XML_TOK_INVALID; \
   40:      } \
   41:      ptr += n; \
   42:      break;
   43: 
   44: #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
   45:   case BT_NONASCII: \
   46:     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
   47:       *nextTokPtr = ptr; \
   48:       return XML_TOK_INVALID; \
   49:     } \
   50:   case BT_NMSTRT: \
   51:   case BT_HEX: \
   52:   case BT_DIGIT: \
   53:   case BT_NAME: \
   54:   case BT_MINUS: \
   55:     ptr += MINBPC(enc); \
   56:     break; \
   57:   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
   58:   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
   59:   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
   60: 
   61: #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
   62:    case BT_LEAD ## n: \
   63:      if (end - ptr < n) \
   64:        return XML_TOK_PARTIAL_CHAR; \
   65:      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
   66:        *nextTokPtr = ptr; \
   67:        return XML_TOK_INVALID; \
   68:      } \
   69:      ptr += n; \
   70:      break;
   71: 
   72: #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
   73:   case BT_NONASCII: \
   74:     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
   75:       *nextTokPtr = ptr; \
   76:       return XML_TOK_INVALID; \
   77:     } \
   78:   case BT_NMSTRT: \
   79:   case BT_HEX: \
   80:     ptr += MINBPC(enc); \
   81:     break; \
   82:   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
   83:   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
   84:   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
   85: 
   86: #ifndef PREFIX
   87: #define PREFIX(ident) ident
   88: #endif
   89: 
   90: /* ptr points to character following "<!-" */
   91: 
   92: static int PTRCALL
   93: PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
   94:                     const char *end, const char **nextTokPtr)
   95: {
   96:   if (ptr != end) {
   97:     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
   98:       *nextTokPtr = ptr;
   99:       return XML_TOK_INVALID;
  100:     }
  101:     ptr += MINBPC(enc);
  102:     while (ptr != end) {
  103:       switch (BYTE_TYPE(enc, ptr)) {
  104:       INVALID_CASES(ptr, nextTokPtr)
  105:       case BT_MINUS:
  106:         if ((ptr += MINBPC(enc)) == end)
  107:           return XML_TOK_PARTIAL;
  108:         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
  109:           if ((ptr += MINBPC(enc)) == end)
  110:             return XML_TOK_PARTIAL;
  111:           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  112:             *nextTokPtr = ptr;
  113:             return XML_TOK_INVALID;
  114:           }
  115:           *nextTokPtr = ptr + MINBPC(enc);
  116:           return XML_TOK_COMMENT;
  117:         }
  118:         break;
  119:       default:
  120:         ptr += MINBPC(enc);
  121:         break;
  122:       }
  123:     }
  124:   }
  125:   return XML_TOK_PARTIAL;
  126: }
  127: 
  128: /* ptr points to character following "<!" */
  129: 
  130: static int PTRCALL
  131: PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
  132:                  const char *end, const char **nextTokPtr)
  133: {
  134:   if (ptr == end)
  135:     return XML_TOK_PARTIAL;
  136:   switch (BYTE_TYPE(enc, ptr)) {
  137:   case BT_MINUS:
  138:     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  139:   case BT_LSQB:
  140:     *nextTokPtr = ptr + MINBPC(enc);
  141:     return XML_TOK_COND_SECT_OPEN;
  142:   case BT_NMSTRT:
  143:   case BT_HEX:
  144:     ptr += MINBPC(enc);
  145:     break;
  146:   default:
  147:     *nextTokPtr = ptr;
  148:     return XML_TOK_INVALID;
  149:   }
  150:   while (ptr != end) {
  151:     switch (BYTE_TYPE(enc, ptr)) {
  152:     case BT_PERCNT:
  153:       if (ptr + MINBPC(enc) == end)
  154:         return XML_TOK_PARTIAL;
  155:       /* don't allow <!ENTITY% foo "whatever"> */
  156:       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
  157:       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
  158:         *nextTokPtr = ptr;
  159:         return XML_TOK_INVALID;
  160:       }
  161:       /* fall through */
  162:     case BT_S: case BT_CR: case BT_LF:
  163:       *nextTokPtr = ptr;
  164:       return XML_TOK_DECL_OPEN;
  165:     case BT_NMSTRT:
  166:     case BT_HEX:
  167:       ptr += MINBPC(enc);
  168:       break;
  169:     default:
  170:       *nextTokPtr = ptr;
  171:       return XML_TOK_INVALID;
  172:     }
  173:   }
  174:   return XML_TOK_PARTIAL;
  175: }
  176: 
  177: static int PTRCALL
  178: PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
  179:                       const char *end, int *tokPtr)
  180: {
  181:   int upper = 0;
  182:   *tokPtr = XML_TOK_PI;
  183:   if (end - ptr != MINBPC(enc)*3)
  184:     return 1;
  185:   switch (BYTE_TO_ASCII(enc, ptr)) {
  186:   case ASCII_x:
  187:     break;
  188:   case ASCII_X:
  189:     upper = 1;
  190:     break;
  191:   default:
  192:     return 1;
  193:   }
  194:   ptr += MINBPC(enc);
  195:   switch (BYTE_TO_ASCII(enc, ptr)) {
  196:   case ASCII_m:
  197:     break;
  198:   case ASCII_M:
  199:     upper = 1;
  200:     break;
  201:   default:
  202:     return 1;
  203:   }
  204:   ptr += MINBPC(enc);
  205:   switch (BYTE_TO_ASCII(enc, ptr)) {
  206:   case ASCII_l:
  207:     break;
  208:   case ASCII_L:
  209:     upper = 1;
  210:     break;
  211:   default:
  212:     return 1;
  213:   }
  214:   if (upper)
  215:     return 0;
  216:   *tokPtr = XML_TOK_XML_DECL;
  217:   return 1;
  218: }
  219: 
  220: /* ptr points to character following "<?" */
  221: 
  222: static int PTRCALL
  223: PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
  224:                const char *end, const char **nextTokPtr)
  225: {
  226:   int tok;
  227:   const char *target = ptr;
  228:   if (ptr == end)
  229:     return XML_TOK_PARTIAL;
  230:   switch (BYTE_TYPE(enc, ptr)) {
  231:   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  232:   default:
  233:     *nextTokPtr = ptr;
  234:     return XML_TOK_INVALID;
  235:   }
  236:   while (ptr != end) {
  237:     switch (BYTE_TYPE(enc, ptr)) {
  238:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  239:     case BT_S: case BT_CR: case BT_LF:
  240:       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  241:         *nextTokPtr = ptr;
  242:         return XML_TOK_INVALID;
  243:       }
  244:       ptr += MINBPC(enc);
  245:       while (ptr != end) {
  246:         switch (BYTE_TYPE(enc, ptr)) {
  247:         INVALID_CASES(ptr, nextTokPtr)
  248:         case BT_QUEST:
  249:           ptr += MINBPC(enc);
  250:           if (ptr == end)
  251:             return XML_TOK_PARTIAL;
  252:           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  253:             *nextTokPtr = ptr + MINBPC(enc);
  254:             return tok;
  255:           }
  256:           break;
  257:         default:
  258:           ptr += MINBPC(enc);
  259:           break;
  260:         }
  261:       }
  262:       return XML_TOK_PARTIAL;
  263:     case BT_QUEST:
  264:       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
  265:         *nextTokPtr = ptr;
  266:         return XML_TOK_INVALID;
  267:       }
  268:       ptr += MINBPC(enc);
  269:       if (ptr == end)
  270:         return XML_TOK_PARTIAL;
  271:       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  272:         *nextTokPtr = ptr + MINBPC(enc);
  273:         return tok;
  274:       }
  275:       /* fall through */
  276:     default:
  277:       *nextTokPtr = ptr;
  278:       return XML_TOK_INVALID;
  279:     }
  280:   }
  281:   return XML_TOK_PARTIAL;
  282: }
  283: 
  284: static int PTRCALL
  285: PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
  286:                          const char *end, const char **nextTokPtr)
  287: {
  288:   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
  289:                                      ASCII_T, ASCII_A, ASCII_LSQB };
  290:   int i;
  291:   /* CDATA[ */
  292:   if (end - ptr < 6 * MINBPC(enc))
  293:     return XML_TOK_PARTIAL;
  294:   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
  295:     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
  296:       *nextTokPtr = ptr;
  297:       return XML_TOK_INVALID;
  298:     }
  299:   }
  300:   *nextTokPtr = ptr;
  301:   return XML_TOK_CDATA_SECT_OPEN;
  302: }
  303: 
  304: static int PTRCALL
  305: PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
  306:                         const char *end, const char **nextTokPtr)
  307: {
  308:   if (ptr == end)
  309:     return XML_TOK_NONE;
  310:   if (MINBPC(enc) > 1) {
  311:     size_t n = end - ptr;
  312:     if (n & (MINBPC(enc) - 1)) {
  313:       n &= ~(MINBPC(enc) - 1);
  314:       if (n == 0)
  315:         return XML_TOK_PARTIAL;
  316:       end = ptr + n;
  317:     }
  318:   }
  319:   switch (BYTE_TYPE(enc, ptr)) {
  320:   case BT_RSQB:
  321:     ptr += MINBPC(enc);
  322:     if (ptr == end)
  323:       return XML_TOK_PARTIAL;
  324:     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  325:       break;
  326:     ptr += MINBPC(enc);
  327:     if (ptr == end)
  328:       return XML_TOK_PARTIAL;
  329:     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  330:       ptr -= MINBPC(enc);
  331:       break;
  332:     }
  333:     *nextTokPtr = ptr + MINBPC(enc);
  334:     return XML_TOK_CDATA_SECT_CLOSE;
  335:   case BT_CR:
  336:     ptr += MINBPC(enc);
  337:     if (ptr == end)
  338:       return XML_TOK_PARTIAL;
  339:     if (BYTE_TYPE(enc, ptr) == BT_LF)
  340:       ptr += MINBPC(enc);
  341:     *nextTokPtr = ptr;
  342:     return XML_TOK_DATA_NEWLINE;
  343:   case BT_LF:
  344:     *nextTokPtr = ptr + MINBPC(enc);
  345:     return XML_TOK_DATA_NEWLINE;
  346:   INVALID_CASES(ptr, nextTokPtr)
  347:   default:
  348:     ptr += MINBPC(enc);
  349:     break;
  350:   }
  351:   while (ptr != end) {
  352:     switch (BYTE_TYPE(enc, ptr)) {
  353: #define LEAD_CASE(n) \
  354:     case BT_LEAD ## n: \
  355:       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  356:         *nextTokPtr = ptr; \
  357:         return XML_TOK_DATA_CHARS; \
  358:       } \
  359:       ptr += n; \
  360:       break;
  361:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  362: #undef LEAD_CASE
  363:     case BT_NONXML:
  364:     case BT_MALFORM:
  365:     case BT_TRAIL:
  366:     case BT_CR:
  367:     case BT_LF:
  368:     case BT_RSQB:
  369:       *nextTokPtr = ptr;
  370:       return XML_TOK_DATA_CHARS;
  371:     default:
  372:       ptr += MINBPC(enc);
  373:       break;
  374:     }
  375:   }
  376:   *nextTokPtr = ptr;
  377:   return XML_TOK_DATA_CHARS;
  378: }
  379: 
  380: /* ptr points to character following "</" */
  381: 
  382: static int PTRCALL
  383: PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
  384:                    const char *end, const char **nextTokPtr)
  385: {
  386:   if (ptr == end)
  387:     return XML_TOK_PARTIAL;
  388:   switch (BYTE_TYPE(enc, ptr)) {
  389:   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  390:   default:
  391:     *nextTokPtr = ptr;
  392:     return XML_TOK_INVALID;
  393:   }
  394:   while (ptr != end) {
  395:     switch (BYTE_TYPE(enc, ptr)) {
  396:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  397:     case BT_S: case BT_CR: case BT_LF:
  398:       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  399:         switch (BYTE_TYPE(enc, ptr)) {
  400:         case BT_S: case BT_CR: case BT_LF:
  401:           break;
  402:         case BT_GT:
  403:           *nextTokPtr = ptr + MINBPC(enc);
  404:           return XML_TOK_END_TAG;
  405:         default:
  406:           *nextTokPtr = ptr;
  407:           return XML_TOK_INVALID;
  408:         }
  409:       }
  410:       return XML_TOK_PARTIAL;
  411: #ifdef XML_NS
  412:     case BT_COLON:
  413:       /* no need to check qname syntax here,
  414:          since end-tag must match exactly */
  415:       ptr += MINBPC(enc);
  416:       break;
  417: #endif
  418:     case BT_GT:
  419:       *nextTokPtr = ptr + MINBPC(enc);
  420:       return XML_TOK_END_TAG;
  421:     default:
  422:       *nextTokPtr = ptr;
  423:       return XML_TOK_INVALID;
  424:     }
  425:   }
  426:   return XML_TOK_PARTIAL;
  427: }
  428: 
  429: /* ptr points to character following "&#X" */
  430: 
  431: static int PTRCALL
  432: PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
  433:                        const char *end, const char **nextTokPtr)
  434: {
  435:   if (ptr != end) {
  436:     switch (BYTE_TYPE(enc, ptr)) {
  437:     case BT_DIGIT:
  438:     case BT_HEX:
  439:       break;
  440:     default:
  441:       *nextTokPtr = ptr;
  442:       return XML_TOK_INVALID;
  443:     }
  444:     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  445:       switch (BYTE_TYPE(enc, ptr)) {
  446:       case BT_DIGIT:
  447:       case BT_HEX:
  448:         break;
  449:       case BT_SEMI:
  450:         *nextTokPtr = ptr + MINBPC(enc);
  451:         return XML_TOK_CHAR_REF;
  452:       default:
  453:         *nextTokPtr = ptr;
  454:         return XML_TOK_INVALID;
  455:       }
  456:     }
  457:   }
  458:   return XML_TOK_PARTIAL;
  459: }
  460: 
  461: /* ptr points to character following "&#" */
  462: 
  463: static int PTRCALL
  464: PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
  465:                     const char *end, const char **nextTokPtr)
  466: {
  467:   if (ptr != end) {
  468:     if (CHAR_MATCHES(enc, ptr, ASCII_x))
  469:       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  470:     switch (BYTE_TYPE(enc, ptr)) {
  471:     case BT_DIGIT:
  472:       break;
  473:     default:
  474:       *nextTokPtr = ptr;
  475:       return XML_TOK_INVALID;
  476:     }
  477:     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
  478:       switch (BYTE_TYPE(enc, ptr)) {
  479:       case BT_DIGIT:
  480:         break;
  481:       case BT_SEMI:
  482:         *nextTokPtr = ptr + MINBPC(enc);
  483:         return XML_TOK_CHAR_REF;
  484:       default:
  485:         *nextTokPtr = ptr;
  486:         return XML_TOK_INVALID;
  487:       }
  488:     }
  489:   }
  490:   return XML_TOK_PARTIAL;
  491: }
  492: 
  493: /* ptr points to character following "&" */
  494: 
  495: static int PTRCALL
  496: PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  497:                 const char **nextTokPtr)
  498: {
  499:   if (ptr == end)
  500:     return XML_TOK_PARTIAL;
  501:   switch (BYTE_TYPE(enc, ptr)) {
  502:   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  503:   case BT_NUM:
  504:     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  505:   default:
  506:     *nextTokPtr = ptr;
  507:     return XML_TOK_INVALID;
  508:   }
  509:   while (ptr != end) {
  510:     switch (BYTE_TYPE(enc, ptr)) {
  511:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  512:     case BT_SEMI:
  513:       *nextTokPtr = ptr + MINBPC(enc);
  514:       return XML_TOK_ENTITY_REF;
  515:     default:
  516:       *nextTokPtr = ptr;
  517:       return XML_TOK_INVALID;
  518:     }
  519:   }
  520:   return XML_TOK_PARTIAL;
  521: }
  522: 
  523: /* ptr points to character following first character of attribute name */
  524: 
  525: static int PTRCALL
  526: PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
  527:                  const char **nextTokPtr)
  528: {
  529: #ifdef XML_NS
  530:   int hadColon = 0;
  531: #endif
  532:   while (ptr != end) {
  533:     switch (BYTE_TYPE(enc, ptr)) {
  534:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  535: #ifdef XML_NS
  536:     case BT_COLON:
  537:       if (hadColon) {
  538:         *nextTokPtr = ptr;
  539:         return XML_TOK_INVALID;
  540:       }
  541:       hadColon = 1;
  542:       ptr += MINBPC(enc);
  543:       if (ptr == end)
  544:         return XML_TOK_PARTIAL;
  545:       switch (BYTE_TYPE(enc, ptr)) {
  546:       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  547:       default:
  548:         *nextTokPtr = ptr;
  549:         return XML_TOK_INVALID;
  550:       }
  551:       break;
  552: #endif
  553:     case BT_S: case BT_CR: case BT_LF:
  554:       for (;;) {
  555:         int t;
  556: 
  557:         ptr += MINBPC(enc);
  558:         if (ptr == end)
  559:           return XML_TOK_PARTIAL;
  560:         t = BYTE_TYPE(enc, ptr);
  561:         if (t == BT_EQUALS)
  562:           break;
  563:         switch (t) {
  564:         case BT_S:
  565:         case BT_LF:
  566:         case BT_CR:
  567:           break;
  568:         default:
  569:           *nextTokPtr = ptr;
  570:           return XML_TOK_INVALID;
  571:         }
  572:       }
  573:     /* fall through */
  574:     case BT_EQUALS:
  575:       {
  576:         int open;
  577: #ifdef XML_NS
  578:         hadColon = 0;
  579: #endif
  580:         for (;;) {
  581:           ptr += MINBPC(enc);
  582:           if (ptr == end)
  583:             return XML_TOK_PARTIAL;
  584:           open = BYTE_TYPE(enc, ptr);
  585:           if (open == BT_QUOT || open == BT_APOS)
  586:             break;
  587:           switch (open) {
  588:           case BT_S:
  589:           case BT_LF:
  590:           case BT_CR:
  591:             break;
  592:           default:
  593:             *nextTokPtr = ptr;
  594:             return XML_TOK_INVALID;
  595:           }
  596:         }
  597:         ptr += MINBPC(enc);
  598:         /* in attribute value */
  599:         for (;;) {
  600:           int t;
  601:           if (ptr == end)
  602:             return XML_TOK_PARTIAL;
  603:           t = BYTE_TYPE(enc, ptr);
  604:           if (t == open)
  605:             break;
  606:           switch (t) {
  607:           INVALID_CASES(ptr, nextTokPtr)
  608:           case BT_AMP:
  609:             {
  610:               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
  611:               if (tok <= 0) {
  612:                 if (tok == XML_TOK_INVALID)
  613:                   *nextTokPtr = ptr;
  614:                 return tok;
  615:               }
  616:               break;
  617:             }
  618:           case BT_LT:
  619:             *nextTokPtr = ptr;
  620:             return XML_TOK_INVALID;
  621:           default:
  622:             ptr += MINBPC(enc);
  623:             break;
  624:           }
  625:         }
  626:         ptr += MINBPC(enc);
  627:         if (ptr == end)
  628:           return XML_TOK_PARTIAL;
  629:         switch (BYTE_TYPE(enc, ptr)) {
  630:         case BT_S:
  631:         case BT_CR:
  632:         case BT_LF:
  633:           break;
  634:         case BT_SOL:
  635:           goto sol;
  636:         case BT_GT:
  637:           goto gt;
  638:         default:
  639:           *nextTokPtr = ptr;
  640:           return XML_TOK_INVALID;
  641:         }
  642:         /* ptr points to closing quote */
  643:         for (;;) {
  644:           ptr += MINBPC(enc);
  645:           if (ptr == end)
  646:             return XML_TOK_PARTIAL;
  647:           switch (BYTE_TYPE(enc, ptr)) {
  648:           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  649:           case BT_S: case BT_CR: case BT_LF:
  650:             continue;
  651:           case BT_GT:
  652:           gt:
  653:             *nextTokPtr = ptr + MINBPC(enc);
  654:             return XML_TOK_START_TAG_WITH_ATTS;
  655:           case BT_SOL:
  656:           sol:
  657:             ptr += MINBPC(enc);
  658:             if (ptr == end)
  659:               return XML_TOK_PARTIAL;
  660:             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  661:               *nextTokPtr = ptr;
  662:               return XML_TOK_INVALID;
  663:             }
  664:             *nextTokPtr = ptr + MINBPC(enc);
  665:             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
  666:           default:
  667:             *nextTokPtr = ptr;
  668:             return XML_TOK_INVALID;
  669:           }
  670:           break;
  671:         }
  672:         break;
  673:       }
  674:     default:
  675:       *nextTokPtr = ptr;
  676:       return XML_TOK_INVALID;
  677:     }
  678:   }
  679:   return XML_TOK_PARTIAL;
  680: }
  681: 
  682: /* ptr points to character following "<" */
  683: 
  684: static int PTRCALL
  685: PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
  686:                const char **nextTokPtr)
  687: {
  688: #ifdef XML_NS
  689:   int hadColon;
  690: #endif
  691:   if (ptr == end)
  692:     return XML_TOK_PARTIAL;
  693:   switch (BYTE_TYPE(enc, ptr)) {
  694:   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  695:   case BT_EXCL:
  696:     if ((ptr += MINBPC(enc)) == end)
  697:       return XML_TOK_PARTIAL;
  698:     switch (BYTE_TYPE(enc, ptr)) {
  699:     case BT_MINUS:
  700:       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  701:     case BT_LSQB:
  702:       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
  703:                                       end, nextTokPtr);
  704:     }
  705:     *nextTokPtr = ptr;
  706:     return XML_TOK_INVALID;
  707:   case BT_QUEST:
  708:     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  709:   case BT_SOL:
  710:     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  711:   default:
  712:     *nextTokPtr = ptr;
  713:     return XML_TOK_INVALID;
  714:   }
  715: #ifdef XML_NS
  716:   hadColon = 0;
  717: #endif
  718:   /* we have a start-tag */
  719:   while (ptr != end) {
  720:     switch (BYTE_TYPE(enc, ptr)) {
  721:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  722: #ifdef XML_NS
  723:     case BT_COLON:
  724:       if (hadColon) {
  725:         *nextTokPtr = ptr;
  726:         return XML_TOK_INVALID;
  727:       }
  728:       hadColon = 1;
  729:       ptr += MINBPC(enc);
  730:       if (ptr == end)
  731:         return XML_TOK_PARTIAL;
  732:       switch (BYTE_TYPE(enc, ptr)) {
  733:       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  734:       default:
  735:         *nextTokPtr = ptr;
  736:         return XML_TOK_INVALID;
  737:       }
  738:       break;
  739: #endif
  740:     case BT_S: case BT_CR: case BT_LF:
  741:       {
  742:         ptr += MINBPC(enc);
  743:         while (ptr != end) {
  744:           switch (BYTE_TYPE(enc, ptr)) {
  745:           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  746:           case BT_GT:
  747:             goto gt;
  748:           case BT_SOL:
  749:             goto sol;
  750:           case BT_S: case BT_CR: case BT_LF:
  751:             ptr += MINBPC(enc);
  752:             continue;
  753:           default:
  754:             *nextTokPtr = ptr;
  755:             return XML_TOK_INVALID;
  756:           }
  757:           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
  758:         }
  759:         return XML_TOK_PARTIAL;
  760:       }
  761:     case BT_GT:
  762:     gt:
  763:       *nextTokPtr = ptr + MINBPC(enc);
  764:       return XML_TOK_START_TAG_NO_ATTS;
  765:     case BT_SOL:
  766:     sol:
  767:       ptr += MINBPC(enc);
  768:       if (ptr == end)
  769:         return XML_TOK_PARTIAL;
  770:       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  771:         *nextTokPtr = ptr;
  772:         return XML_TOK_INVALID;
  773:       }
  774:       *nextTokPtr = ptr + MINBPC(enc);
  775:       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
  776:     default:
  777:       *nextTokPtr = ptr;
  778:       return XML_TOK_INVALID;
  779:     }
  780:   }
  781:   return XML_TOK_PARTIAL;
  782: }
  783: 
  784: static int PTRCALL
  785: PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  786:                    const char **nextTokPtr)
  787: {
  788:   if (ptr == end)
  789:     return XML_TOK_NONE;
  790:   if (MINBPC(enc) > 1) {
  791:     size_t n = end - ptr;
  792:     if (n & (MINBPC(enc) - 1)) {
  793:       n &= ~(MINBPC(enc) - 1);
  794:       if (n == 0)
  795:         return XML_TOK_PARTIAL;
  796:       end = ptr + n;
  797:     }
  798:   }
  799:   switch (BYTE_TYPE(enc, ptr)) {
  800:   case BT_LT:
  801:     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  802:   case BT_AMP:
  803:     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
  804:   case BT_CR:
  805:     ptr += MINBPC(enc);
  806:     if (ptr == end)
  807:       return XML_TOK_TRAILING_CR;
  808:     if (BYTE_TYPE(enc, ptr) == BT_LF)
  809:       ptr += MINBPC(enc);
  810:     *nextTokPtr = ptr;
  811:     return XML_TOK_DATA_NEWLINE;
  812:   case BT_LF:
  813:     *nextTokPtr = ptr + MINBPC(enc);
  814:     return XML_TOK_DATA_NEWLINE;
  815:   case BT_RSQB:
  816:     ptr += MINBPC(enc);
  817:     if (ptr == end)
  818:       return XML_TOK_TRAILING_RSQB;
  819:     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
  820:       break;
  821:     ptr += MINBPC(enc);
  822:     if (ptr == end)
  823:       return XML_TOK_TRAILING_RSQB;
  824:     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
  825:       ptr -= MINBPC(enc);
  826:       break;
  827:     }
  828:     *nextTokPtr = ptr;
  829:     return XML_TOK_INVALID;
  830:   INVALID_CASES(ptr, nextTokPtr)
  831:   default:
  832:     ptr += MINBPC(enc);
  833:     break;
  834:   }
  835:   while (ptr != end) {
  836:     switch (BYTE_TYPE(enc, ptr)) {
  837: #define LEAD_CASE(n) \
  838:     case BT_LEAD ## n: \
  839:       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
  840:         *nextTokPtr = ptr; \
  841:         return XML_TOK_DATA_CHARS; \
  842:       } \
  843:       ptr += n; \
  844:       break;
  845:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
  846: #undef LEAD_CASE
  847:     case BT_RSQB:
  848:       if (ptr + MINBPC(enc) != end) {
  849:          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
  850:            ptr += MINBPC(enc);
  851:            break;
  852:          }
  853:          if (ptr + 2*MINBPC(enc) != end) {
  854:            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
  855:              ptr += MINBPC(enc);
  856:              break;
  857:            }
  858:            *nextTokPtr = ptr + 2*MINBPC(enc);
  859:            return XML_TOK_INVALID;
  860:          }
  861:       }
  862:       /* fall through */
  863:     case BT_AMP:
  864:     case BT_LT:
  865:     case BT_NONXML:
  866:     case BT_MALFORM:
  867:     case BT_TRAIL:
  868:     case BT_CR:
  869:     case BT_LF:
  870:       *nextTokPtr = ptr;
  871:       return XML_TOK_DATA_CHARS;
  872:     default:
  873:       ptr += MINBPC(enc);
  874:       break;
  875:     }
  876:   }
  877:   *nextTokPtr = ptr;
  878:   return XML_TOK_DATA_CHARS;
  879: }
  880: 
  881: /* ptr points to character following "%" */
  882: 
  883: static int PTRCALL
  884: PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  885:                     const char **nextTokPtr)
  886: {
  887:   if (ptr == end)
  888:     return XML_TOK_PARTIAL;
  889:   switch (BYTE_TYPE(enc, ptr)) {
  890:   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  891:   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
  892:     *nextTokPtr = ptr;
  893:     return XML_TOK_PERCENT;
  894:   default:
  895:     *nextTokPtr = ptr;
  896:     return XML_TOK_INVALID;
  897:   }
  898:   while (ptr != end) {
  899:     switch (BYTE_TYPE(enc, ptr)) {
  900:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  901:     case BT_SEMI:
  902:       *nextTokPtr = ptr + MINBPC(enc);
  903:       return XML_TOK_PARAM_ENTITY_REF;
  904:     default:
  905:       *nextTokPtr = ptr;
  906:       return XML_TOK_INVALID;
  907:     }
  908:   }
  909:   return XML_TOK_PARTIAL;
  910: }
  911: 
  912: static int PTRCALL
  913: PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  914:                       const char **nextTokPtr)
  915: {
  916:   if (ptr == end)
  917:     return XML_TOK_PARTIAL;
  918:   switch (BYTE_TYPE(enc, ptr)) {
  919:   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  920:   default:
  921:     *nextTokPtr = ptr;
  922:     return XML_TOK_INVALID;
  923:   }
  924:   while (ptr != end) {
  925:     switch (BYTE_TYPE(enc, ptr)) {
  926:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
  927:     case BT_CR: case BT_LF: case BT_S:
  928:     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
  929:       *nextTokPtr = ptr;
  930:       return XML_TOK_POUND_NAME;
  931:     default:
  932:       *nextTokPtr = ptr;
  933:       return XML_TOK_INVALID;
  934:     }
  935:   }
  936:   return -XML_TOK_POUND_NAME;
  937: }
  938: 
  939: static int PTRCALL
  940: PREFIX(scanLit)(int open, const ENCODING *enc,
  941:                 const char *ptr, const char *end,
  942:                 const char **nextTokPtr)
  943: {
  944:   while (ptr != end) {
  945:     int t = BYTE_TYPE(enc, ptr);
  946:     switch (t) {
  947:     INVALID_CASES(ptr, nextTokPtr)
  948:     case BT_QUOT:
  949:     case BT_APOS:
  950:       ptr += MINBPC(enc);
  951:       if (t != open)
  952:         break;
  953:       if (ptr == end)
  954:         return -XML_TOK_LITERAL;
  955:       *nextTokPtr = ptr;
  956:       switch (BYTE_TYPE(enc, ptr)) {
  957:       case BT_S: case BT_CR: case BT_LF:
  958:       case BT_GT: case BT_PERCNT: case BT_LSQB:
  959:         return XML_TOK_LITERAL;
  960:       default:
  961:         return XML_TOK_INVALID;
  962:       }
  963:     default:
  964:       ptr += MINBPC(enc);
  965:       break;
  966:     }
  967:   }
  968:   return XML_TOK_PARTIAL;
  969: }
  970: 
  971: static int PTRCALL
  972: PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  973:                   const char **nextTokPtr)
  974: {
  975:   int tok;
  976:   if (ptr == end)
  977:     return XML_TOK_NONE;
  978:   if (MINBPC(enc) > 1) {
  979:     size_t n = end - ptr;
  980:     if (n & (MINBPC(enc) - 1)) {
  981:       n &= ~(MINBPC(enc) - 1);
  982:       if (n == 0)
  983:         return XML_TOK_PARTIAL;
  984:       end = ptr + n;
  985:     }
  986:   }
  987:   switch (BYTE_TYPE(enc, ptr)) {
  988:   case BT_QUOT:
  989:     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
  990:   case BT_APOS:
  991:     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
  992:   case BT_LT:
  993:     {
  994:       ptr += MINBPC(enc);
  995:       if (ptr == end)
  996:         return XML_TOK_PARTIAL;
  997:       switch (BYTE_TYPE(enc, ptr)) {
  998:       case BT_EXCL:
  999:         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 1000:       case BT_QUEST:
 1001:         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 1002:       case BT_NMSTRT:
 1003:       case BT_HEX:
 1004:       case BT_NONASCII:
 1005:       case BT_LEAD2:
 1006:       case BT_LEAD3:
 1007:       case BT_LEAD4:
 1008:         *nextTokPtr = ptr - MINBPC(enc);
 1009:         return XML_TOK_INSTANCE_START;
 1010:       }
 1011:       *nextTokPtr = ptr;
 1012:       return XML_TOK_INVALID;
 1013:     }
 1014:   case BT_CR:
 1015:     if (ptr + MINBPC(enc) == end) {
 1016:       *nextTokPtr = end;
 1017:       /* indicate that this might be part of a CR/LF pair */
 1018:       return -XML_TOK_PROLOG_S;
 1019:     }
 1020:     /* fall through */
 1021:   case BT_S: case BT_LF:
 1022:     for (;;) {
 1023:       ptr += MINBPC(enc);
 1024:       if (ptr == end)
 1025:         break;
 1026:       switch (BYTE_TYPE(enc, ptr)) {
 1027:       case BT_S: case BT_LF:
 1028:         break;
 1029:       case BT_CR:
 1030:         /* don't split CR/LF pair */
 1031:         if (ptr + MINBPC(enc) != end)
 1032:           break;
 1033:         /* fall through */
 1034:       default:
 1035:         *nextTokPtr = ptr;
 1036:         return XML_TOK_PROLOG_S;
 1037:       }
 1038:     }
 1039:     *nextTokPtr = ptr;
 1040:     return XML_TOK_PROLOG_S;
 1041:   case BT_PERCNT:
 1042:     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 1043:   case BT_COMMA:
 1044:     *nextTokPtr = ptr + MINBPC(enc);
 1045:     return XML_TOK_COMMA;
 1046:   case BT_LSQB:
 1047:     *nextTokPtr = ptr + MINBPC(enc);
 1048:     return XML_TOK_OPEN_BRACKET;
 1049:   case BT_RSQB:
 1050:     ptr += MINBPC(enc);
 1051:     if (ptr == end)
 1052:       return -XML_TOK_CLOSE_BRACKET;
 1053:     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
 1054:       if (ptr + MINBPC(enc) == end)
 1055:         return XML_TOK_PARTIAL;
 1056:       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
 1057:         *nextTokPtr = ptr + 2*MINBPC(enc);
 1058:         return XML_TOK_COND_SECT_CLOSE;
 1059:       }
 1060:     }
 1061:     *nextTokPtr = ptr;
 1062:     return XML_TOK_CLOSE_BRACKET;
 1063:   case BT_LPAR:
 1064:     *nextTokPtr = ptr + MINBPC(enc);
 1065:     return XML_TOK_OPEN_PAREN;
 1066:   case BT_RPAR:
 1067:     ptr += MINBPC(enc);
 1068:     if (ptr == end)
 1069:       return -XML_TOK_CLOSE_PAREN;
 1070:     switch (BYTE_TYPE(enc, ptr)) {
 1071:     case BT_AST:
 1072:       *nextTokPtr = ptr + MINBPC(enc);
 1073:       return XML_TOK_CLOSE_PAREN_ASTERISK;
 1074:     case BT_QUEST:
 1075:       *nextTokPtr = ptr + MINBPC(enc);
 1076:       return XML_TOK_CLOSE_PAREN_QUESTION;
 1077:     case BT_PLUS:
 1078:       *nextTokPtr = ptr + MINBPC(enc);
 1079:       return XML_TOK_CLOSE_PAREN_PLUS;
 1080:     case BT_CR: case BT_LF: case BT_S:
 1081:     case BT_GT: case BT_COMMA: case BT_VERBAR:
 1082:     case BT_RPAR:
 1083:       *nextTokPtr = ptr;
 1084:       return XML_TOK_CLOSE_PAREN;
 1085:     }
 1086:     *nextTokPtr = ptr;
 1087:     return XML_TOK_INVALID;
 1088:   case BT_VERBAR:
 1089:     *nextTokPtr = ptr + MINBPC(enc);
 1090:     return XML_TOK_OR;
 1091:   case BT_GT:
 1092:     *nextTokPtr = ptr + MINBPC(enc);
 1093:     return XML_TOK_DECL_CLOSE;
 1094:   case BT_NUM:
 1095:     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 1096: #define LEAD_CASE(n) \
 1097:   case BT_LEAD ## n: \
 1098:     if (end - ptr < n) \
 1099:       return XML_TOK_PARTIAL_CHAR; \
 1100:     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
 1101:       ptr += n; \
 1102:       tok = XML_TOK_NAME; \
 1103:       break; \
 1104:     } \
 1105:     if (IS_NAME_CHAR(enc, ptr, n)) { \
 1106:       ptr += n; \
 1107:       tok = XML_TOK_NMTOKEN; \
 1108:       break; \
 1109:     } \
 1110:     *nextTokPtr = ptr; \
 1111:     return XML_TOK_INVALID;
 1112:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 1113: #undef LEAD_CASE
 1114:   case BT_NMSTRT:
 1115:   case BT_HEX:
 1116:     tok = XML_TOK_NAME;
 1117:     ptr += MINBPC(enc);
 1118:     break;
 1119:   case BT_DIGIT:
 1120:   case BT_NAME:
 1121:   case BT_MINUS:
 1122: #ifdef XML_NS
 1123:   case BT_COLON:
 1124: #endif
 1125:     tok = XML_TOK_NMTOKEN;
 1126:     ptr += MINBPC(enc);
 1127:     break;
 1128:   case BT_NONASCII:
 1129:     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
 1130:       ptr += MINBPC(enc);
 1131:       tok = XML_TOK_NAME;
 1132:       break;
 1133:     }
 1134:     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
 1135:       ptr += MINBPC(enc);
 1136:       tok = XML_TOK_NMTOKEN;
 1137:       break;
 1138:     }
 1139:     /* fall through */
 1140:   default:
 1141:     *nextTokPtr = ptr;
 1142:     return XML_TOK_INVALID;
 1143:   }
 1144:   while (ptr != end) {
 1145:     switch (BYTE_TYPE(enc, ptr)) {
 1146:     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 1147:     case BT_GT: case BT_RPAR: case BT_COMMA:
 1148:     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
 1149:     case BT_S: case BT_CR: case BT_LF:
 1150:       *nextTokPtr = ptr;
 1151:       return tok;
 1152: #ifdef XML_NS
 1153:     case BT_COLON:
 1154:       ptr += MINBPC(enc);
 1155:       switch (tok) {
 1156:       case XML_TOK_NAME:
 1157:         if (ptr == end)
 1158:           return XML_TOK_PARTIAL;
 1159:         tok = XML_TOK_PREFIXED_NAME;
 1160:         switch (BYTE_TYPE(enc, ptr)) {
 1161:         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
 1162:         default:
 1163:           tok = XML_TOK_NMTOKEN;
 1164:           break;
 1165:         }
 1166:         break;
 1167:       case XML_TOK_PREFIXED_NAME:
 1168:         tok = XML_TOK_NMTOKEN;
 1169:         break;
 1170:       }
 1171:       break;
 1172: #endif
 1173:     case BT_PLUS:
 1174:       if (tok == XML_TOK_NMTOKEN)  {
 1175:         *nextTokPtr = ptr;
 1176:         return XML_TOK_INVALID;
 1177:       }
 1178:       *nextTokPtr = ptr + MINBPC(enc);
 1179:       return XML_TOK_NAME_PLUS;
 1180:     case BT_AST:
 1181:       if (tok == XML_TOK_NMTOKEN)  {
 1182:         *nextTokPtr = ptr;
 1183:         return XML_TOK_INVALID;
 1184:       }
 1185:       *nextTokPtr = ptr + MINBPC(enc);
 1186:       return XML_TOK_NAME_ASTERISK;
 1187:     case BT_QUEST:
 1188:       if (tok == XML_TOK_NMTOKEN)  {
 1189:         *nextTokPtr = ptr;
 1190:         return XML_TOK_INVALID;
 1191:       }
 1192:       *nextTokPtr = ptr + MINBPC(enc);
 1193:       return XML_TOK_NAME_QUESTION;
 1194:     default:
 1195:       *nextTokPtr = ptr;
 1196:       return XML_TOK_INVALID;
 1197:     }
 1198:   }
 1199:   return -tok;
 1200: }
 1201: 
 1202: static int PTRCALL
 1203: PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
 1204:                           const char *end, const char **nextTokPtr)
 1205: {
 1206:   const char *start;
 1207:   if (ptr == end)
 1208:     return XML_TOK_NONE;
 1209:   start = ptr;
 1210:   while (ptr != end) {
 1211:     switch (BYTE_TYPE(enc, ptr)) {
 1212: #define LEAD_CASE(n) \
 1213:     case BT_LEAD ## n: ptr += n; break;
 1214:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 1215: #undef LEAD_CASE
 1216:     case BT_AMP:
 1217:       if (ptr == start)
 1218:         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 1219:       *nextTokPtr = ptr;
 1220:       return XML_TOK_DATA_CHARS;
 1221:     case BT_LT:
 1222:       /* this is for inside entity references */
 1223:       *nextTokPtr = ptr;
 1224:       return XML_TOK_INVALID;
 1225:     case BT_LF:
 1226:       if (ptr == start) {
 1227:         *nextTokPtr = ptr + MINBPC(enc);
 1228:         return XML_TOK_DATA_NEWLINE;
 1229:       }
 1230:       *nextTokPtr = ptr;
 1231:       return XML_TOK_DATA_CHARS;
 1232:     case BT_CR:
 1233:       if (ptr == start) {
 1234:         ptr += MINBPC(enc);
 1235:         if (ptr == end)
 1236:           return XML_TOK_TRAILING_CR;
 1237:         if (BYTE_TYPE(enc, ptr) == BT_LF)
 1238:           ptr += MINBPC(enc);
 1239:         *nextTokPtr = ptr;
 1240:         return XML_TOK_DATA_NEWLINE;
 1241:       }
 1242:       *nextTokPtr = ptr;
 1243:       return XML_TOK_DATA_CHARS;
 1244:     case BT_S:
 1245:       if (ptr == start) {
 1246:         *nextTokPtr = ptr + MINBPC(enc);
 1247:         return XML_TOK_ATTRIBUTE_VALUE_S;
 1248:       }
 1249:       *nextTokPtr = ptr;
 1250:       return XML_TOK_DATA_CHARS;
 1251:     default:
 1252:       ptr += MINBPC(enc);
 1253:       break;
 1254:     }
 1255:   }
 1256:   *nextTokPtr = ptr;
 1257:   return XML_TOK_DATA_CHARS;
 1258: }
 1259: 
 1260: static int PTRCALL
 1261: PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
 1262:                        const char *end, const char **nextTokPtr)
 1263: {
 1264:   const char *start;
 1265:   if (ptr == end)
 1266:     return XML_TOK_NONE;
 1267:   start = ptr;
 1268:   while (ptr != end) {
 1269:     switch (BYTE_TYPE(enc, ptr)) {
 1270: #define LEAD_CASE(n) \
 1271:     case BT_LEAD ## n: ptr += n; break;
 1272:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 1273: #undef LEAD_CASE
 1274:     case BT_AMP:
 1275:       if (ptr == start)
 1276:         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
 1277:       *nextTokPtr = ptr;
 1278:       return XML_TOK_DATA_CHARS;
 1279:     case BT_PERCNT:
 1280:       if (ptr == start) {
 1281:         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
 1282:                                        end, nextTokPtr);
 1283:         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
 1284:       }
 1285:       *nextTokPtr = ptr;
 1286:       return XML_TOK_DATA_CHARS;
 1287:     case BT_LF:
 1288:       if (ptr == start) {
 1289:         *nextTokPtr = ptr + MINBPC(enc);
 1290:         return XML_TOK_DATA_NEWLINE;
 1291:       }
 1292:       *nextTokPtr = ptr;
 1293:       return XML_TOK_DATA_CHARS;
 1294:     case BT_CR:
 1295:       if (ptr == start) {
 1296:         ptr += MINBPC(enc);
 1297:         if (ptr == end)
 1298:           return XML_TOK_TRAILING_CR;
 1299:         if (BYTE_TYPE(enc, ptr) == BT_LF)
 1300:           ptr += MINBPC(enc);
 1301:         *nextTokPtr = ptr;
 1302:         return XML_TOK_DATA_NEWLINE;
 1303:       }
 1304:       *nextTokPtr = ptr;
 1305:       return XML_TOK_DATA_CHARS;
 1306:     default:
 1307:       ptr += MINBPC(enc);
 1308:       break;
 1309:     }
 1310:   }
 1311:   *nextTokPtr = ptr;
 1312:   return XML_TOK_DATA_CHARS;
 1313: }
 1314: 
 1315: #ifdef XML_DTD
 1316: 
 1317: static int PTRCALL
 1318: PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
 1319:                          const char *end, const char **nextTokPtr)
 1320: {
 1321:   int level = 0;
 1322:   if (MINBPC(enc) > 1) {
 1323:     size_t n = end - ptr;
 1324:     if (n & (MINBPC(enc) - 1)) {
 1325:       n &= ~(MINBPC(enc) - 1);
 1326:       end = ptr + n;
 1327:     }
 1328:   }
 1329:   while (ptr != end) {
 1330:     switch (BYTE_TYPE(enc, ptr)) {
 1331:     INVALID_CASES(ptr, nextTokPtr)
 1332:     case BT_LT:
 1333:       if ((ptr += MINBPC(enc)) == end)
 1334:         return XML_TOK_PARTIAL;
 1335:       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
 1336:         if ((ptr += MINBPC(enc)) == end)
 1337:           return XML_TOK_PARTIAL;
 1338:         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
 1339:           ++level;
 1340:           ptr += MINBPC(enc);
 1341:         }
 1342:       }
 1343:       break;
 1344:     case BT_RSQB:
 1345:       if ((ptr += MINBPC(enc)) == end)
 1346:         return XML_TOK_PARTIAL;
 1347:       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
 1348:         if ((ptr += MINBPC(enc)) == end)
 1349:           return XML_TOK_PARTIAL;
 1350:         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
 1351:           ptr += MINBPC(enc);
 1352:           if (level == 0) {
 1353:             *nextTokPtr = ptr;
 1354:             return XML_TOK_IGNORE_SECT;
 1355:           }
 1356:           --level;
 1357:         }
 1358:       }
 1359:       break;
 1360:     default:
 1361:       ptr += MINBPC(enc);
 1362:       break;
 1363:     }
 1364:   }
 1365:   return XML_TOK_PARTIAL;
 1366: }
 1367: 
 1368: #endif /* XML_DTD */
 1369: 
 1370: static int PTRCALL
 1371: PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
 1372:                    const char **badPtr)
 1373: {
 1374:   ptr += MINBPC(enc);
 1375:   end -= MINBPC(enc);
 1376:   for (; ptr != end; ptr += MINBPC(enc)) {
 1377:     switch (BYTE_TYPE(enc, ptr)) {
 1378:     case BT_DIGIT:
 1379:     case BT_HEX:
 1380:     case BT_MINUS:
 1381:     case BT_APOS:
 1382:     case BT_LPAR:
 1383:     case BT_RPAR:
 1384:     case BT_PLUS:
 1385:     case BT_COMMA:
 1386:     case BT_SOL:
 1387:     case BT_EQUALS:
 1388:     case BT_QUEST:
 1389:     case BT_CR:
 1390:     case BT_LF:
 1391:     case BT_SEMI:
 1392:     case BT_EXCL:
 1393:     case BT_AST:
 1394:     case BT_PERCNT:
 1395:     case BT_NUM:
 1396: #ifdef XML_NS
 1397:     case BT_COLON:
 1398: #endif
 1399:       break;
 1400:     case BT_S:
 1401:       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
 1402:         *badPtr = ptr;
 1403:         return 0;
 1404:       }
 1405:       break;
 1406:     case BT_NAME:
 1407:     case BT_NMSTRT:
 1408:       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
 1409:         break;
 1410:     default:
 1411:       switch (BYTE_TO_ASCII(enc, ptr)) {
 1412:       case 0x24: /* $ */
 1413:       case 0x40: /* @ */
 1414:         break;
 1415:       default:
 1416:         *badPtr = ptr;
 1417:         return 0;
 1418:       }
 1419:       break;
 1420:     }
 1421:   }
 1422:   return 1;
 1423: }
 1424: 
 1425: /* This must only be called for a well-formed start-tag or empty
 1426:    element tag.  Returns the number of attributes.  Pointers to the
 1427:    first attsMax attributes are stored in atts.
 1428: */
 1429: 
 1430: static int PTRCALL
 1431: PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
 1432:                 int attsMax, ATTRIBUTE *atts)
 1433: {
 1434:   enum { other, inName, inValue } state = inName;
 1435:   int nAtts = 0;
 1436:   int open = 0; /* defined when state == inValue;
 1437:                    initialization just to shut up compilers */
 1438: 
 1439:   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
 1440:     switch (BYTE_TYPE(enc, ptr)) {
 1441: #define START_NAME \
 1442:       if (state == other) { \
 1443:         if (nAtts < attsMax) { \
 1444:           atts[nAtts].name = ptr; \
 1445:           atts[nAtts].normalized = 1; \
 1446:         } \
 1447:         state = inName; \
 1448:       }
 1449: #define LEAD_CASE(n) \
 1450:     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
 1451:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 1452: #undef LEAD_CASE
 1453:     case BT_NONASCII:
 1454:     case BT_NMSTRT:
 1455:     case BT_HEX:
 1456:       START_NAME
 1457:       break;
 1458: #undef START_NAME
 1459:     case BT_QUOT:
 1460:       if (state != inValue) {
 1461:         if (nAtts < attsMax)
 1462:           atts[nAtts].valuePtr = ptr + MINBPC(enc);
 1463:         state = inValue;
 1464:         open = BT_QUOT;
 1465:       }
 1466:       else if (open == BT_QUOT) {
 1467:         state = other;
 1468:         if (nAtts < attsMax)
 1469:           atts[nAtts].valueEnd = ptr;
 1470:         nAtts++;
 1471:       }
 1472:       break;
 1473:     case BT_APOS:
 1474:       if (state != inValue) {
 1475:         if (nAtts < attsMax)
 1476:           atts[nAtts].valuePtr = ptr + MINBPC(enc);
 1477:         state = inValue;
 1478:         open = BT_APOS;
 1479:       }
 1480:       else if (open == BT_APOS) {
 1481:         state = other;
 1482:         if (nAtts < attsMax)
 1483:           atts[nAtts].valueEnd = ptr;
 1484:         nAtts++;
 1485:       }
 1486:       break;
 1487:     case BT_AMP:
 1488:       if (nAtts < attsMax)
 1489:         atts[nAtts].normalized = 0;
 1490:       break;
 1491:     case BT_S:
 1492:       if (state == inName)
 1493:         state = other;
 1494:       else if (state == inValue
 1495:                && nAtts < attsMax
 1496:                && atts[nAtts].normalized
 1497:                && (ptr == atts[nAtts].valuePtr
 1498:                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
 1499:                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
 1500:                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
 1501:         atts[nAtts].normalized = 0;
 1502:       break;
 1503:     case BT_CR: case BT_LF:
 1504:       /* This case ensures that the first attribute name is counted
 1505:          Apart from that we could just change state on the quote. */
 1506:       if (state == inName)
 1507:         state = other;
 1508:       else if (state == inValue && nAtts < attsMax)
 1509:         atts[nAtts].normalized = 0;
 1510:       break;
 1511:     case BT_GT:
 1512:     case BT_SOL:
 1513:       if (state != inValue)
 1514:         return nAtts;
 1515:       break;
 1516:     default:
 1517:       break;
 1518:     }
 1519:   }
 1520:   /* not reached */
 1521: }
 1522: 
 1523: static int PTRFASTCALL
 1524: PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
 1525: {
 1526:   int result = 0;
 1527:   /* skip &# */
 1528:   ptr += 2*MINBPC(enc);
 1529:   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
 1530:     for (ptr += MINBPC(enc);
 1531:          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
 1532:          ptr += MINBPC(enc)) {
 1533:       int c = BYTE_TO_ASCII(enc, ptr);
 1534:       switch (c) {
 1535:       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
 1536:       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
 1537:         result <<= 4;
 1538:         result |= (c - ASCII_0);
 1539:         break;
 1540:       case ASCII_A: case ASCII_B: case ASCII_C:
 1541:       case ASCII_D: case ASCII_E: case ASCII_F:
 1542:         result <<= 4;
 1543:         result += 10 + (c - ASCII_A);
 1544:         break;
 1545:       case ASCII_a: case ASCII_b: case ASCII_c:
 1546:       case ASCII_d: case ASCII_e: case ASCII_f:
 1547:         result <<= 4;
 1548:         result += 10 + (c - ASCII_a);
 1549:         break;
 1550:       }
 1551:       if (result >= 0x110000)
 1552:         return -1;
 1553:     }
 1554:   }
 1555:   else {
 1556:     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
 1557:       int c = BYTE_TO_ASCII(enc, ptr);
 1558:       result *= 10;
 1559:       result += (c - ASCII_0);
 1560:       if (result >= 0x110000)
 1561:         return -1;
 1562:     }
 1563:   }
 1564:   return checkCharRefNumber(result);
 1565: }
 1566: 
 1567: static int PTRCALL
 1568: PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
 1569:                              const char *end)
 1570: {
 1571:   switch ((end - ptr)/MINBPC(enc)) {
 1572:   case 2:
 1573:     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
 1574:       switch (BYTE_TO_ASCII(enc, ptr)) {
 1575:       case ASCII_l:
 1576:         return ASCII_LT;
 1577:       case ASCII_g:
 1578:         return ASCII_GT;
 1579:       }
 1580:     }
 1581:     break;
 1582:   case 3:
 1583:     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
 1584:       ptr += MINBPC(enc);
 1585:       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
 1586:         ptr += MINBPC(enc);
 1587:         if (CHAR_MATCHES(enc, ptr, ASCII_p))
 1588:           return ASCII_AMP;
 1589:       }
 1590:     }
 1591:     break;
 1592:   case 4:
 1593:     switch (BYTE_TO_ASCII(enc, ptr)) {
 1594:     case ASCII_q:
 1595:       ptr += MINBPC(enc);
 1596:       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
 1597:         ptr += MINBPC(enc);
 1598:         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
 1599:           ptr += MINBPC(enc);
 1600:           if (CHAR_MATCHES(enc, ptr, ASCII_t))
 1601:             return ASCII_QUOT;
 1602:         }
 1603:       }
 1604:       break;
 1605:     case ASCII_a:
 1606:       ptr += MINBPC(enc);
 1607:       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
 1608:         ptr += MINBPC(enc);
 1609:         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
 1610:           ptr += MINBPC(enc);
 1611:           if (CHAR_MATCHES(enc, ptr, ASCII_s))
 1612:             return ASCII_APOS;
 1613:         }
 1614:       }
 1615:       break;
 1616:     }
 1617:   }
 1618:   return 0;
 1619: }
 1620: 
 1621: static int PTRCALL
 1622: PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
 1623: {
 1624:   for (;;) {
 1625:     switch (BYTE_TYPE(enc, ptr1)) {
 1626: #define LEAD_CASE(n) \
 1627:     case BT_LEAD ## n: \
 1628:       if (*ptr1++ != *ptr2++) \
 1629:         return 0;
 1630:     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
 1631: #undef LEAD_CASE
 1632:       /* fall through */
 1633:       if (*ptr1++ != *ptr2++)
 1634:         return 0;
 1635:       break;
 1636:     case BT_NONASCII:
 1637:     case BT_NMSTRT:
 1638: #ifdef XML_NS
 1639:     case BT_COLON:
 1640: #endif
 1641:     case BT_HEX:
 1642:     case BT_DIGIT:
 1643:     case BT_NAME:
 1644:     case BT_MINUS:
 1645:       if (*ptr2++ != *ptr1++)
 1646:         return 0;
 1647:       if (MINBPC(enc) > 1) {
 1648:         if (*ptr2++ != *ptr1++)
 1649:           return 0;
 1650:         if (MINBPC(enc) > 2) {
 1651:           if (*ptr2++ != *ptr1++)
 1652:             return 0;
 1653:           if (MINBPC(enc) > 3) {
 1654:             if (*ptr2++ != *ptr1++)
 1655:               return 0;
 1656:           }
 1657:         }
 1658:       }
 1659:       break;
 1660:     default:
 1661:       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
 1662:         return 1;
 1663:       switch (BYTE_TYPE(enc, ptr2)) {
 1664:       case BT_LEAD2:
 1665:       case BT_LEAD3:
 1666:       case BT_LEAD4:
 1667:       case BT_NONASCII:
 1668:       case BT_NMSTRT:
 1669: #ifdef XML_NS
 1670:       case BT_COLON:
 1671: #endif
 1672:       case BT_HEX:
 1673:       case BT_DIGIT:
 1674:       case BT_NAME:
 1675:       case BT_MINUS:
 1676:         return 0;
 1677:       default:
 1678:         return 1;
 1679:       }
 1680:     }
 1681:   }
 1682:   /* not reached */
 1683: }
 1684: 
 1685: static int PTRCALL
 1686: PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
 1687:                          const char *end1, const char *ptr2)
 1688: {
 1689:   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
 1690:     if (ptr1 == end1)
 1691:       return 0;
 1692:     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
 1693:       return 0;
 1694:   }
 1695:   return ptr1 == end1;
 1696: }
 1697: 
 1698: static int PTRFASTCALL
 1699: PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
 1700: {
 1701:   const char *start = ptr;
 1702:   for (;;) {
 1703:     switch (BYTE_TYPE(enc, ptr)) {
 1704: #define LEAD_CASE(n) \
 1705:     case BT_LEAD ## n: ptr += n; break;
 1706:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 1707: #undef LEAD_CASE
 1708:     case BT_NONASCII:
 1709:     case BT_NMSTRT:
 1710: #ifdef XML_NS
 1711:     case BT_COLON:
 1712: #endif
 1713:     case BT_HEX:
 1714:     case BT_DIGIT:
 1715:     case BT_NAME:
 1716:     case BT_MINUS:
 1717:       ptr += MINBPC(enc);
 1718:       break;
 1719:     default:
 1720:       return (int)(ptr - start);
 1721:     }
 1722:   }
 1723: }
 1724: 
 1725: static const char * PTRFASTCALL
 1726: PREFIX(skipS)(const ENCODING *enc, const char *ptr)
 1727: {
 1728:   for (;;) {
 1729:     switch (BYTE_TYPE(enc, ptr)) {
 1730:     case BT_LF:
 1731:     case BT_CR:
 1732:     case BT_S:
 1733:       ptr += MINBPC(enc);
 1734:       break;
 1735:     default:
 1736:       return ptr;
 1737:     }
 1738:   }
 1739: }
 1740: 
 1741: static void PTRCALL
 1742: PREFIX(updatePosition)(const ENCODING *enc,
 1743:                        const char *ptr,
 1744:                        const char *end,
 1745:                        POSITION *pos)
 1746: {
 1747:   while (ptr < end) {
 1748:     switch (BYTE_TYPE(enc, ptr)) {
 1749: #define LEAD_CASE(n) \
 1750:     case BT_LEAD ## n: \
 1751:       ptr += n; \
 1752:       break;
 1753:     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 1754: #undef LEAD_CASE
 1755:     case BT_LF:
 1756:       pos->columnNumber = (XML_Size)-1;
 1757:       pos->lineNumber++;
 1758:       ptr += MINBPC(enc);
 1759:       break;
 1760:     case BT_CR:
 1761:       pos->lineNumber++;
 1762:       ptr += MINBPC(enc);
 1763:       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
 1764:         ptr += MINBPC(enc);
 1765:       pos->columnNumber = (XML_Size)-1;
 1766:       break;
 1767:     default:
 1768:       ptr += MINBPC(enc);
 1769:       break;
 1770:     }
 1771:     pos->columnNumber++;
 1772:   }
 1773: }
 1774: 
 1775: #undef DO_LEAD_CASE
 1776: #undef MULTIBYTE_CASES
 1777: #undef INVALID_CASES
 1778: #undef CHECK_NAME_CASE
 1779: #undef CHECK_NAME_CASES
 1780: #undef CHECK_NMSTRT_CASE
 1781: #undef CHECK_NMSTRT_CASES
 1782: 
 1783: #endif /* XML_TOK_IMPL_C */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>