embedaddon/libxml2/HTMLparser.c - annotate

Return to HTMLparser.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2
Annotation of embedaddon/libxml2/HTMLparser.c, revision 1.1.1.3

1.1       misho       1: /*
                      2:  * HTMLparser.c : an HTML 4.0 non-verifying parser
                      3:  *
                      4:  * See Copyright for the status of this software.
                      5:  *
                      6:  * daniel@veillard.com
                      7:  */
                      8: 
                      9: #define IN_LIBXML
                     10: #include "libxml.h"
                     11: #ifdef LIBXML_HTML_ENABLED
                     12: 
                     13: #include <string.h>
                     14: #ifdef HAVE_CTYPE_H
                     15: #include <ctype.h>
                     16: #endif
                     17: #ifdef HAVE_STDLIB_H
                     18: #include <stdlib.h>
                     19: #endif
                     20: #ifdef HAVE_SYS_STAT_H
                     21: #include <sys/stat.h>
                     22: #endif
                     23: #ifdef HAVE_FCNTL_H
                     24: #include <fcntl.h>
                     25: #endif
                     26: #ifdef HAVE_UNISTD_H
                     27: #include <unistd.h>
                     28: #endif
                     29: #ifdef HAVE_ZLIB_H
                     30: #include <zlib.h>
                     31: #endif
                     32: 
                     33: #include <libxml/xmlmemory.h>
                     34: #include <libxml/tree.h>
                     35: #include <libxml/parser.h>
                     36: #include <libxml/parserInternals.h>
                     37: #include <libxml/xmlerror.h>
                     38: #include <libxml/HTMLparser.h>
                     39: #include <libxml/HTMLtree.h>
                     40: #include <libxml/entities.h>
                     41: #include <libxml/encoding.h>
                     42: #include <libxml/valid.h>
                     43: #include <libxml/xmlIO.h>
                     44: #include <libxml/globals.h>
                     45: #include <libxml/uri.h>
                     46: 
1.1.1.3 ! misho      47: #include "buf.h"
        !            48: #include "enc.h"
        !            49: 
1.1       misho      50: #define HTML_MAX_NAMELEN 1000
                     51: #define HTML_PARSER_BIG_BUFFER_SIZE 1000
                     52: #define HTML_PARSER_BUFFER_SIZE 100
                     53: 
                     54: /* #define DEBUG */
                     55: /* #define DEBUG_PUSH */
                     56: 
                     57: static int htmlOmittedDefaultValue = 1;
                     58: 
                     59: xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
                     60:                             xmlChar end, xmlChar  end2, xmlChar end3);
                     61: static void htmlParseComment(htmlParserCtxtPtr ctxt);
                     62: 
                     63: /************************************************************************
                     64:  *                                                                     *
                     65:  *             Some factorized error routines                          *
                     66:  *                                                                     *
                     67:  ************************************************************************/
                     68: 
                     69: /**
                     70:  * htmlErrMemory:
                     71:  * @ctxt:  an HTML parser context
                     72:  * @extra:  extra informations
                     73:  *
                     74:  * Handle a redefinition of attribute error
                     75:  */
                     76: static void
                     77: htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
                     78: {
                     79:     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
                     80:         (ctxt->instate == XML_PARSER_EOF))
                     81:        return;
                     82:     if (ctxt != NULL) {
                     83:         ctxt->errNo = XML_ERR_NO_MEMORY;
                     84:         ctxt->instate = XML_PARSER_EOF;
                     85:         ctxt->disableSAX = 1;
                     86:     }
                     87:     if (extra)
                     88:         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
                     89:                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
                     90:                         NULL, NULL, 0, 0,
                     91:                         "Memory allocation failed : %s\n", extra);
                     92:     else
                     93:         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
                     94:                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
                     95:                         NULL, NULL, 0, 0, "Memory allocation failed\n");
                     96: }
                     97: 
                     98: /**
                     99:  * htmlParseErr:
                    100:  * @ctxt:  an HTML parser context
                    101:  * @error:  the error number
                    102:  * @msg:  the error message
                    103:  * @str1:  string infor
                    104:  * @str2:  string infor
                    105:  *
                    106:  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
                    107:  */
                    108: static void
                    109: htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
                    110:              const char *msg, const xmlChar *str1, const xmlChar *str2)
                    111: {
                    112:     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
                    113:         (ctxt->instate == XML_PARSER_EOF))
                    114:        return;
                    115:     if (ctxt != NULL)
                    116:        ctxt->errNo = error;
                    117:     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
                    118:                     XML_ERR_ERROR, NULL, 0,
                    119:                    (const char *) str1, (const char *) str2,
                    120:                    NULL, 0, 0,
                    121:                    msg, str1, str2);
                    122:     if (ctxt != NULL)
                    123:        ctxt->wellFormed = 0;
                    124: }
                    125: 
                    126: /**
                    127:  * htmlParseErrInt:
                    128:  * @ctxt:  an HTML parser context
                    129:  * @error:  the error number
                    130:  * @msg:  the error message
                    131:  * @val:  integer info
                    132:  *
                    133:  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
                    134:  */
                    135: static void
                    136: htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
                    137:              const char *msg, int val)
                    138: {
                    139:     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
                    140:         (ctxt->instate == XML_PARSER_EOF))
                    141:        return;
                    142:     if (ctxt != NULL)
                    143:        ctxt->errNo = error;
                    144:     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
                    145:                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
                    146:                    NULL, val, 0, msg, val);
                    147:     if (ctxt != NULL)
                    148:        ctxt->wellFormed = 0;
                    149: }
                    150: 
                    151: /************************************************************************
                    152:  *                                                                     *
                    153:  *     Parser stacks related functions and macros              *
                    154:  *                                                                     *
                    155:  ************************************************************************/
                    156: 
                    157: /**
                    158:  * htmlnamePush:
                    159:  * @ctxt:  an HTML parser context
                    160:  * @value:  the element name
                    161:  *
                    162:  * Pushes a new element name on top of the name stack
                    163:  *
                    164:  * Returns 0 in case of error, the index in the stack otherwise
                    165:  */
                    166: static int
                    167: htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
                    168: {
                    169:     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
                    170:         ctxt->html = 3;
                    171:     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
                    172:         ctxt->html = 10;
                    173:     if (ctxt->nameNr >= ctxt->nameMax) {
                    174:         ctxt->nameMax *= 2;
                    175:         ctxt->nameTab = (const xmlChar * *)
                    176:                          xmlRealloc((xmlChar * *)ctxt->nameTab,
                    177:                                     ctxt->nameMax *
                    178:                                     sizeof(ctxt->nameTab[0]));
                    179:         if (ctxt->nameTab == NULL) {
                    180:             htmlErrMemory(ctxt, NULL);
                    181:             return (0);
                    182:         }
                    183:     }
                    184:     ctxt->nameTab[ctxt->nameNr] = value;
                    185:     ctxt->name = value;
                    186:     return (ctxt->nameNr++);
                    187: }
                    188: /**
                    189:  * htmlnamePop:
                    190:  * @ctxt: an HTML parser context
                    191:  *
                    192:  * Pops the top element name from the name stack
                    193:  *
                    194:  * Returns the name just removed
                    195:  */
                    196: static const xmlChar *
                    197: htmlnamePop(htmlParserCtxtPtr ctxt)
                    198: {
                    199:     const xmlChar *ret;
                    200: 
                    201:     if (ctxt->nameNr <= 0)
                    202:         return (NULL);
                    203:     ctxt->nameNr--;
                    204:     if (ctxt->nameNr < 0)
                    205:         return (NULL);
                    206:     if (ctxt->nameNr > 0)
                    207:         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
                    208:     else
                    209:         ctxt->name = NULL;
                    210:     ret = ctxt->nameTab[ctxt->nameNr];
                    211:     ctxt->nameTab[ctxt->nameNr] = NULL;
                    212:     return (ret);
                    213: }
                    214: 
                    215: /**
                    216:  * htmlNodeInfoPush:
                    217:  * @ctxt:  an HTML parser context
                    218:  * @value:  the node info
                    219:  *
                    220:  * Pushes a new element name on top of the node info stack
                    221:  *
                    222:  * Returns 0 in case of error, the index in the stack otherwise
                    223:  */
                    224: static int
                    225: htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
                    226: {
                    227:     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
                    228:         if (ctxt->nodeInfoMax == 0)
                    229:                 ctxt->nodeInfoMax = 5;
                    230:         ctxt->nodeInfoMax *= 2;
                    231:         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
                    232:                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
                    233:                                     ctxt->nodeInfoMax *
                    234:                                     sizeof(ctxt->nodeInfoTab[0]));
                    235:         if (ctxt->nodeInfoTab == NULL) {
                    236:             htmlErrMemory(ctxt, NULL);
                    237:             return (0);
                    238:         }
                    239:     }
                    240:     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
                    241:     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
                    242:     return (ctxt->nodeInfoNr++);
                    243: }
                    244: 
                    245: /**
                    246:  * htmlNodeInfoPop:
                    247:  * @ctxt:  an HTML parser context
                    248:  *
                    249:  * Pops the top element name from the node info stack
                    250:  *
                    251:  * Returns 0 in case of error, the pointer to NodeInfo otherwise
                    252:  */
                    253: static htmlParserNodeInfo *
                    254: htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
                    255: {
                    256:     if (ctxt->nodeInfoNr <= 0)
                    257:         return (NULL);
                    258:     ctxt->nodeInfoNr--;
                    259:     if (ctxt->nodeInfoNr < 0)
                    260:         return (NULL);
                    261:     if (ctxt->nodeInfoNr > 0)
                    262:         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
                    263:     else
                    264:         ctxt->nodeInfo = NULL;
                    265:     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
                    266: }
                    267: 
                    268: /*
                    269:  * Macros for accessing the content. Those should be used only by the parser,
                    270:  * and not exported.
                    271:  *
                    272:  * Dirty macros, i.e. one need to make assumption on the context to use them
                    273:  *
                    274:  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
                    275:  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
                    276:  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
                    277:  *           in UNICODE mode. This should be used internally by the parser
                    278:  *           only to compare to ASCII values otherwise it would break when
                    279:  *           running with UTF-8 encoding.
                    280:  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
                    281:  *           to compare on ASCII based substring.
                    282:  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
                    283:  *           it should be used only to compare on ASCII based substring.
                    284:  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
                    285:  *           strings without newlines within the parser.
                    286:  *
                    287:  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
                    288:  *
                    289:  *   CURRENT Returns the current char value, with the full decoding of
                    290:  *           UTF-8 if we are using this mode. It returns an int.
                    291:  *   NEXT    Skip to the next character, this does the proper decoding
                    292:  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
                    293:  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
                    294:  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
                    295:  */
                    296: 
                    297: #define UPPER (toupper(*ctxt->input->cur))
                    298: 
                    299: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
                    300: 
                    301: #define NXT(val) ctxt->input->cur[(val)]
                    302: 
                    303: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
                    304: 
                    305: #define CUR_PTR ctxt->input->cur
                    306: 
                    307: #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
                    308:                   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
                    309:        xmlParserInputShrink(ctxt->input)
                    310: 
                    311: #define GROW if ((ctxt->progressive == 0) &&                           \
                    312:                 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
                    313:        xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
                    314: 
                    315: #define CURRENT ((int) (*ctxt->input->cur))
                    316: 
                    317: #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
                    318: 
                    319: /* Inported from XML */
                    320: 
                    321: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
                    322: #define CUR ((int) (*ctxt->input->cur))
                    323: #define NEXT xmlNextChar(ctxt)
                    324: 
                    325: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
                    326: 
                    327: 
                    328: #define NEXTL(l) do {                                                  \
                    329:     if (*(ctxt->input->cur) == '\n') {                                 \
                    330:        ctxt->input->line++; ctxt->input->col = 1;                      \
                    331:     } else ctxt->input->col++;                                         \
                    332:     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;           \
                    333:   } while (0)
                    334: 
                    335: /************
                    336:     \
                    337:     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);    \
                    338:     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
                    339:  ************/
                    340: 
                    341: #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
                    342: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
                    343: 
                    344: #define COPY_BUF(l,b,i,v)                                              \
                    345:     if (l == 1) b[i++] = (xmlChar) v;                                  \
                    346:     else i += xmlCopyChar(l,&b[i],v)
                    347: 
                    348: /**
                    349:  * htmlFindEncoding:
                    350:  * @the HTML parser context
                    351:  *
                    352:  * Ty to find and encoding in the current data available in the input
                    353:  * buffer this is needed to try to switch to the proper encoding when
                    354:  * one face a character error.
                    355:  * That's an heuristic, since it's operating outside of parsing it could
                    356:  * try to use a meta which had been commented out, that's the reason it
                    357:  * should only be used in case of error, not as a default.
                    358:  *
                    359:  * Returns an encoding string or NULL if not found, the string need to
                    360:  *   be freed
                    361:  */
                    362: static xmlChar *
                    363: htmlFindEncoding(xmlParserCtxtPtr ctxt) {
                    364:     const xmlChar *start, *cur, *end;
                    365: 
                    366:     if ((ctxt == NULL) || (ctxt->input == NULL) ||
                    367:         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
                    368:         (ctxt->input->buf->encoder != NULL))
                    369:         return(NULL);
                    370:     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
                    371:         return(NULL);
                    372: 
                    373:     start = ctxt->input->cur;
                    374:     end = ctxt->input->end;
                    375:     /* we also expect the input buffer to be zero terminated */
                    376:     if (*end != 0)
                    377:         return(NULL);
                    378: 
                    379:     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
                    380:     if (cur == NULL)
                    381:         return(NULL);
                    382:     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
                    383:     if (cur == NULL)
                    384:         return(NULL);
                    385:     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
                    386:     if (cur == NULL)
                    387:         return(NULL);
                    388:     cur += 8;
                    389:     start = cur;
                    390:     while (((*cur >= 'A') && (*cur <= 'Z')) ||
                    391:            ((*cur >= 'a') && (*cur <= 'z')) ||
                    392:            ((*cur >= '0') && (*cur <= '9')) ||
                    393:            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
                    394:            cur++;
                    395:     if (cur == start)
                    396:         return(NULL);
                    397:     return(xmlStrndup(start, cur - start));
                    398: }
                    399: 
                    400: /**
                    401:  * htmlCurrentChar:
                    402:  * @ctxt:  the HTML parser context
                    403:  * @len:  pointer to the length of the char read
                    404:  *
                    405:  * The current char value, if using UTF-8 this may actually span multiple
                    406:  * bytes in the input buffer. Implement the end of line normalization:
                    407:  * 2.11 End-of-Line Handling
                    408:  * If the encoding is unspecified, in the case we find an ISO-Latin-1
                    409:  * char, then the encoding converter is plugged in automatically.
                    410:  *
                    411:  * Returns the current char value and its length
                    412:  */
                    413: 
                    414: static int
                    415: htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
                    416:     if (ctxt->instate == XML_PARSER_EOF)
                    417:        return(0);
                    418: 
                    419:     if (ctxt->token != 0) {
                    420:        *len = 0;
                    421:        return(ctxt->token);
                    422:     }
                    423:     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
                    424:        /*
                    425:         * We are supposed to handle UTF8, check it's valid
                    426:         * From rfc2044: encoding of the Unicode values on UTF-8:
                    427:         *
                    428:         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                    429:         * 0000 0000-0000 007F   0xxxxxxx
                    430:         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                    431:         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
                    432:         *
                    433:         * Check for the 0x110000 limit too
                    434:         */
                    435:        const unsigned char *cur = ctxt->input->cur;
                    436:        unsigned char c;
                    437:        unsigned int val;
                    438: 
                    439:        c = *cur;
                    440:        if (c & 0x80) {
                    441:            if (cur[1] == 0) {
                    442:                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
                    443:                 cur = ctxt->input->cur;
                    444:             }
                    445:            if ((cur[1] & 0xc0) != 0x80)
                    446:                goto encoding_error;
                    447:            if ((c & 0xe0) == 0xe0) {
                    448: 
                    449:                if (cur[2] == 0) {
                    450:                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
                    451:                     cur = ctxt->input->cur;
                    452:                 }
                    453:                if ((cur[2] & 0xc0) != 0x80)
                    454:                    goto encoding_error;
                    455:                if ((c & 0xf0) == 0xf0) {
                    456:                    if (cur[3] == 0) {
                    457:                        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
                    458:                         cur = ctxt->input->cur;
                    459:                     }
                    460:                    if (((c & 0xf8) != 0xf0) ||
                    461:                        ((cur[3] & 0xc0) != 0x80))
                    462:                        goto encoding_error;
                    463:                    /* 4-byte code */
                    464:                    *len = 4;
                    465:                    val = (cur[0] & 0x7) << 18;
                    466:                    val |= (cur[1] & 0x3f) << 12;
                    467:                    val |= (cur[2] & 0x3f) << 6;
                    468:                    val |= cur[3] & 0x3f;
                    469:                } else {
                    470:                  /* 3-byte code */
                    471:                    *len = 3;
                    472:                    val = (cur[0] & 0xf) << 12;
                    473:                    val |= (cur[1] & 0x3f) << 6;
                    474:                    val |= cur[2] & 0x3f;
                    475:                }
                    476:            } else {
                    477:              /* 2-byte code */
                    478:                *len = 2;
                    479:                val = (cur[0] & 0x1f) << 6;
                    480:                val |= cur[1] & 0x3f;
                    481:            }
                    482:            if (!IS_CHAR(val)) {
                    483:                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                    484:                                "Char 0x%X out of allowed range\n", val);
                    485:            }
                    486:            return(val);
                    487:        } else {
                    488:             if ((*ctxt->input->cur == 0) &&
                    489:                 (ctxt->input->cur < ctxt->input->end)) {
                    490:                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                    491:                                "Char 0x%X out of allowed range\n", 0);
                    492:                 *len = 1;
                    493:                 return(' ');
                    494:             }
                    495:            /* 1-byte code */
                    496:            *len = 1;
                    497:            return((int) *ctxt->input->cur);
                    498:        }
                    499:     }
                    500:     /*
                    501:      * Assume it's a fixed length encoding (1) with
                    502:      * a compatible encoding for the ASCII set, since
                    503:      * XML constructs only use < 128 chars
                    504:      */
                    505:     *len = 1;
                    506:     if ((int) *ctxt->input->cur < 0x80)
                    507:        return((int) *ctxt->input->cur);
                    508: 
                    509:     /*
                    510:      * Humm this is bad, do an automatic flow conversion
                    511:      */
                    512:     {
                    513:         xmlChar * guess;
                    514:         xmlCharEncodingHandlerPtr handler;
                    515: 
                    516:         guess = htmlFindEncoding(ctxt);
                    517:         if (guess == NULL) {
                    518:             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
                    519:         } else {
                    520:             if (ctxt->input->encoding != NULL)
                    521:                 xmlFree((xmlChar *) ctxt->input->encoding);
                    522:             ctxt->input->encoding = guess;
                    523:             handler = xmlFindCharEncodingHandler((const char *) guess);
                    524:             if (handler != NULL) {
                    525:                 xmlSwitchToEncoding(ctxt, handler);
                    526:             } else {
                    527:                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
                    528:                              "Unsupported encoding %s", guess, NULL);
                    529:             }
                    530:         }
                    531:         ctxt->charset = XML_CHAR_ENCODING_UTF8;
                    532:     }
                    533: 
                    534:     return(xmlCurrentChar(ctxt, len));
                    535: 
                    536: encoding_error:
                    537:     /*
                    538:      * If we detect an UTF8 error that probably mean that the
                    539:      * input encoding didn't get properly advertized in the
                    540:      * declaration header. Report the error and switch the encoding
                    541:      * to ISO-Latin-1 (if you don't like this policy, just declare the
                    542:      * encoding !)
                    543:      */
                    544:     {
                    545:         char buffer[150];
                    546: 
                    547:        if (ctxt->input->end - ctxt->input->cur >= 4) {
                    548:            snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
                    549:                            ctxt->input->cur[0], ctxt->input->cur[1],
                    550:                            ctxt->input->cur[2], ctxt->input->cur[3]);
                    551:        } else {
                    552:            snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
                    553:        }
                    554:        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
                    555:                     "Input is not proper UTF-8, indicate encoding !\n",
                    556:                     BAD_CAST buffer, NULL);
                    557:     }
                    558: 
                    559:     ctxt->charset = XML_CHAR_ENCODING_8859_1;
                    560:     *len = 1;
                    561:     return((int) *ctxt->input->cur);
                    562: }
                    563: 
                    564: /**
                    565:  * htmlSkipBlankChars:
                    566:  * @ctxt:  the HTML parser context
                    567:  *
                    568:  * skip all blanks character found at that point in the input streams.
                    569:  *
                    570:  * Returns the number of space chars skipped
                    571:  */
                    572: 
                    573: static int
                    574: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
                    575:     int res = 0;
                    576: 
                    577:     while (IS_BLANK_CH(*(ctxt->input->cur))) {
                    578:        if ((*ctxt->input->cur == 0) &&
                    579:            (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
                    580:                xmlPopInput(ctxt);
                    581:        } else {
                    582:            if (*(ctxt->input->cur) == '\n') {
                    583:                ctxt->input->line++; ctxt->input->col = 1;
                    584:            } else ctxt->input->col++;
                    585:            ctxt->input->cur++;
                    586:            ctxt->nbChars++;
                    587:            if (*ctxt->input->cur == 0)
                    588:                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
                    589:        }
                    590:        res++;
                    591:     }
                    592:     return(res);
                    593: }
                    594: 
                    595: 
                    596: 
                    597: /************************************************************************
                    598:  *                                                                     *
                    599:  *     The list of HTML elements and their properties          *
                    600:  *                                                                     *
                    601:  ************************************************************************/
                    602: 
                    603: /*
                    604:  *  Start Tag: 1 means the start tag can be ommited
                    605:  *  End Tag:   1 means the end tag can be ommited
                    606:  *             2 means it's forbidden (empty elements)
                    607:  *             3 means the tag is stylistic and should be closed easily
                    608:  *  Depr:      this element is deprecated
                    609:  *  DTD:       1 means that this element is valid only in the Loose DTD
                    610:  *             2 means that this element is valid only in the Frameset DTD
                    611:  *
                    612:  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
                    613:        , subElements , impliedsubelt , Attributes, userdata
                    614:  */
                    615: 
                    616: /* Definitions and a couple of vars for HTML Elements */
                    617: 
                    618: #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
                    619: #define NB_FONTSTYLE 8
                    620: #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
                    621: #define NB_PHRASE 10
                    622: #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
                    623: #define NB_SPECIAL 16
                    624: #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
                    625: #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
                    626: #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
                    627: #define NB_BLOCK NB_HEADING + NB_LIST + 14
                    628: #define FORMCTRL "input", "select", "textarea", "label", "button"
                    629: #define NB_FORMCTRL 5
                    630: #define PCDATA
                    631: #define NB_PCDATA 0
                    632: #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
                    633: #define NB_HEADING 6
                    634: #define LIST "ul", "ol", "dir", "menu"
                    635: #define NB_LIST 4
                    636: #define MODIFIER
                    637: #define NB_MODIFIER 0
                    638: #define FLOW BLOCK,INLINE
                    639: #define NB_FLOW NB_BLOCK + NB_INLINE
                    640: #define EMPTY NULL
                    641: 
                    642: 
                    643: static const char* const html_flow[] = { FLOW, NULL } ;
                    644: static const char* const html_inline[] = { INLINE, NULL } ;
                    645: 
                    646: /* placeholders: elts with content but no subelements */
                    647: static const char* const html_pcdata[] = { NULL } ;
                    648: #define html_cdata html_pcdata
                    649: 
                    650: 
                    651: /* ... and for HTML Attributes */
                    652: 
                    653: #define COREATTRS "id", "class", "style", "title"
                    654: #define NB_COREATTRS 4
                    655: #define I18N "lang", "dir"
                    656: #define NB_I18N 2
                    657: #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
                    658: #define NB_EVENTS 9
                    659: #define ATTRS COREATTRS,I18N,EVENTS
                    660: #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
                    661: #define CELLHALIGN "align", "char", "charoff"
                    662: #define NB_CELLHALIGN 3
                    663: #define CELLVALIGN "valign"
                    664: #define NB_CELLVALIGN 1
                    665: 
                    666: static const char* const html_attrs[] = { ATTRS, NULL } ;
                    667: static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
                    668: static const char* const core_attrs[] = { COREATTRS, NULL } ;
                    669: static const char* const i18n_attrs[] = { I18N, NULL } ;
                    670: 
                    671: 
                    672: /* Other declarations that should go inline ... */
                    673: static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
                    674:        "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
                    675:        "tabindex", "onfocus", "onblur", NULL } ;
                    676: static const char* const target_attr[] = { "target", NULL } ;
                    677: static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
                    678: static const char* const alt_attr[] = { "alt", NULL } ;
                    679: static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
                    680: static const char* const href_attrs[] = { "href", NULL } ;
                    681: static const char* const clear_attrs[] = { "clear", NULL } ;
                    682: static const char* const inline_p[] = { INLINE, "p", NULL } ;
                    683: 
                    684: static const char* const flow_param[] = { FLOW, "param", NULL } ;
                    685: static const char* const applet_attrs[] = { COREATTRS , "codebase",
                    686:                "archive", "alt", "name", "height", "width", "align",
                    687:                "hspace", "vspace", NULL } ;
                    688: static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
                    689:        "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
                    690: static const char* const basefont_attrs[] =
                    691:        { "id", "size", "color", "face", NULL } ;
                    692: static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
                    693: static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
                    694: static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
                    695: static const char* const body_depr[] = { "background", "bgcolor", "text",
                    696:        "link", "vlink", "alink", NULL } ;
                    697: static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
                    698:        "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
                    699: 
                    700: 
                    701: static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
                    702: static const char* const col_elt[] = { "col", NULL } ;
                    703: static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
                    704: static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
                    705: static const char* const dl_contents[] = { "dt", "dd", NULL } ;
                    706: static const char* const compact_attr[] = { "compact", NULL } ;
                    707: static const char* const label_attr[] = { "label", NULL } ;
                    708: static const char* const fieldset_contents[] = { FLOW, "legend" } ;
                    709: static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
                    710: static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
                    711: static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
                    712: static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
                    713: static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
                    714: static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
                    715: static const char* const head_attrs[] = { I18N, "profile", NULL } ;
                    716: static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
                    717: static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
                    718: static const char* const version_attr[] = { "version", NULL } ;
                    719: static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
                    720: static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
                    721: static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
                    722: static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
                    723: static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
                    724: static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
                    725: static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
                    726: static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
                    727: static const char* const align_attr[] = { "align", NULL } ;
                    728: static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
                    729: static const char* const map_contents[] = { BLOCK, "area", NULL } ;
                    730: static const char* const name_attr[] = { "name", NULL } ;
                    731: static const char* const action_attr[] = { "action", NULL } ;
                    732: static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
1.1.1.2   misho     733: static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
1.1       misho     734: static const char* const content_attr[] = { "content", NULL } ;
                    735: static const char* const type_attr[] = { "type", NULL } ;
                    736: static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
                    737: static const char* const object_contents[] = { FLOW, "param", NULL } ;
                    738: static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
                    739: static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
                    740: static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
                    741: static const char* const option_elt[] = { "option", NULL } ;
                    742: static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
                    743: static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
                    744: static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
                    745: static const char* const width_attr[] = { "width", NULL } ;
                    746: static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
                    747: static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
                    748: static const char* const language_attr[] = { "language", NULL } ;
                    749: static const char* const select_content[] = { "optgroup", "option", NULL } ;
                    750: static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
                    751: static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
                    752: static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
                    753: static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
                    754: static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
                    755: static const char* const tr_elt[] = { "tr", NULL } ;
                    756: static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
                    757: static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
                    758: static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
                    759: static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
                    760: static const char* const tr_contents[] = { "th", "td", NULL } ;
                    761: static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
                    762: static const char* const li_elt[] = { "li", NULL } ;
                    763: static const char* const ul_depr[] = { "type", "compact", NULL} ;
                    764: static const char* const dir_attr[] = { "dir", NULL} ;
                    765: 
                    766: #define DECL (const char**)
                    767: 
                    768: static const htmlElemDesc
                    769: html40ElementTable[] = {
                    770: { "a",         0, 0, 0, 0, 0, 0, 1, "anchor ",
                    771:        DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
                    772: },
                    773: { "abbr",      0, 0, 0, 0, 0, 0, 1, "abbreviated form",
                    774:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    775: },
                    776: { "acronym",   0, 0, 0, 0, 0, 0, 1, "",
                    777:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    778: },
                    779: { "address",   0, 0, 0, 0, 0, 0, 0, "information on author ",
                    780:        DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
                    781: },
                    782: { "applet",    0, 0, 0, 0, 1, 1, 2, "java applet ",
                    783:        DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
                    784: },
                    785: { "area",      0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
                    786:        EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
                    787: },
                    788: { "b",         0, 3, 0, 0, 0, 0, 1, "bold text style",
                    789:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    790: },
                    791: { "base",      0, 2, 2, 1, 0, 0, 0, "document base uri ",
                    792:        EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
                    793: },
                    794: { "basefont",  0, 2, 2, 1, 1, 1, 1, "base font size " ,
                    795:        EMPTY , NULL , NULL, DECL basefont_attrs, NULL
                    796: },
                    797: { "bdo",       0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
                    798:        DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
                    799: },
                    800: { "big",       0, 3, 0, 0, 0, 0, 1, "large text style",
                    801:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    802: },
                    803: { "blockquote",        0, 0, 0, 0, 0, 0, 0, "long quotation ",
                    804:        DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
                    805: },
                    806: { "body",      1, 1, 0, 0, 0, 0, 0, "document body ",
                    807:        DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
                    808: },
                    809: { "br",                0, 2, 2, 1, 0, 0, 1, "forced line break ",
                    810:        EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
                    811: },
                    812: { "button",    0, 0, 0, 0, 0, 0, 2, "push button ",
                    813:        DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
                    814: },
                    815: { "caption",   0, 0, 0, 0, 0, 0, 0, "table caption ",
                    816:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    817: },
                    818: { "center",    0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
                    819:        DECL html_flow , NULL , NULL, DECL html_attrs, NULL
                    820: },
                    821: { "cite",      0, 0, 0, 0, 0, 0, 1, "citation",
                    822:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    823: },
                    824: { "code",      0, 0, 0, 0, 0, 0, 1, "computer code fragment",
                    825:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    826: },
                    827: { "col",       0, 2, 2, 1, 0, 0, 0, "table column ",
                    828:        EMPTY , NULL , DECL col_attrs , NULL, NULL
                    829: },
                    830: { "colgroup",  0, 1, 0, 0, 0, 0, 0, "table column group ",
                    831:        DECL col_elt , "col" , DECL col_attrs , NULL, NULL
                    832: },
                    833: { "dd",                0, 1, 0, 0, 0, 0, 0, "definition description ",
                    834:        DECL html_flow , NULL , DECL html_attrs, NULL, NULL
                    835: },
                    836: { "del",       0, 0, 0, 0, 0, 0, 2, "deleted text ",
                    837:        DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
                    838: },
                    839: { "dfn",       0, 0, 0, 0, 0, 0, 1, "instance definition",
                    840:        DECL html_inline , NULL , DECL html_attrs, NULL, NULL
                    841: },
                    842: { "dir",       0, 0, 0, 0, 1, 1, 0, "directory list",
                    843:        DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
                    844: },
                    845: { "div",       0, 0, 0, 0, 0, 0, 0, "generic language/style container",
                    846:        DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
                    847: },
                    848: { "dl",                0, 0, 0, 0, 0, 0, 0, "definition list ",
                    849:        DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
                    850: },
                    851: { "dt",                0, 1, 0, 0, 0, 0, 0, "definition term ",
                    852:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    853: },
                    854: { "em",                0, 3, 0, 0, 0, 0, 1, "emphasis",
                    855:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    856: },
                    857: { "embed",     0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
                    858:        EMPTY, NULL, DECL embed_attrs, NULL, NULL
                    859: },
                    860: { "fieldset",  0, 0, 0, 0, 0, 0, 0, "form control group ",
                    861:        DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
                    862: },
                    863: { "font",      0, 3, 0, 0, 1, 1, 1, "local change to font ",
                    864:        DECL html_inline, NULL, NULL, DECL font_attrs, NULL
                    865: },
                    866: { "form",      0, 0, 0, 0, 0, 0, 0, "interactive form ",
                    867:        DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
                    868: },
                    869: { "frame",     0, 2, 2, 1, 0, 2, 0, "subwindow " ,
                    870:        EMPTY, NULL, NULL, DECL frame_attrs, NULL
                    871: },
                    872: { "frameset",  0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
                    873:        DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
                    874: },
                    875: { "h1",                0, 0, 0, 0, 0, 0, 0, "heading ",
                    876:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    877: },
                    878: { "h2",                0, 0, 0, 0, 0, 0, 0, "heading ",
                    879:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    880: },
                    881: { "h3",                0, 0, 0, 0, 0, 0, 0, "heading ",
                    882:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    883: },
                    884: { "h4",                0, 0, 0, 0, 0, 0, 0, "heading ",
                    885:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    886: },
                    887: { "h5",                0, 0, 0, 0, 0, 0, 0, "heading ",
                    888:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    889: },
                    890: { "h6",                0, 0, 0, 0, 0, 0, 0, "heading ",
                    891:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    892: },
                    893: { "head",      1, 1, 0, 0, 0, 0, 0, "document head ",
                    894:        DECL head_contents, NULL, DECL head_attrs, NULL, NULL
                    895: },
                    896: { "hr",                0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
                    897:        EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
                    898: },
                    899: { "html",      1, 1, 0, 0, 0, 0, 0, "document root element ",
                    900:        DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
                    901: },
                    902: { "i",         0, 3, 0, 0, 0, 0, 1, "italic text style",
                    903:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    904: },
                    905: { "iframe",    0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
                    906:        DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
                    907: },
                    908: { "img",       0, 2, 2, 1, 0, 0, 1, "embedded image ",
                    909:        EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
                    910: },
                    911: { "input",     0, 2, 2, 1, 0, 0, 1, "form control ",
                    912:        EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
                    913: },
                    914: { "ins",       0, 0, 0, 0, 0, 0, 2, "inserted text",
                    915:        DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
                    916: },
                    917: { "isindex",   0, 2, 2, 1, 1, 1, 0, "single line prompt ",
                    918:        EMPTY, NULL, NULL, DECL prompt_attrs, NULL
                    919: },
                    920: { "kbd",       0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
                    921:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    922: },
                    923: { "label",     0, 0, 0, 0, 0, 0, 1, "form field label text ",
                    924:        DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
                    925: },
                    926: { "legend",    0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
                    927:        DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
                    928: },
                    929: { "li",                0, 1, 1, 0, 0, 0, 0, "list item ",
                    930:        DECL html_flow, NULL, DECL html_attrs, NULL, NULL
                    931: },
                    932: { "link",      0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
                    933:        EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
                    934: },
                    935: { "map",       0, 0, 0, 0, 0, 0, 2, "client-side image map ",
                    936:        DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
                    937: },
                    938: { "menu",      0, 0, 0, 0, 1, 1, 0, "menu list ",
                    939:        DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
                    940: },
                    941: { "meta",      0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
                    942:        EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
                    943: },
                    944: { "noframes",  0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
                    945:        DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
                    946: },
                    947: { "noscript",  0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
                    948:        DECL html_flow, "div", DECL html_attrs, NULL, NULL
                    949: },
                    950: { "object",    0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
                    951:        DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
                    952: },
                    953: { "ol",                0, 0, 0, 0, 0, 0, 0, "ordered list ",
                    954:        DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
                    955: },
                    956: { "optgroup",  0, 0, 0, 0, 0, 0, 0, "option group ",
                    957:        DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
                    958: },
                    959: { "option",    0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
                    960:        DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
                    961: },
                    962: { "p",         0, 1, 0, 0, 0, 0, 0, "paragraph ",
                    963:        DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
                    964: },
                    965: { "param",     0, 2, 2, 1, 0, 0, 0, "named property value ",
                    966:        EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
                    967: },
                    968: { "pre",       0, 0, 0, 0, 0, 0, 0, "preformatted text ",
                    969:        DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
                    970: },
                    971: { "q",         0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
                    972:        DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
                    973: },
                    974: { "s",         0, 3, 0, 0, 1, 1, 1, "strike-through text style",
                    975:        DECL html_inline, NULL, NULL, DECL html_attrs, NULL
                    976: },
                    977: { "samp",      0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
                    978:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    979: },
                    980: { "script",    0, 0, 0, 0, 0, 0, 2, "script statements ",
                    981:        DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
                    982: },
                    983: { "select",    0, 0, 0, 0, 0, 0, 1, "option selector ",
                    984:        DECL select_content, NULL, DECL select_attrs, NULL, NULL
                    985: },
                    986: { "small",     0, 3, 0, 0, 0, 0, 1, "small text style",
                    987:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    988: },
                    989: { "span",      0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
                    990:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    991: },
                    992: { "strike",    0, 3, 0, 0, 1, 1, 1, "strike-through text",
                    993:        DECL html_inline, NULL, NULL, DECL html_attrs, NULL
                    994: },
                    995: { "strong",    0, 3, 0, 0, 0, 0, 1, "strong emphasis",
                    996:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                    997: },
                    998: { "style",     0, 0, 0, 0, 0, 0, 0, "style info ",
                    999:        DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
                   1000: },
                   1001: { "sub",       0, 3, 0, 0, 0, 0, 1, "subscript",
                   1002:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                   1003: },
                   1004: { "sup",       0, 3, 0, 0, 0, 0, 1, "superscript ",
                   1005:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                   1006: },
                   1007: { "table",     0, 0, 0, 0, 0, 0, 0, "",
                   1008:        DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
                   1009: },
                   1010: { "tbody",     1, 0, 0, 0, 0, 0, 0, "table body ",
                   1011:        DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
                   1012: },
                   1013: { "td",                0, 0, 0, 0, 0, 0, 0, "table data cell",
                   1014:        DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
                   1015: },
                   1016: { "textarea",  0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
                   1017:        DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
                   1018: },
                   1019: { "tfoot",     0, 1, 0, 0, 0, 0, 0, "table footer ",
                   1020:        DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
                   1021: },
                   1022: { "th",                0, 1, 0, 0, 0, 0, 0, "table header cell",
                   1023:        DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
                   1024: },
                   1025: { "thead",     0, 1, 0, 0, 0, 0, 0, "table header ",
                   1026:        DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
                   1027: },
                   1028: { "title",     0, 0, 0, 0, 0, 0, 0, "document title ",
                   1029:        DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
                   1030: },
                   1031: { "tr",                0, 0, 0, 0, 0, 0, 0, "table row ",
                   1032:        DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
                   1033: },
                   1034: { "tt",                0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
                   1035:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                   1036: },
                   1037: { "u",         0, 3, 0, 0, 1, 1, 1, "underlined text style",
                   1038:        DECL html_inline, NULL, NULL, DECL html_attrs, NULL
                   1039: },
                   1040: { "ul",                0, 0, 0, 0, 0, 0, 0, "unordered list ",
                   1041:        DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
                   1042: },
                   1043: { "var",       0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
                   1044:        DECL html_inline, NULL, DECL html_attrs, NULL, NULL
                   1045: }
                   1046: };
                   1047: 
                   1048: /*
                   1049:  * start tags that imply the end of current element
                   1050:  */
                   1051: static const char * const htmlStartClose[] = {
                   1052: "form",                "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
                   1053:                "dl", "ul", "ol", "menu", "dir", "address", "pre",
                   1054:                "listing", "xmp", "head", NULL,
                   1055: "head",                "p", NULL,
                   1056: "title",       "p", NULL,
                   1057: "body",                "head", "style", "link", "title", "p", NULL,
                   1058: "frameset",    "head", "style", "link", "title", "p", NULL,
                   1059: "li",          "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
                   1060:                "pre", "listing", "xmp", "head", "li", NULL,
                   1061: "hr",          "p", "head", NULL,
                   1062: "h1",          "p", "head", NULL,
                   1063: "h2",          "p", "head", NULL,
                   1064: "h3",          "p", "head", NULL,
                   1065: "h4",          "p", "head", NULL,
                   1066: "h5",          "p", "head", NULL,
                   1067: "h6",          "p", "head", NULL,
                   1068: "dir",         "p", "head", NULL,
                   1069: "address",     "p", "head", "ul", NULL,
                   1070: "pre",         "p", "head", "ul", NULL,
                   1071: "listing",     "p", "head", NULL,
                   1072: "xmp",         "p", "head", NULL,
                   1073: "blockquote",  "p", "head", NULL,
                   1074: "dl",          "p", "dt", "menu", "dir", "address", "pre", "listing",
                   1075:                "xmp", "head", NULL,
                   1076: "dt",          "p", "menu", "dir", "address", "pre", "listing", "xmp",
                   1077:                 "head", "dd", NULL,
                   1078: "dd",          "p", "menu", "dir", "address", "pre", "listing", "xmp",
                   1079:                 "head", "dt", NULL,
                   1080: "ul",          "p", "head", "ol", "menu", "dir", "address", "pre",
                   1081:                "listing", "xmp", NULL,
                   1082: "ol",          "p", "head", "ul", NULL,
                   1083: "menu",                "p", "head", "ul", NULL,
                   1084: "p",           "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
                   1085: "div",         "p", "head", NULL,
1.1.1.2   misho    1086: "noscript",    "p", NULL,
1.1       misho    1087: "center",      "font", "b", "i", "p", "head", NULL,
1.1.1.3 ! misho    1088: "a",           "a", "head", NULL,
1.1       misho    1089: "caption",     "p", NULL,
                   1090: "colgroup",    "caption", "colgroup", "col", "p", NULL,
                   1091: "col",         "caption", "col", "p", NULL,
                   1092: "table",       "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
                   1093:                "listing", "xmp", "a", NULL,
                   1094: "th",          "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
                   1095: "td",          "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
                   1096: "tr",          "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
                   1097: "thead",       "caption", "col", "colgroup", NULL,
                   1098: "tfoot",       "th", "td", "tr", "caption", "col", "colgroup", "thead",
                   1099:                "tbody", "p", NULL,
                   1100: "tbody",       "th", "td", "tr", "caption", "col", "colgroup", "thead",
                   1101:                "tfoot", "tbody", "p", NULL,
                   1102: "optgroup",    "option", NULL,
                   1103: "option",      "option", NULL,
                   1104: "fieldset",    "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
                   1105:                "pre", "listing", "xmp", "a", NULL,
1.1.1.3 ! misho    1106: /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
        !          1107: "tt",          "head", NULL,
        !          1108: "i",           "head", NULL,
        !          1109: "b",           "head", NULL,
        !          1110: "u",           "head", NULL,
        !          1111: "s",           "head", NULL,
        !          1112: "strike",      "head", NULL,
        !          1113: "big",         "head", NULL,
        !          1114: "small",       "head", NULL,
        !          1115: 
        !          1116: "em",          "head", NULL,
        !          1117: "strong",      "head", NULL,
        !          1118: "dfn",         "head", NULL,
        !          1119: "code",                "head", NULL,
        !          1120: "samp",                "head", NULL,
        !          1121: "kbd",         "head", NULL,
        !          1122: "var",         "head", NULL,
        !          1123: "cite",                "head", NULL,
        !          1124: "abbr",                "head", NULL,
        !          1125: "acronym",     "head", NULL,
        !          1126: 
        !          1127: /* "a" */
        !          1128: "img",         "head", NULL,
        !          1129: /* "applet" */
        !          1130: /* "embed" */
        !          1131: /* "object" */
        !          1132: "font",                "head", NULL,
        !          1133: /* "basefont" */
        !          1134: "br",          "head", NULL,
        !          1135: /* "script" */
        !          1136: "map",         "head", NULL,
        !          1137: "q",           "head", NULL,
        !          1138: "sub",         "head", NULL,
        !          1139: "sup",         "head", NULL,
        !          1140: "span",                "head", NULL,
        !          1141: "bdo",         "head", NULL,
        !          1142: "iframe",      "head", NULL,
1.1       misho    1143: NULL
                   1144: };
                   1145: 
                   1146: /*
                   1147:  * The list of HTML elements which are supposed not to have
                   1148:  * CDATA content and where a p element will be implied
                   1149:  *
                   1150:  * TODO: extend that list by reading the HTML SGML DTD on
                   1151:  *       implied paragraph
                   1152:  */
                   1153: static const char *const htmlNoContentElements[] = {
                   1154:     "html",
                   1155:     "head",
                   1156:     NULL
                   1157: };
                   1158: 
                   1159: /*
                   1160:  * The list of HTML attributes which are of content %Script;
                   1161:  * NOTE: when adding ones, check htmlIsScriptAttribute() since
                   1162:  *       it assumes the name starts with 'on'
                   1163:  */
                   1164: static const char *const htmlScriptAttributes[] = {
                   1165:     "onclick",
                   1166:     "ondblclick",
                   1167:     "onmousedown",
                   1168:     "onmouseup",
                   1169:     "onmouseover",
                   1170:     "onmousemove",
                   1171:     "onmouseout",
                   1172:     "onkeypress",
                   1173:     "onkeydown",
                   1174:     "onkeyup",
                   1175:     "onload",
                   1176:     "onunload",
                   1177:     "onfocus",
                   1178:     "onblur",
                   1179:     "onsubmit",
                   1180:     "onrest",
                   1181:     "onchange",
                   1182:     "onselect"
                   1183: };
                   1184: 
                   1185: /*
                   1186:  * This table is used by the htmlparser to know what to do with
                   1187:  * broken html pages. By assigning different priorities to different
                   1188:  * elements the parser can decide how to handle extra endtags.
                   1189:  * Endtags are only allowed to close elements with lower or equal
                   1190:  * priority.
                   1191:  */
                   1192: 
                   1193: typedef struct {
                   1194:     const char *name;
                   1195:     int priority;
                   1196: } elementPriority;
                   1197: 
                   1198: static const elementPriority htmlEndPriority[] = {
                   1199:     {"div",   150},
                   1200:     {"td",    160},
                   1201:     {"th",    160},
                   1202:     {"tr",    170},
                   1203:     {"thead", 180},
                   1204:     {"tbody", 180},
                   1205:     {"tfoot", 180},
                   1206:     {"table", 190},
                   1207:     {"head",  200},
                   1208:     {"body",  200},
                   1209:     {"html",  220},
                   1210:     {NULL,    100} /* Default priority */
                   1211: };
                   1212: 
                   1213: static const char** htmlStartCloseIndex[100];
                   1214: static int htmlStartCloseIndexinitialized = 0;
                   1215: 
                   1216: /************************************************************************
                   1217:  *                                                                     *
                   1218:  *     functions to handle HTML specific data                  *
                   1219:  *                                                                     *
                   1220:  ************************************************************************/
                   1221: 
                   1222: /**
                   1223:  * htmlInitAutoClose:
                   1224:  *
                   1225:  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
                   1226:  * This is not reentrant. Call xmlInitParser() once before processing in
                   1227:  * case of use in multithreaded programs.
                   1228:  */
                   1229: void
                   1230: htmlInitAutoClose(void) {
                   1231:     int indx, i = 0;
                   1232: 
                   1233:     if (htmlStartCloseIndexinitialized) return;
                   1234: 
                   1235:     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
                   1236:     indx = 0;
                   1237:     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
                   1238:         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
                   1239:        while (htmlStartClose[i] != NULL) i++;
                   1240:        i++;
                   1241:     }
                   1242:     htmlStartCloseIndexinitialized = 1;
                   1243: }
                   1244: 
                   1245: /**
                   1246:  * htmlTagLookup:
                   1247:  * @tag:  The tag name in lowercase
                   1248:  *
                   1249:  * Lookup the HTML tag in the ElementTable
                   1250:  *
                   1251:  * Returns the related htmlElemDescPtr or NULL if not found.
                   1252:  */
                   1253: const htmlElemDesc *
                   1254: htmlTagLookup(const xmlChar *tag) {
                   1255:     unsigned int i;
                   1256: 
                   1257:     for (i = 0; i < (sizeof(html40ElementTable) /
                   1258:                      sizeof(html40ElementTable[0]));i++) {
                   1259:         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
                   1260:            return((htmlElemDescPtr) &html40ElementTable[i]);
                   1261:     }
                   1262:     return(NULL);
                   1263: }
                   1264: 
                   1265: /**
                   1266:  * htmlGetEndPriority:
                   1267:  * @name: The name of the element to look up the priority for.
                   1268:  *
                   1269:  * Return value: The "endtag" priority.
                   1270:  **/
                   1271: static int
                   1272: htmlGetEndPriority (const xmlChar *name) {
                   1273:     int i = 0;
                   1274: 
                   1275:     while ((htmlEndPriority[i].name != NULL) &&
                   1276:           (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
                   1277:        i++;
                   1278: 
                   1279:     return(htmlEndPriority[i].priority);
                   1280: }
                   1281: 
                   1282: 
                   1283: /**
                   1284:  * htmlCheckAutoClose:
                   1285:  * @newtag:  The new tag name
                   1286:  * @oldtag:  The old tag name
                   1287:  *
                   1288:  * Checks whether the new tag is one of the registered valid tags for
                   1289:  * closing old.
                   1290:  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
                   1291:  *
                   1292:  * Returns 0 if no, 1 if yes.
                   1293:  */
                   1294: static int
                   1295: htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
                   1296: {
                   1297:     int i, indx;
                   1298:     const char **closed = NULL;
                   1299: 
                   1300:     if (htmlStartCloseIndexinitialized == 0)
                   1301:         htmlInitAutoClose();
                   1302: 
                   1303:     /* inefficient, but not a big deal */
                   1304:     for (indx = 0; indx < 100; indx++) {
                   1305:         closed = htmlStartCloseIndex[indx];
                   1306:         if (closed == NULL)
                   1307:             return (0);
                   1308:         if (xmlStrEqual(BAD_CAST * closed, newtag))
                   1309:             break;
                   1310:     }
                   1311: 
                   1312:     i = closed - htmlStartClose;
                   1313:     i++;
                   1314:     while (htmlStartClose[i] != NULL) {
                   1315:         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
                   1316:             return (1);
                   1317:         }
                   1318:         i++;
                   1319:     }
                   1320:     return (0);
                   1321: }
                   1322: 
                   1323: /**
                   1324:  * htmlAutoCloseOnClose:
                   1325:  * @ctxt:  an HTML parser context
                   1326:  * @newtag:  The new tag name
                   1327:  * @force:  force the tag closure
                   1328:  *
                   1329:  * The HTML DTD allows an ending tag to implicitly close other tags.
                   1330:  */
                   1331: static void
                   1332: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
                   1333: {
                   1334:     const htmlElemDesc *info;
                   1335:     int i, priority;
                   1336: 
                   1337:     priority = htmlGetEndPriority(newtag);
                   1338: 
                   1339:     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
                   1340: 
                   1341:         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
                   1342:             break;
                   1343:         /*
                   1344:          * A missplaced endtag can only close elements with lower
                   1345:          * or equal priority, so if we find an element with higher
                   1346:          * priority before we find an element with
                   1347:          * matching name, we just ignore this endtag
                   1348:          */
                   1349:         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
                   1350:             return;
                   1351:     }
                   1352:     if (i < 0)
                   1353:         return;
                   1354: 
                   1355:     while (!xmlStrEqual(newtag, ctxt->name)) {
                   1356:         info = htmlTagLookup(ctxt->name);
                   1357:         if ((info != NULL) && (info->endTag == 3)) {
                   1358:             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
                   1359:                         "Opening and ending tag mismatch: %s and %s\n",
                   1360:                         newtag, ctxt->name);
                   1361:         }
                   1362:         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   1363:             ctxt->sax->endElement(ctxt->userData, ctxt->name);
                   1364:        htmlnamePop(ctxt);
                   1365:     }
                   1366: }
                   1367: 
                   1368: /**
                   1369:  * htmlAutoCloseOnEnd:
                   1370:  * @ctxt:  an HTML parser context
                   1371:  *
                   1372:  * Close all remaining tags at the end of the stream
                   1373:  */
                   1374: static void
                   1375: htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
                   1376: {
                   1377:     int i;
                   1378: 
                   1379:     if (ctxt->nameNr == 0)
                   1380:         return;
                   1381:     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
                   1382:         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   1383:             ctxt->sax->endElement(ctxt->userData, ctxt->name);
                   1384:        htmlnamePop(ctxt);
                   1385:     }
                   1386: }
                   1387: 
                   1388: /**
                   1389:  * htmlAutoClose:
                   1390:  * @ctxt:  an HTML parser context
                   1391:  * @newtag:  The new tag name or NULL
                   1392:  *
                   1393:  * The HTML DTD allows a tag to implicitly close other tags.
                   1394:  * The list is kept in htmlStartClose array. This function is
                   1395:  * called when a new tag has been detected and generates the
                   1396:  * appropriates closes if possible/needed.
                   1397:  * If newtag is NULL this mean we are at the end of the resource
                   1398:  * and we should check
                   1399:  */
                   1400: static void
                   1401: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
                   1402: {
                   1403:     while ((newtag != NULL) && (ctxt->name != NULL) &&
                   1404:            (htmlCheckAutoClose(newtag, ctxt->name))) {
                   1405:         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   1406:             ctxt->sax->endElement(ctxt->userData, ctxt->name);
                   1407:        htmlnamePop(ctxt);
                   1408:     }
                   1409:     if (newtag == NULL) {
                   1410:         htmlAutoCloseOnEnd(ctxt);
                   1411:         return;
                   1412:     }
                   1413:     while ((newtag == NULL) && (ctxt->name != NULL) &&
                   1414:            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
                   1415:             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
                   1416:             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
                   1417:         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   1418:             ctxt->sax->endElement(ctxt->userData, ctxt->name);
                   1419:        htmlnamePop(ctxt);
                   1420:     }
                   1421: }
                   1422: 
                   1423: /**
                   1424:  * htmlAutoCloseTag:
                   1425:  * @doc:  the HTML document
                   1426:  * @name:  The tag name
                   1427:  * @elem:  the HTML element
                   1428:  *
                   1429:  * The HTML DTD allows a tag to implicitly close other tags.
                   1430:  * The list is kept in htmlStartClose array. This function checks
                   1431:  * if the element or one of it's children would autoclose the
                   1432:  * given tag.
                   1433:  *
                   1434:  * Returns 1 if autoclose, 0 otherwise
                   1435:  */
                   1436: int
                   1437: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
                   1438:     htmlNodePtr child;
                   1439: 
                   1440:     if (elem == NULL) return(1);
                   1441:     if (xmlStrEqual(name, elem->name)) return(0);
                   1442:     if (htmlCheckAutoClose(elem->name, name)) return(1);
                   1443:     child = elem->children;
                   1444:     while (child != NULL) {
                   1445:         if (htmlAutoCloseTag(doc, name, child)) return(1);
                   1446:        child = child->next;
                   1447:     }
                   1448:     return(0);
                   1449: }
                   1450: 
                   1451: /**
                   1452:  * htmlIsAutoClosed:
                   1453:  * @doc:  the HTML document
                   1454:  * @elem:  the HTML element
                   1455:  *
                   1456:  * The HTML DTD allows a tag to implicitly close other tags.
                   1457:  * The list is kept in htmlStartClose array. This function checks
                   1458:  * if a tag is autoclosed by one of it's child
                   1459:  *
                   1460:  * Returns 1 if autoclosed, 0 otherwise
                   1461:  */
                   1462: int
                   1463: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
                   1464:     htmlNodePtr child;
                   1465: 
                   1466:     if (elem == NULL) return(1);
                   1467:     child = elem->children;
                   1468:     while (child != NULL) {
                   1469:        if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
                   1470:        child = child->next;
                   1471:     }
                   1472:     return(0);
                   1473: }
                   1474: 
                   1475: /**
                   1476:  * htmlCheckImplied:
                   1477:  * @ctxt:  an HTML parser context
                   1478:  * @newtag:  The new tag name
                   1479:  *
                   1480:  * The HTML DTD allows a tag to exists only implicitly
                   1481:  * called when a new tag has been detected and generates the
                   1482:  * appropriates implicit tags if missing
                   1483:  */
                   1484: static void
                   1485: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
                   1486:     int i;
                   1487: 
                   1488:     if (ctxt->options & HTML_PARSE_NOIMPLIED)
                   1489:         return;
                   1490:     if (!htmlOmittedDefaultValue)
                   1491:        return;
                   1492:     if (xmlStrEqual(newtag, BAD_CAST"html"))
                   1493:        return;
                   1494:     if (ctxt->nameNr <= 0) {
                   1495:        htmlnamePush(ctxt, BAD_CAST"html");
                   1496:        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
                   1497:            ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
                   1498:     }
                   1499:     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
                   1500:         return;
                   1501:     if ((ctxt->nameNr <= 1) &&
                   1502:         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
                   1503:         (xmlStrEqual(newtag, BAD_CAST"style")) ||
                   1504:         (xmlStrEqual(newtag, BAD_CAST"meta")) ||
                   1505:         (xmlStrEqual(newtag, BAD_CAST"link")) ||
                   1506:         (xmlStrEqual(newtag, BAD_CAST"title")) ||
                   1507:         (xmlStrEqual(newtag, BAD_CAST"base")))) {
                   1508:         if (ctxt->html >= 3) {
                   1509:             /* we already saw or generated an <head> before */
                   1510:             return;
                   1511:         }
                   1512:         /*
                   1513:          * dropped OBJECT ... i you put it first BODY will be
                   1514:          * assumed !
                   1515:          */
                   1516:         htmlnamePush(ctxt, BAD_CAST"head");
                   1517:         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
                   1518:             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
                   1519:     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
                   1520:               (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
                   1521:               (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
                   1522:         if (ctxt->html >= 10) {
                   1523:             /* we already saw or generated a <body> before */
                   1524:             return;
                   1525:         }
                   1526:        for (i = 0;i < ctxt->nameNr;i++) {
                   1527:            if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
                   1528:                return;
                   1529:            }
                   1530:            if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
                   1531:                return;
                   1532:            }
                   1533:        }
                   1534: 
                   1535:        htmlnamePush(ctxt, BAD_CAST"body");
                   1536:        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
                   1537:            ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
                   1538:     }
                   1539: }
                   1540: 
                   1541: /**
                   1542:  * htmlCheckParagraph
                   1543:  * @ctxt:  an HTML parser context
                   1544:  *
                   1545:  * Check whether a p element need to be implied before inserting
                   1546:  * characters in the current element.
                   1547:  *
                   1548:  * Returns 1 if a paragraph has been inserted, 0 if not and -1
                   1549:  *         in case of error.
                   1550:  */
                   1551: 
                   1552: static int
                   1553: htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
                   1554:     const xmlChar *tag;
                   1555:     int i;
                   1556: 
                   1557:     if (ctxt == NULL)
                   1558:        return(-1);
                   1559:     tag = ctxt->name;
                   1560:     if (tag == NULL) {
                   1561:        htmlAutoClose(ctxt, BAD_CAST"p");
                   1562:        htmlCheckImplied(ctxt, BAD_CAST"p");
                   1563:        htmlnamePush(ctxt, BAD_CAST"p");
                   1564:        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
                   1565:            ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
                   1566:        return(1);
                   1567:     }
                   1568:     if (!htmlOmittedDefaultValue)
                   1569:        return(0);
                   1570:     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
                   1571:        if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
                   1572:            htmlAutoClose(ctxt, BAD_CAST"p");
                   1573:            htmlCheckImplied(ctxt, BAD_CAST"p");
                   1574:            htmlnamePush(ctxt, BAD_CAST"p");
                   1575:            if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
                   1576:                ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
                   1577:            return(1);
                   1578:        }
                   1579:     }
                   1580:     return(0);
                   1581: }
                   1582: 
                   1583: /**
                   1584:  * htmlIsScriptAttribute:
                   1585:  * @name:  an attribute name
                   1586:  *
                   1587:  * Check if an attribute is of content type Script
                   1588:  *
                   1589:  * Returns 1 is the attribute is a script 0 otherwise
                   1590:  */
                   1591: int
                   1592: htmlIsScriptAttribute(const xmlChar *name) {
                   1593:     unsigned int i;
                   1594: 
                   1595:     if (name == NULL)
                   1596:       return(0);
                   1597:     /*
                   1598:      * all script attributes start with 'on'
                   1599:      */
                   1600:     if ((name[0] != 'o') || (name[1] != 'n'))
                   1601:       return(0);
                   1602:     for (i = 0;
                   1603:         i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
                   1604:         i++) {
                   1605:        if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
                   1606:            return(1);
                   1607:     }
                   1608:     return(0);
                   1609: }
                   1610: 
                   1611: /************************************************************************
                   1612:  *                                                                     *
                   1613:  *     The list of HTML predefined entities                    *
                   1614:  *                                                                     *
                   1615:  ************************************************************************/
                   1616: 
                   1617: 
                   1618: static const htmlEntityDesc  html40EntitiesTable[] = {
                   1619: /*
                   1620:  * the 4 absolute ones, plus apostrophe.
                   1621:  */
                   1622: { 34,  "quot", "quotation mark = APL quote, U+0022 ISOnum" },
                   1623: { 38,  "amp",  "ampersand, U+0026 ISOnum" },
                   1624: { 39,  "apos", "single quote" },
                   1625: { 60,  "lt",   "less-than sign, U+003C ISOnum" },
                   1626: { 62,  "gt",   "greater-than sign, U+003E ISOnum" },
                   1627: 
                   1628: /*
                   1629:  * A bunch still in the 128-255 range
                   1630:  * Replacing them depend really on the charset used.
                   1631:  */
                   1632: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
                   1633: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
                   1634: { 162, "cent", "cent sign, U+00A2 ISOnum" },
                   1635: { 163, "pound","pound sign, U+00A3 ISOnum" },
                   1636: { 164, "curren","currency sign, U+00A4 ISOnum" },
                   1637: { 165, "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
                   1638: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
                   1639: { 167, "sect", "section sign, U+00A7 ISOnum" },
                   1640: { 168, "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
                   1641: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
                   1642: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
                   1643: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
                   1644: { 172, "not",  "not sign, U+00AC ISOnum" },
                   1645: { 173, "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
                   1646: { 174, "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
                   1647: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
                   1648: { 176, "deg",  "degree sign, U+00B0 ISOnum" },
                   1649: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
                   1650: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
                   1651: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
                   1652: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
                   1653: { 181, "micro","micro sign, U+00B5 ISOnum" },
                   1654: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
                   1655: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
                   1656: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
                   1657: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
                   1658: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
                   1659: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
                   1660: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
                   1661: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
                   1662: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
                   1663: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
                   1664: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
                   1665: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
                   1666: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
                   1667: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
                   1668: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
                   1669: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
                   1670: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
                   1671: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
                   1672: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
                   1673: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
                   1674: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
                   1675: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
                   1676: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
                   1677: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
                   1678: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
                   1679: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
                   1680: { 208, "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
                   1681: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
                   1682: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
                   1683: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
                   1684: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
                   1685: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
                   1686: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
                   1687: { 215, "times","multiplication sign, U+00D7 ISOnum" },
                   1688: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
                   1689: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
                   1690: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
                   1691: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
                   1692: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
                   1693: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
                   1694: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
                   1695: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
                   1696: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
                   1697: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
                   1698: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
                   1699: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
                   1700: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
                   1701: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
                   1702: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
                   1703: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
                   1704: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
                   1705: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
                   1706: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
                   1707: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
                   1708: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
                   1709: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
                   1710: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
                   1711: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
                   1712: { 240, "eth",  "latin small letter eth, U+00F0 ISOlat1" },
                   1713: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
                   1714: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
                   1715: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
                   1716: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
                   1717: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
                   1718: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
                   1719: { 247, "divide","division sign, U+00F7 ISOnum" },
                   1720: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
                   1721: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
                   1722: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
                   1723: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
                   1724: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
                   1725: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
                   1726: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
                   1727: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
                   1728: 
                   1729: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
                   1730: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
                   1731: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
                   1732: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
                   1733: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
                   1734: 
                   1735: /*
                   1736:  * Anything below should really be kept as entities references
                   1737:  */
                   1738: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
                   1739: 
                   1740: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
                   1741: { 732, "tilde","small tilde, U+02DC ISOdia" },
                   1742: 
                   1743: { 913, "Alpha","greek capital letter alpha, U+0391" },
                   1744: { 914, "Beta", "greek capital letter beta, U+0392" },
                   1745: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
                   1746: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
                   1747: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
                   1748: { 918, "Zeta", "greek capital letter zeta, U+0396" },
                   1749: { 919, "Eta",  "greek capital letter eta, U+0397" },
                   1750: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
                   1751: { 921, "Iota", "greek capital letter iota, U+0399" },
                   1752: { 922, "Kappa","greek capital letter kappa, U+039A" },
                   1753: { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
                   1754: { 924, "Mu",   "greek capital letter mu, U+039C" },
                   1755: { 925, "Nu",   "greek capital letter nu, U+039D" },
                   1756: { 926, "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
                   1757: { 927, "Omicron","greek capital letter omicron, U+039F" },
                   1758: { 928, "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
                   1759: { 929, "Rho",  "greek capital letter rho, U+03A1" },
                   1760: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
                   1761: { 932, "Tau",  "greek capital letter tau, U+03A4" },
                   1762: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
                   1763: { 934, "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
                   1764: { 935, "Chi",  "greek capital letter chi, U+03A7" },
                   1765: { 936, "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
                   1766: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
                   1767: 
                   1768: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
                   1769: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
                   1770: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
                   1771: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
                   1772: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
                   1773: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
                   1774: { 951, "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
                   1775: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
                   1776: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
                   1777: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
                   1778: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
                   1779: { 956, "mu",   "greek small letter mu, U+03BC ISOgrk3" },
                   1780: { 957, "nu",   "greek small letter nu, U+03BD ISOgrk3" },
                   1781: { 958, "xi",   "greek small letter xi, U+03BE ISOgrk3" },
                   1782: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
                   1783: { 960, "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
                   1784: { 961, "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
                   1785: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
                   1786: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
                   1787: { 964, "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
                   1788: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
                   1789: { 966, "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
                   1790: { 967, "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
                   1791: { 968, "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
                   1792: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
                   1793: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
                   1794: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
                   1795: { 982, "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
                   1796: 
                   1797: { 8194,        "ensp", "en space, U+2002 ISOpub" },
                   1798: { 8195,        "emsp", "em space, U+2003 ISOpub" },
                   1799: { 8201,        "thinsp","thin space, U+2009 ISOpub" },
                   1800: { 8204,        "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
                   1801: { 8205,        "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
                   1802: { 8206,        "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
                   1803: { 8207,        "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
                   1804: { 8211,        "ndash","en dash, U+2013 ISOpub" },
                   1805: { 8212,        "mdash","em dash, U+2014 ISOpub" },
                   1806: { 8216,        "lsquo","left single quotation mark, U+2018 ISOnum" },
                   1807: { 8217,        "rsquo","right single quotation mark, U+2019 ISOnum" },
                   1808: { 8218,        "sbquo","single low-9 quotation mark, U+201A NEW" },
                   1809: { 8220,        "ldquo","left double quotation mark, U+201C ISOnum" },
                   1810: { 8221,        "rdquo","right double quotation mark, U+201D ISOnum" },
                   1811: { 8222,        "bdquo","double low-9 quotation mark, U+201E NEW" },
                   1812: { 8224,        "dagger","dagger, U+2020 ISOpub" },
                   1813: { 8225,        "Dagger","double dagger, U+2021 ISOpub" },
                   1814: 
                   1815: { 8226,        "bull", "bullet = black small circle, U+2022 ISOpub" },
                   1816: { 8230,        "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
                   1817: 
                   1818: { 8240,        "permil","per mille sign, U+2030 ISOtech" },
                   1819: 
                   1820: { 8242,        "prime","prime = minutes = feet, U+2032 ISOtech" },
                   1821: { 8243,        "Prime","double prime = seconds = inches, U+2033 ISOtech" },
                   1822: 
                   1823: { 8249,        "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
                   1824: { 8250,        "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
                   1825: 
                   1826: { 8254,        "oline","overline = spacing overscore, U+203E NEW" },
                   1827: { 8260,        "frasl","fraction slash, U+2044 NEW" },
                   1828: 
                   1829: { 8364,        "euro", "euro sign, U+20AC NEW" },
                   1830: 
                   1831: { 8465,        "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
                   1832: { 8472,        "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
                   1833: { 8476,        "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
                   1834: { 8482,        "trade","trade mark sign, U+2122 ISOnum" },
                   1835: { 8501,        "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
                   1836: { 8592,        "larr", "leftwards arrow, U+2190 ISOnum" },
                   1837: { 8593,        "uarr", "upwards arrow, U+2191 ISOnum" },
                   1838: { 8594,        "rarr", "rightwards arrow, U+2192 ISOnum" },
                   1839: { 8595,        "darr", "downwards arrow, U+2193 ISOnum" },
                   1840: { 8596,        "harr", "left right arrow, U+2194 ISOamsa" },
                   1841: { 8629,        "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
                   1842: { 8656,        "lArr", "leftwards double arrow, U+21D0 ISOtech" },
                   1843: { 8657,        "uArr", "upwards double arrow, U+21D1 ISOamsa" },
                   1844: { 8658,        "rArr", "rightwards double arrow, U+21D2 ISOtech" },
                   1845: { 8659,        "dArr", "downwards double arrow, U+21D3 ISOamsa" },
                   1846: { 8660,        "hArr", "left right double arrow, U+21D4 ISOamsa" },
                   1847: 
                   1848: { 8704,        "forall","for all, U+2200 ISOtech" },
                   1849: { 8706,        "part", "partial differential, U+2202 ISOtech" },
                   1850: { 8707,        "exist","there exists, U+2203 ISOtech" },
                   1851: { 8709,        "empty","empty set = null set = diameter, U+2205 ISOamso" },
                   1852: { 8711,        "nabla","nabla = backward difference, U+2207 ISOtech" },
                   1853: { 8712,        "isin", "element of, U+2208 ISOtech" },
                   1854: { 8713,        "notin","not an element of, U+2209 ISOtech" },
                   1855: { 8715,        "ni",   "contains as member, U+220B ISOtech" },
                   1856: { 8719,        "prod", "n-ary product = product sign, U+220F ISOamsb" },
                   1857: { 8721,        "sum",  "n-ary summation, U+2211 ISOamsb" },
                   1858: { 8722,        "minus","minus sign, U+2212 ISOtech" },
                   1859: { 8727,        "lowast","asterisk operator, U+2217 ISOtech" },
                   1860: { 8730,        "radic","square root = radical sign, U+221A ISOtech" },
                   1861: { 8733,        "prop", "proportional to, U+221D ISOtech" },
                   1862: { 8734,        "infin","infinity, U+221E ISOtech" },
                   1863: { 8736,        "ang",  "angle, U+2220 ISOamso" },
                   1864: { 8743,        "and",  "logical and = wedge, U+2227 ISOtech" },
                   1865: { 8744,        "or",   "logical or = vee, U+2228 ISOtech" },
                   1866: { 8745,        "cap",  "intersection = cap, U+2229 ISOtech" },
                   1867: { 8746,        "cup",  "union = cup, U+222A ISOtech" },
                   1868: { 8747,        "int",  "integral, U+222B ISOtech" },
                   1869: { 8756,        "there4","therefore, U+2234 ISOtech" },
                   1870: { 8764,        "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
                   1871: { 8773,        "cong", "approximately equal to, U+2245 ISOtech" },
                   1872: { 8776,        "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
                   1873: { 8800,        "ne",   "not equal to, U+2260 ISOtech" },
                   1874: { 8801,        "equiv","identical to, U+2261 ISOtech" },
                   1875: { 8804,        "le",   "less-than or equal to, U+2264 ISOtech" },
                   1876: { 8805,        "ge",   "greater-than or equal to, U+2265 ISOtech" },
                   1877: { 8834,        "sub",  "subset of, U+2282 ISOtech" },
                   1878: { 8835,        "sup",  "superset of, U+2283 ISOtech" },
                   1879: { 8836,        "nsub", "not a subset of, U+2284 ISOamsn" },
                   1880: { 8838,        "sube", "subset of or equal to, U+2286 ISOtech" },
                   1881: { 8839,        "supe", "superset of or equal to, U+2287 ISOtech" },
                   1882: { 8853,        "oplus","circled plus = direct sum, U+2295 ISOamsb" },
                   1883: { 8855,        "otimes","circled times = vector product, U+2297 ISOamsb" },
                   1884: { 8869,        "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
                   1885: { 8901,        "sdot", "dot operator, U+22C5 ISOamsb" },
                   1886: { 8968,        "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
                   1887: { 8969,        "rceil","right ceiling, U+2309 ISOamsc" },
                   1888: { 8970,        "lfloor","left floor = apl downstile, U+230A ISOamsc" },
                   1889: { 8971,        "rfloor","right floor, U+230B ISOamsc" },
                   1890: { 9001,        "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
                   1891: { 9002,        "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
                   1892: { 9674,        "loz",  "lozenge, U+25CA ISOpub" },
                   1893: 
                   1894: { 9824,        "spades","black spade suit, U+2660 ISOpub" },
                   1895: { 9827,        "clubs","black club suit = shamrock, U+2663 ISOpub" },
                   1896: { 9829,        "hearts","black heart suit = valentine, U+2665 ISOpub" },
                   1897: { 9830,        "diams","black diamond suit, U+2666 ISOpub" },
                   1898: 
                   1899: };
                   1900: 
                   1901: /************************************************************************
                   1902:  *                                                                     *
                   1903:  *             Commodity functions to handle entities                  *
                   1904:  *                                                                     *
                   1905:  ************************************************************************/
                   1906: 
                   1907: /*
                   1908:  * Macro used to grow the current buffer.
                   1909:  */
                   1910: #define growBuffer(buffer) {                                           \
                   1911:     xmlChar *tmp;                                                      \
                   1912:     buffer##_size *= 2;                                                        \
                   1913:     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
                   1914:     if (tmp == NULL) {                                         \
                   1915:        htmlErrMemory(ctxt, "growing buffer\n");                        \
                   1916:        xmlFree(buffer);                                                \
                   1917:        return(NULL);                                                   \
                   1918:     }                                                                  \
                   1919:     buffer = tmp;                                                      \
                   1920: }
                   1921: 
                   1922: /**
                   1923:  * htmlEntityLookup:
                   1924:  * @name: the entity name
                   1925:  *
                   1926:  * Lookup the given entity in EntitiesTable
                   1927:  *
                   1928:  * TODO: the linear scan is really ugly, an hash table is really needed.
                   1929:  *
                   1930:  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
                   1931:  */
                   1932: const htmlEntityDesc *
                   1933: htmlEntityLookup(const xmlChar *name) {
                   1934:     unsigned int i;
                   1935: 
                   1936:     for (i = 0;i < (sizeof(html40EntitiesTable)/
                   1937:                     sizeof(html40EntitiesTable[0]));i++) {
                   1938:         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
                   1939:             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
                   1940:        }
                   1941:     }
                   1942:     return(NULL);
                   1943: }
                   1944: 
                   1945: /**
                   1946:  * htmlEntityValueLookup:
                   1947:  * @value: the entity's unicode value
                   1948:  *
                   1949:  * Lookup the given entity in EntitiesTable
                   1950:  *
                   1951:  * TODO: the linear scan is really ugly, an hash table is really needed.
                   1952:  *
                   1953:  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
                   1954:  */
                   1955: const htmlEntityDesc *
                   1956: htmlEntityValueLookup(unsigned int value) {
                   1957:     unsigned int i;
                   1958: 
                   1959:     for (i = 0;i < (sizeof(html40EntitiesTable)/
                   1960:                     sizeof(html40EntitiesTable[0]));i++) {
                   1961:         if (html40EntitiesTable[i].value >= value) {
                   1962:            if (html40EntitiesTable[i].value > value)
                   1963:                break;
                   1964:             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
                   1965:        }
                   1966:     }
                   1967:     return(NULL);
                   1968: }
                   1969: 
                   1970: /**
                   1971:  * UTF8ToHtml:
                   1972:  * @out:  a pointer to an array of bytes to store the result
                   1973:  * @outlen:  the length of @out
                   1974:  * @in:  a pointer to an array of UTF-8 chars
                   1975:  * @inlen:  the length of @in
                   1976:  *
                   1977:  * Take a block of UTF-8 chars in and try to convert it to an ASCII
                   1978:  * plus HTML entities block of chars out.
                   1979:  *
                   1980:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
                   1981:  * The value of @inlen after return is the number of octets consumed
                   1982:  *     as the return value is positive, else unpredictable.
                   1983:  * The value of @outlen after return is the number of octets consumed.
                   1984:  */
                   1985: int
                   1986: UTF8ToHtml(unsigned char* out, int *outlen,
                   1987:               const unsigned char* in, int *inlen) {
                   1988:     const unsigned char* processed = in;
                   1989:     const unsigned char* outend;
                   1990:     const unsigned char* outstart = out;
                   1991:     const unsigned char* instart = in;
                   1992:     const unsigned char* inend;
                   1993:     unsigned int c, d;
                   1994:     int trailing;
                   1995: 
                   1996:     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
                   1997:     if (in == NULL) {
                   1998:         /*
                   1999:         * initialization nothing to do
                   2000:         */
                   2001:        *outlen = 0;
                   2002:        *inlen = 0;
                   2003:        return(0);
                   2004:     }
                   2005:     inend = in + (*inlen);
                   2006:     outend = out + (*outlen);
                   2007:     while (in < inend) {
                   2008:        d = *in++;
                   2009:        if      (d < 0x80)  { c= d; trailing= 0; }
                   2010:        else if (d < 0xC0) {
                   2011:            /* trailing byte in leading position */
                   2012:            *outlen = out - outstart;
                   2013:            *inlen = processed - instart;
                   2014:            return(-2);
                   2015:         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                   2016:         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                   2017:         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                   2018:        else {
                   2019:            /* no chance for this in Ascii */
                   2020:            *outlen = out - outstart;
                   2021:            *inlen = processed - instart;
                   2022:            return(-2);
                   2023:        }
                   2024: 
                   2025:        if (inend - in < trailing) {
                   2026:            break;
                   2027:        }
                   2028: 
                   2029:        for ( ; trailing; trailing--) {
                   2030:            if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
                   2031:                break;
                   2032:            c <<= 6;
                   2033:            c |= d & 0x3F;
                   2034:        }
                   2035: 
                   2036:        /* assertion: c is a single UTF-4 value */
                   2037:        if (c < 0x80) {
                   2038:            if (out + 1 >= outend)
                   2039:                break;
                   2040:            *out++ = c;
                   2041:        } else {
                   2042:            int len;
                   2043:            const htmlEntityDesc * ent;
                   2044:            const char *cp;
                   2045:            char nbuf[16];
                   2046: 
                   2047:            /*
                   2048:             * Try to lookup a predefined HTML entity for it
                   2049:             */
                   2050: 
                   2051:            ent = htmlEntityValueLookup(c);
                   2052:            if (ent == NULL) {
                   2053:              snprintf(nbuf, sizeof(nbuf), "#%u", c);
                   2054:              cp = nbuf;
                   2055:            }
                   2056:            else
                   2057:              cp = ent->name;
                   2058:            len = strlen(cp);
                   2059:            if (out + 2 + len >= outend)
                   2060:                break;
                   2061:            *out++ = '&';
                   2062:            memcpy(out, cp, len);
                   2063:            out += len;
                   2064:            *out++ = ';';
                   2065:        }
                   2066:        processed = in;
                   2067:     }
                   2068:     *outlen = out - outstart;
                   2069:     *inlen = processed - instart;
                   2070:     return(0);
                   2071: }
                   2072: 
                   2073: /**
                   2074:  * htmlEncodeEntities:
                   2075:  * @out:  a pointer to an array of bytes to store the result
                   2076:  * @outlen:  the length of @out
                   2077:  * @in:  a pointer to an array of UTF-8 chars
                   2078:  * @inlen:  the length of @in
                   2079:  * @quoteChar: the quote character to escape (' or ") or zero.
                   2080:  *
                   2081:  * Take a block of UTF-8 chars in and try to convert it to an ASCII
                   2082:  * plus HTML entities block of chars out.
                   2083:  *
                   2084:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
                   2085:  * The value of @inlen after return is the number of octets consumed
                   2086:  *     as the return value is positive, else unpredictable.
                   2087:  * The value of @outlen after return is the number of octets consumed.
                   2088:  */
                   2089: int
                   2090: htmlEncodeEntities(unsigned char* out, int *outlen,
                   2091:                   const unsigned char* in, int *inlen, int quoteChar) {
                   2092:     const unsigned char* processed = in;
                   2093:     const unsigned char* outend;
                   2094:     const unsigned char* outstart = out;
                   2095:     const unsigned char* instart = in;
                   2096:     const unsigned char* inend;
                   2097:     unsigned int c, d;
                   2098:     int trailing;
                   2099: 
                   2100:     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
                   2101:         return(-1);
                   2102:     outend = out + (*outlen);
                   2103:     inend = in + (*inlen);
                   2104:     while (in < inend) {
                   2105:        d = *in++;
                   2106:        if      (d < 0x80)  { c= d; trailing= 0; }
                   2107:        else if (d < 0xC0) {
                   2108:            /* trailing byte in leading position */
                   2109:            *outlen = out - outstart;
                   2110:            *inlen = processed - instart;
                   2111:            return(-2);
                   2112:         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                   2113:         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                   2114:         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                   2115:        else {
                   2116:            /* no chance for this in Ascii */
                   2117:            *outlen = out - outstart;
                   2118:            *inlen = processed - instart;
                   2119:            return(-2);
                   2120:        }
                   2121: 
                   2122:        if (inend - in < trailing)
                   2123:            break;
                   2124: 
                   2125:        while (trailing--) {
                   2126:            if (((d= *in++) & 0xC0) != 0x80) {
                   2127:                *outlen = out - outstart;
                   2128:                *inlen = processed - instart;
                   2129:                return(-2);
                   2130:            }
                   2131:            c <<= 6;
                   2132:            c |= d & 0x3F;
                   2133:        }
                   2134: 
                   2135:        /* assertion: c is a single UTF-4 value */
                   2136:        if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
                   2137:            (c != '&') && (c != '<') && (c != '>')) {
                   2138:            if (out >= outend)
                   2139:                break;
                   2140:            *out++ = c;
                   2141:        } else {
                   2142:            const htmlEntityDesc * ent;
                   2143:            const char *cp;
                   2144:            char nbuf[16];
                   2145:            int len;
                   2146: 
                   2147:            /*
                   2148:             * Try to lookup a predefined HTML entity for it
                   2149:             */
                   2150:            ent = htmlEntityValueLookup(c);
                   2151:            if (ent == NULL) {
                   2152:                snprintf(nbuf, sizeof(nbuf), "#%u", c);
                   2153:                cp = nbuf;
                   2154:            }
                   2155:            else
                   2156:                cp = ent->name;
                   2157:            len = strlen(cp);
                   2158:            if (out + 2 + len > outend)
                   2159:                break;
                   2160:            *out++ = '&';
                   2161:            memcpy(out, cp, len);
                   2162:            out += len;
                   2163:            *out++ = ';';
                   2164:        }
                   2165:        processed = in;
                   2166:     }
                   2167:     *outlen = out - outstart;
                   2168:     *inlen = processed - instart;
                   2169:     return(0);
                   2170: }
                   2171: 
                   2172: /************************************************************************
                   2173:  *                                                                     *
                   2174:  *             Commodity functions to handle streams                   *
                   2175:  *                                                                     *
                   2176:  ************************************************************************/
                   2177: 
                   2178: /**
                   2179:  * htmlNewInputStream:
                   2180:  * @ctxt:  an HTML parser context
                   2181:  *
                   2182:  * Create a new input stream structure
                   2183:  * Returns the new input stream or NULL
                   2184:  */
                   2185: static htmlParserInputPtr
                   2186: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
                   2187:     htmlParserInputPtr input;
                   2188: 
                   2189:     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
                   2190:     if (input == NULL) {
                   2191:         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
                   2192:        return(NULL);
                   2193:     }
                   2194:     memset(input, 0, sizeof(htmlParserInput));
                   2195:     input->filename = NULL;
                   2196:     input->directory = NULL;
                   2197:     input->base = NULL;
                   2198:     input->cur = NULL;
                   2199:     input->buf = NULL;
                   2200:     input->line = 1;
                   2201:     input->col = 1;
                   2202:     input->buf = NULL;
                   2203:     input->free = NULL;
                   2204:     input->version = NULL;
                   2205:     input->consumed = 0;
                   2206:     input->length = 0;
                   2207:     return(input);
                   2208: }
                   2209: 
                   2210: 
                   2211: /************************************************************************
                   2212:  *                                                                     *
                   2213:  *             Commodity functions, cleanup needed ?                   *
                   2214:  *                                                                     *
                   2215:  ************************************************************************/
                   2216: /*
                   2217:  * all tags allowing pc data from the html 4.01 loose dtd
                   2218:  * NOTE: it might be more apropriate to integrate this information
                   2219:  * into the html40ElementTable array but I don't want to risk any
                   2220:  * binary incomptibility
                   2221:  */
                   2222: static const char *allowPCData[] = {
                   2223:     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
                   2224:     "blockquote", "body", "button", "caption", "center", "cite", "code",
                   2225:     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
                   2226:     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
                   2227:     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
                   2228:     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
                   2229: };
                   2230: 
                   2231: /**
                   2232:  * areBlanks:
                   2233:  * @ctxt:  an HTML parser context
                   2234:  * @str:  a xmlChar *
                   2235:  * @len:  the size of @str
                   2236:  *
                   2237:  * Is this a sequence of blank chars that one can ignore ?
                   2238:  *
                   2239:  * Returns 1 if ignorable 0 otherwise.
                   2240:  */
                   2241: 
                   2242: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
                   2243:     unsigned int i;
                   2244:     int j;
                   2245:     xmlNodePtr lastChild;
                   2246:     xmlDtdPtr dtd;
                   2247: 
                   2248:     for (j = 0;j < len;j++)
                   2249:         if (!(IS_BLANK_CH(str[j]))) return(0);
                   2250: 
                   2251:     if (CUR == 0) return(1);
                   2252:     if (CUR != '<') return(0);
                   2253:     if (ctxt->name == NULL)
                   2254:        return(1);
                   2255:     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
                   2256:        return(1);
                   2257:     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
                   2258:        return(1);
                   2259: 
                   2260:     /* Only strip CDATA children of the body tag for strict HTML DTDs */
                   2261:     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
                   2262:         dtd = xmlGetIntSubset(ctxt->myDoc);
                   2263:         if (dtd != NULL && dtd->ExternalID != NULL) {
                   2264:             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
                   2265:                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
                   2266:                 return(1);
                   2267:         }
                   2268:     }
                   2269: 
                   2270:     if (ctxt->node == NULL) return(0);
                   2271:     lastChild = xmlGetLastChild(ctxt->node);
                   2272:     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
                   2273:        lastChild = lastChild->prev;
                   2274:     if (lastChild == NULL) {
                   2275:         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
                   2276:             (ctxt->node->content != NULL)) return(0);
                   2277:        /* keep ws in constructs like ...<b> </b>...
                   2278:           for all tags "b" allowing PCDATA */
                   2279:        for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
                   2280:            if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
                   2281:                return(0);
                   2282:            }
                   2283:        }
                   2284:     } else if (xmlNodeIsText(lastChild)) {
                   2285:         return(0);
                   2286:     } else {
                   2287:        /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
                   2288:           for all tags "p" allowing PCDATA */
                   2289:        for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
                   2290:            if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
                   2291:                return(0);
                   2292:            }
                   2293:        }
                   2294:     }
                   2295:     return(1);
                   2296: }
                   2297: 
                   2298: /**
                   2299:  * htmlNewDocNoDtD:
                   2300:  * @URI:  URI for the dtd, or NULL
                   2301:  * @ExternalID:  the external ID of the DTD, or NULL
                   2302:  *
                   2303:  * Creates a new HTML document without a DTD node if @URI and @ExternalID
                   2304:  * are NULL
                   2305:  *
                   2306:  * Returns a new document, do not initialize the DTD if not provided
                   2307:  */
                   2308: htmlDocPtr
                   2309: htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
                   2310:     xmlDocPtr cur;
                   2311: 
                   2312:     /*
                   2313:      * Allocate a new document and fill the fields.
                   2314:      */
                   2315:     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
                   2316:     if (cur == NULL) {
                   2317:        htmlErrMemory(NULL, "HTML document creation failed\n");
                   2318:        return(NULL);
                   2319:     }
                   2320:     memset(cur, 0, sizeof(xmlDoc));
                   2321: 
                   2322:     cur->type = XML_HTML_DOCUMENT_NODE;
                   2323:     cur->version = NULL;
                   2324:     cur->intSubset = NULL;
                   2325:     cur->doc = cur;
                   2326:     cur->name = NULL;
                   2327:     cur->children = NULL;
                   2328:     cur->extSubset = NULL;
                   2329:     cur->oldNs = NULL;
                   2330:     cur->encoding = NULL;
                   2331:     cur->standalone = 1;
                   2332:     cur->compression = 0;
                   2333:     cur->ids = NULL;
                   2334:     cur->refs = NULL;
                   2335:     cur->_private = NULL;
                   2336:     cur->charset = XML_CHAR_ENCODING_UTF8;
                   2337:     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
                   2338:     if ((ExternalID != NULL) ||
                   2339:        (URI != NULL))
                   2340:        xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
                   2341:     return(cur);
                   2342: }
                   2343: 
                   2344: /**
                   2345:  * htmlNewDoc:
                   2346:  * @URI:  URI for the dtd, or NULL
                   2347:  * @ExternalID:  the external ID of the DTD, or NULL
                   2348:  *
                   2349:  * Creates a new HTML document
                   2350:  *
                   2351:  * Returns a new document
                   2352:  */
                   2353: htmlDocPtr
                   2354: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
                   2355:     if ((URI == NULL) && (ExternalID == NULL))
                   2356:        return(htmlNewDocNoDtD(
                   2357:                    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
                   2358:                    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
                   2359: 
                   2360:     return(htmlNewDocNoDtD(URI, ExternalID));
                   2361: }
                   2362: 
                   2363: 
                   2364: /************************************************************************
                   2365:  *                                                                     *
                   2366:  *                     The parser itself                               *
                   2367:  *     Relates to http://www.w3.org/TR/html40                          *
                   2368:  *                                                                     *
                   2369:  ************************************************************************/
                   2370: 
                   2371: /************************************************************************
                   2372:  *                                                                     *
                   2373:  *                     The parser itself                               *
                   2374:  *                                                                     *
                   2375:  ************************************************************************/
                   2376: 
                   2377: static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
                   2378: 
                   2379: /**
                   2380:  * htmlParseHTMLName:
                   2381:  * @ctxt:  an HTML parser context
                   2382:  *
                   2383:  * parse an HTML tag or attribute name, note that we convert it to lowercase
                   2384:  * since HTML names are not case-sensitive.
                   2385:  *
                   2386:  * Returns the Tag Name parsed or NULL
                   2387:  */
                   2388: 
                   2389: static const xmlChar *
                   2390: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
                   2391:     int i = 0;
                   2392:     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
                   2393: 
                   2394:     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
                   2395:         (CUR != ':') && (CUR != '.')) return(NULL);
                   2396: 
                   2397:     while ((i < HTML_PARSER_BUFFER_SIZE) &&
                   2398:            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
                   2399:           (CUR == ':') || (CUR == '-') || (CUR == '_') ||
                   2400:            (CUR == '.'))) {
                   2401:        if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
                   2402:         else loc[i] = CUR;
                   2403:        i++;
                   2404: 
                   2405:        NEXT;
                   2406:     }
                   2407: 
                   2408:     return(xmlDictLookup(ctxt->dict, loc, i));
                   2409: }
                   2410: 
                   2411: 
                   2412: /**
                   2413:  * htmlParseHTMLName_nonInvasive:
                   2414:  * @ctxt:  an HTML parser context
                   2415:  *
                   2416:  * parse an HTML tag or attribute name, note that we convert it to lowercase
                   2417:  * since HTML names are not case-sensitive, this doesn't consume the data
                   2418:  * from the stream, it's a look-ahead
                   2419:  *
                   2420:  * Returns the Tag Name parsed or NULL
                   2421:  */
                   2422: 
                   2423: static const xmlChar *
                   2424: htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
                   2425:     int i = 0;
                   2426:     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
                   2427: 
                   2428:     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
                   2429:         (NXT(1) != ':')) return(NULL);
                   2430: 
                   2431:     while ((i < HTML_PARSER_BUFFER_SIZE) &&
                   2432:            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
                   2433:           (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
                   2434:        if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
                   2435:         else loc[i] = NXT(1+i);
                   2436:        i++;
                   2437:     }
                   2438: 
                   2439:     return(xmlDictLookup(ctxt->dict, loc, i));
                   2440: }
                   2441: 
                   2442: 
                   2443: /**
                   2444:  * htmlParseName:
                   2445:  * @ctxt:  an HTML parser context
                   2446:  *
                   2447:  * parse an HTML name, this routine is case sensitive.
                   2448:  *
                   2449:  * Returns the Name parsed or NULL
                   2450:  */
                   2451: 
                   2452: static const xmlChar *
                   2453: htmlParseName(htmlParserCtxtPtr ctxt) {
                   2454:     const xmlChar *in;
                   2455:     const xmlChar *ret;
                   2456:     int count = 0;
                   2457: 
                   2458:     GROW;
                   2459: 
                   2460:     /*
                   2461:      * Accelerator for simple ASCII names
                   2462:      */
                   2463:     in = ctxt->input->cur;
                   2464:     if (((*in >= 0x61) && (*in <= 0x7A)) ||
                   2465:        ((*in >= 0x41) && (*in <= 0x5A)) ||
                   2466:        (*in == '_') || (*in == ':')) {
                   2467:        in++;
                   2468:        while (((*in >= 0x61) && (*in <= 0x7A)) ||
                   2469:               ((*in >= 0x41) && (*in <= 0x5A)) ||
                   2470:               ((*in >= 0x30) && (*in <= 0x39)) ||
                   2471:               (*in == '_') || (*in == '-') ||
                   2472:               (*in == ':') || (*in == '.'))
                   2473:            in++;
                   2474:        if ((*in > 0) && (*in < 0x80)) {
                   2475:            count = in - ctxt->input->cur;
                   2476:            ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
                   2477:            ctxt->input->cur = in;
                   2478:            ctxt->nbChars += count;
                   2479:            ctxt->input->col += count;
                   2480:            return(ret);
                   2481:        }
                   2482:     }
                   2483:     return(htmlParseNameComplex(ctxt));
                   2484: }
                   2485: 
                   2486: static const xmlChar *
                   2487: htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
                   2488:     int len = 0, l;
                   2489:     int c;
                   2490:     int count = 0;
                   2491: 
                   2492:     /*
                   2493:      * Handler for more complex cases
                   2494:      */
                   2495:     GROW;
                   2496:     c = CUR_CHAR(l);
                   2497:     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
                   2498:        (!IS_LETTER(c) && (c != '_') &&
                   2499:          (c != ':'))) {
                   2500:        return(NULL);
                   2501:     }
                   2502: 
                   2503:     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
                   2504:           ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
                   2505:             (c == '.') || (c == '-') ||
                   2506:            (c == '_') || (c == ':') ||
                   2507:            (IS_COMBINING(c)) ||
                   2508:            (IS_EXTENDER(c)))) {
                   2509:        if (count++ > 100) {
                   2510:            count = 0;
                   2511:            GROW;
                   2512:        }
                   2513:        len += l;
                   2514:        NEXTL(l);
                   2515:        c = CUR_CHAR(l);
                   2516:     }
                   2517:     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
                   2518: }
                   2519: 
                   2520: 
                   2521: /**
                   2522:  * htmlParseHTMLAttribute:
                   2523:  * @ctxt:  an HTML parser context
                   2524:  * @stop:  a char stop value
                   2525:  *
                   2526:  * parse an HTML attribute value till the stop (quote), if
                   2527:  * stop is 0 then it stops at the first space
                   2528:  *
                   2529:  * Returns the attribute parsed or NULL
                   2530:  */
                   2531: 
                   2532: static xmlChar *
                   2533: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
                   2534:     xmlChar *buffer = NULL;
                   2535:     int buffer_size = 0;
                   2536:     xmlChar *out = NULL;
                   2537:     const xmlChar *name = NULL;
                   2538:     const xmlChar *cur = NULL;
                   2539:     const htmlEntityDesc * ent;
                   2540: 
                   2541:     /*
                   2542:      * allocate a translation buffer.
                   2543:      */
                   2544:     buffer_size = HTML_PARSER_BUFFER_SIZE;
                   2545:     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
                   2546:     if (buffer == NULL) {
                   2547:        htmlErrMemory(ctxt, "buffer allocation failed\n");
                   2548:        return(NULL);
                   2549:     }
                   2550:     out = buffer;
                   2551: 
                   2552:     /*
                   2553:      * Ok loop until we reach one of the ending chars
                   2554:      */
                   2555:     while ((CUR != 0) && (CUR != stop)) {
                   2556:        if ((stop == 0) && (CUR == '>')) break;
                   2557:        if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
                   2558:         if (CUR == '&') {
                   2559:            if (NXT(1) == '#') {
                   2560:                unsigned int c;
                   2561:                int bits;
                   2562: 
                   2563:                c = htmlParseCharRef(ctxt);
                   2564:                if      (c <    0x80)
                   2565:                        { *out++  = c;                bits= -6; }
                   2566:                else if (c <   0x800)
                   2567:                        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                   2568:                else if (c < 0x10000)
                   2569:                        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                   2570:                else
                   2571:                        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                   2572: 
                   2573:                for ( ; bits >= 0; bits-= 6) {
                   2574:                    *out++  = ((c >> bits) & 0x3F) | 0x80;
                   2575:                }
                   2576: 
                   2577:                if (out - buffer > buffer_size - 100) {
                   2578:                        int indx = out - buffer;
                   2579: 
                   2580:                        growBuffer(buffer);
                   2581:                        out = &buffer[indx];
                   2582:                }
                   2583:            } else {
                   2584:                ent = htmlParseEntityRef(ctxt, &name);
                   2585:                if (name == NULL) {
                   2586:                    *out++ = '&';
                   2587:                    if (out - buffer > buffer_size - 100) {
                   2588:                        int indx = out - buffer;
                   2589: 
                   2590:                        growBuffer(buffer);
                   2591:                        out = &buffer[indx];
                   2592:                    }
                   2593:                } else if (ent == NULL) {
                   2594:                    *out++ = '&';
                   2595:                    cur = name;
                   2596:                    while (*cur != 0) {
                   2597:                        if (out - buffer > buffer_size - 100) {
                   2598:                            int indx = out - buffer;
                   2599: 
                   2600:                            growBuffer(buffer);
                   2601:                            out = &buffer[indx];
                   2602:                        }
                   2603:                        *out++ = *cur++;
                   2604:                    }
                   2605:                } else {
                   2606:                    unsigned int c;
                   2607:                    int bits;
                   2608: 
                   2609:                    if (out - buffer > buffer_size - 100) {
                   2610:                        int indx = out - buffer;
                   2611: 
                   2612:                        growBuffer(buffer);
                   2613:                        out = &buffer[indx];
                   2614:                    }
                   2615:                    c = ent->value;
                   2616:                    if      (c <    0x80)
                   2617:                        { *out++  = c;                bits= -6; }
                   2618:                    else if (c <   0x800)
                   2619:                        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                   2620:                    else if (c < 0x10000)
                   2621:                        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                   2622:                    else
                   2623:                        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                   2624: 
                   2625:                    for ( ; bits >= 0; bits-= 6) {
                   2626:                        *out++  = ((c >> bits) & 0x3F) | 0x80;
                   2627:                    }
                   2628:                }
                   2629:            }
                   2630:        } else {
                   2631:            unsigned int c;
                   2632:            int bits, l;
                   2633: 
                   2634:            if (out - buffer > buffer_size - 100) {
                   2635:                int indx = out - buffer;
                   2636: 
                   2637:                growBuffer(buffer);
                   2638:                out = &buffer[indx];
                   2639:            }
                   2640:            c = CUR_CHAR(l);
                   2641:            if      (c <    0x80)
                   2642:                    { *out++  = c;                bits= -6; }
                   2643:            else if (c <   0x800)
                   2644:                    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                   2645:            else if (c < 0x10000)
                   2646:                    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                   2647:            else
                   2648:                    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                   2649: 
                   2650:            for ( ; bits >= 0; bits-= 6) {
                   2651:                *out++  = ((c >> bits) & 0x3F) | 0x80;
                   2652:            }
                   2653:            NEXT;
                   2654:        }
                   2655:     }
                   2656:     *out = 0;
                   2657:     return(buffer);
                   2658: }
                   2659: 
                   2660: /**
                   2661:  * htmlParseEntityRef:
                   2662:  * @ctxt:  an HTML parser context
                   2663:  * @str:  location to store the entity name
                   2664:  *
                   2665:  * parse an HTML ENTITY references
                   2666:  *
                   2667:  * [68] EntityRef ::= '&' Name ';'
                   2668:  *
                   2669:  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
                   2670:  *         if non-NULL *str will have to be freed by the caller.
                   2671:  */
                   2672: const htmlEntityDesc *
                   2673: htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
                   2674:     const xmlChar *name;
                   2675:     const htmlEntityDesc * ent = NULL;
                   2676: 
                   2677:     if (str != NULL) *str = NULL;
                   2678:     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
                   2679: 
                   2680:     if (CUR == '&') {
                   2681:         NEXT;
                   2682:         name = htmlParseName(ctxt);
                   2683:        if (name == NULL) {
                   2684:            htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                   2685:                         "htmlParseEntityRef: no name\n", NULL, NULL);
                   2686:        } else {
                   2687:            GROW;
                   2688:            if (CUR == ';') {
                   2689:                if (str != NULL)
                   2690:                    *str = name;
                   2691: 
                   2692:                /*
                   2693:                 * Lookup the entity in the table.
                   2694:                 */
                   2695:                ent = htmlEntityLookup(name);
                   2696:                if (ent != NULL) /* OK that's ugly !!! */
                   2697:                    NEXT;
                   2698:            } else {
                   2699:                htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
                   2700:                             "htmlParseEntityRef: expecting ';'\n",
                   2701:                             NULL, NULL);
                   2702:                if (str != NULL)
                   2703:                    *str = name;
                   2704:            }
                   2705:        }
                   2706:     }
                   2707:     return(ent);
                   2708: }
                   2709: 
                   2710: /**
                   2711:  * htmlParseAttValue:
                   2712:  * @ctxt:  an HTML parser context
                   2713:  *
                   2714:  * parse a value for an attribute
                   2715:  * Note: the parser won't do substitution of entities here, this
                   2716:  * will be handled later in xmlStringGetNodeList, unless it was
                   2717:  * asked for ctxt->replaceEntities != 0
                   2718:  *
                   2719:  * Returns the AttValue parsed or NULL.
                   2720:  */
                   2721: 
                   2722: static xmlChar *
                   2723: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
                   2724:     xmlChar *ret = NULL;
                   2725: 
                   2726:     if (CUR == '"') {
                   2727:         NEXT;
                   2728:        ret = htmlParseHTMLAttribute(ctxt, '"');
                   2729:         if (CUR != '"') {
                   2730:            htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
                   2731:                         "AttValue: \" expected\n", NULL, NULL);
                   2732:        } else
                   2733:            NEXT;
                   2734:     } else if (CUR == '\'') {
                   2735:         NEXT;
                   2736:        ret = htmlParseHTMLAttribute(ctxt, '\'');
                   2737:         if (CUR != '\'') {
                   2738:            htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
                   2739:                         "AttValue: ' expected\n", NULL, NULL);
                   2740:        } else
                   2741:            NEXT;
                   2742:     } else {
                   2743:         /*
                   2744:         * That's an HTMLism, the attribute value may not be quoted
                   2745:         */
                   2746:        ret = htmlParseHTMLAttribute(ctxt, 0);
                   2747:        if (ret == NULL) {
                   2748:            htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
                   2749:                         "AttValue: no value found\n", NULL, NULL);
                   2750:        }
                   2751:     }
                   2752:     return(ret);
                   2753: }
                   2754: 
                   2755: /**
                   2756:  * htmlParseSystemLiteral:
                   2757:  * @ctxt:  an HTML parser context
                   2758:  *
                   2759:  * parse an HTML Literal
                   2760:  *
                   2761:  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
                   2762:  *
                   2763:  * Returns the SystemLiteral parsed or NULL
                   2764:  */
                   2765: 
                   2766: static xmlChar *
                   2767: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
                   2768:     const xmlChar *q;
                   2769:     xmlChar *ret = NULL;
                   2770: 
                   2771:     if (CUR == '"') {
                   2772:         NEXT;
                   2773:        q = CUR_PTR;
                   2774:        while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
                   2775:            NEXT;
                   2776:        if (!IS_CHAR_CH(CUR)) {
                   2777:            htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
                   2778:                         "Unfinished SystemLiteral\n", NULL, NULL);
                   2779:        } else {
                   2780:            ret = xmlStrndup(q, CUR_PTR - q);
                   2781:            NEXT;
                   2782:         }
                   2783:     } else if (CUR == '\'') {
                   2784:         NEXT;
                   2785:        q = CUR_PTR;
                   2786:        while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
                   2787:            NEXT;
                   2788:        if (!IS_CHAR_CH(CUR)) {
                   2789:            htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
                   2790:                         "Unfinished SystemLiteral\n", NULL, NULL);
                   2791:        } else {
                   2792:            ret = xmlStrndup(q, CUR_PTR - q);
                   2793:            NEXT;
                   2794:         }
                   2795:     } else {
                   2796:        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
                   2797:                     " or ' expected\n", NULL, NULL);
                   2798:     }
                   2799: 
                   2800:     return(ret);
                   2801: }
                   2802: 
                   2803: /**
                   2804:  * htmlParsePubidLiteral:
                   2805:  * @ctxt:  an HTML parser context
                   2806:  *
                   2807:  * parse an HTML public literal
                   2808:  *
                   2809:  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
                   2810:  *
                   2811:  * Returns the PubidLiteral parsed or NULL.
                   2812:  */
                   2813: 
                   2814: static xmlChar *
                   2815: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
                   2816:     const xmlChar *q;
                   2817:     xmlChar *ret = NULL;
                   2818:     /*
                   2819:      * Name ::= (Letter | '_') (NameChar)*
                   2820:      */
                   2821:     if (CUR == '"') {
                   2822:         NEXT;
                   2823:        q = CUR_PTR;
                   2824:        while (IS_PUBIDCHAR_CH(CUR)) NEXT;
                   2825:        if (CUR != '"') {
                   2826:            htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
                   2827:                         "Unfinished PubidLiteral\n", NULL, NULL);
                   2828:        } else {
                   2829:            ret = xmlStrndup(q, CUR_PTR - q);
                   2830:            NEXT;
                   2831:        }
                   2832:     } else if (CUR == '\'') {
                   2833:         NEXT;
                   2834:        q = CUR_PTR;
                   2835:        while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
                   2836:            NEXT;
                   2837:        if (CUR != '\'') {
                   2838:            htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
                   2839:                         "Unfinished PubidLiteral\n", NULL, NULL);
                   2840:        } else {
                   2841:            ret = xmlStrndup(q, CUR_PTR - q);
                   2842:            NEXT;
                   2843:        }
                   2844:     } else {
                   2845:        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
                   2846:                     "PubidLiteral \" or ' expected\n", NULL, NULL);
                   2847:     }
                   2848: 
                   2849:     return(ret);
                   2850: }
                   2851: 
                   2852: /**
                   2853:  * htmlParseScript:
                   2854:  * @ctxt:  an HTML parser context
                   2855:  *
                   2856:  * parse the content of an HTML SCRIPT or STYLE element
                   2857:  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
                   2858:  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
                   2859:  * http://www.w3.org/TR/html4/types.html#type-script
                   2860:  * http://www.w3.org/TR/html4/types.html#h-6.15
                   2861:  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
                   2862:  *
                   2863:  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
                   2864:  * element and the value of intrinsic event attributes. User agents must
                   2865:  * not evaluate script data as HTML markup but instead must pass it on as
                   2866:  * data to a script engine.
                   2867:  * NOTES:
                   2868:  * - The content is passed like CDATA
                   2869:  * - the attributes for style and scripting "onXXX" are also described
                   2870:  *   as CDATA but SGML allows entities references in attributes so their
                   2871:  *   processing is identical as other attributes
                   2872:  */
                   2873: static void
                   2874: htmlParseScript(htmlParserCtxtPtr ctxt) {
                   2875:     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
                   2876:     int nbchar = 0;
                   2877:     int cur,l;
                   2878: 
                   2879:     SHRINK;
                   2880:     cur = CUR_CHAR(l);
                   2881:     while (IS_CHAR_CH(cur)) {
                   2882:        if ((cur == '<') && (NXT(1) == '/')) {
                   2883:             /*
                   2884:              * One should break here, the specification is clear:
                   2885:              * Authors should therefore escape "</" within the content.
                   2886:              * Escape mechanisms are specific to each scripting or
                   2887:              * style sheet language.
                   2888:              *
                   2889:              * In recovery mode, only break if end tag match the
                   2890:              * current tag, effectively ignoring all tags inside the
                   2891:              * script/style block and treating the entire block as
                   2892:              * CDATA.
                   2893:              */
                   2894:             if (ctxt->recovery) {
                   2895:                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
                   2896:                                   xmlStrlen(ctxt->name)) == 0)
                   2897:                 {
                   2898:                     break; /* while */
                   2899:                 } else {
                   2900:                    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
                   2901:                                 "Element %s embeds close tag\n",
                   2902:                                 ctxt->name, NULL);
                   2903:                }
                   2904:             } else {
                   2905:                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
                   2906:                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
                   2907:                 {
                   2908:                     break; /* while */
                   2909:                 }
                   2910:             }
                   2911:        }
                   2912:        COPY_BUF(l,buf,nbchar,cur);
                   2913:        if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
                   2914:            if (ctxt->sax->cdataBlock!= NULL) {
                   2915:                /*
                   2916:                 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
                   2917:                 */
                   2918:                ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
                   2919:            } else if (ctxt->sax->characters != NULL) {
                   2920:                ctxt->sax->characters(ctxt->userData, buf, nbchar);
                   2921:            }
                   2922:            nbchar = 0;
                   2923:        }
                   2924:        GROW;
                   2925:        NEXTL(l);
                   2926:        cur = CUR_CHAR(l);
                   2927:     }
                   2928: 
                   2929:     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
                   2930:         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                   2931:                     "Invalid char in CDATA 0x%X\n", cur);
                   2932:         if (ctxt->input->cur < ctxt->input->end) {
                   2933:             NEXT;
                   2934:         }
                   2935:     }
                   2936: 
                   2937:     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
                   2938:        if (ctxt->sax->cdataBlock!= NULL) {
                   2939:            /*
                   2940:             * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
                   2941:             */
                   2942:            ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
                   2943:        } else if (ctxt->sax->characters != NULL) {
                   2944:            ctxt->sax->characters(ctxt->userData, buf, nbchar);
                   2945:        }
                   2946:     }
                   2947: }
                   2948: 
                   2949: 
                   2950: /**
                   2951:  * htmlParseCharData:
                   2952:  * @ctxt:  an HTML parser context
                   2953:  *
                   2954:  * parse a CharData section.
                   2955:  * if we are within a CDATA section ']]>' marks an end of section.
                   2956:  *
                   2957:  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
                   2958:  */
                   2959: 
                   2960: static void
                   2961: htmlParseCharData(htmlParserCtxtPtr ctxt) {
                   2962:     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
                   2963:     int nbchar = 0;
                   2964:     int cur, l;
                   2965:     int chunk = 0;
                   2966: 
                   2967:     SHRINK;
                   2968:     cur = CUR_CHAR(l);
                   2969:     while (((cur != '<') || (ctxt->token == '<')) &&
                   2970:            ((cur != '&') || (ctxt->token == '&')) &&
                   2971:           (cur != 0)) {
                   2972:        if (!(IS_CHAR(cur))) {
                   2973:            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                   2974:                        "Invalid char in CDATA 0x%X\n", cur);
                   2975:        } else {
                   2976:            COPY_BUF(l,buf,nbchar,cur);
                   2977:        }
                   2978:        if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
                   2979:            /*
                   2980:             * Ok the segment is to be consumed as chars.
                   2981:             */
                   2982:            if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
                   2983:                if (areBlanks(ctxt, buf, nbchar)) {
1.1.1.3 ! misho    2984:                    if (ctxt->keepBlanks) {
        !          2985:                        if (ctxt->sax->characters != NULL)
        !          2986:                            ctxt->sax->characters(ctxt->userData, buf, nbchar);
        !          2987:                    } else {
        !          2988:                        if (ctxt->sax->ignorableWhitespace != NULL)
        !          2989:                            ctxt->sax->ignorableWhitespace(ctxt->userData,
        !          2990:                                                           buf, nbchar);
        !          2991:                    }
1.1       misho    2992:                } else {
                   2993:                    htmlCheckParagraph(ctxt);
                   2994:                    if (ctxt->sax->characters != NULL)
                   2995:                        ctxt->sax->characters(ctxt->userData, buf, nbchar);
                   2996:                }
                   2997:            }
                   2998:            nbchar = 0;
                   2999:        }
                   3000:        NEXTL(l);
                   3001:         chunk++;
                   3002:         if (chunk > HTML_PARSER_BUFFER_SIZE) {
                   3003:             chunk = 0;
                   3004:             SHRINK;
                   3005:             GROW;
                   3006:         }
                   3007:        cur = CUR_CHAR(l);
                   3008:        if (cur == 0) {
                   3009:            SHRINK;
                   3010:            GROW;
                   3011:            cur = CUR_CHAR(l);
                   3012:        }
                   3013:     }
                   3014:     if (nbchar != 0) {
                   3015:         buf[nbchar] = 0;
                   3016: 
                   3017:        /*
                   3018:         * Ok the segment is to be consumed as chars.
                   3019:         */
                   3020:        if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
                   3021:            if (areBlanks(ctxt, buf, nbchar)) {
1.1.1.3 ! misho    3022:                if (ctxt->keepBlanks) {
        !          3023:                    if (ctxt->sax->characters != NULL)
        !          3024:                        ctxt->sax->characters(ctxt->userData, buf, nbchar);
        !          3025:                } else {
        !          3026:                    if (ctxt->sax->ignorableWhitespace != NULL)
        !          3027:                        ctxt->sax->ignorableWhitespace(ctxt->userData,
        !          3028:                                                       buf, nbchar);
        !          3029:                }
1.1       misho    3030:            } else {
                   3031:                htmlCheckParagraph(ctxt);
                   3032:                if (ctxt->sax->characters != NULL)
                   3033:                    ctxt->sax->characters(ctxt->userData, buf, nbchar);
                   3034:            }
                   3035:        }
                   3036:     } else {
                   3037:        /*
                   3038:         * Loop detection
                   3039:         */
                   3040:        if (cur == 0)
                   3041:            ctxt->instate = XML_PARSER_EOF;
                   3042:     }
                   3043: }
                   3044: 
                   3045: /**
                   3046:  * htmlParseExternalID:
                   3047:  * @ctxt:  an HTML parser context
                   3048:  * @publicID:  a xmlChar** receiving PubidLiteral
                   3049:  *
                   3050:  * Parse an External ID or a Public ID
                   3051:  *
                   3052:  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
                   3053:  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
                   3054:  *
                   3055:  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
                   3056:  *
                   3057:  * Returns the function returns SystemLiteral and in the second
                   3058:  *                case publicID receives PubidLiteral, is strict is off
                   3059:  *                it is possible to return NULL and have publicID set.
                   3060:  */
                   3061: 
                   3062: static xmlChar *
                   3063: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
                   3064:     xmlChar *URI = NULL;
                   3065: 
                   3066:     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
                   3067:          (UPP(2) == 'S') && (UPP(3) == 'T') &&
                   3068:         (UPP(4) == 'E') && (UPP(5) == 'M')) {
                   3069:         SKIP(6);
                   3070:        if (!IS_BLANK_CH(CUR)) {
                   3071:            htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
                   3072:                         "Space required after 'SYSTEM'\n", NULL, NULL);
                   3073:        }
                   3074:         SKIP_BLANKS;
                   3075:        URI = htmlParseSystemLiteral(ctxt);
                   3076:        if (URI == NULL) {
                   3077:            htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
                   3078:                         "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
                   3079:         }
                   3080:     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
                   3081:               (UPP(2) == 'B') && (UPP(3) == 'L') &&
                   3082:               (UPP(4) == 'I') && (UPP(5) == 'C')) {
                   3083:         SKIP(6);
                   3084:        if (!IS_BLANK_CH(CUR)) {
                   3085:            htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
                   3086:                         "Space required after 'PUBLIC'\n", NULL, NULL);
                   3087:        }
                   3088:         SKIP_BLANKS;
                   3089:        *publicID = htmlParsePubidLiteral(ctxt);
                   3090:        if (*publicID == NULL) {
                   3091:            htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
                   3092:                         "htmlParseExternalID: PUBLIC, no Public Identifier\n",
                   3093:                         NULL, NULL);
                   3094:        }
                   3095:         SKIP_BLANKS;
                   3096:         if ((CUR == '"') || (CUR == '\'')) {
                   3097:            URI = htmlParseSystemLiteral(ctxt);
                   3098:        }
                   3099:     }
                   3100:     return(URI);
                   3101: }
                   3102: 
                   3103: /**
                   3104:  * xmlParsePI:
                   3105:  * @ctxt:  an XML parser context
                   3106:  *
                   3107:  * parse an XML Processing Instruction.
                   3108:  *
                   3109:  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
                   3110:  */
                   3111: static void
                   3112: htmlParsePI(htmlParserCtxtPtr ctxt) {
                   3113:     xmlChar *buf = NULL;
                   3114:     int len = 0;
                   3115:     int size = HTML_PARSER_BUFFER_SIZE;
                   3116:     int cur, l;
                   3117:     const xmlChar *target;
                   3118:     xmlParserInputState state;
                   3119:     int count = 0;
                   3120: 
                   3121:     if ((RAW == '<') && (NXT(1) == '?')) {
                   3122:        state = ctxt->instate;
                   3123:         ctxt->instate = XML_PARSER_PI;
                   3124:        /*
                   3125:         * this is a Processing Instruction.
                   3126:         */
                   3127:        SKIP(2);
                   3128:        SHRINK;
                   3129: 
                   3130:        /*
                   3131:         * Parse the target name and check for special support like
                   3132:         * namespace.
                   3133:         */
                   3134:         target = htmlParseName(ctxt);
                   3135:        if (target != NULL) {
                   3136:            if (RAW == '>') {
                   3137:                SKIP(1);
                   3138: 
                   3139:                /*
                   3140:                 * SAX: PI detected.
                   3141:                 */
                   3142:                if ((ctxt->sax) && (!ctxt->disableSAX) &&
                   3143:                    (ctxt->sax->processingInstruction != NULL))
                   3144:                    ctxt->sax->processingInstruction(ctxt->userData,
                   3145:                                                     target, NULL);
                   3146:                ctxt->instate = state;
                   3147:                return;
                   3148:            }
                   3149:            buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
                   3150:            if (buf == NULL) {
                   3151:                htmlErrMemory(ctxt, NULL);
                   3152:                ctxt->instate = state;
                   3153:                return;
                   3154:            }
                   3155:            cur = CUR;
                   3156:            if (!IS_BLANK(cur)) {
                   3157:                htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
                   3158:                          "ParsePI: PI %s space expected\n", target, NULL);
                   3159:            }
                   3160:             SKIP_BLANKS;
                   3161:            cur = CUR_CHAR(l);
                   3162:            while (IS_CHAR(cur) && (cur != '>')) {
                   3163:                if (len + 5 >= size) {
                   3164:                    xmlChar *tmp;
                   3165: 
                   3166:                    size *= 2;
                   3167:                    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
                   3168:                    if (tmp == NULL) {
                   3169:                        htmlErrMemory(ctxt, NULL);
                   3170:                        xmlFree(buf);
                   3171:                        ctxt->instate = state;
                   3172:                        return;
                   3173:                    }
                   3174:                    buf = tmp;
                   3175:                }
                   3176:                count++;
                   3177:                if (count > 50) {
                   3178:                    GROW;
                   3179:                    count = 0;
                   3180:                }
                   3181:                COPY_BUF(l,buf,len,cur);
                   3182:                NEXTL(l);
                   3183:                cur = CUR_CHAR(l);
                   3184:                if (cur == 0) {
                   3185:                    SHRINK;
                   3186:                    GROW;
                   3187:                    cur = CUR_CHAR(l);
                   3188:                }
                   3189:            }
                   3190:            buf[len] = 0;
                   3191:            if (cur != '>') {
                   3192:                htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
                   3193:                      "ParsePI: PI %s never end ...\n", target, NULL);
                   3194:            } else {
                   3195:                SKIP(1);
                   3196: 
                   3197:                /*
                   3198:                 * SAX: PI detected.
                   3199:                 */
                   3200:                if ((ctxt->sax) && (!ctxt->disableSAX) &&
                   3201:                    (ctxt->sax->processingInstruction != NULL))
                   3202:                    ctxt->sax->processingInstruction(ctxt->userData,
                   3203:                                                     target, buf);
                   3204:            }
                   3205:            xmlFree(buf);
                   3206:        } else {
                   3207:            htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
                   3208:                          "PI is not started correctly", NULL, NULL);
                   3209:        }
                   3210:        ctxt->instate = state;
                   3211:     }
                   3212: }
                   3213: 
                   3214: /**
                   3215:  * htmlParseComment:
                   3216:  * @ctxt:  an HTML parser context
                   3217:  *
                   3218:  * Parse an XML (SGML) comment <!-- .... -->
                   3219:  *
                   3220:  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
                   3221:  */
                   3222: static void
                   3223: htmlParseComment(htmlParserCtxtPtr ctxt) {
                   3224:     xmlChar *buf = NULL;
                   3225:     int len;
                   3226:     int size = HTML_PARSER_BUFFER_SIZE;
                   3227:     int q, ql;
                   3228:     int r, rl;
                   3229:     int cur, l;
                   3230:     xmlParserInputState state;
                   3231: 
                   3232:     /*
                   3233:      * Check that there is a comment right here.
                   3234:      */
                   3235:     if ((RAW != '<') || (NXT(1) != '!') ||
                   3236:         (NXT(2) != '-') || (NXT(3) != '-')) return;
                   3237: 
                   3238:     state = ctxt->instate;
                   3239:     ctxt->instate = XML_PARSER_COMMENT;
                   3240:     SHRINK;
                   3241:     SKIP(4);
                   3242:     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
                   3243:     if (buf == NULL) {
                   3244:         htmlErrMemory(ctxt, "buffer allocation failed\n");
                   3245:        ctxt->instate = state;
                   3246:        return;
                   3247:     }
                   3248:     q = CUR_CHAR(ql);
                   3249:     NEXTL(ql);
                   3250:     r = CUR_CHAR(rl);
                   3251:     NEXTL(rl);
                   3252:     cur = CUR_CHAR(l);
                   3253:     len = 0;
                   3254:     while (IS_CHAR(cur) &&
                   3255:            ((cur != '>') ||
                   3256:            (r != '-') || (q != '-'))) {
                   3257:        if (len + 5 >= size) {
                   3258:            xmlChar *tmp;
                   3259: 
                   3260:            size *= 2;
                   3261:            tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
                   3262:            if (tmp == NULL) {
                   3263:                xmlFree(buf);
                   3264:                htmlErrMemory(ctxt, "growing buffer failed\n");
                   3265:                ctxt->instate = state;
                   3266:                return;
                   3267:            }
                   3268:            buf = tmp;
                   3269:        }
                   3270:        COPY_BUF(ql,buf,len,q);
                   3271:        q = r;
                   3272:        ql = rl;
                   3273:        r = cur;
                   3274:        rl = l;
                   3275:        NEXTL(l);
                   3276:        cur = CUR_CHAR(l);
                   3277:        if (cur == 0) {
                   3278:            SHRINK;
                   3279:            GROW;
                   3280:            cur = CUR_CHAR(l);
                   3281:        }
                   3282:     }
                   3283:     buf[len] = 0;
                   3284:     if (!IS_CHAR(cur)) {
                   3285:        htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
                   3286:                     "Comment not terminated \n<!--%.50s\n", buf, NULL);
                   3287:        xmlFree(buf);
                   3288:     } else {
                   3289:         NEXT;
                   3290:        if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
                   3291:            (!ctxt->disableSAX))
                   3292:            ctxt->sax->comment(ctxt->userData, buf);
                   3293:        xmlFree(buf);
                   3294:     }
                   3295:     ctxt->instate = state;
                   3296: }
                   3297: 
                   3298: /**
                   3299:  * htmlParseCharRef:
                   3300:  * @ctxt:  an HTML parser context
                   3301:  *
                   3302:  * parse Reference declarations
                   3303:  *
                   3304:  * [66] CharRef ::= '&#' [0-9]+ ';' |
                   3305:  *                  '&#x' [0-9a-fA-F]+ ';'
                   3306:  *
                   3307:  * Returns the value parsed (as an int)
                   3308:  */
                   3309: int
                   3310: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
                   3311:     int val = 0;
                   3312: 
                   3313:     if ((ctxt == NULL) || (ctxt->input == NULL)) {
                   3314:        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   3315:                     "htmlParseCharRef: context error\n",
                   3316:                     NULL, NULL);
                   3317:         return(0);
                   3318:     }
                   3319:     if ((CUR == '&') && (NXT(1) == '#') &&
                   3320:         ((NXT(2) == 'x') || NXT(2) == 'X')) {
                   3321:        SKIP(3);
                   3322:        while (CUR != ';') {
                   3323:            if ((CUR >= '0') && (CUR <= '9'))
                   3324:                val = val * 16 + (CUR - '0');
                   3325:            else if ((CUR >= 'a') && (CUR <= 'f'))
                   3326:                val = val * 16 + (CUR - 'a') + 10;
                   3327:            else if ((CUR >= 'A') && (CUR <= 'F'))
                   3328:                val = val * 16 + (CUR - 'A') + 10;
                   3329:            else {
                   3330:                htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
                   3331:                             "htmlParseCharRef: missing semicolon\n",
                   3332:                             NULL, NULL);
                   3333:                break;
                   3334:            }
                   3335:            NEXT;
                   3336:        }
                   3337:        if (CUR == ';')
                   3338:            NEXT;
                   3339:     } else if  ((CUR == '&') && (NXT(1) == '#')) {
                   3340:        SKIP(2);
                   3341:        while (CUR != ';') {
                   3342:            if ((CUR >= '0') && (CUR <= '9'))
                   3343:                val = val * 10 + (CUR - '0');
                   3344:            else {
                   3345:                htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
                   3346:                             "htmlParseCharRef: missing semicolon\n",
                   3347:                             NULL, NULL);
                   3348:                break;
                   3349:            }
                   3350:            NEXT;
                   3351:        }
                   3352:        if (CUR == ';')
                   3353:            NEXT;
                   3354:     } else {
                   3355:        htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
                   3356:                     "htmlParseCharRef: invalid value\n", NULL, NULL);
                   3357:     }
                   3358:     /*
                   3359:      * Check the value IS_CHAR ...
                   3360:      */
                   3361:     if (IS_CHAR(val)) {
                   3362:         return(val);
                   3363:     } else {
                   3364:        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                   3365:                        "htmlParseCharRef: invalid xmlChar value %d\n",
                   3366:                        val);
                   3367:     }
                   3368:     return(0);
                   3369: }
                   3370: 
                   3371: 
                   3372: /**
                   3373:  * htmlParseDocTypeDecl:
                   3374:  * @ctxt:  an HTML parser context
                   3375:  *
                   3376:  * parse a DOCTYPE declaration
                   3377:  *
                   3378:  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
                   3379:  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
                   3380:  */
                   3381: 
                   3382: static void
                   3383: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
                   3384:     const xmlChar *name;
                   3385:     xmlChar *ExternalID = NULL;
                   3386:     xmlChar *URI = NULL;
                   3387: 
                   3388:     /*
                   3389:      * We know that '<!DOCTYPE' has been detected.
                   3390:      */
                   3391:     SKIP(9);
                   3392: 
                   3393:     SKIP_BLANKS;
                   3394: 
                   3395:     /*
                   3396:      * Parse the DOCTYPE name.
                   3397:      */
                   3398:     name = htmlParseName(ctxt);
                   3399:     if (name == NULL) {
                   3400:        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                   3401:                     "htmlParseDocTypeDecl : no DOCTYPE name !\n",
                   3402:                     NULL, NULL);
                   3403:     }
                   3404:     /*
                   3405:      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
                   3406:      */
                   3407: 
                   3408:     SKIP_BLANKS;
                   3409: 
                   3410:     /*
                   3411:      * Check for SystemID and ExternalID
                   3412:      */
                   3413:     URI = htmlParseExternalID(ctxt, &ExternalID);
                   3414:     SKIP_BLANKS;
                   3415: 
                   3416:     /*
                   3417:      * We should be at the end of the DOCTYPE declaration.
                   3418:      */
                   3419:     if (CUR != '>') {
                   3420:        htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
                   3421:                     "DOCTYPE improperly terminated\n", NULL, NULL);
                   3422:         /* We shouldn't try to resynchronize ... */
                   3423:     }
                   3424:     NEXT;
                   3425: 
                   3426:     /*
                   3427:      * Create or update the document accordingly to the DOCTYPE
                   3428:      */
                   3429:     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
                   3430:        (!ctxt->disableSAX))
                   3431:        ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
                   3432: 
                   3433:     /*
                   3434:      * Cleanup, since we don't use all those identifiers
                   3435:      */
                   3436:     if (URI != NULL) xmlFree(URI);
                   3437:     if (ExternalID != NULL) xmlFree(ExternalID);
                   3438: }
                   3439: 
                   3440: /**
                   3441:  * htmlParseAttribute:
                   3442:  * @ctxt:  an HTML parser context
                   3443:  * @value:  a xmlChar ** used to store the value of the attribute
                   3444:  *
                   3445:  * parse an attribute
                   3446:  *
                   3447:  * [41] Attribute ::= Name Eq AttValue
                   3448:  *
                   3449:  * [25] Eq ::= S? '=' S?
                   3450:  *
                   3451:  * With namespace:
                   3452:  *
                   3453:  * [NS 11] Attribute ::= QName Eq AttValue
                   3454:  *
                   3455:  * Also the case QName == xmlns:??? is handled independently as a namespace
                   3456:  * definition.
                   3457:  *
                   3458:  * Returns the attribute name, and the value in *value.
                   3459:  */
                   3460: 
                   3461: static const xmlChar *
                   3462: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
                   3463:     const xmlChar *name;
                   3464:     xmlChar *val = NULL;
                   3465: 
                   3466:     *value = NULL;
                   3467:     name = htmlParseHTMLName(ctxt);
                   3468:     if (name == NULL) {
                   3469:        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                   3470:                     "error parsing attribute name\n", NULL, NULL);
                   3471:         return(NULL);
                   3472:     }
                   3473: 
                   3474:     /*
                   3475:      * read the value
                   3476:      */
                   3477:     SKIP_BLANKS;
                   3478:     if (CUR == '=') {
                   3479:         NEXT;
                   3480:        SKIP_BLANKS;
                   3481:        val = htmlParseAttValue(ctxt);
                   3482:     }
                   3483: 
                   3484:     *value = val;
                   3485:     return(name);
                   3486: }
                   3487: 
                   3488: /**
1.1.1.2   misho    3489:  * htmlCheckEncodingDirect:
1.1       misho    3490:  * @ctxt:  an HTML parser context
                   3491:  * @attvalue: the attribute value
                   3492:  *
1.1.1.2   misho    3493:  * Checks an attribute value to detect
1.1       misho    3494:  * the encoding
                   3495:  * If a new encoding is detected the parser is switched to decode
                   3496:  * it and pass UTF8
                   3497:  */
                   3498: static void
1.1.1.2   misho    3499: htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
1.1       misho    3500: 
1.1.1.2   misho    3501:     if ((ctxt == NULL) || (encoding == NULL) ||
                   3502:         (ctxt->options & HTML_PARSE_IGNORE_ENC))
1.1       misho    3503:        return;
                   3504: 
                   3505:     /* do not change encoding */
                   3506:     if (ctxt->input->encoding != NULL)
                   3507:         return;
                   3508: 
                   3509:     if (encoding != NULL) {
                   3510:        xmlCharEncoding enc;
                   3511:        xmlCharEncodingHandlerPtr handler;
                   3512: 
                   3513:        while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
                   3514: 
                   3515:        if (ctxt->input->encoding != NULL)
                   3516:            xmlFree((xmlChar *) ctxt->input->encoding);
                   3517:        ctxt->input->encoding = xmlStrdup(encoding);
                   3518: 
                   3519:        enc = xmlParseCharEncoding((const char *) encoding);
                   3520:        /*
                   3521:         * registered set of known encodings
                   3522:         */
                   3523:        if (enc != XML_CHAR_ENCODING_ERROR) {
                   3524:            if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
                   3525:                 (enc == XML_CHAR_ENCODING_UTF16BE) ||
                   3526:                 (enc == XML_CHAR_ENCODING_UCS4LE) ||
                   3527:                 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
                   3528:                (ctxt->input->buf != NULL) &&
                   3529:                (ctxt->input->buf->encoder == NULL)) {
                   3530:                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
                   3531:                             "htmlCheckEncoding: wrong encoding meta\n",
                   3532:                             NULL, NULL);
                   3533:            } else {
                   3534:                xmlSwitchEncoding(ctxt, enc);
                   3535:            }
                   3536:            ctxt->charset = XML_CHAR_ENCODING_UTF8;
                   3537:        } else {
                   3538:            /*
                   3539:             * fallback for unknown encodings
                   3540:             */
                   3541:            handler = xmlFindCharEncodingHandler((const char *) encoding);
                   3542:            if (handler != NULL) {
                   3543:                xmlSwitchToEncoding(ctxt, handler);
                   3544:                ctxt->charset = XML_CHAR_ENCODING_UTF8;
                   3545:            } else {
1.1.1.2   misho    3546:                htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
                   3547:                             "htmlCheckEncoding: unknown encoding %s\n",
                   3548:                             encoding, NULL);
1.1       misho    3549:            }
                   3550:        }
                   3551: 
                   3552:        if ((ctxt->input->buf != NULL) &&
                   3553:            (ctxt->input->buf->encoder != NULL) &&
                   3554:            (ctxt->input->buf->raw != NULL) &&
                   3555:            (ctxt->input->buf->buffer != NULL)) {
                   3556:            int nbchars;
                   3557:            int processed;
                   3558: 
                   3559:            /*
                   3560:             * convert as much as possible to the parser reading buffer.
                   3561:             */
                   3562:            processed = ctxt->input->cur - ctxt->input->base;
1.1.1.3 ! misho    3563:            xmlBufShrink(ctxt->input->buf->buffer, processed);
        !          3564:            nbchars = xmlCharEncInput(ctxt->input->buf, 1);
1.1       misho    3565:            if (nbchars < 0) {
                   3566:                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
                   3567:                             "htmlCheckEncoding: encoder error\n",
                   3568:                             NULL, NULL);
                   3569:            }
1.1.1.3 ! misho    3570:             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
1.1       misho    3571:        }
                   3572:     }
                   3573: }
                   3574: 
                   3575: /**
1.1.1.2   misho    3576:  * htmlCheckEncoding:
                   3577:  * @ctxt:  an HTML parser context
                   3578:  * @attvalue: the attribute value
                   3579:  *
                   3580:  * Checks an http-equiv attribute from a Meta tag to detect
                   3581:  * the encoding
                   3582:  * If a new encoding is detected the parser is switched to decode
                   3583:  * it and pass UTF8
                   3584:  */
                   3585: static void
                   3586: htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
                   3587:     const xmlChar *encoding;
                   3588: 
                   3589:     if (!attvalue)
                   3590:        return;
                   3591: 
                   3592:     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
                   3593:     if (encoding != NULL) {
                   3594:        encoding += 7;
                   3595:     }
                   3596:     /*
                   3597:      * skip blank
                   3598:      */
                   3599:     if (encoding && IS_BLANK_CH(*encoding))
                   3600:        encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
                   3601:     if (encoding && *encoding == '=') {
                   3602:        encoding ++;
                   3603:        htmlCheckEncodingDirect(ctxt, encoding);
                   3604:     }
                   3605: }
                   3606: 
                   3607: /**
1.1       misho    3608:  * htmlCheckMeta:
                   3609:  * @ctxt:  an HTML parser context
                   3610:  * @atts:  the attributes values
                   3611:  *
                   3612:  * Checks an attributes from a Meta tag
                   3613:  */
                   3614: static void
                   3615: htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
                   3616:     int i;
                   3617:     const xmlChar *att, *value;
                   3618:     int http = 0;
                   3619:     const xmlChar *content = NULL;
                   3620: 
                   3621:     if ((ctxt == NULL) || (atts == NULL))
                   3622:        return;
                   3623: 
                   3624:     i = 0;
                   3625:     att = atts[i++];
                   3626:     while (att != NULL) {
                   3627:        value = atts[i++];
                   3628:        if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
                   3629:         && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
                   3630:            http = 1;
1.1.1.2   misho    3631:        else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
                   3632:            htmlCheckEncodingDirect(ctxt, value);
1.1       misho    3633:        else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
                   3634:            content = value;
                   3635:        att = atts[i++];
                   3636:     }
                   3637:     if ((http) && (content != NULL))
                   3638:        htmlCheckEncoding(ctxt, content);
                   3639: 
                   3640: }
                   3641: 
                   3642: /**
                   3643:  * htmlParseStartTag:
                   3644:  * @ctxt:  an HTML parser context
                   3645:  *
                   3646:  * parse a start of tag either for rule element or
                   3647:  * EmptyElement. In both case we don't parse the tag closing chars.
                   3648:  *
                   3649:  * [40] STag ::= '<' Name (S Attribute)* S? '>'
                   3650:  *
                   3651:  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
                   3652:  *
                   3653:  * With namespace:
                   3654:  *
                   3655:  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
                   3656:  *
                   3657:  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
                   3658:  *
                   3659:  * Returns 0 in case of success, -1 in case of error and 1 if discarded
                   3660:  */
                   3661: 
                   3662: static int
                   3663: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
                   3664:     const xmlChar *name;
                   3665:     const xmlChar *attname;
                   3666:     xmlChar *attvalue;
                   3667:     const xmlChar **atts;
                   3668:     int nbatts = 0;
                   3669:     int maxatts;
                   3670:     int meta = 0;
                   3671:     int i;
                   3672:     int discardtag = 0;
                   3673: 
                   3674:     if (ctxt->instate == XML_PARSER_EOF)
                   3675:         return(-1);
                   3676:     if ((ctxt == NULL) || (ctxt->input == NULL)) {
                   3677:        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   3678:                     "htmlParseStartTag: context error\n", NULL, NULL);
                   3679:        return -1;
                   3680:     }
                   3681:     if (CUR != '<') return -1;
                   3682:     NEXT;
                   3683: 
                   3684:     atts = ctxt->atts;
                   3685:     maxatts = ctxt->maxatts;
                   3686: 
                   3687:     GROW;
                   3688:     name = htmlParseHTMLName(ctxt);
                   3689:     if (name == NULL) {
                   3690:        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                   3691:                     "htmlParseStartTag: invalid element name\n",
                   3692:                     NULL, NULL);
                   3693:        /* Dump the bogus tag like browsers do */
                   3694:        while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
                   3695:                (ctxt->instate != XML_PARSER_EOF))
                   3696:            NEXT;
                   3697:         return -1;
                   3698:     }
                   3699:     if (xmlStrEqual(name, BAD_CAST"meta"))
                   3700:        meta = 1;
                   3701: 
                   3702:     /*
                   3703:      * Check for auto-closure of HTML elements.
                   3704:      */
                   3705:     htmlAutoClose(ctxt, name);
                   3706: 
                   3707:     /*
                   3708:      * Check for implied HTML elements.
                   3709:      */
                   3710:     htmlCheckImplied(ctxt, name);
                   3711: 
                   3712:     /*
                   3713:      * Avoid html at any level > 0, head at any level != 1
                   3714:      * or any attempt to recurse body
                   3715:      */
                   3716:     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
                   3717:        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                   3718:                     "htmlParseStartTag: misplaced <html> tag\n",
                   3719:                     name, NULL);
                   3720:        discardtag = 1;
                   3721:        ctxt->depth++;
                   3722:     }
                   3723:     if ((ctxt->nameNr != 1) &&
                   3724:        (xmlStrEqual(name, BAD_CAST"head"))) {
                   3725:        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                   3726:                     "htmlParseStartTag: misplaced <head> tag\n",
                   3727:                     name, NULL);
                   3728:        discardtag = 1;
                   3729:        ctxt->depth++;
                   3730:     }
                   3731:     if (xmlStrEqual(name, BAD_CAST"body")) {
                   3732:        int indx;
                   3733:        for (indx = 0;indx < ctxt->nameNr;indx++) {
                   3734:            if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
                   3735:                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                   3736:                             "htmlParseStartTag: misplaced <body> tag\n",
                   3737:                             name, NULL);
                   3738:                discardtag = 1;
                   3739:                ctxt->depth++;
                   3740:            }
                   3741:        }
                   3742:     }
                   3743: 
                   3744:     /*
                   3745:      * Now parse the attributes, it ends up with the ending
                   3746:      *
                   3747:      * (S Attribute)* S?
                   3748:      */
                   3749:     SKIP_BLANKS;
                   3750:     while ((IS_CHAR_CH(CUR)) &&
                   3751:            (CUR != '>') &&
                   3752:           ((CUR != '/') || (NXT(1) != '>'))) {
                   3753:        long cons = ctxt->nbChars;
                   3754: 
                   3755:        GROW;
                   3756:        attname = htmlParseAttribute(ctxt, &attvalue);
                   3757:         if (attname != NULL) {
                   3758: 
                   3759:            /*
                   3760:             * Well formedness requires at most one declaration of an attribute
                   3761:             */
                   3762:            for (i = 0; i < nbatts;i += 2) {
                   3763:                if (xmlStrEqual(atts[i], attname)) {
                   3764:                    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
                   3765:                                 "Attribute %s redefined\n", attname, NULL);
                   3766:                    if (attvalue != NULL)
                   3767:                        xmlFree(attvalue);
                   3768:                    goto failed;
                   3769:                }
                   3770:            }
                   3771: 
                   3772:            /*
                   3773:             * Add the pair to atts
                   3774:             */
                   3775:            if (atts == NULL) {
                   3776:                maxatts = 22; /* allow for 10 attrs by default */
                   3777:                atts = (const xmlChar **)
                   3778:                       xmlMalloc(maxatts * sizeof(xmlChar *));
                   3779:                if (atts == NULL) {
                   3780:                    htmlErrMemory(ctxt, NULL);
                   3781:                    if (attvalue != NULL)
                   3782:                        xmlFree(attvalue);
                   3783:                    goto failed;
                   3784:                }
                   3785:                ctxt->atts = atts;
                   3786:                ctxt->maxatts = maxatts;
                   3787:            } else if (nbatts + 4 > maxatts) {
                   3788:                const xmlChar **n;
                   3789: 
                   3790:                maxatts *= 2;
                   3791:                n = (const xmlChar **) xmlRealloc((void *) atts,
                   3792:                                             maxatts * sizeof(const xmlChar *));
                   3793:                if (n == NULL) {
                   3794:                    htmlErrMemory(ctxt, NULL);
                   3795:                    if (attvalue != NULL)
                   3796:                        xmlFree(attvalue);
                   3797:                    goto failed;
                   3798:                }
                   3799:                atts = n;
                   3800:                ctxt->atts = atts;
                   3801:                ctxt->maxatts = maxatts;
                   3802:            }
                   3803:            atts[nbatts++] = attname;
                   3804:            atts[nbatts++] = attvalue;
                   3805:            atts[nbatts] = NULL;
                   3806:            atts[nbatts + 1] = NULL;
                   3807:        }
                   3808:        else {
                   3809:            if (attvalue != NULL)
                   3810:                xmlFree(attvalue);
                   3811:            /* Dump the bogus attribute string up to the next blank or
                   3812:             * the end of the tag. */
                   3813:            while ((IS_CHAR_CH(CUR)) &&
                   3814:                   !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
                   3815:                   ((CUR != '/') || (NXT(1) != '>')))
                   3816:                NEXT;
                   3817:        }
                   3818: 
                   3819: failed:
                   3820:        SKIP_BLANKS;
                   3821:         if (cons == ctxt->nbChars) {
                   3822:            htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   3823:                         "htmlParseStartTag: problem parsing attributes\n",
                   3824:                         NULL, NULL);
                   3825:            break;
                   3826:        }
                   3827:     }
                   3828: 
                   3829:     /*
                   3830:      * Handle specific association to the META tag
                   3831:      */
                   3832:     if (meta && (nbatts != 0))
                   3833:        htmlCheckMeta(ctxt, atts);
                   3834: 
                   3835:     /*
                   3836:      * SAX: Start of Element !
                   3837:      */
                   3838:     if (!discardtag) {
                   3839:        htmlnamePush(ctxt, name);
                   3840:        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
                   3841:            if (nbatts != 0)
                   3842:                ctxt->sax->startElement(ctxt->userData, name, atts);
                   3843:            else
                   3844:                ctxt->sax->startElement(ctxt->userData, name, NULL);
                   3845:        }
                   3846:     }
                   3847: 
                   3848:     if (atts != NULL) {
                   3849:         for (i = 1;i < nbatts;i += 2) {
                   3850:            if (atts[i] != NULL)
                   3851:                xmlFree((xmlChar *) atts[i]);
                   3852:        }
                   3853:     }
                   3854: 
                   3855:     return(discardtag);
                   3856: }
                   3857: 
                   3858: /**
                   3859:  * htmlParseEndTag:
                   3860:  * @ctxt:  an HTML parser context
                   3861:  *
                   3862:  * parse an end of tag
                   3863:  *
                   3864:  * [42] ETag ::= '</' Name S? '>'
                   3865:  *
                   3866:  * With namespace
                   3867:  *
                   3868:  * [NS 9] ETag ::= '</' QName S? '>'
                   3869:  *
                   3870:  * Returns 1 if the current level should be closed.
                   3871:  */
                   3872: 
                   3873: static int
                   3874: htmlParseEndTag(htmlParserCtxtPtr ctxt)
                   3875: {
                   3876:     const xmlChar *name;
                   3877:     const xmlChar *oldname;
                   3878:     int i, ret;
                   3879: 
                   3880:     if ((CUR != '<') || (NXT(1) != '/')) {
                   3881:         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
                   3882:                     "htmlParseEndTag: '</' not found\n", NULL, NULL);
                   3883:         return (0);
                   3884:     }
                   3885:     SKIP(2);
                   3886: 
                   3887:     name = htmlParseHTMLName(ctxt);
                   3888:     if (name == NULL)
                   3889:         return (0);
                   3890:     /*
                   3891:      * We should definitely be at the ending "S? '>'" part
                   3892:      */
                   3893:     SKIP_BLANKS;
                   3894:     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
                   3895:         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
                   3896:                     "End tag : expected '>'\n", NULL, NULL);
                   3897:        if (ctxt->recovery) {
                   3898:            /*
                   3899:             * We're not at the ending > !!
                   3900:             * Error, unless in recover mode where we search forwards
                   3901:             * until we find a >
                   3902:             */
                   3903:            while (CUR != '\0' && CUR != '>') NEXT;
                   3904:            NEXT;
                   3905:        }
                   3906:     } else
                   3907:         NEXT;
                   3908: 
                   3909:     /*
                   3910:      * if we ignored misplaced tags in htmlParseStartTag don't pop them
                   3911:      * out now.
                   3912:      */
                   3913:     if ((ctxt->depth > 0) &&
                   3914:         (xmlStrEqual(name, BAD_CAST "html") ||
                   3915:          xmlStrEqual(name, BAD_CAST "body") ||
                   3916:         xmlStrEqual(name, BAD_CAST "head"))) {
                   3917:        ctxt->depth--;
                   3918:        return (0);
                   3919:     }
                   3920: 
                   3921:     /*
                   3922:      * If the name read is not one of the element in the parsing stack
                   3923:      * then return, it's just an error.
                   3924:      */
                   3925:     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
                   3926:         if (xmlStrEqual(name, ctxt->nameTab[i]))
                   3927:             break;
                   3928:     }
                   3929:     if (i < 0) {
                   3930:         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
                   3931:                     "Unexpected end tag : %s\n", name, NULL);
                   3932:         return (0);
                   3933:     }
                   3934: 
                   3935: 
                   3936:     /*
                   3937:      * Check for auto-closure of HTML elements.
                   3938:      */
                   3939: 
                   3940:     htmlAutoCloseOnClose(ctxt, name);
                   3941: 
                   3942:     /*
                   3943:      * Well formedness constraints, opening and closing must match.
                   3944:      * With the exception that the autoclose may have popped stuff out
                   3945:      * of the stack.
                   3946:      */
                   3947:     if (!xmlStrEqual(name, ctxt->name)) {
                   3948:         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
                   3949:             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
                   3950:                         "Opening and ending tag mismatch: %s and %s\n",
                   3951:                         name, ctxt->name);
                   3952:         }
                   3953:     }
                   3954: 
                   3955:     /*
                   3956:      * SAX: End of Tag
                   3957:      */
                   3958:     oldname = ctxt->name;
                   3959:     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
                   3960:         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   3961:             ctxt->sax->endElement(ctxt->userData, name);
1.1.1.2   misho    3962:        htmlNodeInfoPop(ctxt);
1.1       misho    3963:         htmlnamePop(ctxt);
                   3964:         ret = 1;
                   3965:     } else {
                   3966:         ret = 0;
                   3967:     }
                   3968: 
                   3969:     return (ret);
                   3970: }
                   3971: 
                   3972: 
                   3973: /**
                   3974:  * htmlParseReference:
                   3975:  * @ctxt:  an HTML parser context
                   3976:  *
                   3977:  * parse and handle entity references in content,
                   3978:  * this will end-up in a call to character() since this is either a
                   3979:  * CharRef, or a predefined entity.
                   3980:  */
                   3981: static void
                   3982: htmlParseReference(htmlParserCtxtPtr ctxt) {
                   3983:     const htmlEntityDesc * ent;
                   3984:     xmlChar out[6];
                   3985:     const xmlChar *name;
                   3986:     if (CUR != '&') return;
                   3987: 
                   3988:     if (NXT(1) == '#') {
                   3989:        unsigned int c;
                   3990:        int bits, i = 0;
                   3991: 
                   3992:        c = htmlParseCharRef(ctxt);
                   3993:        if (c == 0)
                   3994:            return;
                   3995: 
                   3996:         if      (c <    0x80) { out[i++]= c;                bits= -6; }
                   3997:         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                   3998:         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                   3999:         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                   4000: 
                   4001:         for ( ; bits >= 0; bits-= 6) {
                   4002:             out[i++]= ((c >> bits) & 0x3F) | 0x80;
                   4003:         }
                   4004:        out[i] = 0;
                   4005: 
                   4006:        htmlCheckParagraph(ctxt);
                   4007:        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
                   4008:            ctxt->sax->characters(ctxt->userData, out, i);
                   4009:     } else {
                   4010:        ent = htmlParseEntityRef(ctxt, &name);
                   4011:        if (name == NULL) {
                   4012:            htmlCheckParagraph(ctxt);
                   4013:            if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
                   4014:                ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
                   4015:            return;
                   4016:        }
                   4017:        if ((ent == NULL) || !(ent->value > 0)) {
                   4018:            htmlCheckParagraph(ctxt);
                   4019:            if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
                   4020:                ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
                   4021:                ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
                   4022:                /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
                   4023:            }
                   4024:        } else {
                   4025:            unsigned int c;
                   4026:            int bits, i = 0;
                   4027: 
                   4028:            c = ent->value;
                   4029:            if      (c <    0x80)
                   4030:                    { out[i++]= c;                bits= -6; }
                   4031:            else if (c <   0x800)
                   4032:                    { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                   4033:            else if (c < 0x10000)
                   4034:                    { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                   4035:            else
                   4036:                    { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                   4037: 
                   4038:            for ( ; bits >= 0; bits-= 6) {
                   4039:                out[i++]= ((c >> bits) & 0x3F) | 0x80;
                   4040:            }
                   4041:            out[i] = 0;
                   4042: 
                   4043:            htmlCheckParagraph(ctxt);
                   4044:            if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
                   4045:                ctxt->sax->characters(ctxt->userData, out, i);
                   4046:        }
                   4047:     }
                   4048: }
                   4049: 
                   4050: /**
                   4051:  * htmlParseContent:
                   4052:  * @ctxt:  an HTML parser context
                   4053:  *
                   4054:  * Parse a content: comment, sub-element, reference or text.
                   4055:  * Kept for compatibility with old code
                   4056:  */
                   4057: 
                   4058: static void
                   4059: htmlParseContent(htmlParserCtxtPtr ctxt) {
                   4060:     xmlChar *currentNode;
                   4061:     int depth;
                   4062:     const xmlChar *name;
                   4063: 
                   4064:     currentNode = xmlStrdup(ctxt->name);
                   4065:     depth = ctxt->nameNr;
                   4066:     while (1) {
                   4067:        long cons = ctxt->nbChars;
                   4068: 
                   4069:         GROW;
                   4070: 
                   4071:         if (ctxt->instate == XML_PARSER_EOF)
                   4072:             break;
                   4073: 
                   4074:        /*
                   4075:         * Our tag or one of it's parent or children is ending.
                   4076:         */
                   4077:         if ((CUR == '<') && (NXT(1) == '/')) {
                   4078:            if (htmlParseEndTag(ctxt) &&
                   4079:                ((currentNode != NULL) || (ctxt->nameNr == 0))) {
                   4080:                if (currentNode != NULL)
                   4081:                    xmlFree(currentNode);
                   4082:                return;
                   4083:            }
                   4084:            continue; /* while */
                   4085:         }
                   4086: 
                   4087:        else if ((CUR == '<') &&
                   4088:                 ((IS_ASCII_LETTER(NXT(1))) ||
                   4089:                  (NXT(1) == '_') || (NXT(1) == ':'))) {
                   4090:            name = htmlParseHTMLName_nonInvasive(ctxt);
                   4091:            if (name == NULL) {
                   4092:                htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                   4093:                         "htmlParseStartTag: invalid element name\n",
                   4094:                         NULL, NULL);
                   4095:                /* Dump the bogus tag like browsers do */
                   4096:         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
                   4097:                    NEXT;
                   4098: 
                   4099:                if (currentNode != NULL)
                   4100:                    xmlFree(currentNode);
                   4101:                return;
                   4102:            }
                   4103: 
                   4104:            if (ctxt->name != NULL) {
                   4105:                if (htmlCheckAutoClose(name, ctxt->name) == 1) {
                   4106:                    htmlAutoClose(ctxt, name);
                   4107:                    continue;
                   4108:                }
                   4109:            }
                   4110:        }
                   4111: 
                   4112:        /*
                   4113:         * Has this node been popped out during parsing of
                   4114:         * the next element
                   4115:         */
                   4116:         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
                   4117:            (!xmlStrEqual(currentNode, ctxt->name)))
                   4118:             {
                   4119:            if (currentNode != NULL) xmlFree(currentNode);
                   4120:            return;
                   4121:        }
                   4122: 
                   4123:        if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
                   4124:            (xmlStrEqual(currentNode, BAD_CAST"style")))) {
                   4125:            /*
                   4126:             * Handle SCRIPT/STYLE separately
                   4127:             */
                   4128:            htmlParseScript(ctxt);
                   4129:        } else {
                   4130:            /*
                   4131:             * Sometimes DOCTYPE arrives in the middle of the document
                   4132:             */
                   4133:            if ((CUR == '<') && (NXT(1) == '!') &&
                   4134:                (UPP(2) == 'D') && (UPP(3) == 'O') &&
                   4135:                (UPP(4) == 'C') && (UPP(5) == 'T') &&
                   4136:                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                   4137:                (UPP(8) == 'E')) {
                   4138:                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                   4139:                             "Misplaced DOCTYPE declaration\n",
                   4140:                             BAD_CAST "DOCTYPE" , NULL);
                   4141:                htmlParseDocTypeDecl(ctxt);
                   4142:            }
                   4143: 
                   4144:            /*
                   4145:             * First case :  a comment
                   4146:             */
                   4147:            if ((CUR == '<') && (NXT(1) == '!') &&
                   4148:                (NXT(2) == '-') && (NXT(3) == '-')) {
                   4149:                htmlParseComment(ctxt);
                   4150:            }
                   4151: 
                   4152:            /*
                   4153:             * Second case : a Processing Instruction.
                   4154:             */
                   4155:            else if ((CUR == '<') && (NXT(1) == '?')) {
                   4156:                htmlParsePI(ctxt);
                   4157:            }
                   4158: 
                   4159:            /*
                   4160:             * Third case :  a sub-element.
                   4161:             */
                   4162:            else if (CUR == '<') {
                   4163:                htmlParseElement(ctxt);
                   4164:            }
                   4165: 
                   4166:            /*
                   4167:             * Fourth case : a reference. If if has not been resolved,
                   4168:             *    parsing returns it's Name, create the node
                   4169:             */
                   4170:            else if (CUR == '&') {
                   4171:                htmlParseReference(ctxt);
                   4172:            }
                   4173: 
                   4174:            /*
                   4175:             * Fifth case : end of the resource
                   4176:             */
                   4177:            else if (CUR == 0) {
                   4178:                htmlAutoCloseOnEnd(ctxt);
                   4179:                break;
                   4180:            }
                   4181: 
                   4182:            /*
                   4183:             * Last case, text. Note that References are handled directly.
                   4184:             */
                   4185:            else {
                   4186:                htmlParseCharData(ctxt);
                   4187:            }
                   4188: 
                   4189:            if (cons == ctxt->nbChars) {
                   4190:                if (ctxt->node != NULL) {
                   4191:                    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   4192:                                 "detected an error in element content\n",
                   4193:                                 NULL, NULL);
                   4194:                }
                   4195:                break;
                   4196:            }
                   4197:        }
                   4198:         GROW;
                   4199:     }
                   4200:     if (currentNode != NULL) xmlFree(currentNode);
                   4201: }
                   4202: 
                   4203: /**
                   4204:  * htmlParseElement:
                   4205:  * @ctxt:  an HTML parser context
                   4206:  *
                   4207:  * parse an HTML element, this is highly recursive
                   4208:  * this is kept for compatibility with previous code versions
                   4209:  *
                   4210:  * [39] element ::= EmptyElemTag | STag content ETag
                   4211:  *
                   4212:  * [41] Attribute ::= Name Eq AttValue
                   4213:  */
                   4214: 
                   4215: void
                   4216: htmlParseElement(htmlParserCtxtPtr ctxt) {
                   4217:     const xmlChar *name;
                   4218:     xmlChar *currentNode = NULL;
                   4219:     const htmlElemDesc * info;
                   4220:     htmlParserNodeInfo node_info;
                   4221:     int failed;
                   4222:     int depth;
                   4223:     const xmlChar *oldptr;
                   4224: 
                   4225:     if ((ctxt == NULL) || (ctxt->input == NULL)) {
                   4226:        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   4227:                     "htmlParseElement: context error\n", NULL, NULL);
                   4228:        return;
                   4229:     }
                   4230: 
                   4231:     if (ctxt->instate == XML_PARSER_EOF)
                   4232:         return;
                   4233: 
                   4234:     /* Capture start position */
                   4235:     if (ctxt->record_info) {
                   4236:         node_info.begin_pos = ctxt->input->consumed +
                   4237:                           (CUR_PTR - ctxt->input->base);
                   4238:        node_info.begin_line = ctxt->input->line;
                   4239:     }
                   4240: 
                   4241:     failed = htmlParseStartTag(ctxt);
                   4242:     name = ctxt->name;
                   4243:     if ((failed == -1) || (name == NULL)) {
                   4244:        if (CUR == '>')
                   4245:            NEXT;
                   4246:         return;
                   4247:     }
                   4248: 
                   4249:     /*
                   4250:      * Lookup the info for that element.
                   4251:      */
                   4252:     info = htmlTagLookup(name);
                   4253:     if (info == NULL) {
                   4254:        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
                   4255:                     "Tag %s invalid\n", name, NULL);
                   4256:     }
                   4257: 
                   4258:     /*
                   4259:      * Check for an Empty Element labeled the XML/SGML way
                   4260:      */
                   4261:     if ((CUR == '/') && (NXT(1) == '>')) {
                   4262:         SKIP(2);
                   4263:        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   4264:            ctxt->sax->endElement(ctxt->userData, name);
                   4265:        htmlnamePop(ctxt);
                   4266:        return;
                   4267:     }
                   4268: 
                   4269:     if (CUR == '>') {
                   4270:         NEXT;
                   4271:     } else {
                   4272:        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
                   4273:                     "Couldn't find end of Start Tag %s\n", name, NULL);
                   4274: 
                   4275:        /*
                   4276:         * end of parsing of this node.
                   4277:         */
                   4278:        if (xmlStrEqual(name, ctxt->name)) {
                   4279:            nodePop(ctxt);
                   4280:            htmlnamePop(ctxt);
                   4281:        }
                   4282: 
                   4283:        /*
                   4284:         * Capture end position and add node
                   4285:         */
                   4286:        if (ctxt->record_info) {
                   4287:           node_info.end_pos = ctxt->input->consumed +
                   4288:                              (CUR_PTR - ctxt->input->base);
                   4289:           node_info.end_line = ctxt->input->line;
                   4290:           node_info.node = ctxt->node;
                   4291:           xmlParserAddNodeInfo(ctxt, &node_info);
                   4292:        }
                   4293:        return;
                   4294:     }
                   4295: 
                   4296:     /*
                   4297:      * Check for an Empty Element from DTD definition
                   4298:      */
                   4299:     if ((info != NULL) && (info->empty)) {
                   4300:        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   4301:            ctxt->sax->endElement(ctxt->userData, name);
                   4302:        htmlnamePop(ctxt);
                   4303:        return;
                   4304:     }
                   4305: 
                   4306:     /*
                   4307:      * Parse the content of the element:
                   4308:      */
                   4309:     currentNode = xmlStrdup(ctxt->name);
                   4310:     depth = ctxt->nameNr;
                   4311:     while (IS_CHAR_CH(CUR)) {
                   4312:        oldptr = ctxt->input->cur;
                   4313:        htmlParseContent(ctxt);
                   4314:        if (oldptr==ctxt->input->cur) break;
                   4315:        if (ctxt->nameNr < depth) break;
                   4316:     }
                   4317: 
                   4318:     /*
                   4319:      * Capture end position and add node
                   4320:      */
                   4321:     if ( currentNode != NULL && ctxt->record_info ) {
                   4322:        node_info.end_pos = ctxt->input->consumed +
                   4323:                           (CUR_PTR - ctxt->input->base);
                   4324:        node_info.end_line = ctxt->input->line;
                   4325:        node_info.node = ctxt->node;
                   4326:        xmlParserAddNodeInfo(ctxt, &node_info);
                   4327:     }
                   4328:     if (!IS_CHAR_CH(CUR)) {
                   4329:        htmlAutoCloseOnEnd(ctxt);
                   4330:     }
                   4331: 
                   4332:     if (currentNode != NULL)
                   4333:        xmlFree(currentNode);
                   4334: }
                   4335: 
                   4336: static void
                   4337: htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
                   4338:     /*
                   4339:      * Capture end position and add node
                   4340:      */
                   4341:     if ( ctxt->node != NULL && ctxt->record_info ) {
                   4342:        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
                   4343:                                 (CUR_PTR - ctxt->input->base);
                   4344:        ctxt->nodeInfo->end_line = ctxt->input->line;
                   4345:        ctxt->nodeInfo->node = ctxt->node;
                   4346:        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
                   4347:        htmlNodeInfoPop(ctxt);
                   4348:     }
                   4349:     if (!IS_CHAR_CH(CUR)) {
                   4350:        htmlAutoCloseOnEnd(ctxt);
                   4351:     }
                   4352: }
                   4353: 
                   4354: /**
                   4355:  * htmlParseElementInternal:
                   4356:  * @ctxt:  an HTML parser context
                   4357:  *
                   4358:  * parse an HTML element, new version, non recursive
                   4359:  *
                   4360:  * [39] element ::= EmptyElemTag | STag content ETag
                   4361:  *
                   4362:  * [41] Attribute ::= Name Eq AttValue
                   4363:  */
                   4364: 
                   4365: static void
                   4366: htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
                   4367:     const xmlChar *name;
                   4368:     const htmlElemDesc * info;
                   4369:     htmlParserNodeInfo node_info;
                   4370:     int failed;
                   4371: 
                   4372:     if ((ctxt == NULL) || (ctxt->input == NULL)) {
                   4373:        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   4374:                     "htmlParseElementInternal: context error\n", NULL, NULL);
                   4375:        return;
                   4376:     }
                   4377: 
                   4378:     if (ctxt->instate == XML_PARSER_EOF)
                   4379:         return;
                   4380: 
                   4381:     /* Capture start position */
                   4382:     if (ctxt->record_info) {
                   4383:         node_info.begin_pos = ctxt->input->consumed +
                   4384:                           (CUR_PTR - ctxt->input->base);
                   4385:        node_info.begin_line = ctxt->input->line;
                   4386:     }
                   4387: 
                   4388:     failed = htmlParseStartTag(ctxt);
                   4389:     name = ctxt->name;
                   4390:     if ((failed == -1) || (name == NULL)) {
                   4391:        if (CUR == '>')
                   4392:            NEXT;
                   4393:         return;
                   4394:     }
                   4395: 
                   4396:     /*
                   4397:      * Lookup the info for that element.
                   4398:      */
                   4399:     info = htmlTagLookup(name);
                   4400:     if (info == NULL) {
                   4401:        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
                   4402:                     "Tag %s invalid\n", name, NULL);
                   4403:     }
                   4404: 
                   4405:     /*
                   4406:      * Check for an Empty Element labeled the XML/SGML way
                   4407:      */
                   4408:     if ((CUR == '/') && (NXT(1) == '>')) {
                   4409:         SKIP(2);
                   4410:        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   4411:            ctxt->sax->endElement(ctxt->userData, name);
                   4412:        htmlnamePop(ctxt);
                   4413:        return;
                   4414:     }
                   4415: 
                   4416:     if (CUR == '>') {
                   4417:         NEXT;
                   4418:     } else {
                   4419:        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
                   4420:                     "Couldn't find end of Start Tag %s\n", name, NULL);
                   4421: 
                   4422:        /*
                   4423:         * end of parsing of this node.
                   4424:         */
                   4425:        if (xmlStrEqual(name, ctxt->name)) {
                   4426:            nodePop(ctxt);
                   4427:            htmlnamePop(ctxt);
                   4428:        }
                   4429: 
                   4430:         if (ctxt->record_info)
                   4431:             htmlNodeInfoPush(ctxt, &node_info);
                   4432:         htmlParserFinishElementParsing(ctxt);
                   4433:        return;
                   4434:     }
                   4435: 
                   4436:     /*
                   4437:      * Check for an Empty Element from DTD definition
                   4438:      */
                   4439:     if ((info != NULL) && (info->empty)) {
                   4440:        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   4441:            ctxt->sax->endElement(ctxt->userData, name);
                   4442:        htmlnamePop(ctxt);
                   4443:        return;
                   4444:     }
                   4445: 
                   4446:     if (ctxt->record_info)
                   4447:         htmlNodeInfoPush(ctxt, &node_info);
                   4448: }
                   4449: 
                   4450: /**
                   4451:  * htmlParseContentInternal:
                   4452:  * @ctxt:  an HTML parser context
                   4453:  *
                   4454:  * Parse a content: comment, sub-element, reference or text.
                   4455:  * New version for non recursive htmlParseElementInternal
                   4456:  */
                   4457: 
                   4458: static void
                   4459: htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
                   4460:     xmlChar *currentNode;
                   4461:     int depth;
                   4462:     const xmlChar *name;
                   4463: 
                   4464:     currentNode = xmlStrdup(ctxt->name);
                   4465:     depth = ctxt->nameNr;
                   4466:     while (1) {
                   4467:        long cons = ctxt->nbChars;
                   4468: 
                   4469:         GROW;
                   4470: 
                   4471:         if (ctxt->instate == XML_PARSER_EOF)
                   4472:             break;
                   4473: 
                   4474:        /*
                   4475:         * Our tag or one of it's parent or children is ending.
                   4476:         */
                   4477:         if ((CUR == '<') && (NXT(1) == '/')) {
                   4478:            if (htmlParseEndTag(ctxt) &&
                   4479:                ((currentNode != NULL) || (ctxt->nameNr == 0))) {
                   4480:                if (currentNode != NULL)
                   4481:                    xmlFree(currentNode);
                   4482: 
                   4483:                currentNode = xmlStrdup(ctxt->name);
                   4484:                depth = ctxt->nameNr;
                   4485:            }
                   4486:            continue; /* while */
                   4487:         }
                   4488: 
                   4489:        else if ((CUR == '<') &&
                   4490:                 ((IS_ASCII_LETTER(NXT(1))) ||
                   4491:                  (NXT(1) == '_') || (NXT(1) == ':'))) {
                   4492:            name = htmlParseHTMLName_nonInvasive(ctxt);
                   4493:            if (name == NULL) {
                   4494:                htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                   4495:                         "htmlParseStartTag: invalid element name\n",
                   4496:                         NULL, NULL);
                   4497:                /* Dump the bogus tag like browsers do */
                   4498:                while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
                   4499:                    NEXT;
                   4500: 
                   4501:                htmlParserFinishElementParsing(ctxt);
                   4502:                if (currentNode != NULL)
                   4503:                    xmlFree(currentNode);
                   4504: 
                   4505:                currentNode = xmlStrdup(ctxt->name);
                   4506:                depth = ctxt->nameNr;
                   4507:                continue;
                   4508:            }
                   4509: 
                   4510:            if (ctxt->name != NULL) {
                   4511:                if (htmlCheckAutoClose(name, ctxt->name) == 1) {
                   4512:                    htmlAutoClose(ctxt, name);
                   4513:                    continue;
                   4514:                }
                   4515:            }
                   4516:        }
                   4517: 
                   4518:        /*
                   4519:         * Has this node been popped out during parsing of
                   4520:         * the next element
                   4521:         */
                   4522:         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
                   4523:            (!xmlStrEqual(currentNode, ctxt->name)))
                   4524:             {
                   4525:            htmlParserFinishElementParsing(ctxt);
                   4526:            if (currentNode != NULL) xmlFree(currentNode);
                   4527: 
                   4528:            currentNode = xmlStrdup(ctxt->name);
                   4529:            depth = ctxt->nameNr;
                   4530:            continue;
                   4531:        }
                   4532: 
                   4533:        if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
                   4534:            (xmlStrEqual(currentNode, BAD_CAST"style")))) {
                   4535:            /*
                   4536:             * Handle SCRIPT/STYLE separately
                   4537:             */
                   4538:            htmlParseScript(ctxt);
                   4539:        } else {
                   4540:            /*
                   4541:             * Sometimes DOCTYPE arrives in the middle of the document
                   4542:             */
                   4543:            if ((CUR == '<') && (NXT(1) == '!') &&
                   4544:                (UPP(2) == 'D') && (UPP(3) == 'O') &&
                   4545:                (UPP(4) == 'C') && (UPP(5) == 'T') &&
                   4546:                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                   4547:                (UPP(8) == 'E')) {
                   4548:                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                   4549:                             "Misplaced DOCTYPE declaration\n",
                   4550:                             BAD_CAST "DOCTYPE" , NULL);
                   4551:                htmlParseDocTypeDecl(ctxt);
                   4552:            }
                   4553: 
                   4554:            /*
                   4555:             * First case :  a comment
                   4556:             */
                   4557:            if ((CUR == '<') && (NXT(1) == '!') &&
                   4558:                (NXT(2) == '-') && (NXT(3) == '-')) {
                   4559:                htmlParseComment(ctxt);
                   4560:            }
                   4561: 
                   4562:            /*
                   4563:             * Second case : a Processing Instruction.
                   4564:             */
                   4565:            else if ((CUR == '<') && (NXT(1) == '?')) {
                   4566:                htmlParsePI(ctxt);
                   4567:            }
                   4568: 
                   4569:            /*
                   4570:             * Third case :  a sub-element.
                   4571:             */
                   4572:            else if (CUR == '<') {
                   4573:                htmlParseElementInternal(ctxt);
                   4574:                if (currentNode != NULL) xmlFree(currentNode);
                   4575: 
                   4576:                currentNode = xmlStrdup(ctxt->name);
                   4577:                depth = ctxt->nameNr;
                   4578:            }
                   4579: 
                   4580:            /*
                   4581:             * Fourth case : a reference. If if has not been resolved,
                   4582:             *    parsing returns it's Name, create the node
                   4583:             */
                   4584:            else if (CUR == '&') {
                   4585:                htmlParseReference(ctxt);
                   4586:            }
                   4587: 
                   4588:            /*
                   4589:             * Fifth case : end of the resource
                   4590:             */
                   4591:            else if (CUR == 0) {
                   4592:                htmlAutoCloseOnEnd(ctxt);
                   4593:                break;
                   4594:            }
                   4595: 
                   4596:            /*
                   4597:             * Last case, text. Note that References are handled directly.
                   4598:             */
                   4599:            else {
                   4600:                htmlParseCharData(ctxt);
                   4601:            }
                   4602: 
                   4603:            if (cons == ctxt->nbChars) {
                   4604:                if (ctxt->node != NULL) {
                   4605:                    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   4606:                                 "detected an error in element content\n",
                   4607:                                 NULL, NULL);
                   4608:                }
                   4609:                break;
                   4610:            }
                   4611:        }
                   4612:         GROW;
                   4613:     }
                   4614:     if (currentNode != NULL) xmlFree(currentNode);
                   4615: }
                   4616: 
                   4617: /**
                   4618:  * htmlParseContent:
                   4619:  * @ctxt:  an HTML parser context
                   4620:  *
                   4621:  * Parse a content: comment, sub-element, reference or text.
                   4622:  * This is the entry point when called from parser.c
                   4623:  */
                   4624: 
                   4625: void
                   4626: __htmlParseContent(void *ctxt) {
                   4627:     if (ctxt != NULL)
                   4628:        htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
                   4629: }
                   4630: 
                   4631: /**
                   4632:  * htmlParseDocument:
                   4633:  * @ctxt:  an HTML parser context
                   4634:  *
                   4635:  * parse an HTML document (and build a tree if using the standard SAX
                   4636:  * interface).
                   4637:  *
                   4638:  * Returns 0, -1 in case of error. the parser context is augmented
                   4639:  *                as a result of the parsing.
                   4640:  */
                   4641: 
                   4642: int
                   4643: htmlParseDocument(htmlParserCtxtPtr ctxt) {
                   4644:     xmlChar start[4];
                   4645:     xmlCharEncoding enc;
                   4646:     xmlDtdPtr dtd;
                   4647: 
                   4648:     xmlInitParser();
                   4649: 
                   4650:     htmlDefaultSAXHandlerInit();
                   4651: 
                   4652:     if ((ctxt == NULL) || (ctxt->input == NULL)) {
                   4653:        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   4654:                     "htmlParseDocument: context error\n", NULL, NULL);
                   4655:        return(XML_ERR_INTERNAL_ERROR);
                   4656:     }
                   4657:     ctxt->html = 1;
                   4658:     ctxt->linenumbers = 1;
                   4659:     GROW;
                   4660:     /*
                   4661:      * SAX: beginning of the document processing.
                   4662:      */
                   4663:     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
                   4664:         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
                   4665: 
                   4666:     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
                   4667:         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
                   4668:        /*
                   4669:         * Get the 4 first bytes and decode the charset
                   4670:         * if enc != XML_CHAR_ENCODING_NONE
                   4671:         * plug some encoding conversion routines.
                   4672:         */
                   4673:        start[0] = RAW;
                   4674:        start[1] = NXT(1);
                   4675:        start[2] = NXT(2);
                   4676:        start[3] = NXT(3);
                   4677:        enc = xmlDetectCharEncoding(&start[0], 4);
                   4678:        if (enc != XML_CHAR_ENCODING_NONE) {
                   4679:            xmlSwitchEncoding(ctxt, enc);
                   4680:        }
                   4681:     }
                   4682: 
                   4683:     /*
                   4684:      * Wipe out everything which is before the first '<'
                   4685:      */
                   4686:     SKIP_BLANKS;
                   4687:     if (CUR == 0) {
                   4688:        htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
                   4689:                     "Document is empty\n", NULL, NULL);
                   4690:     }
                   4691: 
                   4692:     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
                   4693:        ctxt->sax->startDocument(ctxt->userData);
                   4694: 
                   4695: 
                   4696:     /*
                   4697:      * Parse possible comments and PIs before any content
                   4698:      */
                   4699:     while (((CUR == '<') && (NXT(1) == '!') &&
                   4700:             (NXT(2) == '-') && (NXT(3) == '-')) ||
                   4701:           ((CUR == '<') && (NXT(1) == '?'))) {
                   4702:         htmlParseComment(ctxt);
                   4703:         htmlParsePI(ctxt);
                   4704:        SKIP_BLANKS;
                   4705:     }
                   4706: 
                   4707: 
                   4708:     /*
                   4709:      * Then possibly doc type declaration(s) and more Misc
                   4710:      * (doctypedecl Misc*)?
                   4711:      */
                   4712:     if ((CUR == '<') && (NXT(1) == '!') &&
                   4713:        (UPP(2) == 'D') && (UPP(3) == 'O') &&
                   4714:        (UPP(4) == 'C') && (UPP(5) == 'T') &&
                   4715:        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                   4716:        (UPP(8) == 'E')) {
                   4717:        htmlParseDocTypeDecl(ctxt);
                   4718:     }
                   4719:     SKIP_BLANKS;
                   4720: 
                   4721:     /*
                   4722:      * Parse possible comments and PIs before any content
                   4723:      */
                   4724:     while (((CUR == '<') && (NXT(1) == '!') &&
                   4725:             (NXT(2) == '-') && (NXT(3) == '-')) ||
                   4726:           ((CUR == '<') && (NXT(1) == '?'))) {
                   4727:         htmlParseComment(ctxt);
                   4728:         htmlParsePI(ctxt);
                   4729:        SKIP_BLANKS;
                   4730:     }
                   4731: 
                   4732:     /*
                   4733:      * Time to start parsing the tree itself
                   4734:      */
                   4735:     htmlParseContentInternal(ctxt);
                   4736: 
                   4737:     /*
                   4738:      * autoclose
                   4739:      */
                   4740:     if (CUR == 0)
                   4741:        htmlAutoCloseOnEnd(ctxt);
                   4742: 
                   4743: 
                   4744:     /*
                   4745:      * SAX: end of the document processing.
                   4746:      */
                   4747:     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
                   4748:         ctxt->sax->endDocument(ctxt->userData);
                   4749: 
                   4750:     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
                   4751:        dtd = xmlGetIntSubset(ctxt->myDoc);
                   4752:        if (dtd == NULL)
                   4753:            ctxt->myDoc->intSubset =
                   4754:                xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
                   4755:                    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
                   4756:                    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
                   4757:     }
                   4758:     if (! ctxt->wellFormed) return(-1);
                   4759:     return(0);
                   4760: }
                   4761: 
                   4762: 
                   4763: /************************************************************************
                   4764:  *                                                                     *
                   4765:  *                     Parser contexts handling                        *
                   4766:  *                                                                     *
                   4767:  ************************************************************************/
                   4768: 
                   4769: /**
                   4770:  * htmlInitParserCtxt:
                   4771:  * @ctxt:  an HTML parser context
                   4772:  *
                   4773:  * Initialize a parser context
                   4774:  *
                   4775:  * Returns 0 in case of success and -1 in case of error
                   4776:  */
                   4777: 
                   4778: static int
                   4779: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
                   4780: {
                   4781:     htmlSAXHandler *sax;
                   4782: 
                   4783:     if (ctxt == NULL) return(-1);
                   4784:     memset(ctxt, 0, sizeof(htmlParserCtxt));
                   4785: 
                   4786:     ctxt->dict = xmlDictCreate();
                   4787:     if (ctxt->dict == NULL) {
                   4788:         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
                   4789:        return(-1);
                   4790:     }
                   4791:     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
                   4792:     if (sax == NULL) {
                   4793:         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
                   4794:        return(-1);
                   4795:     }
                   4796:     else
                   4797:         memset(sax, 0, sizeof(htmlSAXHandler));
                   4798: 
                   4799:     /* Allocate the Input stack */
                   4800:     ctxt->inputTab = (htmlParserInputPtr *)
                   4801:                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
                   4802:     if (ctxt->inputTab == NULL) {
                   4803:         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
                   4804:        ctxt->inputNr = 0;
                   4805:        ctxt->inputMax = 0;
                   4806:        ctxt->input = NULL;
                   4807:        return(-1);
                   4808:     }
                   4809:     ctxt->inputNr = 0;
                   4810:     ctxt->inputMax = 5;
                   4811:     ctxt->input = NULL;
                   4812:     ctxt->version = NULL;
                   4813:     ctxt->encoding = NULL;
                   4814:     ctxt->standalone = -1;
                   4815:     ctxt->instate = XML_PARSER_START;
                   4816: 
                   4817:     /* Allocate the Node stack */
                   4818:     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
                   4819:     if (ctxt->nodeTab == NULL) {
                   4820:         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
                   4821:        ctxt->nodeNr = 0;
                   4822:        ctxt->nodeMax = 0;
                   4823:        ctxt->node = NULL;
                   4824:        ctxt->inputNr = 0;
                   4825:        ctxt->inputMax = 0;
                   4826:        ctxt->input = NULL;
                   4827:        return(-1);
                   4828:     }
                   4829:     ctxt->nodeNr = 0;
                   4830:     ctxt->nodeMax = 10;
                   4831:     ctxt->node = NULL;
                   4832: 
                   4833:     /* Allocate the Name stack */
                   4834:     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
                   4835:     if (ctxt->nameTab == NULL) {
                   4836:         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
                   4837:        ctxt->nameNr = 0;
                   4838:        ctxt->nameMax = 0;
                   4839:        ctxt->name = NULL;
                   4840:        ctxt->nodeNr = 0;
                   4841:        ctxt->nodeMax = 0;
                   4842:        ctxt->node = NULL;
                   4843:        ctxt->inputNr = 0;
                   4844:        ctxt->inputMax = 0;
                   4845:        ctxt->input = NULL;
                   4846:        return(-1);
                   4847:     }
                   4848:     ctxt->nameNr = 0;
                   4849:     ctxt->nameMax = 10;
                   4850:     ctxt->name = NULL;
                   4851: 
                   4852:     ctxt->nodeInfoTab = NULL;
                   4853:     ctxt->nodeInfoNr  = 0;
                   4854:     ctxt->nodeInfoMax = 0;
                   4855: 
                   4856:     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
                   4857:     else {
                   4858:         ctxt->sax = sax;
                   4859:        memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
                   4860:     }
                   4861:     ctxt->userData = ctxt;
                   4862:     ctxt->myDoc = NULL;
                   4863:     ctxt->wellFormed = 1;
                   4864:     ctxt->replaceEntities = 0;
                   4865:     ctxt->linenumbers = xmlLineNumbersDefaultValue;
                   4866:     ctxt->html = 1;
                   4867:     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
                   4868:     ctxt->vctxt.userData = ctxt;
                   4869:     ctxt->vctxt.error = xmlParserValidityError;
                   4870:     ctxt->vctxt.warning = xmlParserValidityWarning;
                   4871:     ctxt->record_info = 0;
                   4872:     ctxt->validate = 0;
                   4873:     ctxt->nbChars = 0;
                   4874:     ctxt->checkIndex = 0;
                   4875:     ctxt->catalogs = NULL;
                   4876:     xmlInitNodeInfoSeq(&ctxt->node_seq);
                   4877:     return(0);
                   4878: }
                   4879: 
                   4880: /**
                   4881:  * htmlFreeParserCtxt:
                   4882:  * @ctxt:  an HTML parser context
                   4883:  *
                   4884:  * Free all the memory used by a parser context. However the parsed
                   4885:  * document in ctxt->myDoc is not freed.
                   4886:  */
                   4887: 
                   4888: void
                   4889: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
                   4890: {
                   4891:     xmlFreeParserCtxt(ctxt);
                   4892: }
                   4893: 
                   4894: /**
                   4895:  * htmlNewParserCtxt:
                   4896:  *
                   4897:  * Allocate and initialize a new parser context.
                   4898:  *
                   4899:  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
                   4900:  */
                   4901: 
                   4902: htmlParserCtxtPtr
                   4903: htmlNewParserCtxt(void)
                   4904: {
                   4905:     xmlParserCtxtPtr ctxt;
                   4906: 
                   4907:     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
                   4908:     if (ctxt == NULL) {
                   4909:         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
                   4910:        return(NULL);
                   4911:     }
                   4912:     memset(ctxt, 0, sizeof(xmlParserCtxt));
                   4913:     if (htmlInitParserCtxt(ctxt) < 0) {
                   4914:         htmlFreeParserCtxt(ctxt);
                   4915:        return(NULL);
                   4916:     }
                   4917:     return(ctxt);
                   4918: }
                   4919: 
                   4920: /**
                   4921:  * htmlCreateMemoryParserCtxt:
                   4922:  * @buffer:  a pointer to a char array
                   4923:  * @size:  the size of the array
                   4924:  *
                   4925:  * Create a parser context for an HTML in-memory document.
                   4926:  *
                   4927:  * Returns the new parser context or NULL
                   4928:  */
                   4929: htmlParserCtxtPtr
                   4930: htmlCreateMemoryParserCtxt(const char *buffer, int size) {
                   4931:     xmlParserCtxtPtr ctxt;
                   4932:     xmlParserInputPtr input;
                   4933:     xmlParserInputBufferPtr buf;
                   4934: 
                   4935:     if (buffer == NULL)
                   4936:        return(NULL);
                   4937:     if (size <= 0)
                   4938:        return(NULL);
                   4939: 
                   4940:     ctxt = htmlNewParserCtxt();
                   4941:     if (ctxt == NULL)
                   4942:        return(NULL);
                   4943: 
                   4944:     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
                   4945:     if (buf == NULL) return(NULL);
                   4946: 
                   4947:     input = xmlNewInputStream(ctxt);
                   4948:     if (input == NULL) {
                   4949:        xmlFreeParserCtxt(ctxt);
                   4950:        return(NULL);
                   4951:     }
                   4952: 
                   4953:     input->filename = NULL;
                   4954:     input->buf = buf;
1.1.1.3 ! misho    4955:     xmlBufResetInput(buf->buffer, input);
1.1       misho    4956: 
                   4957:     inputPush(ctxt, input);
                   4958:     return(ctxt);
                   4959: }
                   4960: 
                   4961: /**
                   4962:  * htmlCreateDocParserCtxt:
                   4963:  * @cur:  a pointer to an array of xmlChar
                   4964:  * @encoding:  a free form C string describing the HTML document encoding, or NULL
                   4965:  *
                   4966:  * Create a parser context for an HTML document.
                   4967:  *
                   4968:  * TODO: check the need to add encoding handling there
                   4969:  *
                   4970:  * Returns the new parser context or NULL
                   4971:  */
                   4972: static htmlParserCtxtPtr
                   4973: htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
                   4974:     int len;
                   4975:     htmlParserCtxtPtr ctxt;
                   4976: 
                   4977:     if (cur == NULL)
                   4978:        return(NULL);
                   4979:     len = xmlStrlen(cur);
                   4980:     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
                   4981:     if (ctxt == NULL)
                   4982:        return(NULL);
                   4983: 
                   4984:     if (encoding != NULL) {
                   4985:        xmlCharEncoding enc;
                   4986:        xmlCharEncodingHandlerPtr handler;
                   4987: 
                   4988:        if (ctxt->input->encoding != NULL)
                   4989:            xmlFree((xmlChar *) ctxt->input->encoding);
                   4990:        ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
                   4991: 
                   4992:        enc = xmlParseCharEncoding(encoding);
                   4993:        /*
                   4994:         * registered set of known encodings
                   4995:         */
                   4996:        if (enc != XML_CHAR_ENCODING_ERROR) {
                   4997:            xmlSwitchEncoding(ctxt, enc);
                   4998:            if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
                   4999:                htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
                   5000:                             "Unsupported encoding %s\n",
                   5001:                             (const xmlChar *) encoding, NULL);
                   5002:            }
                   5003:        } else {
                   5004:            /*
                   5005:             * fallback for unknown encodings
                   5006:             */
                   5007:            handler = xmlFindCharEncodingHandler((const char *) encoding);
                   5008:            if (handler != NULL) {
                   5009:                xmlSwitchToEncoding(ctxt, handler);
                   5010:            } else {
                   5011:                htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
                   5012:                             "Unsupported encoding %s\n",
                   5013:                             (const xmlChar *) encoding, NULL);
                   5014:            }
                   5015:        }
                   5016:     }
                   5017:     return(ctxt);
                   5018: }
                   5019: 
                   5020: #ifdef LIBXML_PUSH_ENABLED
                   5021: /************************************************************************
                   5022:  *                                                                     *
                   5023:  *     Progressive parsing interfaces                          *
                   5024:  *                                                                     *
                   5025:  ************************************************************************/
                   5026: 
                   5027: /**
                   5028:  * htmlParseLookupSequence:
                   5029:  * @ctxt:  an HTML parser context
                   5030:  * @first:  the first char to lookup
                   5031:  * @next:  the next char to lookup or zero
                   5032:  * @third:  the next char to lookup or zero
                   5033:  * @comment: flag to force checking inside comments
                   5034:  *
                   5035:  * Try to find if a sequence (first, next, third) or  just (first next) or
                   5036:  * (first) is available in the input stream.
                   5037:  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
                   5038:  * to avoid rescanning sequences of bytes, it DOES change the state of the
                   5039:  * parser, do not use liberally.
                   5040:  * This is basically similar to xmlParseLookupSequence()
                   5041:  *
                   5042:  * Returns the index to the current parsing point if the full sequence
                   5043:  *      is available, -1 otherwise.
                   5044:  */
                   5045: static int
                   5046: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
                   5047:                         xmlChar next, xmlChar third, int iscomment,
                   5048:                         int ignoreattrval)
                   5049: {
                   5050:     int base, len;
                   5051:     htmlParserInputPtr in;
                   5052:     const xmlChar *buf;
                   5053:     int incomment = 0;
                   5054:     int invalue = 0;
                   5055:     char valdellim = 0x0;
                   5056: 
                   5057:     in = ctxt->input;
                   5058:     if (in == NULL)
                   5059:         return (-1);
                   5060: 
                   5061:     base = in->cur - in->base;
                   5062:     if (base < 0)
                   5063:         return (-1);
                   5064: 
                   5065:     if (ctxt->checkIndex > base)
                   5066:         base = ctxt->checkIndex;
                   5067: 
                   5068:     if (in->buf == NULL) {
                   5069:         buf = in->base;
                   5070:         len = in->length;
                   5071:     } else {
1.1.1.3 ! misho    5072:         buf = xmlBufContent(in->buf->buffer);
        !          5073:         len = xmlBufUse(in->buf->buffer);
1.1       misho    5074:     }
                   5075: 
                   5076:     /* take into account the sequence length */
                   5077:     if (third)
                   5078:         len -= 2;
                   5079:     else if (next)
                   5080:         len--;
                   5081:     for (; base < len; base++) {
                   5082:         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
                   5083:             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
                   5084:                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
                   5085:                 incomment = 1;
                   5086:                 /* do not increment past <! - some people use <!--> */
                   5087:                 base += 2;
                   5088:             }
                   5089:         }
                   5090:         if (ignoreattrval) {
                   5091:             if (buf[base] == '"' || buf[base] == '\'') {
                   5092:                 if (invalue) {
                   5093:                     if (buf[base] == valdellim) {
                   5094:                         invalue = 0;
                   5095:                         continue;
                   5096:                     }
                   5097:                 } else {
                   5098:                     valdellim = buf[base];
                   5099:                     invalue = 1;
                   5100:                     continue;
                   5101:                 }
                   5102:             } else if (invalue) {
                   5103:                 continue;
                   5104:             }
                   5105:         }
                   5106:         if (incomment) {
                   5107:             if (base + 3 > len)
                   5108:                 return (-1);
                   5109:             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
                   5110:                 (buf[base + 2] == '>')) {
                   5111:                 incomment = 0;
                   5112:                 base += 2;
                   5113:             }
                   5114:             continue;
                   5115:         }
                   5116:         if (buf[base] == first) {
                   5117:             if (third != 0) {
                   5118:                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
                   5119:                     continue;
                   5120:             } else if (next != 0) {
                   5121:                 if (buf[base + 1] != next)
                   5122:                     continue;
                   5123:             }
                   5124:             ctxt->checkIndex = 0;
                   5125: #ifdef DEBUG_PUSH
                   5126:             if (next == 0)
                   5127:                 xmlGenericError(xmlGenericErrorContext,
                   5128:                                 "HPP: lookup '%c' found at %d\n",
                   5129:                                 first, base);
                   5130:             else if (third == 0)
                   5131:                 xmlGenericError(xmlGenericErrorContext,
                   5132:                                 "HPP: lookup '%c%c' found at %d\n",
                   5133:                                 first, next, base);
                   5134:             else
                   5135:                 xmlGenericError(xmlGenericErrorContext,
                   5136:                                 "HPP: lookup '%c%c%c' found at %d\n",
                   5137:                                 first, next, third, base);
                   5138: #endif
                   5139:             return (base - (in->cur - in->base));
                   5140:         }
                   5141:     }
                   5142:     if ((!incomment) && (!invalue))
                   5143:         ctxt->checkIndex = base;
                   5144: #ifdef DEBUG_PUSH
                   5145:     if (next == 0)
                   5146:         xmlGenericError(xmlGenericErrorContext,
                   5147:                         "HPP: lookup '%c' failed\n", first);
                   5148:     else if (third == 0)
                   5149:         xmlGenericError(xmlGenericErrorContext,
                   5150:                         "HPP: lookup '%c%c' failed\n", first, next);
                   5151:     else
                   5152:         xmlGenericError(xmlGenericErrorContext,
                   5153:                         "HPP: lookup '%c%c%c' failed\n", first, next,
                   5154:                         third);
                   5155: #endif
                   5156:     return (-1);
                   5157: }
                   5158: 
                   5159: /**
                   5160:  * htmlParseLookupChars:
                   5161:  * @ctxt: an HTML parser context
                   5162:  * @stop: Array of chars, which stop the lookup.
                   5163:  * @stopLen: Length of stop-Array
                   5164:  *
1.1.1.3 ! misho    5165:  * Try to find if any char of the stop-Array is available in the input
1.1       misho    5166:  * stream.
                   5167:  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
                   5168:  * to avoid rescanning sequences of bytes, it DOES change the state of the
                   5169:  * parser, do not use liberally.
                   5170:  *
1.1.1.3 ! misho    5171:  * Returns the index to the current parsing point if a stopChar
1.1       misho    5172:  *      is available, -1 otherwise.
                   5173:  */
                   5174: static int
                   5175: htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
                   5176:                      int stopLen)
                   5177: {
                   5178:     int base, len;
                   5179:     htmlParserInputPtr in;
                   5180:     const xmlChar *buf;
                   5181:     int incomment = 0;
                   5182:     int i;
                   5183: 
                   5184:     in = ctxt->input;
                   5185:     if (in == NULL)
                   5186:         return (-1);
                   5187: 
                   5188:     base = in->cur - in->base;
                   5189:     if (base < 0)
                   5190:         return (-1);
                   5191: 
                   5192:     if (ctxt->checkIndex > base)
                   5193:         base = ctxt->checkIndex;
                   5194: 
                   5195:     if (in->buf == NULL) {
                   5196:         buf = in->base;
                   5197:         len = in->length;
                   5198:     } else {
1.1.1.3 ! misho    5199:         buf = xmlBufContent(in->buf->buffer);
        !          5200:         len = xmlBufUse(in->buf->buffer);
1.1       misho    5201:     }
                   5202: 
                   5203:     for (; base < len; base++) {
                   5204:         if (!incomment && (base + 4 < len)) {
                   5205:             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
                   5206:                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
                   5207:                 incomment = 1;
                   5208:                 /* do not increment past <! - some people use <!--> */
                   5209:                 base += 2;
                   5210:             }
                   5211:         }
                   5212:         if (incomment) {
                   5213:             if (base + 3 > len)
                   5214:                 return (-1);
                   5215:             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
                   5216:                 (buf[base + 2] == '>')) {
                   5217:                 incomment = 0;
                   5218:                 base += 2;
                   5219:             }
                   5220:             continue;
                   5221:         }
                   5222:         for (i = 0; i < stopLen; ++i) {
                   5223:             if (buf[base] == stop[i]) {
                   5224:                 ctxt->checkIndex = 0;
                   5225:                 return (base - (in->cur - in->base));
                   5226:             }
                   5227:         }
                   5228:     }
                   5229:     ctxt->checkIndex = base;
                   5230:     return (-1);
                   5231: }
                   5232: 
                   5233: /**
                   5234:  * htmlParseTryOrFinish:
                   5235:  * @ctxt:  an HTML parser context
                   5236:  * @terminate:  last chunk indicator
                   5237:  *
                   5238:  * Try to progress on parsing
                   5239:  *
                   5240:  * Returns zero if no parsing was possible
                   5241:  */
                   5242: static int
                   5243: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                   5244:     int ret = 0;
                   5245:     htmlParserInputPtr in;
                   5246:     int avail = 0;
                   5247:     xmlChar cur, next;
                   5248: 
1.1.1.2   misho    5249:     htmlParserNodeInfo node_info;
                   5250: 
1.1       misho    5251: #ifdef DEBUG_PUSH
                   5252:     switch (ctxt->instate) {
                   5253:        case XML_PARSER_EOF:
                   5254:            xmlGenericError(xmlGenericErrorContext,
                   5255:                    "HPP: try EOF\n"); break;
                   5256:        case XML_PARSER_START:
                   5257:            xmlGenericError(xmlGenericErrorContext,
                   5258:                    "HPP: try START\n"); break;
                   5259:        case XML_PARSER_MISC:
                   5260:            xmlGenericError(xmlGenericErrorContext,
                   5261:                    "HPP: try MISC\n");break;
                   5262:        case XML_PARSER_COMMENT:
                   5263:            xmlGenericError(xmlGenericErrorContext,
                   5264:                    "HPP: try COMMENT\n");break;
                   5265:        case XML_PARSER_PROLOG:
                   5266:            xmlGenericError(xmlGenericErrorContext,
                   5267:                    "HPP: try PROLOG\n");break;
                   5268:        case XML_PARSER_START_TAG:
                   5269:            xmlGenericError(xmlGenericErrorContext,
                   5270:                    "HPP: try START_TAG\n");break;
                   5271:        case XML_PARSER_CONTENT:
                   5272:            xmlGenericError(xmlGenericErrorContext,
                   5273:                    "HPP: try CONTENT\n");break;
                   5274:        case XML_PARSER_CDATA_SECTION:
                   5275:            xmlGenericError(xmlGenericErrorContext,
                   5276:                    "HPP: try CDATA_SECTION\n");break;
                   5277:        case XML_PARSER_END_TAG:
                   5278:            xmlGenericError(xmlGenericErrorContext,
                   5279:                    "HPP: try END_TAG\n");break;
                   5280:        case XML_PARSER_ENTITY_DECL:
                   5281:            xmlGenericError(xmlGenericErrorContext,
                   5282:                    "HPP: try ENTITY_DECL\n");break;
                   5283:        case XML_PARSER_ENTITY_VALUE:
                   5284:            xmlGenericError(xmlGenericErrorContext,
                   5285:                    "HPP: try ENTITY_VALUE\n");break;
                   5286:        case XML_PARSER_ATTRIBUTE_VALUE:
                   5287:            xmlGenericError(xmlGenericErrorContext,
                   5288:                    "HPP: try ATTRIBUTE_VALUE\n");break;
                   5289:        case XML_PARSER_DTD:
                   5290:            xmlGenericError(xmlGenericErrorContext,
                   5291:                    "HPP: try DTD\n");break;
                   5292:        case XML_PARSER_EPILOG:
                   5293:            xmlGenericError(xmlGenericErrorContext,
                   5294:                    "HPP: try EPILOG\n");break;
                   5295:        case XML_PARSER_PI:
                   5296:            xmlGenericError(xmlGenericErrorContext,
                   5297:                    "HPP: try PI\n");break;
                   5298:        case XML_PARSER_SYSTEM_LITERAL:
                   5299:            xmlGenericError(xmlGenericErrorContext,
                   5300:                    "HPP: try SYSTEM_LITERAL\n");break;
                   5301:     }
                   5302: #endif
                   5303: 
                   5304:     while (1) {
                   5305: 
                   5306:        in = ctxt->input;
                   5307:        if (in == NULL) break;
                   5308:        if (in->buf == NULL)
                   5309:            avail = in->length - (in->cur - in->base);
                   5310:        else
1.1.1.3 ! misho    5311:            avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
1.1       misho    5312:        if ((avail == 0) && (terminate)) {
                   5313:            htmlAutoCloseOnEnd(ctxt);
                   5314:            if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
                   5315:                /*
                   5316:                 * SAX: end of the document processing.
                   5317:                 */
                   5318:                ctxt->instate = XML_PARSER_EOF;
                   5319:                if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
                   5320:                    ctxt->sax->endDocument(ctxt->userData);
                   5321:            }
                   5322:        }
                   5323:         if (avail < 1)
                   5324:            goto done;
                   5325:        cur = in->cur[0];
                   5326:        if (cur == 0) {
                   5327:            SKIP(1);
                   5328:            continue;
                   5329:        }
                   5330: 
                   5331:         switch (ctxt->instate) {
                   5332:             case XML_PARSER_EOF:
                   5333:                /*
                   5334:                 * Document parsing is done !
                   5335:                 */
                   5336:                goto done;
                   5337:             case XML_PARSER_START:
                   5338:                /*
                   5339:                 * Very first chars read from the document flow.
                   5340:                 */
                   5341:                cur = in->cur[0];
                   5342:                if (IS_BLANK_CH(cur)) {
                   5343:                    SKIP_BLANKS;
                   5344:                    if (in->buf == NULL)
                   5345:                        avail = in->length - (in->cur - in->base);
                   5346:                    else
1.1.1.3 ! misho    5347:                        avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
1.1       misho    5348:                }
                   5349:                if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
                   5350:                    ctxt->sax->setDocumentLocator(ctxt->userData,
                   5351:                                                  &xmlDefaultSAXLocator);
                   5352:                if ((ctxt->sax) && (ctxt->sax->startDocument) &&
                   5353:                    (!ctxt->disableSAX))
                   5354:                    ctxt->sax->startDocument(ctxt->userData);
                   5355: 
                   5356:                cur = in->cur[0];
                   5357:                next = in->cur[1];
                   5358:                if ((cur == '<') && (next == '!') &&
                   5359:                    (UPP(2) == 'D') && (UPP(3) == 'O') &&
                   5360:                    (UPP(4) == 'C') && (UPP(5) == 'T') &&
                   5361:                    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                   5362:                    (UPP(8) == 'E')) {
                   5363:                    if ((!terminate) &&
                   5364:                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5365:                        goto done;
                   5366: #ifdef DEBUG_PUSH
                   5367:                    xmlGenericError(xmlGenericErrorContext,
                   5368:                            "HPP: Parsing internal subset\n");
                   5369: #endif
                   5370:                    htmlParseDocTypeDecl(ctxt);
                   5371:                    ctxt->instate = XML_PARSER_PROLOG;
                   5372: #ifdef DEBUG_PUSH
                   5373:                    xmlGenericError(xmlGenericErrorContext,
                   5374:                            "HPP: entering PROLOG\n");
                   5375: #endif
                   5376:                 } else {
                   5377:                    ctxt->instate = XML_PARSER_MISC;
                   5378: #ifdef DEBUG_PUSH
                   5379:                    xmlGenericError(xmlGenericErrorContext,
                   5380:                            "HPP: entering MISC\n");
                   5381: #endif
                   5382:                }
                   5383:                break;
                   5384:             case XML_PARSER_MISC:
                   5385:                SKIP_BLANKS;
                   5386:                if (in->buf == NULL)
                   5387:                    avail = in->length - (in->cur - in->base);
                   5388:                else
1.1.1.3 ! misho    5389:                    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
1.1.1.2   misho    5390:                /*
                   5391:                 * no chars in buffer
                   5392:                 */
                   5393:                if (avail < 1)
1.1       misho    5394:                    goto done;
1.1.1.2   misho    5395:                /*
                   5396:                 * not enouth chars in buffer
                   5397:                 */
                   5398:                if (avail < 2) {
                   5399:                    if (!terminate)
                   5400:                        goto done;
                   5401:                    else
                   5402:                        next = ' ';
                   5403:                } else {
                   5404:                    next = in->cur[1];
                   5405:                }
1.1       misho    5406:                cur = in->cur[0];
                   5407:                if ((cur == '<') && (next == '!') &&
                   5408:                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
                   5409:                    if ((!terminate) &&
                   5410:                        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
                   5411:                        goto done;
                   5412: #ifdef DEBUG_PUSH
                   5413:                    xmlGenericError(xmlGenericErrorContext,
                   5414:                            "HPP: Parsing Comment\n");
                   5415: #endif
                   5416:                    htmlParseComment(ctxt);
                   5417:                    ctxt->instate = XML_PARSER_MISC;
                   5418:                } else if ((cur == '<') && (next == '?')) {
                   5419:                    if ((!terminate) &&
                   5420:                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5421:                        goto done;
                   5422: #ifdef DEBUG_PUSH
                   5423:                    xmlGenericError(xmlGenericErrorContext,
                   5424:                            "HPP: Parsing PI\n");
                   5425: #endif
                   5426:                    htmlParsePI(ctxt);
                   5427:                    ctxt->instate = XML_PARSER_MISC;
                   5428:                } else if ((cur == '<') && (next == '!') &&
                   5429:                    (UPP(2) == 'D') && (UPP(3) == 'O') &&
                   5430:                    (UPP(4) == 'C') && (UPP(5) == 'T') &&
                   5431:                    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                   5432:                    (UPP(8) == 'E')) {
                   5433:                    if ((!terminate) &&
                   5434:                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5435:                        goto done;
                   5436: #ifdef DEBUG_PUSH
                   5437:                    xmlGenericError(xmlGenericErrorContext,
                   5438:                            "HPP: Parsing internal subset\n");
                   5439: #endif
                   5440:                    htmlParseDocTypeDecl(ctxt);
                   5441:                    ctxt->instate = XML_PARSER_PROLOG;
                   5442: #ifdef DEBUG_PUSH
                   5443:                    xmlGenericError(xmlGenericErrorContext,
                   5444:                            "HPP: entering PROLOG\n");
                   5445: #endif
                   5446:                } else if ((cur == '<') && (next == '!') &&
                   5447:                           (avail < 9)) {
                   5448:                    goto done;
                   5449:                } else {
                   5450:                    ctxt->instate = XML_PARSER_START_TAG;
                   5451: #ifdef DEBUG_PUSH
                   5452:                    xmlGenericError(xmlGenericErrorContext,
                   5453:                            "HPP: entering START_TAG\n");
                   5454: #endif
                   5455:                }
                   5456:                break;
                   5457:             case XML_PARSER_PROLOG:
                   5458:                SKIP_BLANKS;
                   5459:                if (in->buf == NULL)
                   5460:                    avail = in->length - (in->cur - in->base);
                   5461:                else
1.1.1.3 ! misho    5462:                    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
1.1       misho    5463:                if (avail < 2)
                   5464:                    goto done;
                   5465:                cur = in->cur[0];
                   5466:                next = in->cur[1];
                   5467:                if ((cur == '<') && (next == '!') &&
                   5468:                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
                   5469:                    if ((!terminate) &&
                   5470:                        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
                   5471:                        goto done;
                   5472: #ifdef DEBUG_PUSH
                   5473:                    xmlGenericError(xmlGenericErrorContext,
                   5474:                            "HPP: Parsing Comment\n");
                   5475: #endif
                   5476:                    htmlParseComment(ctxt);
                   5477:                    ctxt->instate = XML_PARSER_PROLOG;
                   5478:                } else if ((cur == '<') && (next == '?')) {
                   5479:                    if ((!terminate) &&
                   5480:                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5481:                        goto done;
                   5482: #ifdef DEBUG_PUSH
                   5483:                    xmlGenericError(xmlGenericErrorContext,
                   5484:                            "HPP: Parsing PI\n");
                   5485: #endif
                   5486:                    htmlParsePI(ctxt);
                   5487:                    ctxt->instate = XML_PARSER_PROLOG;
                   5488:                } else if ((cur == '<') && (next == '!') &&
                   5489:                           (avail < 4)) {
                   5490:                    goto done;
                   5491:                } else {
                   5492:                    ctxt->instate = XML_PARSER_START_TAG;
                   5493: #ifdef DEBUG_PUSH
                   5494:                    xmlGenericError(xmlGenericErrorContext,
                   5495:                            "HPP: entering START_TAG\n");
                   5496: #endif
                   5497:                }
                   5498:                break;
                   5499:             case XML_PARSER_EPILOG:
                   5500:                if (in->buf == NULL)
                   5501:                    avail = in->length - (in->cur - in->base);
                   5502:                else
1.1.1.3 ! misho    5503:                    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
1.1       misho    5504:                if (avail < 1)
                   5505:                    goto done;
                   5506:                cur = in->cur[0];
                   5507:                if (IS_BLANK_CH(cur)) {
                   5508:                    htmlParseCharData(ctxt);
                   5509:                    goto done;
                   5510:                }
                   5511:                if (avail < 2)
                   5512:                    goto done;
                   5513:                next = in->cur[1];
                   5514:                if ((cur == '<') && (next == '!') &&
                   5515:                    (in->cur[2] == '-') && (in->cur[3] == '-')) {
                   5516:                    if ((!terminate) &&
                   5517:                        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
                   5518:                        goto done;
                   5519: #ifdef DEBUG_PUSH
                   5520:                    xmlGenericError(xmlGenericErrorContext,
                   5521:                            "HPP: Parsing Comment\n");
                   5522: #endif
                   5523:                    htmlParseComment(ctxt);
                   5524:                    ctxt->instate = XML_PARSER_EPILOG;
                   5525:                } else if ((cur == '<') && (next == '?')) {
                   5526:                    if ((!terminate) &&
                   5527:                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5528:                        goto done;
                   5529: #ifdef DEBUG_PUSH
                   5530:                    xmlGenericError(xmlGenericErrorContext,
                   5531:                            "HPP: Parsing PI\n");
                   5532: #endif
                   5533:                    htmlParsePI(ctxt);
                   5534:                    ctxt->instate = XML_PARSER_EPILOG;
                   5535:                } else if ((cur == '<') && (next == '!') &&
                   5536:                           (avail < 4)) {
                   5537:                    goto done;
                   5538:                } else {
                   5539:                    ctxt->errNo = XML_ERR_DOCUMENT_END;
                   5540:                    ctxt->wellFormed = 0;
                   5541:                    ctxt->instate = XML_PARSER_EOF;
                   5542: #ifdef DEBUG_PUSH
                   5543:                    xmlGenericError(xmlGenericErrorContext,
                   5544:                            "HPP: entering EOF\n");
                   5545: #endif
                   5546:                    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
                   5547:                        ctxt->sax->endDocument(ctxt->userData);
                   5548:                    goto done;
                   5549:                }
                   5550:                break;
                   5551:             case XML_PARSER_START_TAG: {
                   5552:                const xmlChar *name;
                   5553:                int failed;
                   5554:                const htmlElemDesc * info;
                   5555: 
1.1.1.2   misho    5556:                /*
                   5557:                 * no chars in buffer
                   5558:                 */
                   5559:                if (avail < 1)
1.1       misho    5560:                    goto done;
1.1.1.2   misho    5561:                /*
                   5562:                 * not enouth chars in buffer
                   5563:                 */
                   5564:                if (avail < 2) {
                   5565:                    if (!terminate)
                   5566:                        goto done;
                   5567:                    else
                   5568:                        next = ' ';
                   5569:                } else {
                   5570:                    next = in->cur[1];
                   5571:                }
1.1       misho    5572:                cur = in->cur[0];
                   5573:                if (cur != '<') {
                   5574:                    ctxt->instate = XML_PARSER_CONTENT;
                   5575: #ifdef DEBUG_PUSH
                   5576:                    xmlGenericError(xmlGenericErrorContext,
                   5577:                            "HPP: entering CONTENT\n");
                   5578: #endif
                   5579:                    break;
                   5580:                }
1.1.1.2   misho    5581:                if (next == '/') {
1.1       misho    5582:                    ctxt->instate = XML_PARSER_END_TAG;
                   5583:                    ctxt->checkIndex = 0;
                   5584: #ifdef DEBUG_PUSH
                   5585:                    xmlGenericError(xmlGenericErrorContext,
                   5586:                            "HPP: entering END_TAG\n");
                   5587: #endif
                   5588:                    break;
                   5589:                }
                   5590:                if ((!terminate) &&
                   5591:                    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5592:                    goto done;
                   5593: 
1.1.1.2   misho    5594:                 /* Capture start position */
                   5595:                if (ctxt->record_info) {
                   5596:                     node_info.begin_pos = ctxt->input->consumed +
                   5597:                                        (CUR_PTR - ctxt->input->base);
                   5598:                     node_info.begin_line = ctxt->input->line;
                   5599:                }
                   5600: 
                   5601: 
1.1       misho    5602:                failed = htmlParseStartTag(ctxt);
                   5603:                name = ctxt->name;
                   5604:                if ((failed == -1) ||
                   5605:                    (name == NULL)) {
                   5606:                    if (CUR == '>')
                   5607:                        NEXT;
                   5608:                    break;
                   5609:                }
                   5610: 
                   5611:                /*
                   5612:                 * Lookup the info for that element.
                   5613:                 */
                   5614:                info = htmlTagLookup(name);
                   5615:                if (info == NULL) {
                   5616:                    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
                   5617:                                 "Tag %s invalid\n", name, NULL);
                   5618:                }
                   5619: 
                   5620:                /*
                   5621:                 * Check for an Empty Element labeled the XML/SGML way
                   5622:                 */
                   5623:                if ((CUR == '/') && (NXT(1) == '>')) {
                   5624:                    SKIP(2);
                   5625:                    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   5626:                        ctxt->sax->endElement(ctxt->userData, name);
                   5627:                    htmlnamePop(ctxt);
                   5628:                    ctxt->instate = XML_PARSER_CONTENT;
                   5629: #ifdef DEBUG_PUSH
                   5630:                    xmlGenericError(xmlGenericErrorContext,
                   5631:                            "HPP: entering CONTENT\n");
                   5632: #endif
                   5633:                    break;
                   5634:                }
                   5635: 
                   5636:                if (CUR == '>') {
                   5637:                    NEXT;
                   5638:                } else {
                   5639:                    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
                   5640:                                 "Couldn't find end of Start Tag %s\n",
                   5641:                                 name, NULL);
                   5642: 
                   5643:                    /*
                   5644:                     * end of parsing of this node.
                   5645:                     */
                   5646:                    if (xmlStrEqual(name, ctxt->name)) {
                   5647:                        nodePop(ctxt);
                   5648:                        htmlnamePop(ctxt);
                   5649:                    }
                   5650: 
1.1.1.2   misho    5651:                    if (ctxt->record_info)
                   5652:                        htmlNodeInfoPush(ctxt, &node_info);
                   5653: 
1.1       misho    5654:                    ctxt->instate = XML_PARSER_CONTENT;
                   5655: #ifdef DEBUG_PUSH
                   5656:                    xmlGenericError(xmlGenericErrorContext,
                   5657:                            "HPP: entering CONTENT\n");
                   5658: #endif
                   5659:                    break;
                   5660:                }
                   5661: 
                   5662:                /*
                   5663:                 * Check for an Empty Element from DTD definition
                   5664:                 */
                   5665:                if ((info != NULL) && (info->empty)) {
                   5666:                    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
                   5667:                        ctxt->sax->endElement(ctxt->userData, name);
                   5668:                    htmlnamePop(ctxt);
                   5669:                }
1.1.1.2   misho    5670: 
                   5671:                 if (ctxt->record_info)
                   5672:                    htmlNodeInfoPush(ctxt, &node_info);
                   5673: 
1.1       misho    5674:                ctxt->instate = XML_PARSER_CONTENT;
                   5675: #ifdef DEBUG_PUSH
                   5676:                xmlGenericError(xmlGenericErrorContext,
                   5677:                        "HPP: entering CONTENT\n");
                   5678: #endif
                   5679:                 break;
                   5680:            }
                   5681:             case XML_PARSER_CONTENT: {
                   5682:                long cons;
                   5683:                 /*
                   5684:                 * Handle preparsed entities and charRef
                   5685:                 */
                   5686:                if (ctxt->token != 0) {
                   5687:                    xmlChar chr[2] = { 0 , 0 } ;
                   5688: 
                   5689:                    chr[0] = (xmlChar) ctxt->token;
                   5690:                    htmlCheckParagraph(ctxt);
                   5691:                    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
                   5692:                        ctxt->sax->characters(ctxt->userData, chr, 1);
                   5693:                    ctxt->token = 0;
                   5694:                    ctxt->checkIndex = 0;
                   5695:                }
                   5696:                if ((avail == 1) && (terminate)) {
                   5697:                    cur = in->cur[0];
                   5698:                    if ((cur != '<') && (cur != '&')) {
                   5699:                        if (ctxt->sax != NULL) {
                   5700:                            if (IS_BLANK_CH(cur)) {
1.1.1.3 ! misho    5701:                                if (ctxt->keepBlanks) {
        !          5702:                                    if (ctxt->sax->characters != NULL)
        !          5703:                                        ctxt->sax->characters(
        !          5704:                                                ctxt->userData, &cur, 1);
        !          5705:                                } else {
        !          5706:                                    if (ctxt->sax->ignorableWhitespace != NULL)
        !          5707:                                        ctxt->sax->ignorableWhitespace(
        !          5708:                                                ctxt->userData, &cur, 1);
        !          5709:                                }
1.1       misho    5710:                            } else {
                   5711:                                htmlCheckParagraph(ctxt);
                   5712:                                if (ctxt->sax->characters != NULL)
                   5713:                                    ctxt->sax->characters(
                   5714:                                            ctxt->userData, &cur, 1);
                   5715:                            }
                   5716:                        }
                   5717:                        ctxt->token = 0;
                   5718:                        ctxt->checkIndex = 0;
                   5719:                        in->cur++;
                   5720:                        break;
                   5721:                    }
                   5722:                }
                   5723:                if (avail < 2)
                   5724:                    goto done;
                   5725:                cur = in->cur[0];
                   5726:                next = in->cur[1];
                   5727:                cons = ctxt->nbChars;
                   5728:                if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
                   5729:                    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
                   5730:                    /*
                   5731:                     * Handle SCRIPT/STYLE separately
                   5732:                     */
                   5733:                    if (!terminate) {
                   5734:                        int idx;
                   5735:                        xmlChar val;
                   5736: 
                   5737:                        idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
                   5738:                        if (idx < 0)
                   5739:                            goto done;
                   5740:                        val = in->cur[idx + 2];
                   5741:                        if (val == 0) /* bad cut of input */
                   5742:                            goto done;
                   5743:                    }
                   5744:                    htmlParseScript(ctxt);
                   5745:                    if ((cur == '<') && (next == '/')) {
                   5746:                        ctxt->instate = XML_PARSER_END_TAG;
                   5747:                        ctxt->checkIndex = 0;
                   5748: #ifdef DEBUG_PUSH
                   5749:                        xmlGenericError(xmlGenericErrorContext,
                   5750:                                "HPP: entering END_TAG\n");
                   5751: #endif
                   5752:                        break;
                   5753:                    }
                   5754:                } else {
                   5755:                    /*
                   5756:                     * Sometimes DOCTYPE arrives in the middle of the document
                   5757:                     */
                   5758:                    if ((cur == '<') && (next == '!') &&
                   5759:                        (UPP(2) == 'D') && (UPP(3) == 'O') &&
                   5760:                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
                   5761:                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                   5762:                        (UPP(8) == 'E')) {
                   5763:                        if ((!terminate) &&
                   5764:                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5765:                            goto done;
                   5766:                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
                   5767:                                     "Misplaced DOCTYPE declaration\n",
                   5768:                                     BAD_CAST "DOCTYPE" , NULL);
                   5769:                        htmlParseDocTypeDecl(ctxt);
                   5770:                    } else if ((cur == '<') && (next == '!') &&
                   5771:                        (in->cur[2] == '-') && (in->cur[3] == '-')) {
                   5772:                        if ((!terminate) &&
                   5773:                            (htmlParseLookupSequence(
                   5774:                                ctxt, '-', '-', '>', 1, 1) < 0))
                   5775:                            goto done;
                   5776: #ifdef DEBUG_PUSH
                   5777:                        xmlGenericError(xmlGenericErrorContext,
                   5778:                                "HPP: Parsing Comment\n");
                   5779: #endif
                   5780:                        htmlParseComment(ctxt);
                   5781:                        ctxt->instate = XML_PARSER_CONTENT;
                   5782:                    } else if ((cur == '<') && (next == '?')) {
                   5783:                        if ((!terminate) &&
                   5784:                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5785:                            goto done;
                   5786: #ifdef DEBUG_PUSH
                   5787:                        xmlGenericError(xmlGenericErrorContext,
                   5788:                                "HPP: Parsing PI\n");
                   5789: #endif
                   5790:                        htmlParsePI(ctxt);
                   5791:                        ctxt->instate = XML_PARSER_CONTENT;
                   5792:                    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
                   5793:                        goto done;
                   5794:                    } else if ((cur == '<') && (next == '/')) {
                   5795:                        ctxt->instate = XML_PARSER_END_TAG;
                   5796:                        ctxt->checkIndex = 0;
                   5797: #ifdef DEBUG_PUSH
                   5798:                        xmlGenericError(xmlGenericErrorContext,
                   5799:                                "HPP: entering END_TAG\n");
                   5800: #endif
                   5801:                        break;
                   5802:                    } else if (cur == '<') {
                   5803:                        ctxt->instate = XML_PARSER_START_TAG;
                   5804:                        ctxt->checkIndex = 0;
                   5805: #ifdef DEBUG_PUSH
                   5806:                        xmlGenericError(xmlGenericErrorContext,
                   5807:                                "HPP: entering START_TAG\n");
                   5808: #endif
                   5809:                        break;
                   5810:                    } else if (cur == '&') {
                   5811:                        if ((!terminate) &&
                   5812:                            (htmlParseLookupChars(ctxt,
                   5813:                                                   BAD_CAST "; >/", 4) < 0))
                   5814:                            goto done;
                   5815: #ifdef DEBUG_PUSH
                   5816:                        xmlGenericError(xmlGenericErrorContext,
                   5817:                                "HPP: Parsing Reference\n");
                   5818: #endif
                   5819:                        /* TODO: check generation of subtrees if noent !!! */
                   5820:                        htmlParseReference(ctxt);
                   5821:                    } else {
                   5822:                        /*
                   5823:                         * check that the text sequence is complete
                   5824:                         * before handing out the data to the parser
                   5825:                         * to avoid problems with erroneous end of
                   5826:                         * data detection.
                   5827:                         */
                   5828:                        if ((!terminate) &&
                   5829:                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
                   5830:                            goto done;
                   5831:                        ctxt->checkIndex = 0;
                   5832: #ifdef DEBUG_PUSH
                   5833:                        xmlGenericError(xmlGenericErrorContext,
                   5834:                                "HPP: Parsing char data\n");
                   5835: #endif
                   5836:                        htmlParseCharData(ctxt);
                   5837:                    }
                   5838:                }
                   5839:                if (cons == ctxt->nbChars) {
                   5840:                    if (ctxt->node != NULL) {
                   5841:                        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5842:                                     "detected an error in element content\n",
                   5843:                                     NULL, NULL);
                   5844:                    }
                   5845:                    NEXT;
                   5846:                    break;
                   5847:                }
                   5848: 
                   5849:                break;
                   5850:            }
                   5851:             case XML_PARSER_END_TAG:
                   5852:                if (avail < 2)
                   5853:                    goto done;
                   5854:                if ((!terminate) &&
                   5855:                    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
                   5856:                    goto done;
                   5857:                htmlParseEndTag(ctxt);
                   5858:                if (ctxt->nameNr == 0) {
                   5859:                    ctxt->instate = XML_PARSER_EPILOG;
                   5860:                } else {
                   5861:                    ctxt->instate = XML_PARSER_CONTENT;
                   5862:                }
                   5863:                ctxt->checkIndex = 0;
                   5864: #ifdef DEBUG_PUSH
                   5865:                xmlGenericError(xmlGenericErrorContext,
                   5866:                        "HPP: entering CONTENT\n");
                   5867: #endif
                   5868:                break;
                   5869:             case XML_PARSER_CDATA_SECTION:
                   5870:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5871:                        "HPP: internal error, state == CDATA\n",
                   5872:                             NULL, NULL);
                   5873:                ctxt->instate = XML_PARSER_CONTENT;
                   5874:                ctxt->checkIndex = 0;
                   5875: #ifdef DEBUG_PUSH
                   5876:                xmlGenericError(xmlGenericErrorContext,
                   5877:                        "HPP: entering CONTENT\n");
                   5878: #endif
                   5879:                break;
                   5880:             case XML_PARSER_DTD:
                   5881:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5882:                        "HPP: internal error, state == DTD\n",
                   5883:                             NULL, NULL);
                   5884:                ctxt->instate = XML_PARSER_CONTENT;
                   5885:                ctxt->checkIndex = 0;
                   5886: #ifdef DEBUG_PUSH
                   5887:                xmlGenericError(xmlGenericErrorContext,
                   5888:                        "HPP: entering CONTENT\n");
                   5889: #endif
                   5890:                break;
                   5891:             case XML_PARSER_COMMENT:
                   5892:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5893:                        "HPP: internal error, state == COMMENT\n",
                   5894:                             NULL, NULL);
                   5895:                ctxt->instate = XML_PARSER_CONTENT;
                   5896:                ctxt->checkIndex = 0;
                   5897: #ifdef DEBUG_PUSH
                   5898:                xmlGenericError(xmlGenericErrorContext,
                   5899:                        "HPP: entering CONTENT\n");
                   5900: #endif
                   5901:                break;
                   5902:             case XML_PARSER_PI:
                   5903:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5904:                        "HPP: internal error, state == PI\n",
                   5905:                             NULL, NULL);
                   5906:                ctxt->instate = XML_PARSER_CONTENT;
                   5907:                ctxt->checkIndex = 0;
                   5908: #ifdef DEBUG_PUSH
                   5909:                xmlGenericError(xmlGenericErrorContext,
                   5910:                        "HPP: entering CONTENT\n");
                   5911: #endif
                   5912:                break;
                   5913:             case XML_PARSER_ENTITY_DECL:
                   5914:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5915:                        "HPP: internal error, state == ENTITY_DECL\n",
                   5916:                             NULL, NULL);
                   5917:                ctxt->instate = XML_PARSER_CONTENT;
                   5918:                ctxt->checkIndex = 0;
                   5919: #ifdef DEBUG_PUSH
                   5920:                xmlGenericError(xmlGenericErrorContext,
                   5921:                        "HPP: entering CONTENT\n");
                   5922: #endif
                   5923:                break;
                   5924:             case XML_PARSER_ENTITY_VALUE:
                   5925:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5926:                        "HPP: internal error, state == ENTITY_VALUE\n",
                   5927:                             NULL, NULL);
                   5928:                ctxt->instate = XML_PARSER_CONTENT;
                   5929:                ctxt->checkIndex = 0;
                   5930: #ifdef DEBUG_PUSH
                   5931:                xmlGenericError(xmlGenericErrorContext,
                   5932:                        "HPP: entering DTD\n");
                   5933: #endif
                   5934:                break;
                   5935:             case XML_PARSER_ATTRIBUTE_VALUE:
                   5936:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5937:                        "HPP: internal error, state == ATTRIBUTE_VALUE\n",
                   5938:                             NULL, NULL);
                   5939:                ctxt->instate = XML_PARSER_START_TAG;
                   5940:                ctxt->checkIndex = 0;
                   5941: #ifdef DEBUG_PUSH
                   5942:                xmlGenericError(xmlGenericErrorContext,
                   5943:                        "HPP: entering START_TAG\n");
                   5944: #endif
                   5945:                break;
                   5946:            case XML_PARSER_SYSTEM_LITERAL:
                   5947:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5948:                    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
                   5949:                             NULL, NULL);
                   5950:                ctxt->instate = XML_PARSER_CONTENT;
                   5951:                ctxt->checkIndex = 0;
                   5952: #ifdef DEBUG_PUSH
                   5953:                xmlGenericError(xmlGenericErrorContext,
                   5954:                        "HPP: entering CONTENT\n");
                   5955: #endif
                   5956:                break;
                   5957:            case XML_PARSER_IGNORE:
                   5958:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5959:                        "HPP: internal error, state == XML_PARSER_IGNORE\n",
                   5960:                             NULL, NULL);
                   5961:                ctxt->instate = XML_PARSER_CONTENT;
                   5962:                ctxt->checkIndex = 0;
                   5963: #ifdef DEBUG_PUSH
                   5964:                xmlGenericError(xmlGenericErrorContext,
                   5965:                        "HPP: entering CONTENT\n");
                   5966: #endif
                   5967:                break;
                   5968:            case XML_PARSER_PUBLIC_LITERAL:
                   5969:                htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   5970:                        "HPP: internal error, state == XML_PARSER_LITERAL\n",
                   5971:                             NULL, NULL);
                   5972:                ctxt->instate = XML_PARSER_CONTENT;
                   5973:                ctxt->checkIndex = 0;
                   5974: #ifdef DEBUG_PUSH
                   5975:                xmlGenericError(xmlGenericErrorContext,
                   5976:                        "HPP: entering CONTENT\n");
                   5977: #endif
                   5978:                break;
                   5979: 
                   5980:        }
                   5981:     }
                   5982: done:
                   5983:     if ((avail == 0) && (terminate)) {
                   5984:        htmlAutoCloseOnEnd(ctxt);
                   5985:        if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
                   5986:            /*
                   5987:             * SAX: end of the document processing.
                   5988:             */
                   5989:            ctxt->instate = XML_PARSER_EOF;
                   5990:            if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
                   5991:                ctxt->sax->endDocument(ctxt->userData);
                   5992:        }
                   5993:     }
                   5994:     if ((ctxt->myDoc != NULL) &&
                   5995:        ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
                   5996:         (ctxt->instate == XML_PARSER_EPILOG))) {
                   5997:        xmlDtdPtr dtd;
                   5998:        dtd = xmlGetIntSubset(ctxt->myDoc);
                   5999:        if (dtd == NULL)
                   6000:            ctxt->myDoc->intSubset =
                   6001:                xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
                   6002:                    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
                   6003:                    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
                   6004:     }
                   6005: #ifdef DEBUG_PUSH
                   6006:     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
                   6007: #endif
                   6008:     return(ret);
                   6009: }
                   6010: 
                   6011: /**
                   6012:  * htmlParseChunk:
                   6013:  * @ctxt:  an HTML parser context
                   6014:  * @chunk:  an char array
                   6015:  * @size:  the size in byte of the chunk
                   6016:  * @terminate:  last chunk indicator
                   6017:  *
                   6018:  * Parse a Chunk of memory
                   6019:  *
                   6020:  * Returns zero if no error, the xmlParserErrors otherwise.
                   6021:  */
                   6022: int
                   6023: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
                   6024:               int terminate) {
                   6025:     if ((ctxt == NULL) || (ctxt->input == NULL)) {
                   6026:        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
                   6027:                     "htmlParseChunk: context error\n", NULL, NULL);
                   6028:        return(XML_ERR_INTERNAL_ERROR);
                   6029:     }
                   6030:     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
                   6031:         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
1.1.1.3 ! misho    6032:        size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
        !          6033:        size_t cur = ctxt->input->cur - ctxt->input->base;
1.1       misho    6034:        int res;
                   6035: 
                   6036:        res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
                   6037:        if (res < 0) {
                   6038:            ctxt->errNo = XML_PARSER_EOF;
                   6039:            ctxt->disableSAX = 1;
                   6040:            return (XML_PARSER_EOF);
                   6041:        }
1.1.1.3 ! misho    6042:         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
1.1       misho    6043: #ifdef DEBUG_PUSH
                   6044:        xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
                   6045: #endif
                   6046: 
                   6047: #if 0
                   6048:        if ((terminate) || (ctxt->input->buf->buffer->use > 80))
                   6049:            htmlParseTryOrFinish(ctxt, terminate);
                   6050: #endif
                   6051:     } else if (ctxt->instate != XML_PARSER_EOF) {
                   6052:        if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
                   6053:            xmlParserInputBufferPtr in = ctxt->input->buf;
                   6054:            if ((in->encoder != NULL) && (in->buffer != NULL) &&
                   6055:                    (in->raw != NULL)) {
                   6056:                int nbchars;
1.1.1.3 ! misho    6057:                size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
        !          6058:                size_t current = ctxt->input->cur - ctxt->input->base;
1.1       misho    6059: 
1.1.1.3 ! misho    6060:                nbchars = xmlCharEncInput(in, terminate);
1.1       misho    6061:                if (nbchars < 0) {
                   6062:                    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
                   6063:                                 "encoder error\n", NULL, NULL);
                   6064:                    return(XML_ERR_INVALID_ENCODING);
                   6065:                }
1.1.1.3 ! misho    6066:                xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
1.1       misho    6067:            }
                   6068:        }
                   6069:     }
                   6070:     htmlParseTryOrFinish(ctxt, terminate);
                   6071:     if (terminate) {
                   6072:        if ((ctxt->instate != XML_PARSER_EOF) &&
                   6073:            (ctxt->instate != XML_PARSER_EPILOG) &&
                   6074:            (ctxt->instate != XML_PARSER_MISC)) {
                   6075:            ctxt->errNo = XML_ERR_DOCUMENT_END;
                   6076:            ctxt->wellFormed = 0;
                   6077:        }
                   6078:        if (ctxt->instate != XML_PARSER_EOF) {
                   6079:            if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
                   6080:                ctxt->sax->endDocument(ctxt->userData);
                   6081:        }
                   6082:        ctxt->instate = XML_PARSER_EOF;
                   6083:     }
                   6084:     return((xmlParserErrors) ctxt->errNo);
                   6085: }
                   6086: 
                   6087: /************************************************************************
                   6088:  *                                                                     *
                   6089:  *                     User entry points                               *
                   6090:  *                                                                     *
                   6091:  ************************************************************************/
                   6092: 
                   6093: /**
                   6094:  * htmlCreatePushParserCtxt:
                   6095:  * @sax:  a SAX handler
                   6096:  * @user_data:  The user data returned on SAX callbacks
                   6097:  * @chunk:  a pointer to an array of chars
                   6098:  * @size:  number of chars in the array
                   6099:  * @filename:  an optional file name or URI
                   6100:  * @enc:  an optional encoding
                   6101:  *
                   6102:  * Create a parser context for using the HTML parser in push mode
                   6103:  * The value of @filename is used for fetching external entities
                   6104:  * and error/warning reports.
                   6105:  *
                   6106:  * Returns the new parser context or NULL
                   6107:  */
                   6108: htmlParserCtxtPtr
                   6109: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
                   6110:                          const char *chunk, int size, const char *filename,
                   6111:                         xmlCharEncoding enc) {
                   6112:     htmlParserCtxtPtr ctxt;
                   6113:     htmlParserInputPtr inputStream;
                   6114:     xmlParserInputBufferPtr buf;
                   6115: 
                   6116:     xmlInitParser();
                   6117: 
                   6118:     buf = xmlAllocParserInputBuffer(enc);
                   6119:     if (buf == NULL) return(NULL);
                   6120: 
                   6121:     ctxt = htmlNewParserCtxt();
                   6122:     if (ctxt == NULL) {
                   6123:        xmlFreeParserInputBuffer(buf);
                   6124:        return(NULL);
                   6125:     }
                   6126:     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
                   6127:        ctxt->charset=XML_CHAR_ENCODING_UTF8;
                   6128:     if (sax != NULL) {
                   6129:        if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
                   6130:            xmlFree(ctxt->sax);
                   6131:        ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
                   6132:        if (ctxt->sax == NULL) {
                   6133:            xmlFree(buf);
                   6134:            xmlFree(ctxt);
                   6135:            return(NULL);
                   6136:        }
                   6137:        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
                   6138:        if (user_data != NULL)
                   6139:            ctxt->userData = user_data;
                   6140:     }
                   6141:     if (filename == NULL) {
                   6142:        ctxt->directory = NULL;
                   6143:     } else {
                   6144:         ctxt->directory = xmlParserGetDirectory(filename);
                   6145:     }
                   6146: 
                   6147:     inputStream = htmlNewInputStream(ctxt);
                   6148:     if (inputStream == NULL) {
                   6149:        xmlFreeParserCtxt(ctxt);
                   6150:        xmlFree(buf);
                   6151:        return(NULL);
                   6152:     }
                   6153: 
                   6154:     if (filename == NULL)
                   6155:        inputStream->filename = NULL;
                   6156:     else
                   6157:        inputStream->filename = (char *)
                   6158:            xmlCanonicPath((const xmlChar *) filename);
                   6159:     inputStream->buf = buf;
1.1.1.3 ! misho    6160:     xmlBufResetInput(buf->buffer, inputStream);
1.1       misho    6161: 
                   6162:     inputPush(ctxt, inputStream);
                   6163: 
                   6164:     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
                   6165:         (ctxt->input->buf != NULL))  {
1.1.1.3 ! misho    6166:        size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
        !          6167:        size_t cur = ctxt->input->cur - ctxt->input->base;
1.1       misho    6168: 
                   6169:        xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
                   6170: 
1.1.1.3 ! misho    6171:         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
1.1       misho    6172: #ifdef DEBUG_PUSH
                   6173:        xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
                   6174: #endif
                   6175:     }
                   6176:     ctxt->progressive = 1;
                   6177: 
                   6178:     return(ctxt);
                   6179: }
                   6180: #endif /* LIBXML_PUSH_ENABLED */
                   6181: 
                   6182: /**
                   6183:  * htmlSAXParseDoc:
                   6184:  * @cur:  a pointer to an array of xmlChar
                   6185:  * @encoding:  a free form C string describing the HTML document encoding, or NULL
                   6186:  * @sax:  the SAX handler block
                   6187:  * @userData: if using SAX, this pointer will be provided on callbacks.
                   6188:  *
                   6189:  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
                   6190:  * to handle parse events. If sax is NULL, fallback to the default DOM
                   6191:  * behavior and return a tree.
                   6192:  *
                   6193:  * Returns the resulting document tree unless SAX is NULL or the document is
                   6194:  *     not well formed.
                   6195:  */
                   6196: 
                   6197: htmlDocPtr
                   6198: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
                   6199:     htmlDocPtr ret;
                   6200:     htmlParserCtxtPtr ctxt;
                   6201: 
                   6202:     xmlInitParser();
                   6203: 
                   6204:     if (cur == NULL) return(NULL);
                   6205: 
                   6206: 
                   6207:     ctxt = htmlCreateDocParserCtxt(cur, encoding);
                   6208:     if (ctxt == NULL) return(NULL);
                   6209:     if (sax != NULL) {
                   6210:         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
                   6211:         ctxt->sax = sax;
                   6212:         ctxt->userData = userData;
                   6213:     }
                   6214: 
                   6215:     htmlParseDocument(ctxt);
                   6216:     ret = ctxt->myDoc;
                   6217:     if (sax != NULL) {
                   6218:        ctxt->sax = NULL;
                   6219:        ctxt->userData = NULL;
                   6220:     }
                   6221:     htmlFreeParserCtxt(ctxt);
                   6222: 
                   6223:     return(ret);
                   6224: }
                   6225: 
                   6226: /**
                   6227:  * htmlParseDoc:
                   6228:  * @cur:  a pointer to an array of xmlChar
                   6229:  * @encoding:  a free form C string describing the HTML document encoding, or NULL
                   6230:  *
                   6231:  * parse an HTML in-memory document and build a tree.
                   6232:  *
                   6233:  * Returns the resulting document tree
                   6234:  */
                   6235: 
                   6236: htmlDocPtr
                   6237: htmlParseDoc(xmlChar *cur, const char *encoding) {
                   6238:     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
                   6239: }
                   6240: 
                   6241: 
                   6242: /**
                   6243:  * htmlCreateFileParserCtxt:
                   6244:  * @filename:  the filename
                   6245:  * @encoding:  a free form C string describing the HTML document encoding, or NULL
                   6246:  *
                   6247:  * Create a parser context for a file content.
                   6248:  * Automatic support for ZLIB/Compress compressed document is provided
                   6249:  * by default if found at compile-time.
                   6250:  *
                   6251:  * Returns the new parser context or NULL
                   6252:  */
                   6253: htmlParserCtxtPtr
                   6254: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
                   6255: {
                   6256:     htmlParserCtxtPtr ctxt;
                   6257:     htmlParserInputPtr inputStream;
                   6258:     char *canonicFilename;
                   6259:     /* htmlCharEncoding enc; */
                   6260:     xmlChar *content, *content_line = (xmlChar *) "charset=";
                   6261: 
                   6262:     if (filename == NULL)
                   6263:         return(NULL);
                   6264: 
                   6265:     ctxt = htmlNewParserCtxt();
                   6266:     if (ctxt == NULL) {
                   6267:        return(NULL);
                   6268:     }
                   6269:     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
                   6270:     if (canonicFilename == NULL) {
                   6271: #ifdef LIBXML_SAX1_ENABLED
                   6272:        if (xmlDefaultSAXHandler.error != NULL) {
                   6273:            xmlDefaultSAXHandler.error(NULL, "out of memory\n");
                   6274:        }
                   6275: #endif
                   6276:        xmlFreeParserCtxt(ctxt);
                   6277:        return(NULL);
                   6278:     }
                   6279: 
                   6280:     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
                   6281:     xmlFree(canonicFilename);
                   6282:     if (inputStream == NULL) {
                   6283:        xmlFreeParserCtxt(ctxt);
                   6284:        return(NULL);
                   6285:     }
                   6286: 
                   6287:     inputPush(ctxt, inputStream);
                   6288: 
                   6289:     /* set encoding */
                   6290:     if (encoding) {
                   6291:         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
                   6292:        if (content) {
                   6293:            strcpy ((char *)content, (char *)content_line);
                   6294:             strcat ((char *)content, (char *)encoding);
                   6295:             htmlCheckEncoding (ctxt, content);
                   6296:            xmlFree (content);
                   6297:        }
                   6298:     }
                   6299: 
                   6300:     return(ctxt);
                   6301: }
                   6302: 
                   6303: /**
                   6304:  * htmlSAXParseFile:
                   6305:  * @filename:  the filename
                   6306:  * @encoding:  a free form C string describing the HTML document encoding, or NULL
                   6307:  * @sax:  the SAX handler block
                   6308:  * @userData: if using SAX, this pointer will be provided on callbacks.
                   6309:  *
                   6310:  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
                   6311:  * compressed document is provided by default if found at compile-time.
                   6312:  * It use the given SAX function block to handle the parsing callback.
                   6313:  * If sax is NULL, fallback to the default DOM tree building routines.
                   6314:  *
                   6315:  * Returns the resulting document tree unless SAX is NULL or the document is
                   6316:  *     not well formed.
                   6317:  */
                   6318: 
                   6319: htmlDocPtr
                   6320: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
                   6321:                  void *userData) {
                   6322:     htmlDocPtr ret;
                   6323:     htmlParserCtxtPtr ctxt;
                   6324:     htmlSAXHandlerPtr oldsax = NULL;
                   6325: 
                   6326:     xmlInitParser();
                   6327: 
                   6328:     ctxt = htmlCreateFileParserCtxt(filename, encoding);
                   6329:     if (ctxt == NULL) return(NULL);
                   6330:     if (sax != NULL) {
                   6331:        oldsax = ctxt->sax;
                   6332:         ctxt->sax = sax;
                   6333:         ctxt->userData = userData;
                   6334:     }
                   6335: 
                   6336:     htmlParseDocument(ctxt);
                   6337: 
                   6338:     ret = ctxt->myDoc;
                   6339:     if (sax != NULL) {
                   6340:         ctxt->sax = oldsax;
                   6341:         ctxt->userData = NULL;
                   6342:     }
                   6343:     htmlFreeParserCtxt(ctxt);
                   6344: 
                   6345:     return(ret);
                   6346: }
                   6347: 
                   6348: /**
                   6349:  * htmlParseFile:
                   6350:  * @filename:  the filename
                   6351:  * @encoding:  a free form C string describing the HTML document encoding, or NULL
                   6352:  *
                   6353:  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
                   6354:  * compressed document is provided by default if found at compile-time.
                   6355:  *
                   6356:  * Returns the resulting document tree
                   6357:  */
                   6358: 
                   6359: htmlDocPtr
                   6360: htmlParseFile(const char *filename, const char *encoding) {
                   6361:     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
                   6362: }
                   6363: 
                   6364: /**
                   6365:  * htmlHandleOmittedElem:
                   6366:  * @val:  int 0 or 1
                   6367:  *
                   6368:  * Set and return the previous value for handling HTML omitted tags.
                   6369:  *
                   6370:  * Returns the last value for 0 for no handling, 1 for auto insertion.
                   6371:  */
                   6372: 
                   6373: int
                   6374: htmlHandleOmittedElem(int val) {
                   6375:     int old = htmlOmittedDefaultValue;
                   6376: 
                   6377:     htmlOmittedDefaultValue = val;
                   6378:     return(old);
                   6379: }
                   6380: 
                   6381: /**
                   6382:  * htmlElementAllowedHere:
                   6383:  * @parent: HTML parent element
                   6384:  * @elt: HTML element
                   6385:  *
                   6386:  * Checks whether an HTML element may be a direct child of a parent element.
                   6387:  * Note - doesn't check for deprecated elements
                   6388:  *
                   6389:  * Returns 1 if allowed; 0 otherwise.
                   6390:  */
                   6391: int
                   6392: htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
                   6393:   const char** p ;
                   6394: 
                   6395:   if ( ! elt || ! parent || ! parent->subelts )
                   6396:        return 0 ;
                   6397: 
                   6398:   for ( p = parent->subelts; *p; ++p )
                   6399:     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
                   6400:       return 1 ;
                   6401: 
                   6402:   return 0 ;
                   6403: }
                   6404: /**
                   6405:  * htmlElementStatusHere:
                   6406:  * @parent: HTML parent element
                   6407:  * @elt: HTML element
                   6408:  *
                   6409:  * Checks whether an HTML element may be a direct child of a parent element.
                   6410:  * and if so whether it is valid or deprecated.
                   6411:  *
                   6412:  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
                   6413:  */
                   6414: htmlStatus
                   6415: htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
                   6416:   if ( ! parent || ! elt )
                   6417:     return HTML_INVALID ;
                   6418:   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
                   6419:     return HTML_INVALID ;
                   6420: 
                   6421:   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
                   6422: }
                   6423: /**
                   6424:  * htmlAttrAllowed:
                   6425:  * @elt: HTML element
                   6426:  * @attr: HTML attribute
                   6427:  * @legacy: whether to allow deprecated attributes
                   6428:  *
                   6429:  * Checks whether an attribute is valid for an element
                   6430:  * Has full knowledge of Required and Deprecated attributes
                   6431:  *
                   6432:  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
                   6433:  */
                   6434: htmlStatus
                   6435: htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
                   6436:   const char** p ;
                   6437: 
                   6438:   if ( !elt || ! attr )
                   6439:        return HTML_INVALID ;
                   6440: 
                   6441:   if ( elt->attrs_req )
                   6442:     for ( p = elt->attrs_req; *p; ++p)
                   6443:       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
                   6444:         return HTML_REQUIRED ;
                   6445: 
                   6446:   if ( elt->attrs_opt )
                   6447:     for ( p = elt->attrs_opt; *p; ++p)
                   6448:       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
                   6449:         return HTML_VALID ;
                   6450: 
                   6451:   if ( legacy && elt->attrs_depr )
                   6452:     for ( p = elt->attrs_depr; *p; ++p)
                   6453:       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
                   6454:         return HTML_DEPRECATED ;
                   6455: 
                   6456:   return HTML_INVALID ;
                   6457: }
                   6458: /**
                   6459:  * htmlNodeStatus:
                   6460:  * @node: an htmlNodePtr in a tree
                   6461:  * @legacy: whether to allow deprecated elements (YES is faster here
                   6462:  *     for Element nodes)
                   6463:  *
                   6464:  * Checks whether the tree node is valid.  Experimental (the author
                   6465:  *     only uses the HTML enhancements in a SAX parser)
                   6466:  *
                   6467:  * Return: for Element nodes, a return from htmlElementAllowedHere (if
                   6468:  *     legacy allowed) or htmlElementStatusHere (otherwise).
                   6469:  *     for Attribute nodes, a return from htmlAttrAllowed
                   6470:  *     for other nodes, HTML_NA (no checks performed)
                   6471:  */
                   6472: htmlStatus
                   6473: htmlNodeStatus(const htmlNodePtr node, int legacy) {
                   6474:   if ( ! node )
                   6475:     return HTML_INVALID ;
                   6476: 
                   6477:   switch ( node->type ) {
                   6478:     case XML_ELEMENT_NODE:
                   6479:       return legacy
                   6480:        ? ( htmlElementAllowedHere (
                   6481:                htmlTagLookup(node->parent->name) , node->name
                   6482:                ) ? HTML_VALID : HTML_INVALID )
                   6483:        : htmlElementStatusHere(
                   6484:                htmlTagLookup(node->parent->name) ,
                   6485:                htmlTagLookup(node->name) )
                   6486:        ;
                   6487:     case XML_ATTRIBUTE_NODE:
                   6488:       return htmlAttrAllowed(
                   6489:        htmlTagLookup(node->parent->name) , node->name, legacy) ;
                   6490:     default: return HTML_NA ;
                   6491:   }
                   6492: }
                   6493: /************************************************************************
                   6494:  *                                                                     *
                   6495:  *     New set (2.6.0) of simpler and more flexible APIs               *
                   6496:  *                                                                     *
                   6497:  ************************************************************************/
                   6498: /**
                   6499:  * DICT_FREE:
                   6500:  * @str:  a string
                   6501:  *
                   6502:  * Free a string if it is not owned by the "dict" dictionnary in the
                   6503:  * current scope
                   6504:  */
                   6505: #define DICT_FREE(str)                                         \
                   6506:        if ((str) && ((!dict) ||                                \
                   6507:            (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
                   6508:            xmlFree((char *)(str));
                   6509: 
                   6510: /**
                   6511:  * htmlCtxtReset:
                   6512:  * @ctxt: an HTML parser context
                   6513:  *
                   6514:  * Reset a parser context
                   6515:  */
                   6516: void
                   6517: htmlCtxtReset(htmlParserCtxtPtr ctxt)
                   6518: {
                   6519:     xmlParserInputPtr input;
                   6520:     xmlDictPtr dict;
                   6521: 
                   6522:     if (ctxt == NULL)
                   6523:         return;
                   6524: 
                   6525:     xmlInitParser();
                   6526:     dict = ctxt->dict;
                   6527: 
                   6528:     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
                   6529:         xmlFreeInputStream(input);
                   6530:     }
                   6531:     ctxt->inputNr = 0;
                   6532:     ctxt->input = NULL;
                   6533: 
                   6534:     ctxt->spaceNr = 0;
                   6535:     if (ctxt->spaceTab != NULL) {
                   6536:        ctxt->spaceTab[0] = -1;
                   6537:        ctxt->space = &ctxt->spaceTab[0];
                   6538:     } else {
                   6539:        ctxt->space = NULL;
                   6540:     }
                   6541: 
                   6542: 
                   6543:     ctxt->nodeNr = 0;
                   6544:     ctxt->node = NULL;
                   6545: 
                   6546:     ctxt->nameNr = 0;
                   6547:     ctxt->name = NULL;
                   6548: 
                   6549:     DICT_FREE(ctxt->version);
                   6550:     ctxt->version = NULL;
                   6551:     DICT_FREE(ctxt->encoding);
                   6552:     ctxt->encoding = NULL;
                   6553:     DICT_FREE(ctxt->directory);
                   6554:     ctxt->directory = NULL;
                   6555:     DICT_FREE(ctxt->extSubURI);
                   6556:     ctxt->extSubURI = NULL;
                   6557:     DICT_FREE(ctxt->extSubSystem);
                   6558:     ctxt->extSubSystem = NULL;
                   6559:     if (ctxt->myDoc != NULL)
                   6560:         xmlFreeDoc(ctxt->myDoc);
                   6561:     ctxt->myDoc = NULL;
                   6562: 
                   6563:     ctxt->standalone = -1;
                   6564:     ctxt->hasExternalSubset = 0;
                   6565:     ctxt->hasPErefs = 0;
                   6566:     ctxt->html = 1;
                   6567:     ctxt->external = 0;
                   6568:     ctxt->instate = XML_PARSER_START;
                   6569:     ctxt->token = 0;
                   6570: 
                   6571:     ctxt->wellFormed = 1;
                   6572:     ctxt->nsWellFormed = 1;
                   6573:     ctxt->disableSAX = 0;
                   6574:     ctxt->valid = 1;
                   6575:     ctxt->vctxt.userData = ctxt;
                   6576:     ctxt->vctxt.error = xmlParserValidityError;
                   6577:     ctxt->vctxt.warning = xmlParserValidityWarning;
                   6578:     ctxt->record_info = 0;
                   6579:     ctxt->nbChars = 0;
                   6580:     ctxt->checkIndex = 0;
                   6581:     ctxt->inSubset = 0;
                   6582:     ctxt->errNo = XML_ERR_OK;
                   6583:     ctxt->depth = 0;
                   6584:     ctxt->charset = XML_CHAR_ENCODING_NONE;
                   6585:     ctxt->catalogs = NULL;
                   6586:     xmlInitNodeInfoSeq(&ctxt->node_seq);
                   6587: 
                   6588:     if (ctxt->attsDefault != NULL) {
                   6589:         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
                   6590:         ctxt->attsDefault = NULL;
                   6591:     }
                   6592:     if (ctxt->attsSpecial != NULL) {
                   6593:         xmlHashFree(ctxt->attsSpecial, NULL);
                   6594:         ctxt->attsSpecial = NULL;
                   6595:     }
                   6596: }
                   6597: 
                   6598: /**
                   6599:  * htmlCtxtUseOptions:
                   6600:  * @ctxt: an HTML parser context
                   6601:  * @options:  a combination of htmlParserOption(s)
                   6602:  *
                   6603:  * Applies the options to the parser context
                   6604:  *
                   6605:  * Returns 0 in case of success, the set of unknown or unimplemented options
                   6606:  *         in case of error.
                   6607:  */
                   6608: int
                   6609: htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
                   6610: {
                   6611:     if (ctxt == NULL)
                   6612:         return(-1);
                   6613: 
                   6614:     if (options & HTML_PARSE_NOWARNING) {
                   6615:         ctxt->sax->warning = NULL;
                   6616:         ctxt->vctxt.warning = NULL;
                   6617:         options -= XML_PARSE_NOWARNING;
                   6618:        ctxt->options |= XML_PARSE_NOWARNING;
                   6619:     }
                   6620:     if (options & HTML_PARSE_NOERROR) {
                   6621:         ctxt->sax->error = NULL;
                   6622:         ctxt->vctxt.error = NULL;
                   6623:         ctxt->sax->fatalError = NULL;
                   6624:         options -= XML_PARSE_NOERROR;
                   6625:        ctxt->options |= XML_PARSE_NOERROR;
                   6626:     }
                   6627:     if (options & HTML_PARSE_PEDANTIC) {
                   6628:         ctxt->pedantic = 1;
                   6629:         options -= XML_PARSE_PEDANTIC;
                   6630:        ctxt->options |= XML_PARSE_PEDANTIC;
                   6631:     } else
                   6632:         ctxt->pedantic = 0;
                   6633:     if (options & XML_PARSE_NOBLANKS) {
                   6634:         ctxt->keepBlanks = 0;
                   6635:         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
                   6636:         options -= XML_PARSE_NOBLANKS;
                   6637:        ctxt->options |= XML_PARSE_NOBLANKS;
                   6638:     } else
                   6639:         ctxt->keepBlanks = 1;
                   6640:     if (options & HTML_PARSE_RECOVER) {
                   6641:         ctxt->recovery = 1;
                   6642:        options -= HTML_PARSE_RECOVER;
                   6643:     } else
                   6644:         ctxt->recovery = 0;
                   6645:     if (options & HTML_PARSE_COMPACT) {
                   6646:        ctxt->options |= HTML_PARSE_COMPACT;
                   6647:         options -= HTML_PARSE_COMPACT;
                   6648:     }
                   6649:     if (options & XML_PARSE_HUGE) {
                   6650:        ctxt->options |= XML_PARSE_HUGE;
                   6651:         options -= XML_PARSE_HUGE;
                   6652:     }
                   6653:     if (options & HTML_PARSE_NODEFDTD) {
                   6654:        ctxt->options |= HTML_PARSE_NODEFDTD;
                   6655:         options -= HTML_PARSE_NODEFDTD;
                   6656:     }
1.1.1.2   misho    6657:     if (options & HTML_PARSE_IGNORE_ENC) {
                   6658:        ctxt->options |= HTML_PARSE_IGNORE_ENC;
                   6659:         options -= HTML_PARSE_IGNORE_ENC;
                   6660:     }
                   6661:     if (options & HTML_PARSE_NOIMPLIED) {
                   6662:         ctxt->options |= HTML_PARSE_NOIMPLIED;
                   6663:         options -= HTML_PARSE_NOIMPLIED;
                   6664:     }
1.1       misho    6665:     ctxt->dictNames = 0;
                   6666:     return (options);
                   6667: }
                   6668: 
                   6669: /**
                   6670:  * htmlDoRead:
                   6671:  * @ctxt:  an HTML parser context
                   6672:  * @URL:  the base URL to use for the document
                   6673:  * @encoding:  the document encoding, or NULL
                   6674:  * @options:  a combination of htmlParserOption(s)
                   6675:  * @reuse:  keep the context for reuse
                   6676:  *
                   6677:  * Common front-end for the htmlRead functions
                   6678:  *
                   6679:  * Returns the resulting document tree or NULL
                   6680:  */
                   6681: static htmlDocPtr
                   6682: htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
                   6683:           int options, int reuse)
                   6684: {
                   6685:     htmlDocPtr ret;
                   6686: 
                   6687:     htmlCtxtUseOptions(ctxt, options);
                   6688:     ctxt->html = 1;
                   6689:     if (encoding != NULL) {
                   6690:         xmlCharEncodingHandlerPtr hdlr;
                   6691: 
                   6692:        hdlr = xmlFindCharEncodingHandler(encoding);
                   6693:        if (hdlr != NULL) {
                   6694:            xmlSwitchToEncoding(ctxt, hdlr);
                   6695:            if (ctxt->input->encoding != NULL)
                   6696:              xmlFree((xmlChar *) ctxt->input->encoding);
                   6697:             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
                   6698:         }
                   6699:     }
                   6700:     if ((URL != NULL) && (ctxt->input != NULL) &&
                   6701:         (ctxt->input->filename == NULL))
                   6702:         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
                   6703:     htmlParseDocument(ctxt);
                   6704:     ret = ctxt->myDoc;
                   6705:     ctxt->myDoc = NULL;
                   6706:     if (!reuse) {
                   6707:         if ((ctxt->dictNames) &&
                   6708:            (ret != NULL) &&
                   6709:            (ret->dict == ctxt->dict))
                   6710:            ctxt->dict = NULL;
                   6711:        xmlFreeParserCtxt(ctxt);
                   6712:     }
                   6713:     return (ret);
                   6714: }
                   6715: 
                   6716: /**
                   6717:  * htmlReadDoc:
                   6718:  * @cur:  a pointer to a zero terminated string
                   6719:  * @URL:  the base URL to use for the document
                   6720:  * @encoding:  the document encoding, or NULL
                   6721:  * @options:  a combination of htmlParserOption(s)
                   6722:  *
                   6723:  * parse an XML in-memory document and build a tree.
                   6724:  *
                   6725:  * Returns the resulting document tree
                   6726:  */
                   6727: htmlDocPtr
                   6728: htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
                   6729: {
                   6730:     htmlParserCtxtPtr ctxt;
                   6731: 
                   6732:     if (cur == NULL)
                   6733:         return (NULL);
                   6734: 
                   6735:     xmlInitParser();
                   6736:     ctxt = htmlCreateDocParserCtxt(cur, NULL);
                   6737:     if (ctxt == NULL)
                   6738:         return (NULL);
                   6739:     return (htmlDoRead(ctxt, URL, encoding, options, 0));
                   6740: }
                   6741: 
                   6742: /**
                   6743:  * htmlReadFile:
                   6744:  * @filename:  a file or URL
                   6745:  * @encoding:  the document encoding, or NULL
                   6746:  * @options:  a combination of htmlParserOption(s)
                   6747:  *
                   6748:  * parse an XML file from the filesystem or the network.
                   6749:  *
                   6750:  * Returns the resulting document tree
                   6751:  */
                   6752: htmlDocPtr
                   6753: htmlReadFile(const char *filename, const char *encoding, int options)
                   6754: {
                   6755:     htmlParserCtxtPtr ctxt;
                   6756: 
                   6757:     xmlInitParser();
                   6758:     ctxt = htmlCreateFileParserCtxt(filename, encoding);
                   6759:     if (ctxt == NULL)
                   6760:         return (NULL);
                   6761:     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
                   6762: }
                   6763: 
                   6764: /**
                   6765:  * htmlReadMemory:
                   6766:  * @buffer:  a pointer to a char array
                   6767:  * @size:  the size of the array
                   6768:  * @URL:  the base URL to use for the document
                   6769:  * @encoding:  the document encoding, or NULL
                   6770:  * @options:  a combination of htmlParserOption(s)
                   6771:  *
                   6772:  * parse an XML in-memory document and build a tree.
                   6773:  *
                   6774:  * Returns the resulting document tree
                   6775:  */
                   6776: htmlDocPtr
                   6777: htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
                   6778: {
                   6779:     htmlParserCtxtPtr ctxt;
                   6780: 
                   6781:     xmlInitParser();
                   6782:     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
                   6783:     if (ctxt == NULL)
                   6784:         return (NULL);
                   6785:     htmlDefaultSAXHandlerInit();
                   6786:     if (ctxt->sax != NULL)
                   6787:         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
                   6788:     return (htmlDoRead(ctxt, URL, encoding, options, 0));
                   6789: }
                   6790: 
                   6791: /**
                   6792:  * htmlReadFd:
                   6793:  * @fd:  an open file descriptor
                   6794:  * @URL:  the base URL to use for the document
                   6795:  * @encoding:  the document encoding, or NULL
                   6796:  * @options:  a combination of htmlParserOption(s)
                   6797:  *
                   6798:  * parse an XML from a file descriptor and build a tree.
                   6799:  *
                   6800:  * Returns the resulting document tree
                   6801:  */
                   6802: htmlDocPtr
                   6803: htmlReadFd(int fd, const char *URL, const char *encoding, int options)
                   6804: {
                   6805:     htmlParserCtxtPtr ctxt;
                   6806:     xmlParserInputBufferPtr input;
                   6807:     xmlParserInputPtr stream;
                   6808: 
                   6809:     if (fd < 0)
                   6810:         return (NULL);
                   6811: 
                   6812:     xmlInitParser();
                   6813:     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
                   6814:     if (input == NULL)
                   6815:         return (NULL);
                   6816:     ctxt = xmlNewParserCtxt();
                   6817:     if (ctxt == NULL) {
                   6818:         xmlFreeParserInputBuffer(input);
                   6819:         return (NULL);
                   6820:     }
                   6821:     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
                   6822:     if (stream == NULL) {
                   6823:         xmlFreeParserInputBuffer(input);
                   6824:        xmlFreeParserCtxt(ctxt);
                   6825:         return (NULL);
                   6826:     }
                   6827:     inputPush(ctxt, stream);
                   6828:     return (htmlDoRead(ctxt, URL, encoding, options, 0));
                   6829: }
                   6830: 
                   6831: /**
                   6832:  * htmlReadIO:
                   6833:  * @ioread:  an I/O read function
                   6834:  * @ioclose:  an I/O close function
                   6835:  * @ioctx:  an I/O handler
                   6836:  * @URL:  the base URL to use for the document
                   6837:  * @encoding:  the document encoding, or NULL
                   6838:  * @options:  a combination of htmlParserOption(s)
                   6839:  *
                   6840:  * parse an HTML document from I/O functions and source and build a tree.
                   6841:  *
                   6842:  * Returns the resulting document tree
                   6843:  */
                   6844: htmlDocPtr
                   6845: htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
                   6846:           void *ioctx, const char *URL, const char *encoding, int options)
                   6847: {
                   6848:     htmlParserCtxtPtr ctxt;
                   6849:     xmlParserInputBufferPtr input;
                   6850:     xmlParserInputPtr stream;
                   6851: 
                   6852:     if (ioread == NULL)
                   6853:         return (NULL);
                   6854:     xmlInitParser();
                   6855: 
                   6856:     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
                   6857:                                          XML_CHAR_ENCODING_NONE);
1.1.1.2   misho    6858:     if (input == NULL) {
                   6859:         if (ioclose != NULL)
                   6860:             ioclose(ioctx);
1.1       misho    6861:         return (NULL);
1.1.1.2   misho    6862:     }
1.1       misho    6863:     ctxt = htmlNewParserCtxt();
                   6864:     if (ctxt == NULL) {
                   6865:         xmlFreeParserInputBuffer(input);
                   6866:         return (NULL);
                   6867:     }
                   6868:     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
                   6869:     if (stream == NULL) {
                   6870:         xmlFreeParserInputBuffer(input);
                   6871:        xmlFreeParserCtxt(ctxt);
                   6872:         return (NULL);
                   6873:     }
                   6874:     inputPush(ctxt, stream);
                   6875:     return (htmlDoRead(ctxt, URL, encoding, options, 0));
                   6876: }
                   6877: 
                   6878: /**
                   6879:  * htmlCtxtReadDoc:
                   6880:  * @ctxt:  an HTML parser context
                   6881:  * @cur:  a pointer to a zero terminated string
                   6882:  * @URL:  the base URL to use for the document
                   6883:  * @encoding:  the document encoding, or NULL
                   6884:  * @options:  a combination of htmlParserOption(s)
                   6885:  *
                   6886:  * parse an XML in-memory document and build a tree.
                   6887:  * This reuses the existing @ctxt parser context
                   6888:  *
                   6889:  * Returns the resulting document tree
                   6890:  */
                   6891: htmlDocPtr
                   6892: htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
                   6893:                const char *URL, const char *encoding, int options)
                   6894: {
                   6895:     xmlParserInputPtr stream;
                   6896: 
                   6897:     if (cur == NULL)
                   6898:         return (NULL);
                   6899:     if (ctxt == NULL)
                   6900:         return (NULL);
                   6901: 
                   6902:     htmlCtxtReset(ctxt);
                   6903: 
                   6904:     stream = xmlNewStringInputStream(ctxt, cur);
                   6905:     if (stream == NULL) {
                   6906:         return (NULL);
                   6907:     }
                   6908:     inputPush(ctxt, stream);
                   6909:     return (htmlDoRead(ctxt, URL, encoding, options, 1));
                   6910: }
                   6911: 
                   6912: /**
                   6913:  * htmlCtxtReadFile:
                   6914:  * @ctxt:  an HTML parser context
                   6915:  * @filename:  a file or URL
                   6916:  * @encoding:  the document encoding, or NULL
                   6917:  * @options:  a combination of htmlParserOption(s)
                   6918:  *
                   6919:  * parse an XML file from the filesystem or the network.
                   6920:  * This reuses the existing @ctxt parser context
                   6921:  *
                   6922:  * Returns the resulting document tree
                   6923:  */
                   6924: htmlDocPtr
                   6925: htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
                   6926:                 const char *encoding, int options)
                   6927: {
                   6928:     xmlParserInputPtr stream;
                   6929: 
                   6930:     if (filename == NULL)
                   6931:         return (NULL);
                   6932:     if (ctxt == NULL)
                   6933:         return (NULL);
                   6934: 
                   6935:     htmlCtxtReset(ctxt);
                   6936: 
                   6937:     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
                   6938:     if (stream == NULL) {
                   6939:         return (NULL);
                   6940:     }
                   6941:     inputPush(ctxt, stream);
                   6942:     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
                   6943: }
                   6944: 
                   6945: /**
                   6946:  * htmlCtxtReadMemory:
                   6947:  * @ctxt:  an HTML parser context
                   6948:  * @buffer:  a pointer to a char array
                   6949:  * @size:  the size of the array
                   6950:  * @URL:  the base URL to use for the document
                   6951:  * @encoding:  the document encoding, or NULL
                   6952:  * @options:  a combination of htmlParserOption(s)
                   6953:  *
                   6954:  * parse an XML in-memory document and build a tree.
                   6955:  * This reuses the existing @ctxt parser context
                   6956:  *
                   6957:  * Returns the resulting document tree
                   6958:  */
                   6959: htmlDocPtr
                   6960: htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
                   6961:                   const char *URL, const char *encoding, int options)
                   6962: {
                   6963:     xmlParserInputBufferPtr input;
                   6964:     xmlParserInputPtr stream;
                   6965: 
                   6966:     if (ctxt == NULL)
                   6967:         return (NULL);
                   6968:     if (buffer == NULL)
                   6969:         return (NULL);
                   6970: 
                   6971:     htmlCtxtReset(ctxt);
                   6972: 
                   6973:     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
                   6974:     if (input == NULL) {
                   6975:        return(NULL);
                   6976:     }
                   6977: 
                   6978:     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
                   6979:     if (stream == NULL) {
                   6980:        xmlFreeParserInputBuffer(input);
                   6981:        return(NULL);
                   6982:     }
                   6983: 
                   6984:     inputPush(ctxt, stream);
                   6985:     return (htmlDoRead(ctxt, URL, encoding, options, 1));
                   6986: }
                   6987: 
                   6988: /**
                   6989:  * htmlCtxtReadFd:
                   6990:  * @ctxt:  an HTML parser context
                   6991:  * @fd:  an open file descriptor
                   6992:  * @URL:  the base URL to use for the document
                   6993:  * @encoding:  the document encoding, or NULL
                   6994:  * @options:  a combination of htmlParserOption(s)
                   6995:  *
                   6996:  * parse an XML from a file descriptor and build a tree.
                   6997:  * This reuses the existing @ctxt parser context
                   6998:  *
                   6999:  * Returns the resulting document tree
                   7000:  */
                   7001: htmlDocPtr
                   7002: htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
                   7003:               const char *URL, const char *encoding, int options)
                   7004: {
                   7005:     xmlParserInputBufferPtr input;
                   7006:     xmlParserInputPtr stream;
                   7007: 
                   7008:     if (fd < 0)
                   7009:         return (NULL);
                   7010:     if (ctxt == NULL)
                   7011:         return (NULL);
                   7012: 
                   7013:     htmlCtxtReset(ctxt);
                   7014: 
                   7015: 
                   7016:     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
                   7017:     if (input == NULL)
                   7018:         return (NULL);
                   7019:     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
                   7020:     if (stream == NULL) {
                   7021:         xmlFreeParserInputBuffer(input);
                   7022:         return (NULL);
                   7023:     }
                   7024:     inputPush(ctxt, stream);
                   7025:     return (htmlDoRead(ctxt, URL, encoding, options, 1));
                   7026: }
                   7027: 
                   7028: /**
                   7029:  * htmlCtxtReadIO:
                   7030:  * @ctxt:  an HTML parser context
                   7031:  * @ioread:  an I/O read function
                   7032:  * @ioclose:  an I/O close function
                   7033:  * @ioctx:  an I/O handler
                   7034:  * @URL:  the base URL to use for the document
                   7035:  * @encoding:  the document encoding, or NULL
                   7036:  * @options:  a combination of htmlParserOption(s)
                   7037:  *
                   7038:  * parse an HTML document from I/O functions and source and build a tree.
                   7039:  * This reuses the existing @ctxt parser context
                   7040:  *
                   7041:  * Returns the resulting document tree
                   7042:  */
                   7043: htmlDocPtr
                   7044: htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
                   7045:               xmlInputCloseCallback ioclose, void *ioctx,
                   7046:              const char *URL,
                   7047:               const char *encoding, int options)
                   7048: {
                   7049:     xmlParserInputBufferPtr input;
                   7050:     xmlParserInputPtr stream;
                   7051: 
                   7052:     if (ioread == NULL)
                   7053:         return (NULL);
                   7054:     if (ctxt == NULL)
                   7055:         return (NULL);
                   7056: 
                   7057:     htmlCtxtReset(ctxt);
                   7058: 
                   7059:     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
                   7060:                                          XML_CHAR_ENCODING_NONE);
1.1.1.2   misho    7061:     if (input == NULL) {
                   7062:         if (ioclose != NULL)
                   7063:             ioclose(ioctx);
1.1       misho    7064:         return (NULL);
1.1.1.2   misho    7065:     }
1.1       misho    7066:     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
                   7067:     if (stream == NULL) {
                   7068:         xmlFreeParserInputBuffer(input);
                   7069:         return (NULL);
                   7070:     }
                   7071:     inputPush(ctxt, stream);
                   7072:     return (htmlDoRead(ctxt, URL, encoding, options, 1));
                   7073: }
                   7074: 
                   7075: #define bottom_HTMLparser
                   7076: #include "elfgcchack.h"
                   7077: #endif /* LIBXML_HTML_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>