Annotation of embedaddon/libxml2/HTMLtree.c, revision 1.1
1.1 ! misho 1: /*
! 2: * HTMLtree.c : implementation of access function for an HTML tree.
! 3: *
! 4: * See Copyright for the status of this software.
! 5: *
! 6: * daniel@veillard.com
! 7: */
! 8:
! 9:
! 10: #define IN_LIBXML
! 11: #include "libxml.h"
! 12: #ifdef LIBXML_HTML_ENABLED
! 13:
! 14: #include <string.h> /* for memset() only ! */
! 15:
! 16: #ifdef HAVE_CTYPE_H
! 17: #include <ctype.h>
! 18: #endif
! 19: #ifdef HAVE_STDLIB_H
! 20: #include <stdlib.h>
! 21: #endif
! 22:
! 23: #include <libxml/xmlmemory.h>
! 24: #include <libxml/HTMLparser.h>
! 25: #include <libxml/HTMLtree.h>
! 26: #include <libxml/entities.h>
! 27: #include <libxml/valid.h>
! 28: #include <libxml/xmlerror.h>
! 29: #include <libxml/parserInternals.h>
! 30: #include <libxml/globals.h>
! 31: #include <libxml/uri.h>
! 32:
! 33: /************************************************************************
! 34: * *
! 35: * Getting/Setting encoding meta tags *
! 36: * *
! 37: ************************************************************************/
! 38:
! 39: /**
! 40: * htmlGetMetaEncoding:
! 41: * @doc: the document
! 42: *
! 43: * Encoding definition lookup in the Meta tags
! 44: *
! 45: * Returns the current encoding as flagged in the HTML source
! 46: */
! 47: const xmlChar *
! 48: htmlGetMetaEncoding(htmlDocPtr doc) {
! 49: htmlNodePtr cur;
! 50: const xmlChar *content;
! 51: const xmlChar *encoding;
! 52:
! 53: if (doc == NULL)
! 54: return(NULL);
! 55: cur = doc->children;
! 56:
! 57: /*
! 58: * Search the html
! 59: */
! 60: while (cur != NULL) {
! 61: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
! 62: if (xmlStrEqual(cur->name, BAD_CAST"html"))
! 63: break;
! 64: if (xmlStrEqual(cur->name, BAD_CAST"head"))
! 65: goto found_head;
! 66: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
! 67: goto found_meta;
! 68: }
! 69: cur = cur->next;
! 70: }
! 71: if (cur == NULL)
! 72: return(NULL);
! 73: cur = cur->children;
! 74:
! 75: /*
! 76: * Search the head
! 77: */
! 78: while (cur != NULL) {
! 79: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
! 80: if (xmlStrEqual(cur->name, BAD_CAST"head"))
! 81: break;
! 82: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
! 83: goto found_meta;
! 84: }
! 85: cur = cur->next;
! 86: }
! 87: if (cur == NULL)
! 88: return(NULL);
! 89: found_head:
! 90: cur = cur->children;
! 91:
! 92: /*
! 93: * Search the meta elements
! 94: */
! 95: found_meta:
! 96: while (cur != NULL) {
! 97: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
! 98: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
! 99: xmlAttrPtr attr = cur->properties;
! 100: int http;
! 101: const xmlChar *value;
! 102:
! 103: content = NULL;
! 104: http = 0;
! 105: while (attr != NULL) {
! 106: if ((attr->children != NULL) &&
! 107: (attr->children->type == XML_TEXT_NODE) &&
! 108: (attr->children->next == NULL)) {
! 109: value = attr->children->content;
! 110: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
! 111: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
! 112: http = 1;
! 113: else if ((value != NULL)
! 114: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
! 115: content = value;
! 116: if ((http != 0) && (content != NULL))
! 117: goto found_content;
! 118: }
! 119: attr = attr->next;
! 120: }
! 121: }
! 122: }
! 123: cur = cur->next;
! 124: }
! 125: return(NULL);
! 126:
! 127: found_content:
! 128: encoding = xmlStrstr(content, BAD_CAST"charset=");
! 129: if (encoding == NULL)
! 130: encoding = xmlStrstr(content, BAD_CAST"Charset=");
! 131: if (encoding == NULL)
! 132: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
! 133: if (encoding != NULL) {
! 134: encoding += 8;
! 135: } else {
! 136: encoding = xmlStrstr(content, BAD_CAST"charset =");
! 137: if (encoding == NULL)
! 138: encoding = xmlStrstr(content, BAD_CAST"Charset =");
! 139: if (encoding == NULL)
! 140: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
! 141: if (encoding != NULL)
! 142: encoding += 9;
! 143: }
! 144: if (encoding != NULL) {
! 145: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
! 146: }
! 147: return(encoding);
! 148: }
! 149:
! 150: /**
! 151: * htmlSetMetaEncoding:
! 152: * @doc: the document
! 153: * @encoding: the encoding string
! 154: *
! 155: * Sets the current encoding in the Meta tags
! 156: * NOTE: this will not change the document content encoding, just
! 157: * the META flag associated.
! 158: *
! 159: * Returns 0 in case of success and -1 in case of error
! 160: */
! 161: int
! 162: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
! 163: htmlNodePtr cur, meta = NULL, head = NULL;
! 164: const xmlChar *content = NULL;
! 165: char newcontent[100];
! 166:
! 167:
! 168: if (doc == NULL)
! 169: return(-1);
! 170:
! 171: /* html isn't a real encoding it's just libxml2 way to get entities */
! 172: if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
! 173: return(-1);
! 174:
! 175: if (encoding != NULL) {
! 176: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
! 177: (char *)encoding);
! 178: newcontent[sizeof(newcontent) - 1] = 0;
! 179: }
! 180:
! 181: cur = doc->children;
! 182:
! 183: /*
! 184: * Search the html
! 185: */
! 186: while (cur != NULL) {
! 187: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
! 188: if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
! 189: break;
! 190: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
! 191: goto found_head;
! 192: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
! 193: goto found_meta;
! 194: }
! 195: cur = cur->next;
! 196: }
! 197: if (cur == NULL)
! 198: return(-1);
! 199: cur = cur->children;
! 200:
! 201: /*
! 202: * Search the head
! 203: */
! 204: while (cur != NULL) {
! 205: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
! 206: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
! 207: break;
! 208: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
! 209: head = cur->parent;
! 210: goto found_meta;
! 211: }
! 212: }
! 213: cur = cur->next;
! 214: }
! 215: if (cur == NULL)
! 216: return(-1);
! 217: found_head:
! 218: head = cur;
! 219: if (cur->children == NULL)
! 220: goto create;
! 221: cur = cur->children;
! 222:
! 223: found_meta:
! 224: /*
! 225: * Search and update all the remaining the meta elements carrying
! 226: * encoding informations
! 227: */
! 228: while (cur != NULL) {
! 229: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
! 230: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
! 231: xmlAttrPtr attr = cur->properties;
! 232: int http;
! 233: const xmlChar *value;
! 234:
! 235: content = NULL;
! 236: http = 0;
! 237: while (attr != NULL) {
! 238: if ((attr->children != NULL) &&
! 239: (attr->children->type == XML_TEXT_NODE) &&
! 240: (attr->children->next == NULL)) {
! 241: value = attr->children->content;
! 242: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
! 243: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
! 244: http = 1;
! 245: else
! 246: {
! 247: if ((value != NULL) &&
! 248: (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
! 249: content = value;
! 250: }
! 251: if ((http != 0) && (content != NULL))
! 252: break;
! 253: }
! 254: attr = attr->next;
! 255: }
! 256: if ((http != 0) && (content != NULL)) {
! 257: meta = cur;
! 258: break;
! 259: }
! 260:
! 261: }
! 262: }
! 263: cur = cur->next;
! 264: }
! 265: create:
! 266: if (meta == NULL) {
! 267: if ((encoding != NULL) && (head != NULL)) {
! 268: /*
! 269: * Create a new Meta element with the right attributes
! 270: */
! 271:
! 272: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
! 273: if (head->children == NULL)
! 274: xmlAddChild(head, meta);
! 275: else
! 276: xmlAddPrevSibling(head->children, meta);
! 277: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
! 278: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
! 279: }
! 280: } else {
! 281: /* change the document only if there is a real encoding change */
! 282: if (xmlStrcasestr(content, encoding) == NULL) {
! 283: xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
! 284: }
! 285: }
! 286:
! 287:
! 288: return(0);
! 289: }
! 290:
! 291: /**
! 292: * booleanHTMLAttrs:
! 293: *
! 294: * These are the HTML attributes which will be output
! 295: * in minimized form, i.e. <option selected="selected"> will be
! 296: * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
! 297: *
! 298: */
! 299: static const char* htmlBooleanAttrs[] = {
! 300: "checked", "compact", "declare", "defer", "disabled", "ismap",
! 301: "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
! 302: "selected", NULL
! 303: };
! 304:
! 305:
! 306: /**
! 307: * htmlIsBooleanAttr:
! 308: * @name: the name of the attribute to check
! 309: *
! 310: * Determine if a given attribute is a boolean attribute.
! 311: *
! 312: * returns: false if the attribute is not boolean, true otherwise.
! 313: */
! 314: int
! 315: htmlIsBooleanAttr(const xmlChar *name)
! 316: {
! 317: int i = 0;
! 318:
! 319: while (htmlBooleanAttrs[i] != NULL) {
! 320: if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
! 321: return 1;
! 322: i++;
! 323: }
! 324: return 0;
! 325: }
! 326:
! 327: #ifdef LIBXML_OUTPUT_ENABLED
! 328: /*
! 329: * private routine exported from xmlIO.c
! 330: */
! 331: xmlOutputBufferPtr
! 332: xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
! 333: /************************************************************************
! 334: * *
! 335: * Output error handlers *
! 336: * *
! 337: ************************************************************************/
! 338: /**
! 339: * htmlSaveErrMemory:
! 340: * @extra: extra informations
! 341: *
! 342: * Handle an out of memory condition
! 343: */
! 344: static void
! 345: htmlSaveErrMemory(const char *extra)
! 346: {
! 347: __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
! 348: }
! 349:
! 350: /**
! 351: * htmlSaveErr:
! 352: * @code: the error number
! 353: * @node: the location of the error.
! 354: * @extra: extra informations
! 355: *
! 356: * Handle an out of memory condition
! 357: */
! 358: static void
! 359: htmlSaveErr(int code, xmlNodePtr node, const char *extra)
! 360: {
! 361: const char *msg = NULL;
! 362:
! 363: switch(code) {
! 364: case XML_SAVE_NOT_UTF8:
! 365: msg = "string is not in UTF-8\n";
! 366: break;
! 367: case XML_SAVE_CHAR_INVALID:
! 368: msg = "invalid character value\n";
! 369: break;
! 370: case XML_SAVE_UNKNOWN_ENCODING:
! 371: msg = "unknown encoding %s\n";
! 372: break;
! 373: case XML_SAVE_NO_DOCTYPE:
! 374: msg = "HTML has no DOCTYPE\n";
! 375: break;
! 376: default:
! 377: msg = "unexpected error number\n";
! 378: }
! 379: __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
! 380: }
! 381:
! 382: /************************************************************************
! 383: * *
! 384: * Dumping HTML tree content to a simple buffer *
! 385: * *
! 386: ************************************************************************/
! 387:
! 388: static int
! 389: htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
! 390: int format);
! 391:
! 392: /**
! 393: * htmlNodeDumpFormat:
! 394: * @buf: the HTML buffer output
! 395: * @doc: the document
! 396: * @cur: the current node
! 397: * @format: should formatting spaces been added
! 398: *
! 399: * Dump an HTML node, recursive behaviour,children are printed too.
! 400: *
! 401: * Returns the number of byte written or -1 in case of error
! 402: */
! 403: static int
! 404: htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
! 405: int format) {
! 406: unsigned int use;
! 407: int ret;
! 408: xmlOutputBufferPtr outbuf;
! 409:
! 410: if (cur == NULL) {
! 411: return (-1);
! 412: }
! 413: if (buf == NULL) {
! 414: return (-1);
! 415: }
! 416: outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
! 417: if (outbuf == NULL) {
! 418: htmlSaveErrMemory("allocating HTML output buffer");
! 419: return (-1);
! 420: }
! 421: memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
! 422: outbuf->buffer = buf;
! 423: outbuf->encoder = NULL;
! 424: outbuf->writecallback = NULL;
! 425: outbuf->closecallback = NULL;
! 426: outbuf->context = NULL;
! 427: outbuf->written = 0;
! 428:
! 429: use = buf->use;
! 430: htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
! 431: xmlFree(outbuf);
! 432: ret = buf->use - use;
! 433: return (ret);
! 434: }
! 435:
! 436: /**
! 437: * htmlNodeDump:
! 438: * @buf: the HTML buffer output
! 439: * @doc: the document
! 440: * @cur: the current node
! 441: *
! 442: * Dump an HTML node, recursive behaviour,children are printed too,
! 443: * and formatting returns are added.
! 444: *
! 445: * Returns the number of byte written or -1 in case of error
! 446: */
! 447: int
! 448: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
! 449: xmlInitParser();
! 450:
! 451: return(htmlNodeDumpFormat(buf, doc, cur, 1));
! 452: }
! 453:
! 454: /**
! 455: * htmlNodeDumpFileFormat:
! 456: * @out: the FILE pointer
! 457: * @doc: the document
! 458: * @cur: the current node
! 459: * @encoding: the document encoding
! 460: * @format: should formatting spaces been added
! 461: *
! 462: * Dump an HTML node, recursive behaviour,children are printed too.
! 463: *
! 464: * TODO: if encoding == NULL try to save in the doc encoding
! 465: *
! 466: * returns: the number of byte written or -1 in case of failure.
! 467: */
! 468: int
! 469: htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
! 470: xmlNodePtr cur, const char *encoding, int format) {
! 471: xmlOutputBufferPtr buf;
! 472: xmlCharEncodingHandlerPtr handler = NULL;
! 473: int ret;
! 474:
! 475: xmlInitParser();
! 476:
! 477: if (encoding != NULL) {
! 478: xmlCharEncoding enc;
! 479:
! 480: enc = xmlParseCharEncoding(encoding);
! 481: if (enc != XML_CHAR_ENCODING_UTF8) {
! 482: handler = xmlFindCharEncodingHandler(encoding);
! 483: if (handler == NULL)
! 484: return(-1);
! 485: }
! 486: }
! 487:
! 488: /*
! 489: * Fallback to HTML or ASCII when the encoding is unspecified
! 490: */
! 491: if (handler == NULL)
! 492: handler = xmlFindCharEncodingHandler("HTML");
! 493: if (handler == NULL)
! 494: handler = xmlFindCharEncodingHandler("ascii");
! 495:
! 496: /*
! 497: * save the content to a temp buffer.
! 498: */
! 499: buf = xmlOutputBufferCreateFile(out, handler);
! 500: if (buf == NULL) return(0);
! 501:
! 502: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
! 503:
! 504: ret = xmlOutputBufferClose(buf);
! 505: return(ret);
! 506: }
! 507:
! 508: /**
! 509: * htmlNodeDumpFile:
! 510: * @out: the FILE pointer
! 511: * @doc: the document
! 512: * @cur: the current node
! 513: *
! 514: * Dump an HTML node, recursive behaviour,children are printed too,
! 515: * and formatting returns are added.
! 516: */
! 517: void
! 518: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
! 519: htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
! 520: }
! 521:
! 522: /**
! 523: * htmlDocDumpMemoryFormat:
! 524: * @cur: the document
! 525: * @mem: OUT: the memory pointer
! 526: * @size: OUT: the memory length
! 527: * @format: should formatting spaces been added
! 528: *
! 529: * Dump an HTML document in memory and return the xmlChar * and it's size.
! 530: * It's up to the caller to free the memory.
! 531: */
! 532: void
! 533: htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
! 534: xmlOutputBufferPtr buf;
! 535: xmlCharEncodingHandlerPtr handler = NULL;
! 536: const char *encoding;
! 537:
! 538: xmlInitParser();
! 539:
! 540: if ((mem == NULL) || (size == NULL))
! 541: return;
! 542: if (cur == NULL) {
! 543: *mem = NULL;
! 544: *size = 0;
! 545: return;
! 546: }
! 547:
! 548: encoding = (const char *) htmlGetMetaEncoding(cur);
! 549:
! 550: if (encoding != NULL) {
! 551: xmlCharEncoding enc;
! 552:
! 553: enc = xmlParseCharEncoding(encoding);
! 554: if (enc != cur->charset) {
! 555: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
! 556: /*
! 557: * Not supported yet
! 558: */
! 559: *mem = NULL;
! 560: *size = 0;
! 561: return;
! 562: }
! 563:
! 564: handler = xmlFindCharEncodingHandler(encoding);
! 565: if (handler == NULL) {
! 566: *mem = NULL;
! 567: *size = 0;
! 568: return;
! 569: }
! 570: } else {
! 571: handler = xmlFindCharEncodingHandler(encoding);
! 572: }
! 573: }
! 574:
! 575: /*
! 576: * Fallback to HTML or ASCII when the encoding is unspecified
! 577: */
! 578: if (handler == NULL)
! 579: handler = xmlFindCharEncodingHandler("HTML");
! 580: if (handler == NULL)
! 581: handler = xmlFindCharEncodingHandler("ascii");
! 582:
! 583: buf = xmlAllocOutputBufferInternal(handler);
! 584: if (buf == NULL) {
! 585: *mem = NULL;
! 586: *size = 0;
! 587: return;
! 588: }
! 589:
! 590: htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
! 591:
! 592: xmlOutputBufferFlush(buf);
! 593: if (buf->conv != NULL) {
! 594: *size = buf->conv->use;
! 595: *mem = xmlStrndup(buf->conv->content, *size);
! 596: } else {
! 597: *size = buf->buffer->use;
! 598: *mem = xmlStrndup(buf->buffer->content, *size);
! 599: }
! 600: (void)xmlOutputBufferClose(buf);
! 601: }
! 602:
! 603: /**
! 604: * htmlDocDumpMemory:
! 605: * @cur: the document
! 606: * @mem: OUT: the memory pointer
! 607: * @size: OUT: the memory length
! 608: *
! 609: * Dump an HTML document in memory and return the xmlChar * and it's size.
! 610: * It's up to the caller to free the memory.
! 611: */
! 612: void
! 613: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
! 614: htmlDocDumpMemoryFormat(cur, mem, size, 1);
! 615: }
! 616:
! 617:
! 618: /************************************************************************
! 619: * *
! 620: * Dumping HTML tree content to an I/O output buffer *
! 621: * *
! 622: ************************************************************************/
! 623:
! 624: void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
! 625:
! 626: /**
! 627: * htmlDtdDumpOutput:
! 628: * @buf: the HTML buffer output
! 629: * @doc: the document
! 630: * @encoding: the encoding string
! 631: *
! 632: * TODO: check whether encoding is needed
! 633: *
! 634: * Dump the HTML document DTD, if any.
! 635: */
! 636: static void
! 637: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
! 638: const char *encoding ATTRIBUTE_UNUSED) {
! 639: xmlDtdPtr cur = doc->intSubset;
! 640:
! 641: if (cur == NULL) {
! 642: htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
! 643: return;
! 644: }
! 645: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
! 646: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 647: if (cur->ExternalID != NULL) {
! 648: xmlOutputBufferWriteString(buf, " PUBLIC ");
! 649: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
! 650: if (cur->SystemID != NULL) {
! 651: xmlOutputBufferWriteString(buf, " ");
! 652: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
! 653: }
! 654: } else if (cur->SystemID != NULL) {
! 655: xmlOutputBufferWriteString(buf, " SYSTEM ");
! 656: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
! 657: }
! 658: xmlOutputBufferWriteString(buf, ">\n");
! 659: }
! 660:
! 661: /**
! 662: * htmlAttrDumpOutput:
! 663: * @buf: the HTML buffer output
! 664: * @doc: the document
! 665: * @cur: the attribute pointer
! 666: * @encoding: the encoding string
! 667: *
! 668: * Dump an HTML attribute
! 669: */
! 670: static void
! 671: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
! 672: const char *encoding ATTRIBUTE_UNUSED) {
! 673: xmlChar *value;
! 674:
! 675: /*
! 676: * TODO: The html output method should not escape a & character
! 677: * occurring in an attribute value immediately followed by
! 678: * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
! 679: */
! 680:
! 681: if (cur == NULL) {
! 682: return;
! 683: }
! 684: xmlOutputBufferWriteString(buf, " ");
! 685: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
! 686: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
! 687: xmlOutputBufferWriteString(buf, ":");
! 688: }
! 689: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 690: if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
! 691: value = xmlNodeListGetString(doc, cur->children, 0);
! 692: if (value) {
! 693: xmlOutputBufferWriteString(buf, "=");
! 694: if ((cur->ns == NULL) && (cur->parent != NULL) &&
! 695: (cur->parent->ns == NULL) &&
! 696: ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
! 697: (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
! 698: (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
! 699: ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
! 700: (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
! 701: xmlChar *escaped;
! 702: xmlChar *tmp = value;
! 703:
! 704: while (IS_BLANK_CH(*tmp)) tmp++;
! 705:
! 706: escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
! 707: if (escaped != NULL) {
! 708: xmlBufferWriteQuotedString(buf->buffer, escaped);
! 709: xmlFree(escaped);
! 710: } else {
! 711: xmlBufferWriteQuotedString(buf->buffer, value);
! 712: }
! 713: } else {
! 714: xmlBufferWriteQuotedString(buf->buffer, value);
! 715: }
! 716: xmlFree(value);
! 717: } else {
! 718: xmlOutputBufferWriteString(buf, "=\"\"");
! 719: }
! 720: }
! 721: }
! 722:
! 723: /**
! 724: * htmlAttrListDumpOutput:
! 725: * @buf: the HTML buffer output
! 726: * @doc: the document
! 727: * @cur: the first attribute pointer
! 728: * @encoding: the encoding string
! 729: *
! 730: * Dump a list of HTML attributes
! 731: */
! 732: static void
! 733: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
! 734: if (cur == NULL) {
! 735: return;
! 736: }
! 737: while (cur != NULL) {
! 738: htmlAttrDumpOutput(buf, doc, cur, encoding);
! 739: cur = cur->next;
! 740: }
! 741: }
! 742:
! 743:
! 744:
! 745: /**
! 746: * htmlNodeListDumpOutput:
! 747: * @buf: the HTML buffer output
! 748: * @doc: the document
! 749: * @cur: the first node
! 750: * @encoding: the encoding string
! 751: * @format: should formatting spaces been added
! 752: *
! 753: * Dump an HTML node list, recursive behaviour,children are printed too.
! 754: */
! 755: static void
! 756: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
! 757: xmlNodePtr cur, const char *encoding, int format) {
! 758: if (cur == NULL) {
! 759: return;
! 760: }
! 761: while (cur != NULL) {
! 762: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
! 763: cur = cur->next;
! 764: }
! 765: }
! 766:
! 767: /**
! 768: * htmlNodeDumpFormatOutput:
! 769: * @buf: the HTML buffer output
! 770: * @doc: the document
! 771: * @cur: the current node
! 772: * @encoding: the encoding string
! 773: * @format: should formatting spaces been added
! 774: *
! 775: * Dump an HTML node, recursive behaviour,children are printed too.
! 776: */
! 777: void
! 778: htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
! 779: xmlNodePtr cur, const char *encoding, int format) {
! 780: const htmlElemDesc * info;
! 781:
! 782: xmlInitParser();
! 783:
! 784: if ((cur == NULL) || (buf == NULL)) {
! 785: return;
! 786: }
! 787: /*
! 788: * Special cases.
! 789: */
! 790: if (cur->type == XML_DTD_NODE)
! 791: return;
! 792: if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
! 793: (cur->type == XML_DOCUMENT_NODE)){
! 794: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
! 795: return;
! 796: }
! 797: if (cur->type == XML_ATTRIBUTE_NODE) {
! 798: htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
! 799: return;
! 800: }
! 801: if (cur->type == HTML_TEXT_NODE) {
! 802: if (cur->content != NULL) {
! 803: if (((cur->name == (const xmlChar *)xmlStringText) ||
! 804: (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
! 805: ((cur->parent == NULL) ||
! 806: ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
! 807: (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
! 808: xmlChar *buffer;
! 809:
! 810: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
! 811: if (buffer != NULL) {
! 812: xmlOutputBufferWriteString(buf, (const char *)buffer);
! 813: xmlFree(buffer);
! 814: }
! 815: } else {
! 816: xmlOutputBufferWriteString(buf, (const char *)cur->content);
! 817: }
! 818: }
! 819: return;
! 820: }
! 821: if (cur->type == HTML_COMMENT_NODE) {
! 822: if (cur->content != NULL) {
! 823: xmlOutputBufferWriteString(buf, "<!--");
! 824: xmlOutputBufferWriteString(buf, (const char *)cur->content);
! 825: xmlOutputBufferWriteString(buf, "-->");
! 826: }
! 827: return;
! 828: }
! 829: if (cur->type == HTML_PI_NODE) {
! 830: if (cur->name == NULL)
! 831: return;
! 832: xmlOutputBufferWriteString(buf, "<?");
! 833: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 834: if (cur->content != NULL) {
! 835: xmlOutputBufferWriteString(buf, " ");
! 836: xmlOutputBufferWriteString(buf, (const char *)cur->content);
! 837: }
! 838: xmlOutputBufferWriteString(buf, ">");
! 839: return;
! 840: }
! 841: if (cur->type == HTML_ENTITY_REF_NODE) {
! 842: xmlOutputBufferWriteString(buf, "&");
! 843: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 844: xmlOutputBufferWriteString(buf, ";");
! 845: return;
! 846: }
! 847: if (cur->type == HTML_PRESERVE_NODE) {
! 848: if (cur->content != NULL) {
! 849: xmlOutputBufferWriteString(buf, (const char *)cur->content);
! 850: }
! 851: return;
! 852: }
! 853:
! 854: /*
! 855: * Get specific HTML info for that node.
! 856: */
! 857: if (cur->ns == NULL)
! 858: info = htmlTagLookup(cur->name);
! 859: else
! 860: info = NULL;
! 861:
! 862: xmlOutputBufferWriteString(buf, "<");
! 863: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
! 864: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
! 865: xmlOutputBufferWriteString(buf, ":");
! 866: }
! 867: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 868: if (cur->nsDef)
! 869: xmlNsListDumpOutput(buf, cur->nsDef);
! 870: if (cur->properties != NULL)
! 871: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
! 872:
! 873: if ((info != NULL) && (info->empty)) {
! 874: xmlOutputBufferWriteString(buf, ">");
! 875: if ((format) && (!info->isinline) && (cur->next != NULL)) {
! 876: if ((cur->next->type != HTML_TEXT_NODE) &&
! 877: (cur->next->type != HTML_ENTITY_REF_NODE) &&
! 878: (cur->parent != NULL) &&
! 879: (cur->parent->name != NULL) &&
! 880: (cur->parent->name[0] != 'p')) /* p, pre, param */
! 881: xmlOutputBufferWriteString(buf, "\n");
! 882: }
! 883: return;
! 884: }
! 885: if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
! 886: (cur->children == NULL)) {
! 887: if ((info != NULL) && (info->saveEndTag != 0) &&
! 888: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
! 889: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
! 890: xmlOutputBufferWriteString(buf, ">");
! 891: } else {
! 892: xmlOutputBufferWriteString(buf, "></");
! 893: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
! 894: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
! 895: xmlOutputBufferWriteString(buf, ":");
! 896: }
! 897: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 898: xmlOutputBufferWriteString(buf, ">");
! 899: }
! 900: if ((format) && (cur->next != NULL) &&
! 901: (info != NULL) && (!info->isinline)) {
! 902: if ((cur->next->type != HTML_TEXT_NODE) &&
! 903: (cur->next->type != HTML_ENTITY_REF_NODE) &&
! 904: (cur->parent != NULL) &&
! 905: (cur->parent->name != NULL) &&
! 906: (cur->parent->name[0] != 'p')) /* p, pre, param */
! 907: xmlOutputBufferWriteString(buf, "\n");
! 908: }
! 909: return;
! 910: }
! 911: xmlOutputBufferWriteString(buf, ">");
! 912: if ((cur->type != XML_ELEMENT_NODE) &&
! 913: (cur->content != NULL)) {
! 914: /*
! 915: * Uses the OutputBuffer property to automatically convert
! 916: * invalids to charrefs
! 917: */
! 918:
! 919: xmlOutputBufferWriteString(buf, (const char *) cur->content);
! 920: }
! 921: if (cur->children != NULL) {
! 922: if ((format) && (info != NULL) && (!info->isinline) &&
! 923: (cur->children->type != HTML_TEXT_NODE) &&
! 924: (cur->children->type != HTML_ENTITY_REF_NODE) &&
! 925: (cur->children != cur->last) &&
! 926: (cur->name != NULL) &&
! 927: (cur->name[0] != 'p')) /* p, pre, param */
! 928: xmlOutputBufferWriteString(buf, "\n");
! 929: htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
! 930: if ((format) && (info != NULL) && (!info->isinline) &&
! 931: (cur->last->type != HTML_TEXT_NODE) &&
! 932: (cur->last->type != HTML_ENTITY_REF_NODE) &&
! 933: (cur->children != cur->last) &&
! 934: (cur->name != NULL) &&
! 935: (cur->name[0] != 'p')) /* p, pre, param */
! 936: xmlOutputBufferWriteString(buf, "\n");
! 937: }
! 938: xmlOutputBufferWriteString(buf, "</");
! 939: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
! 940: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
! 941: xmlOutputBufferWriteString(buf, ":");
! 942: }
! 943: xmlOutputBufferWriteString(buf, (const char *)cur->name);
! 944: xmlOutputBufferWriteString(buf, ">");
! 945: if ((format) && (info != NULL) && (!info->isinline) &&
! 946: (cur->next != NULL)) {
! 947: if ((cur->next->type != HTML_TEXT_NODE) &&
! 948: (cur->next->type != HTML_ENTITY_REF_NODE) &&
! 949: (cur->parent != NULL) &&
! 950: (cur->parent->name != NULL) &&
! 951: (cur->parent->name[0] != 'p')) /* p, pre, param */
! 952: xmlOutputBufferWriteString(buf, "\n");
! 953: }
! 954: }
! 955:
! 956: /**
! 957: * htmlNodeDumpOutput:
! 958: * @buf: the HTML buffer output
! 959: * @doc: the document
! 960: * @cur: the current node
! 961: * @encoding: the encoding string
! 962: *
! 963: * Dump an HTML node, recursive behaviour,children are printed too,
! 964: * and formatting returns/spaces are added.
! 965: */
! 966: void
! 967: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
! 968: xmlNodePtr cur, const char *encoding) {
! 969: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
! 970: }
! 971:
! 972: /**
! 973: * htmlDocContentDumpFormatOutput:
! 974: * @buf: the HTML buffer output
! 975: * @cur: the document
! 976: * @encoding: the encoding string
! 977: * @format: should formatting spaces been added
! 978: *
! 979: * Dump an HTML document.
! 980: */
! 981: void
! 982: htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
! 983: const char *encoding, int format) {
! 984: int type;
! 985:
! 986: xmlInitParser();
! 987:
! 988: if ((buf == NULL) || (cur == NULL))
! 989: return;
! 990:
! 991: /*
! 992: * force to output the stuff as HTML, especially for entities
! 993: */
! 994: type = cur->type;
! 995: cur->type = XML_HTML_DOCUMENT_NODE;
! 996: if (cur->intSubset != NULL) {
! 997: htmlDtdDumpOutput(buf, cur, NULL);
! 998: }
! 999: if (cur->children != NULL) {
! 1000: htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
! 1001: }
! 1002: xmlOutputBufferWriteString(buf, "\n");
! 1003: cur->type = (xmlElementType) type;
! 1004: }
! 1005:
! 1006: /**
! 1007: * htmlDocContentDumpOutput:
! 1008: * @buf: the HTML buffer output
! 1009: * @cur: the document
! 1010: * @encoding: the encoding string
! 1011: *
! 1012: * Dump an HTML document. Formating return/spaces are added.
! 1013: */
! 1014: void
! 1015: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
! 1016: const char *encoding) {
! 1017: htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
! 1018: }
! 1019:
! 1020: /************************************************************************
! 1021: * *
! 1022: * Saving functions front-ends *
! 1023: * *
! 1024: ************************************************************************/
! 1025:
! 1026: /**
! 1027: * htmlDocDump:
! 1028: * @f: the FILE*
! 1029: * @cur: the document
! 1030: *
! 1031: * Dump an HTML document to an open FILE.
! 1032: *
! 1033: * returns: the number of byte written or -1 in case of failure.
! 1034: */
! 1035: int
! 1036: htmlDocDump(FILE *f, xmlDocPtr cur) {
! 1037: xmlOutputBufferPtr buf;
! 1038: xmlCharEncodingHandlerPtr handler = NULL;
! 1039: const char *encoding;
! 1040: int ret;
! 1041:
! 1042: xmlInitParser();
! 1043:
! 1044: if ((cur == NULL) || (f == NULL)) {
! 1045: return(-1);
! 1046: }
! 1047:
! 1048: encoding = (const char *) htmlGetMetaEncoding(cur);
! 1049:
! 1050: if (encoding != NULL) {
! 1051: xmlCharEncoding enc;
! 1052:
! 1053: enc = xmlParseCharEncoding(encoding);
! 1054: if (enc != cur->charset) {
! 1055: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
! 1056: /*
! 1057: * Not supported yet
! 1058: */
! 1059: return(-1);
! 1060: }
! 1061:
! 1062: handler = xmlFindCharEncodingHandler(encoding);
! 1063: if (handler == NULL)
! 1064: return(-1);
! 1065: } else {
! 1066: handler = xmlFindCharEncodingHandler(encoding);
! 1067: }
! 1068: }
! 1069:
! 1070: /*
! 1071: * Fallback to HTML or ASCII when the encoding is unspecified
! 1072: */
! 1073: if (handler == NULL)
! 1074: handler = xmlFindCharEncodingHandler("HTML");
! 1075: if (handler == NULL)
! 1076: handler = xmlFindCharEncodingHandler("ascii");
! 1077:
! 1078: buf = xmlOutputBufferCreateFile(f, handler);
! 1079: if (buf == NULL) return(-1);
! 1080: htmlDocContentDumpOutput(buf, cur, NULL);
! 1081:
! 1082: ret = xmlOutputBufferClose(buf);
! 1083: return(ret);
! 1084: }
! 1085:
! 1086: /**
! 1087: * htmlSaveFile:
! 1088: * @filename: the filename (or URL)
! 1089: * @cur: the document
! 1090: *
! 1091: * Dump an HTML document to a file. If @filename is "-" the stdout file is
! 1092: * used.
! 1093: * returns: the number of byte written or -1 in case of failure.
! 1094: */
! 1095: int
! 1096: htmlSaveFile(const char *filename, xmlDocPtr cur) {
! 1097: xmlOutputBufferPtr buf;
! 1098: xmlCharEncodingHandlerPtr handler = NULL;
! 1099: const char *encoding;
! 1100: int ret;
! 1101:
! 1102: if ((cur == NULL) || (filename == NULL))
! 1103: return(-1);
! 1104:
! 1105: xmlInitParser();
! 1106:
! 1107: encoding = (const char *) htmlGetMetaEncoding(cur);
! 1108:
! 1109: if (encoding != NULL) {
! 1110: xmlCharEncoding enc;
! 1111:
! 1112: enc = xmlParseCharEncoding(encoding);
! 1113: if (enc != cur->charset) {
! 1114: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
! 1115: /*
! 1116: * Not supported yet
! 1117: */
! 1118: return(-1);
! 1119: }
! 1120:
! 1121: handler = xmlFindCharEncodingHandler(encoding);
! 1122: if (handler == NULL)
! 1123: return(-1);
! 1124: }
! 1125: }
! 1126:
! 1127: /*
! 1128: * Fallback to HTML or ASCII when the encoding is unspecified
! 1129: */
! 1130: if (handler == NULL)
! 1131: handler = xmlFindCharEncodingHandler("HTML");
! 1132: if (handler == NULL)
! 1133: handler = xmlFindCharEncodingHandler("ascii");
! 1134:
! 1135: /*
! 1136: * save the content to a temp buffer.
! 1137: */
! 1138: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
! 1139: if (buf == NULL) return(0);
! 1140:
! 1141: htmlDocContentDumpOutput(buf, cur, NULL);
! 1142:
! 1143: ret = xmlOutputBufferClose(buf);
! 1144: return(ret);
! 1145: }
! 1146:
! 1147: /**
! 1148: * htmlSaveFileFormat:
! 1149: * @filename: the filename
! 1150: * @cur: the document
! 1151: * @format: should formatting spaces been added
! 1152: * @encoding: the document encoding
! 1153: *
! 1154: * Dump an HTML document to a file using a given encoding.
! 1155: *
! 1156: * returns: the number of byte written or -1 in case of failure.
! 1157: */
! 1158: int
! 1159: htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
! 1160: const char *encoding, int format) {
! 1161: xmlOutputBufferPtr buf;
! 1162: xmlCharEncodingHandlerPtr handler = NULL;
! 1163: int ret;
! 1164:
! 1165: if ((cur == NULL) || (filename == NULL))
! 1166: return(-1);
! 1167:
! 1168: xmlInitParser();
! 1169:
! 1170: if (encoding != NULL) {
! 1171: xmlCharEncoding enc;
! 1172:
! 1173: enc = xmlParseCharEncoding(encoding);
! 1174: if (enc != cur->charset) {
! 1175: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
! 1176: /*
! 1177: * Not supported yet
! 1178: */
! 1179: return(-1);
! 1180: }
! 1181:
! 1182: handler = xmlFindCharEncodingHandler(encoding);
! 1183: if (handler == NULL)
! 1184: return(-1);
! 1185: }
! 1186: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
! 1187: } else {
! 1188: htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
! 1189: }
! 1190:
! 1191: /*
! 1192: * Fallback to HTML or ASCII when the encoding is unspecified
! 1193: */
! 1194: if (handler == NULL)
! 1195: handler = xmlFindCharEncodingHandler("HTML");
! 1196: if (handler == NULL)
! 1197: handler = xmlFindCharEncodingHandler("ascii");
! 1198:
! 1199: /*
! 1200: * save the content to a temp buffer.
! 1201: */
! 1202: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
! 1203: if (buf == NULL) return(0);
! 1204:
! 1205: htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
! 1206:
! 1207: ret = xmlOutputBufferClose(buf);
! 1208: return(ret);
! 1209: }
! 1210:
! 1211: /**
! 1212: * htmlSaveFileEnc:
! 1213: * @filename: the filename
! 1214: * @cur: the document
! 1215: * @encoding: the document encoding
! 1216: *
! 1217: * Dump an HTML document to a file using a given encoding
! 1218: * and formatting returns/spaces are added.
! 1219: *
! 1220: * returns: the number of byte written or -1 in case of failure.
! 1221: */
! 1222: int
! 1223: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
! 1224: return(htmlSaveFileFormat(filename, cur, encoding, 1));
! 1225: }
! 1226:
! 1227: #endif /* LIBXML_OUTPUT_ENABLED */
! 1228:
! 1229: #define bottom_HTMLtree
! 1230: #include "elfgcchack.h"
! 1231: #endif /* LIBXML_HTML_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>