Annotation of embedaddon/libxml2/HTMLtree.c, revision 1.1.1.1
1.1 misho 1: /*
2: * HTMLtree.c : implementation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * daniel@veillard.com
7: */
8:
9:
10: #define IN_LIBXML
11: #include "libxml.h"
12: #ifdef LIBXML_HTML_ENABLED
13:
14: #include <string.h> /* for memset() only ! */
15:
16: #ifdef HAVE_CTYPE_H
17: #include <ctype.h>
18: #endif
19: #ifdef HAVE_STDLIB_H
20: #include <stdlib.h>
21: #endif
22:
23: #include <libxml/xmlmemory.h>
24: #include <libxml/HTMLparser.h>
25: #include <libxml/HTMLtree.h>
26: #include <libxml/entities.h>
27: #include <libxml/valid.h>
28: #include <libxml/xmlerror.h>
29: #include <libxml/parserInternals.h>
30: #include <libxml/globals.h>
31: #include <libxml/uri.h>
32:
33: /************************************************************************
34: * *
35: * Getting/Setting encoding meta tags *
36: * *
37: ************************************************************************/
38:
39: /**
40: * htmlGetMetaEncoding:
41: * @doc: the document
42: *
43: * Encoding definition lookup in the Meta tags
44: *
45: * Returns the current encoding as flagged in the HTML source
46: */
47: const xmlChar *
48: htmlGetMetaEncoding(htmlDocPtr doc) {
49: htmlNodePtr cur;
50: const xmlChar *content;
51: const xmlChar *encoding;
52:
53: if (doc == NULL)
54: return(NULL);
55: cur = doc->children;
56:
57: /*
58: * Search the html
59: */
60: while (cur != NULL) {
61: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62: if (xmlStrEqual(cur->name, BAD_CAST"html"))
63: break;
64: if (xmlStrEqual(cur->name, BAD_CAST"head"))
65: goto found_head;
66: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67: goto found_meta;
68: }
69: cur = cur->next;
70: }
71: if (cur == NULL)
72: return(NULL);
73: cur = cur->children;
74:
75: /*
76: * Search the head
77: */
78: while (cur != NULL) {
79: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80: if (xmlStrEqual(cur->name, BAD_CAST"head"))
81: break;
82: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83: goto found_meta;
84: }
85: cur = cur->next;
86: }
87: if (cur == NULL)
88: return(NULL);
89: found_head:
90: cur = cur->children;
91:
92: /*
93: * Search the meta elements
94: */
95: found_meta:
96: while (cur != NULL) {
97: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99: xmlAttrPtr attr = cur->properties;
100: int http;
101: const xmlChar *value;
102:
103: content = NULL;
104: http = 0;
105: while (attr != NULL) {
106: if ((attr->children != NULL) &&
107: (attr->children->type == XML_TEXT_NODE) &&
108: (attr->children->next == NULL)) {
109: value = attr->children->content;
110: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112: http = 1;
113: else if ((value != NULL)
114: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115: content = value;
116: if ((http != 0) && (content != NULL))
117: goto found_content;
118: }
119: attr = attr->next;
120: }
121: }
122: }
123: cur = cur->next;
124: }
125: return(NULL);
126:
127: found_content:
128: encoding = xmlStrstr(content, BAD_CAST"charset=");
129: if (encoding == NULL)
130: encoding = xmlStrstr(content, BAD_CAST"Charset=");
131: if (encoding == NULL)
132: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133: if (encoding != NULL) {
134: encoding += 8;
135: } else {
136: encoding = xmlStrstr(content, BAD_CAST"charset =");
137: if (encoding == NULL)
138: encoding = xmlStrstr(content, BAD_CAST"Charset =");
139: if (encoding == NULL)
140: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141: if (encoding != NULL)
142: encoding += 9;
143: }
144: if (encoding != NULL) {
145: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146: }
147: return(encoding);
148: }
149:
150: /**
151: * htmlSetMetaEncoding:
152: * @doc: the document
153: * @encoding: the encoding string
154: *
155: * Sets the current encoding in the Meta tags
156: * NOTE: this will not change the document content encoding, just
157: * the META flag associated.
158: *
159: * Returns 0 in case of success and -1 in case of error
160: */
161: int
162: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163: htmlNodePtr cur, meta = NULL, head = NULL;
164: const xmlChar *content = NULL;
165: char newcontent[100];
166:
167:
168: if (doc == NULL)
169: return(-1);
170:
171: /* html isn't a real encoding it's just libxml2 way to get entities */
172: if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
173: return(-1);
174:
175: if (encoding != NULL) {
176: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
177: (char *)encoding);
178: newcontent[sizeof(newcontent) - 1] = 0;
179: }
180:
181: cur = doc->children;
182:
183: /*
184: * Search the html
185: */
186: while (cur != NULL) {
187: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
188: if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
189: break;
190: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
191: goto found_head;
192: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
193: goto found_meta;
194: }
195: cur = cur->next;
196: }
197: if (cur == NULL)
198: return(-1);
199: cur = cur->children;
200:
201: /*
202: * Search the head
203: */
204: while (cur != NULL) {
205: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
206: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
207: break;
208: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
209: head = cur->parent;
210: goto found_meta;
211: }
212: }
213: cur = cur->next;
214: }
215: if (cur == NULL)
216: return(-1);
217: found_head:
218: head = cur;
219: if (cur->children == NULL)
220: goto create;
221: cur = cur->children;
222:
223: found_meta:
224: /*
225: * Search and update all the remaining the meta elements carrying
226: * encoding informations
227: */
228: while (cur != NULL) {
229: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
230: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
231: xmlAttrPtr attr = cur->properties;
232: int http;
233: const xmlChar *value;
234:
235: content = NULL;
236: http = 0;
237: while (attr != NULL) {
238: if ((attr->children != NULL) &&
239: (attr->children->type == XML_TEXT_NODE) &&
240: (attr->children->next == NULL)) {
241: value = attr->children->content;
242: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
243: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
244: http = 1;
245: else
246: {
247: if ((value != NULL) &&
248: (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
249: content = value;
250: }
251: if ((http != 0) && (content != NULL))
252: break;
253: }
254: attr = attr->next;
255: }
256: if ((http != 0) && (content != NULL)) {
257: meta = cur;
258: break;
259: }
260:
261: }
262: }
263: cur = cur->next;
264: }
265: create:
266: if (meta == NULL) {
267: if ((encoding != NULL) && (head != NULL)) {
268: /*
269: * Create a new Meta element with the right attributes
270: */
271:
272: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
273: if (head->children == NULL)
274: xmlAddChild(head, meta);
275: else
276: xmlAddPrevSibling(head->children, meta);
277: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
278: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
279: }
280: } else {
281: /* change the document only if there is a real encoding change */
282: if (xmlStrcasestr(content, encoding) == NULL) {
283: xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
284: }
285: }
286:
287:
288: return(0);
289: }
290:
291: /**
292: * booleanHTMLAttrs:
293: *
294: * These are the HTML attributes which will be output
295: * in minimized form, i.e. <option selected="selected"> will be
296: * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
297: *
298: */
299: static const char* htmlBooleanAttrs[] = {
300: "checked", "compact", "declare", "defer", "disabled", "ismap",
301: "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
302: "selected", NULL
303: };
304:
305:
306: /**
307: * htmlIsBooleanAttr:
308: * @name: the name of the attribute to check
309: *
310: * Determine if a given attribute is a boolean attribute.
311: *
312: * returns: false if the attribute is not boolean, true otherwise.
313: */
314: int
315: htmlIsBooleanAttr(const xmlChar *name)
316: {
317: int i = 0;
318:
319: while (htmlBooleanAttrs[i] != NULL) {
320: if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
321: return 1;
322: i++;
323: }
324: return 0;
325: }
326:
327: #ifdef LIBXML_OUTPUT_ENABLED
328: /*
329: * private routine exported from xmlIO.c
330: */
331: xmlOutputBufferPtr
332: xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
333: /************************************************************************
334: * *
335: * Output error handlers *
336: * *
337: ************************************************************************/
338: /**
339: * htmlSaveErrMemory:
340: * @extra: extra informations
341: *
342: * Handle an out of memory condition
343: */
344: static void
345: htmlSaveErrMemory(const char *extra)
346: {
347: __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
348: }
349:
350: /**
351: * htmlSaveErr:
352: * @code: the error number
353: * @node: the location of the error.
354: * @extra: extra informations
355: *
356: * Handle an out of memory condition
357: */
358: static void
359: htmlSaveErr(int code, xmlNodePtr node, const char *extra)
360: {
361: const char *msg = NULL;
362:
363: switch(code) {
364: case XML_SAVE_NOT_UTF8:
365: msg = "string is not in UTF-8\n";
366: break;
367: case XML_SAVE_CHAR_INVALID:
368: msg = "invalid character value\n";
369: break;
370: case XML_SAVE_UNKNOWN_ENCODING:
371: msg = "unknown encoding %s\n";
372: break;
373: case XML_SAVE_NO_DOCTYPE:
374: msg = "HTML has no DOCTYPE\n";
375: break;
376: default:
377: msg = "unexpected error number\n";
378: }
379: __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
380: }
381:
382: /************************************************************************
383: * *
384: * Dumping HTML tree content to a simple buffer *
385: * *
386: ************************************************************************/
387:
388: static int
389: htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
390: int format);
391:
392: /**
393: * htmlNodeDumpFormat:
394: * @buf: the HTML buffer output
395: * @doc: the document
396: * @cur: the current node
397: * @format: should formatting spaces been added
398: *
399: * Dump an HTML node, recursive behaviour,children are printed too.
400: *
401: * Returns the number of byte written or -1 in case of error
402: */
403: static int
404: htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
405: int format) {
406: unsigned int use;
407: int ret;
408: xmlOutputBufferPtr outbuf;
409:
410: if (cur == NULL) {
411: return (-1);
412: }
413: if (buf == NULL) {
414: return (-1);
415: }
416: outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
417: if (outbuf == NULL) {
418: htmlSaveErrMemory("allocating HTML output buffer");
419: return (-1);
420: }
421: memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
422: outbuf->buffer = buf;
423: outbuf->encoder = NULL;
424: outbuf->writecallback = NULL;
425: outbuf->closecallback = NULL;
426: outbuf->context = NULL;
427: outbuf->written = 0;
428:
429: use = buf->use;
430: htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
431: xmlFree(outbuf);
432: ret = buf->use - use;
433: return (ret);
434: }
435:
436: /**
437: * htmlNodeDump:
438: * @buf: the HTML buffer output
439: * @doc: the document
440: * @cur: the current node
441: *
442: * Dump an HTML node, recursive behaviour,children are printed too,
443: * and formatting returns are added.
444: *
445: * Returns the number of byte written or -1 in case of error
446: */
447: int
448: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
449: xmlInitParser();
450:
451: return(htmlNodeDumpFormat(buf, doc, cur, 1));
452: }
453:
454: /**
455: * htmlNodeDumpFileFormat:
456: * @out: the FILE pointer
457: * @doc: the document
458: * @cur: the current node
459: * @encoding: the document encoding
460: * @format: should formatting spaces been added
461: *
462: * Dump an HTML node, recursive behaviour,children are printed too.
463: *
464: * TODO: if encoding == NULL try to save in the doc encoding
465: *
466: * returns: the number of byte written or -1 in case of failure.
467: */
468: int
469: htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
470: xmlNodePtr cur, const char *encoding, int format) {
471: xmlOutputBufferPtr buf;
472: xmlCharEncodingHandlerPtr handler = NULL;
473: int ret;
474:
475: xmlInitParser();
476:
477: if (encoding != NULL) {
478: xmlCharEncoding enc;
479:
480: enc = xmlParseCharEncoding(encoding);
481: if (enc != XML_CHAR_ENCODING_UTF8) {
482: handler = xmlFindCharEncodingHandler(encoding);
483: if (handler == NULL)
484: return(-1);
485: }
486: }
487:
488: /*
489: * Fallback to HTML or ASCII when the encoding is unspecified
490: */
491: if (handler == NULL)
492: handler = xmlFindCharEncodingHandler("HTML");
493: if (handler == NULL)
494: handler = xmlFindCharEncodingHandler("ascii");
495:
496: /*
497: * save the content to a temp buffer.
498: */
499: buf = xmlOutputBufferCreateFile(out, handler);
500: if (buf == NULL) return(0);
501:
502: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
503:
504: ret = xmlOutputBufferClose(buf);
505: return(ret);
506: }
507:
508: /**
509: * htmlNodeDumpFile:
510: * @out: the FILE pointer
511: * @doc: the document
512: * @cur: the current node
513: *
514: * Dump an HTML node, recursive behaviour,children are printed too,
515: * and formatting returns are added.
516: */
517: void
518: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
519: htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
520: }
521:
522: /**
523: * htmlDocDumpMemoryFormat:
524: * @cur: the document
525: * @mem: OUT: the memory pointer
526: * @size: OUT: the memory length
527: * @format: should formatting spaces been added
528: *
529: * Dump an HTML document in memory and return the xmlChar * and it's size.
530: * It's up to the caller to free the memory.
531: */
532: void
533: htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
534: xmlOutputBufferPtr buf;
535: xmlCharEncodingHandlerPtr handler = NULL;
536: const char *encoding;
537:
538: xmlInitParser();
539:
540: if ((mem == NULL) || (size == NULL))
541: return;
542: if (cur == NULL) {
543: *mem = NULL;
544: *size = 0;
545: return;
546: }
547:
548: encoding = (const char *) htmlGetMetaEncoding(cur);
549:
550: if (encoding != NULL) {
551: xmlCharEncoding enc;
552:
553: enc = xmlParseCharEncoding(encoding);
554: if (enc != cur->charset) {
555: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
556: /*
557: * Not supported yet
558: */
559: *mem = NULL;
560: *size = 0;
561: return;
562: }
563:
564: handler = xmlFindCharEncodingHandler(encoding);
565: if (handler == NULL) {
566: *mem = NULL;
567: *size = 0;
568: return;
569: }
570: } else {
571: handler = xmlFindCharEncodingHandler(encoding);
572: }
573: }
574:
575: /*
576: * Fallback to HTML or ASCII when the encoding is unspecified
577: */
578: if (handler == NULL)
579: handler = xmlFindCharEncodingHandler("HTML");
580: if (handler == NULL)
581: handler = xmlFindCharEncodingHandler("ascii");
582:
583: buf = xmlAllocOutputBufferInternal(handler);
584: if (buf == NULL) {
585: *mem = NULL;
586: *size = 0;
587: return;
588: }
589:
590: htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
591:
592: xmlOutputBufferFlush(buf);
593: if (buf->conv != NULL) {
594: *size = buf->conv->use;
595: *mem = xmlStrndup(buf->conv->content, *size);
596: } else {
597: *size = buf->buffer->use;
598: *mem = xmlStrndup(buf->buffer->content, *size);
599: }
600: (void)xmlOutputBufferClose(buf);
601: }
602:
603: /**
604: * htmlDocDumpMemory:
605: * @cur: the document
606: * @mem: OUT: the memory pointer
607: * @size: OUT: the memory length
608: *
609: * Dump an HTML document in memory and return the xmlChar * and it's size.
610: * It's up to the caller to free the memory.
611: */
612: void
613: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
614: htmlDocDumpMemoryFormat(cur, mem, size, 1);
615: }
616:
617:
618: /************************************************************************
619: * *
620: * Dumping HTML tree content to an I/O output buffer *
621: * *
622: ************************************************************************/
623:
624: void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
625:
626: /**
627: * htmlDtdDumpOutput:
628: * @buf: the HTML buffer output
629: * @doc: the document
630: * @encoding: the encoding string
631: *
632: * TODO: check whether encoding is needed
633: *
634: * Dump the HTML document DTD, if any.
635: */
636: static void
637: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
638: const char *encoding ATTRIBUTE_UNUSED) {
639: xmlDtdPtr cur = doc->intSubset;
640:
641: if (cur == NULL) {
642: htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
643: return;
644: }
645: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
646: xmlOutputBufferWriteString(buf, (const char *)cur->name);
647: if (cur->ExternalID != NULL) {
648: xmlOutputBufferWriteString(buf, " PUBLIC ");
649: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
650: if (cur->SystemID != NULL) {
651: xmlOutputBufferWriteString(buf, " ");
652: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
653: }
654: } else if (cur->SystemID != NULL) {
655: xmlOutputBufferWriteString(buf, " SYSTEM ");
656: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
657: }
658: xmlOutputBufferWriteString(buf, ">\n");
659: }
660:
661: /**
662: * htmlAttrDumpOutput:
663: * @buf: the HTML buffer output
664: * @doc: the document
665: * @cur: the attribute pointer
666: * @encoding: the encoding string
667: *
668: * Dump an HTML attribute
669: */
670: static void
671: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
672: const char *encoding ATTRIBUTE_UNUSED) {
673: xmlChar *value;
674:
675: /*
676: * TODO: The html output method should not escape a & character
677: * occurring in an attribute value immediately followed by
678: * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
679: */
680:
681: if (cur == NULL) {
682: return;
683: }
684: xmlOutputBufferWriteString(buf, " ");
685: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
686: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
687: xmlOutputBufferWriteString(buf, ":");
688: }
689: xmlOutputBufferWriteString(buf, (const char *)cur->name);
690: if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
691: value = xmlNodeListGetString(doc, cur->children, 0);
692: if (value) {
693: xmlOutputBufferWriteString(buf, "=");
694: if ((cur->ns == NULL) && (cur->parent != NULL) &&
695: (cur->parent->ns == NULL) &&
696: ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
697: (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
698: (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
699: ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
700: (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
701: xmlChar *escaped;
702: xmlChar *tmp = value;
703:
704: while (IS_BLANK_CH(*tmp)) tmp++;
705:
706: escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
707: if (escaped != NULL) {
708: xmlBufferWriteQuotedString(buf->buffer, escaped);
709: xmlFree(escaped);
710: } else {
711: xmlBufferWriteQuotedString(buf->buffer, value);
712: }
713: } else {
714: xmlBufferWriteQuotedString(buf->buffer, value);
715: }
716: xmlFree(value);
717: } else {
718: xmlOutputBufferWriteString(buf, "=\"\"");
719: }
720: }
721: }
722:
723: /**
724: * htmlAttrListDumpOutput:
725: * @buf: the HTML buffer output
726: * @doc: the document
727: * @cur: the first attribute pointer
728: * @encoding: the encoding string
729: *
730: * Dump a list of HTML attributes
731: */
732: static void
733: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
734: if (cur == NULL) {
735: return;
736: }
737: while (cur != NULL) {
738: htmlAttrDumpOutput(buf, doc, cur, encoding);
739: cur = cur->next;
740: }
741: }
742:
743:
744:
745: /**
746: * htmlNodeListDumpOutput:
747: * @buf: the HTML buffer output
748: * @doc: the document
749: * @cur: the first node
750: * @encoding: the encoding string
751: * @format: should formatting spaces been added
752: *
753: * Dump an HTML node list, recursive behaviour,children are printed too.
754: */
755: static void
756: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
757: xmlNodePtr cur, const char *encoding, int format) {
758: if (cur == NULL) {
759: return;
760: }
761: while (cur != NULL) {
762: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
763: cur = cur->next;
764: }
765: }
766:
767: /**
768: * htmlNodeDumpFormatOutput:
769: * @buf: the HTML buffer output
770: * @doc: the document
771: * @cur: the current node
772: * @encoding: the encoding string
773: * @format: should formatting spaces been added
774: *
775: * Dump an HTML node, recursive behaviour,children are printed too.
776: */
777: void
778: htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
779: xmlNodePtr cur, const char *encoding, int format) {
780: const htmlElemDesc * info;
781:
782: xmlInitParser();
783:
784: if ((cur == NULL) || (buf == NULL)) {
785: return;
786: }
787: /*
788: * Special cases.
789: */
790: if (cur->type == XML_DTD_NODE)
791: return;
792: if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
793: (cur->type == XML_DOCUMENT_NODE)){
794: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
795: return;
796: }
797: if (cur->type == XML_ATTRIBUTE_NODE) {
798: htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
799: return;
800: }
801: if (cur->type == HTML_TEXT_NODE) {
802: if (cur->content != NULL) {
803: if (((cur->name == (const xmlChar *)xmlStringText) ||
804: (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
805: ((cur->parent == NULL) ||
806: ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
807: (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
808: xmlChar *buffer;
809:
810: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
811: if (buffer != NULL) {
812: xmlOutputBufferWriteString(buf, (const char *)buffer);
813: xmlFree(buffer);
814: }
815: } else {
816: xmlOutputBufferWriteString(buf, (const char *)cur->content);
817: }
818: }
819: return;
820: }
821: if (cur->type == HTML_COMMENT_NODE) {
822: if (cur->content != NULL) {
823: xmlOutputBufferWriteString(buf, "<!--");
824: xmlOutputBufferWriteString(buf, (const char *)cur->content);
825: xmlOutputBufferWriteString(buf, "-->");
826: }
827: return;
828: }
829: if (cur->type == HTML_PI_NODE) {
830: if (cur->name == NULL)
831: return;
832: xmlOutputBufferWriteString(buf, "<?");
833: xmlOutputBufferWriteString(buf, (const char *)cur->name);
834: if (cur->content != NULL) {
835: xmlOutputBufferWriteString(buf, " ");
836: xmlOutputBufferWriteString(buf, (const char *)cur->content);
837: }
838: xmlOutputBufferWriteString(buf, ">");
839: return;
840: }
841: if (cur->type == HTML_ENTITY_REF_NODE) {
842: xmlOutputBufferWriteString(buf, "&");
843: xmlOutputBufferWriteString(buf, (const char *)cur->name);
844: xmlOutputBufferWriteString(buf, ";");
845: return;
846: }
847: if (cur->type == HTML_PRESERVE_NODE) {
848: if (cur->content != NULL) {
849: xmlOutputBufferWriteString(buf, (const char *)cur->content);
850: }
851: return;
852: }
853:
854: /*
855: * Get specific HTML info for that node.
856: */
857: if (cur->ns == NULL)
858: info = htmlTagLookup(cur->name);
859: else
860: info = NULL;
861:
862: xmlOutputBufferWriteString(buf, "<");
863: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
864: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
865: xmlOutputBufferWriteString(buf, ":");
866: }
867: xmlOutputBufferWriteString(buf, (const char *)cur->name);
868: if (cur->nsDef)
869: xmlNsListDumpOutput(buf, cur->nsDef);
870: if (cur->properties != NULL)
871: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
872:
873: if ((info != NULL) && (info->empty)) {
874: xmlOutputBufferWriteString(buf, ">");
875: if ((format) && (!info->isinline) && (cur->next != NULL)) {
876: if ((cur->next->type != HTML_TEXT_NODE) &&
877: (cur->next->type != HTML_ENTITY_REF_NODE) &&
878: (cur->parent != NULL) &&
879: (cur->parent->name != NULL) &&
880: (cur->parent->name[0] != 'p')) /* p, pre, param */
881: xmlOutputBufferWriteString(buf, "\n");
882: }
883: return;
884: }
885: if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
886: (cur->children == NULL)) {
887: if ((info != NULL) && (info->saveEndTag != 0) &&
888: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
889: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
890: xmlOutputBufferWriteString(buf, ">");
891: } else {
892: xmlOutputBufferWriteString(buf, "></");
893: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
894: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
895: xmlOutputBufferWriteString(buf, ":");
896: }
897: xmlOutputBufferWriteString(buf, (const char *)cur->name);
898: xmlOutputBufferWriteString(buf, ">");
899: }
900: if ((format) && (cur->next != NULL) &&
901: (info != NULL) && (!info->isinline)) {
902: if ((cur->next->type != HTML_TEXT_NODE) &&
903: (cur->next->type != HTML_ENTITY_REF_NODE) &&
904: (cur->parent != NULL) &&
905: (cur->parent->name != NULL) &&
906: (cur->parent->name[0] != 'p')) /* p, pre, param */
907: xmlOutputBufferWriteString(buf, "\n");
908: }
909: return;
910: }
911: xmlOutputBufferWriteString(buf, ">");
912: if ((cur->type != XML_ELEMENT_NODE) &&
913: (cur->content != NULL)) {
914: /*
915: * Uses the OutputBuffer property to automatically convert
916: * invalids to charrefs
917: */
918:
919: xmlOutputBufferWriteString(buf, (const char *) cur->content);
920: }
921: if (cur->children != NULL) {
922: if ((format) && (info != NULL) && (!info->isinline) &&
923: (cur->children->type != HTML_TEXT_NODE) &&
924: (cur->children->type != HTML_ENTITY_REF_NODE) &&
925: (cur->children != cur->last) &&
926: (cur->name != NULL) &&
927: (cur->name[0] != 'p')) /* p, pre, param */
928: xmlOutputBufferWriteString(buf, "\n");
929: htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
930: if ((format) && (info != NULL) && (!info->isinline) &&
931: (cur->last->type != HTML_TEXT_NODE) &&
932: (cur->last->type != HTML_ENTITY_REF_NODE) &&
933: (cur->children != cur->last) &&
934: (cur->name != NULL) &&
935: (cur->name[0] != 'p')) /* p, pre, param */
936: xmlOutputBufferWriteString(buf, "\n");
937: }
938: xmlOutputBufferWriteString(buf, "</");
939: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941: xmlOutputBufferWriteString(buf, ":");
942: }
943: xmlOutputBufferWriteString(buf, (const char *)cur->name);
944: xmlOutputBufferWriteString(buf, ">");
945: if ((format) && (info != NULL) && (!info->isinline) &&
946: (cur->next != NULL)) {
947: if ((cur->next->type != HTML_TEXT_NODE) &&
948: (cur->next->type != HTML_ENTITY_REF_NODE) &&
949: (cur->parent != NULL) &&
950: (cur->parent->name != NULL) &&
951: (cur->parent->name[0] != 'p')) /* p, pre, param */
952: xmlOutputBufferWriteString(buf, "\n");
953: }
954: }
955:
956: /**
957: * htmlNodeDumpOutput:
958: * @buf: the HTML buffer output
959: * @doc: the document
960: * @cur: the current node
961: * @encoding: the encoding string
962: *
963: * Dump an HTML node, recursive behaviour,children are printed too,
964: * and formatting returns/spaces are added.
965: */
966: void
967: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
968: xmlNodePtr cur, const char *encoding) {
969: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
970: }
971:
972: /**
973: * htmlDocContentDumpFormatOutput:
974: * @buf: the HTML buffer output
975: * @cur: the document
976: * @encoding: the encoding string
977: * @format: should formatting spaces been added
978: *
979: * Dump an HTML document.
980: */
981: void
982: htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
983: const char *encoding, int format) {
984: int type;
985:
986: xmlInitParser();
987:
988: if ((buf == NULL) || (cur == NULL))
989: return;
990:
991: /*
992: * force to output the stuff as HTML, especially for entities
993: */
994: type = cur->type;
995: cur->type = XML_HTML_DOCUMENT_NODE;
996: if (cur->intSubset != NULL) {
997: htmlDtdDumpOutput(buf, cur, NULL);
998: }
999: if (cur->children != NULL) {
1000: htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1001: }
1002: xmlOutputBufferWriteString(buf, "\n");
1003: cur->type = (xmlElementType) type;
1004: }
1005:
1006: /**
1007: * htmlDocContentDumpOutput:
1008: * @buf: the HTML buffer output
1009: * @cur: the document
1010: * @encoding: the encoding string
1011: *
1012: * Dump an HTML document. Formating return/spaces are added.
1013: */
1014: void
1015: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1016: const char *encoding) {
1017: htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1018: }
1019:
1020: /************************************************************************
1021: * *
1022: * Saving functions front-ends *
1023: * *
1024: ************************************************************************/
1025:
1026: /**
1027: * htmlDocDump:
1028: * @f: the FILE*
1029: * @cur: the document
1030: *
1031: * Dump an HTML document to an open FILE.
1032: *
1033: * returns: the number of byte written or -1 in case of failure.
1034: */
1035: int
1036: htmlDocDump(FILE *f, xmlDocPtr cur) {
1037: xmlOutputBufferPtr buf;
1038: xmlCharEncodingHandlerPtr handler = NULL;
1039: const char *encoding;
1040: int ret;
1041:
1042: xmlInitParser();
1043:
1044: if ((cur == NULL) || (f == NULL)) {
1045: return(-1);
1046: }
1047:
1048: encoding = (const char *) htmlGetMetaEncoding(cur);
1049:
1050: if (encoding != NULL) {
1051: xmlCharEncoding enc;
1052:
1053: enc = xmlParseCharEncoding(encoding);
1054: if (enc != cur->charset) {
1055: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1056: /*
1057: * Not supported yet
1058: */
1059: return(-1);
1060: }
1061:
1062: handler = xmlFindCharEncodingHandler(encoding);
1063: if (handler == NULL)
1064: return(-1);
1065: } else {
1066: handler = xmlFindCharEncodingHandler(encoding);
1067: }
1068: }
1069:
1070: /*
1071: * Fallback to HTML or ASCII when the encoding is unspecified
1072: */
1073: if (handler == NULL)
1074: handler = xmlFindCharEncodingHandler("HTML");
1075: if (handler == NULL)
1076: handler = xmlFindCharEncodingHandler("ascii");
1077:
1078: buf = xmlOutputBufferCreateFile(f, handler);
1079: if (buf == NULL) return(-1);
1080: htmlDocContentDumpOutput(buf, cur, NULL);
1081:
1082: ret = xmlOutputBufferClose(buf);
1083: return(ret);
1084: }
1085:
1086: /**
1087: * htmlSaveFile:
1088: * @filename: the filename (or URL)
1089: * @cur: the document
1090: *
1091: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1092: * used.
1093: * returns: the number of byte written or -1 in case of failure.
1094: */
1095: int
1096: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1097: xmlOutputBufferPtr buf;
1098: xmlCharEncodingHandlerPtr handler = NULL;
1099: const char *encoding;
1100: int ret;
1101:
1102: if ((cur == NULL) || (filename == NULL))
1103: return(-1);
1104:
1105: xmlInitParser();
1106:
1107: encoding = (const char *) htmlGetMetaEncoding(cur);
1108:
1109: if (encoding != NULL) {
1110: xmlCharEncoding enc;
1111:
1112: enc = xmlParseCharEncoding(encoding);
1113: if (enc != cur->charset) {
1114: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1115: /*
1116: * Not supported yet
1117: */
1118: return(-1);
1119: }
1120:
1121: handler = xmlFindCharEncodingHandler(encoding);
1122: if (handler == NULL)
1123: return(-1);
1124: }
1125: }
1126:
1127: /*
1128: * Fallback to HTML or ASCII when the encoding is unspecified
1129: */
1130: if (handler == NULL)
1131: handler = xmlFindCharEncodingHandler("HTML");
1132: if (handler == NULL)
1133: handler = xmlFindCharEncodingHandler("ascii");
1134:
1135: /*
1136: * save the content to a temp buffer.
1137: */
1138: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1139: if (buf == NULL) return(0);
1140:
1141: htmlDocContentDumpOutput(buf, cur, NULL);
1142:
1143: ret = xmlOutputBufferClose(buf);
1144: return(ret);
1145: }
1146:
1147: /**
1148: * htmlSaveFileFormat:
1149: * @filename: the filename
1150: * @cur: the document
1151: * @format: should formatting spaces been added
1152: * @encoding: the document encoding
1153: *
1154: * Dump an HTML document to a file using a given encoding.
1155: *
1156: * returns: the number of byte written or -1 in case of failure.
1157: */
1158: int
1159: htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1160: const char *encoding, int format) {
1161: xmlOutputBufferPtr buf;
1162: xmlCharEncodingHandlerPtr handler = NULL;
1163: int ret;
1164:
1165: if ((cur == NULL) || (filename == NULL))
1166: return(-1);
1167:
1168: xmlInitParser();
1169:
1170: if (encoding != NULL) {
1171: xmlCharEncoding enc;
1172:
1173: enc = xmlParseCharEncoding(encoding);
1174: if (enc != cur->charset) {
1175: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1176: /*
1177: * Not supported yet
1178: */
1179: return(-1);
1180: }
1181:
1182: handler = xmlFindCharEncodingHandler(encoding);
1183: if (handler == NULL)
1184: return(-1);
1185: }
1186: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1187: } else {
1188: htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1189: }
1190:
1191: /*
1192: * Fallback to HTML or ASCII when the encoding is unspecified
1193: */
1194: if (handler == NULL)
1195: handler = xmlFindCharEncodingHandler("HTML");
1196: if (handler == NULL)
1197: handler = xmlFindCharEncodingHandler("ascii");
1198:
1199: /*
1200: * save the content to a temp buffer.
1201: */
1202: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1203: if (buf == NULL) return(0);
1204:
1205: htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1206:
1207: ret = xmlOutputBufferClose(buf);
1208: return(ret);
1209: }
1210:
1211: /**
1212: * htmlSaveFileEnc:
1213: * @filename: the filename
1214: * @cur: the document
1215: * @encoding: the document encoding
1216: *
1217: * Dump an HTML document to a file using a given encoding
1218: * and formatting returns/spaces are added.
1219: *
1220: * returns: the number of byte written or -1 in case of failure.
1221: */
1222: int
1223: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1224: return(htmlSaveFileFormat(filename, cur, encoding, 1));
1225: }
1226:
1227: #endif /* LIBXML_OUTPUT_ENABLED */
1228:
1229: #define bottom_HTMLtree
1230: #include "elfgcchack.h"
1231: #endif /* LIBXML_HTML_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>