Annotation of embedaddon/libxml2/HTMLtree.c, revision 1.1.1.2
1.1 misho 1: /*
2: * HTMLtree.c : implementation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * daniel@veillard.com
7: */
8:
9:
10: #define IN_LIBXML
11: #include "libxml.h"
12: #ifdef LIBXML_HTML_ENABLED
13:
14: #include <string.h> /* for memset() only ! */
15:
16: #ifdef HAVE_CTYPE_H
17: #include <ctype.h>
18: #endif
19: #ifdef HAVE_STDLIB_H
20: #include <stdlib.h>
21: #endif
22:
23: #include <libxml/xmlmemory.h>
24: #include <libxml/HTMLparser.h>
25: #include <libxml/HTMLtree.h>
26: #include <libxml/entities.h>
27: #include <libxml/valid.h>
28: #include <libxml/xmlerror.h>
29: #include <libxml/parserInternals.h>
30: #include <libxml/globals.h>
31: #include <libxml/uri.h>
32:
33: /************************************************************************
34: * *
35: * Getting/Setting encoding meta tags *
36: * *
37: ************************************************************************/
38:
39: /**
40: * htmlGetMetaEncoding:
41: * @doc: the document
42: *
43: * Encoding definition lookup in the Meta tags
44: *
45: * Returns the current encoding as flagged in the HTML source
46: */
47: const xmlChar *
48: htmlGetMetaEncoding(htmlDocPtr doc) {
49: htmlNodePtr cur;
50: const xmlChar *content;
51: const xmlChar *encoding;
52:
53: if (doc == NULL)
54: return(NULL);
55: cur = doc->children;
56:
57: /*
58: * Search the html
59: */
60: while (cur != NULL) {
61: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62: if (xmlStrEqual(cur->name, BAD_CAST"html"))
63: break;
64: if (xmlStrEqual(cur->name, BAD_CAST"head"))
65: goto found_head;
66: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67: goto found_meta;
68: }
69: cur = cur->next;
70: }
71: if (cur == NULL)
72: return(NULL);
73: cur = cur->children;
74:
75: /*
76: * Search the head
77: */
78: while (cur != NULL) {
79: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80: if (xmlStrEqual(cur->name, BAD_CAST"head"))
81: break;
82: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83: goto found_meta;
84: }
85: cur = cur->next;
86: }
87: if (cur == NULL)
88: return(NULL);
89: found_head:
90: cur = cur->children;
91:
92: /*
93: * Search the meta elements
94: */
95: found_meta:
96: while (cur != NULL) {
97: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99: xmlAttrPtr attr = cur->properties;
100: int http;
101: const xmlChar *value;
102:
103: content = NULL;
104: http = 0;
105: while (attr != NULL) {
106: if ((attr->children != NULL) &&
107: (attr->children->type == XML_TEXT_NODE) &&
108: (attr->children->next == NULL)) {
109: value = attr->children->content;
110: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112: http = 1;
113: else if ((value != NULL)
114: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115: content = value;
116: if ((http != 0) && (content != NULL))
117: goto found_content;
118: }
119: attr = attr->next;
120: }
121: }
122: }
123: cur = cur->next;
124: }
125: return(NULL);
126:
127: found_content:
128: encoding = xmlStrstr(content, BAD_CAST"charset=");
129: if (encoding == NULL)
130: encoding = xmlStrstr(content, BAD_CAST"Charset=");
131: if (encoding == NULL)
132: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133: if (encoding != NULL) {
134: encoding += 8;
135: } else {
136: encoding = xmlStrstr(content, BAD_CAST"charset =");
137: if (encoding == NULL)
138: encoding = xmlStrstr(content, BAD_CAST"Charset =");
139: if (encoding == NULL)
140: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141: if (encoding != NULL)
142: encoding += 9;
143: }
144: if (encoding != NULL) {
145: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146: }
147: return(encoding);
148: }
149:
150: /**
151: * htmlSetMetaEncoding:
152: * @doc: the document
153: * @encoding: the encoding string
1.1.1.2 ! misho 154: *
1.1 misho 155: * Sets the current encoding in the Meta tags
156: * NOTE: this will not change the document content encoding, just
157: * the META flag associated.
158: *
159: * Returns 0 in case of success and -1 in case of error
160: */
161: int
162: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163: htmlNodePtr cur, meta = NULL, head = NULL;
164: const xmlChar *content = NULL;
165: char newcontent[100];
166:
1.1.1.2 ! misho 167: newcontent[0] = 0;
1.1 misho 168:
169: if (doc == NULL)
170: return(-1);
171:
172: /* html isn't a real encoding it's just libxml2 way to get entities */
173: if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
174: return(-1);
175:
176: if (encoding != NULL) {
177: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
178: (char *)encoding);
179: newcontent[sizeof(newcontent) - 1] = 0;
180: }
181:
182: cur = doc->children;
183:
184: /*
185: * Search the html
186: */
187: while (cur != NULL) {
188: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
189: if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
190: break;
191: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
192: goto found_head;
193: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
194: goto found_meta;
195: }
196: cur = cur->next;
197: }
198: if (cur == NULL)
199: return(-1);
200: cur = cur->children;
201:
202: /*
203: * Search the head
204: */
205: while (cur != NULL) {
206: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
207: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
208: break;
209: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
210: head = cur->parent;
211: goto found_meta;
212: }
213: }
214: cur = cur->next;
215: }
216: if (cur == NULL)
217: return(-1);
218: found_head:
219: head = cur;
220: if (cur->children == NULL)
221: goto create;
222: cur = cur->children;
223:
224: found_meta:
225: /*
226: * Search and update all the remaining the meta elements carrying
227: * encoding informations
228: */
229: while (cur != NULL) {
230: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
231: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
232: xmlAttrPtr attr = cur->properties;
233: int http;
234: const xmlChar *value;
235:
236: content = NULL;
237: http = 0;
238: while (attr != NULL) {
239: if ((attr->children != NULL) &&
240: (attr->children->type == XML_TEXT_NODE) &&
241: (attr->children->next == NULL)) {
242: value = attr->children->content;
243: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
244: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
245: http = 1;
246: else
247: {
1.1.1.2 ! misho 248: if ((value != NULL) &&
1.1 misho 249: (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
250: content = value;
251: }
252: if ((http != 0) && (content != NULL))
253: break;
254: }
255: attr = attr->next;
256: }
257: if ((http != 0) && (content != NULL)) {
258: meta = cur;
259: break;
260: }
261:
262: }
263: }
264: cur = cur->next;
265: }
266: create:
267: if (meta == NULL) {
268: if ((encoding != NULL) && (head != NULL)) {
269: /*
270: * Create a new Meta element with the right attributes
271: */
272:
273: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
274: if (head->children == NULL)
275: xmlAddChild(head, meta);
276: else
277: xmlAddPrevSibling(head->children, meta);
278: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
279: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
280: }
281: } else {
1.1.1.2 ! misho 282: /* remove the meta tag if NULL is passed */
! 283: if (encoding == NULL) {
! 284: xmlUnlinkNode(meta);
! 285: xmlFreeNode(meta);
! 286: }
1.1 misho 287: /* change the document only if there is a real encoding change */
1.1.1.2 ! misho 288: else if (xmlStrcasestr(content, encoding) == NULL) {
1.1 misho 289: xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
290: }
291: }
292:
293:
294: return(0);
295: }
296:
297: /**
298: * booleanHTMLAttrs:
299: *
300: * These are the HTML attributes which will be output
301: * in minimized form, i.e. <option selected="selected"> will be
302: * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
303: *
304: */
305: static const char* htmlBooleanAttrs[] = {
306: "checked", "compact", "declare", "defer", "disabled", "ismap",
307: "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
308: "selected", NULL
309: };
310:
311:
312: /**
313: * htmlIsBooleanAttr:
314: * @name: the name of the attribute to check
315: *
316: * Determine if a given attribute is a boolean attribute.
317: *
318: * returns: false if the attribute is not boolean, true otherwise.
319: */
320: int
321: htmlIsBooleanAttr(const xmlChar *name)
322: {
323: int i = 0;
324:
325: while (htmlBooleanAttrs[i] != NULL) {
326: if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
327: return 1;
328: i++;
329: }
330: return 0;
331: }
332:
333: #ifdef LIBXML_OUTPUT_ENABLED
334: /*
335: * private routine exported from xmlIO.c
336: */
337: xmlOutputBufferPtr
338: xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
339: /************************************************************************
340: * *
341: * Output error handlers *
342: * *
343: ************************************************************************/
344: /**
345: * htmlSaveErrMemory:
346: * @extra: extra informations
347: *
348: * Handle an out of memory condition
349: */
350: static void
351: htmlSaveErrMemory(const char *extra)
352: {
353: __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
354: }
355:
356: /**
357: * htmlSaveErr:
358: * @code: the error number
359: * @node: the location of the error.
360: * @extra: extra informations
361: *
362: * Handle an out of memory condition
363: */
364: static void
365: htmlSaveErr(int code, xmlNodePtr node, const char *extra)
366: {
367: const char *msg = NULL;
368:
369: switch(code) {
370: case XML_SAVE_NOT_UTF8:
371: msg = "string is not in UTF-8\n";
372: break;
373: case XML_SAVE_CHAR_INVALID:
374: msg = "invalid character value\n";
375: break;
376: case XML_SAVE_UNKNOWN_ENCODING:
377: msg = "unknown encoding %s\n";
378: break;
379: case XML_SAVE_NO_DOCTYPE:
380: msg = "HTML has no DOCTYPE\n";
381: break;
382: default:
383: msg = "unexpected error number\n";
384: }
385: __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
386: }
387:
388: /************************************************************************
389: * *
390: * Dumping HTML tree content to a simple buffer *
391: * *
392: ************************************************************************/
393:
394: static int
395: htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
396: int format);
397:
398: /**
399: * htmlNodeDumpFormat:
400: * @buf: the HTML buffer output
401: * @doc: the document
402: * @cur: the current node
403: * @format: should formatting spaces been added
404: *
405: * Dump an HTML node, recursive behaviour,children are printed too.
406: *
407: * Returns the number of byte written or -1 in case of error
408: */
409: static int
410: htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
411: int format) {
412: unsigned int use;
413: int ret;
414: xmlOutputBufferPtr outbuf;
415:
416: if (cur == NULL) {
417: return (-1);
418: }
419: if (buf == NULL) {
420: return (-1);
421: }
422: outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
423: if (outbuf == NULL) {
424: htmlSaveErrMemory("allocating HTML output buffer");
425: return (-1);
426: }
427: memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
428: outbuf->buffer = buf;
429: outbuf->encoder = NULL;
430: outbuf->writecallback = NULL;
431: outbuf->closecallback = NULL;
432: outbuf->context = NULL;
433: outbuf->written = 0;
434:
435: use = buf->use;
436: htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
437: xmlFree(outbuf);
438: ret = buf->use - use;
439: return (ret);
440: }
441:
442: /**
443: * htmlNodeDump:
444: * @buf: the HTML buffer output
445: * @doc: the document
446: * @cur: the current node
447: *
448: * Dump an HTML node, recursive behaviour,children are printed too,
449: * and formatting returns are added.
450: *
451: * Returns the number of byte written or -1 in case of error
452: */
453: int
454: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
455: xmlInitParser();
456:
457: return(htmlNodeDumpFormat(buf, doc, cur, 1));
458: }
459:
460: /**
461: * htmlNodeDumpFileFormat:
462: * @out: the FILE pointer
463: * @doc: the document
464: * @cur: the current node
465: * @encoding: the document encoding
466: * @format: should formatting spaces been added
467: *
468: * Dump an HTML node, recursive behaviour,children are printed too.
469: *
470: * TODO: if encoding == NULL try to save in the doc encoding
471: *
472: * returns: the number of byte written or -1 in case of failure.
473: */
474: int
475: htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
476: xmlNodePtr cur, const char *encoding, int format) {
477: xmlOutputBufferPtr buf;
478: xmlCharEncodingHandlerPtr handler = NULL;
479: int ret;
480:
481: xmlInitParser();
482:
483: if (encoding != NULL) {
484: xmlCharEncoding enc;
485:
486: enc = xmlParseCharEncoding(encoding);
487: if (enc != XML_CHAR_ENCODING_UTF8) {
488: handler = xmlFindCharEncodingHandler(encoding);
489: if (handler == NULL)
1.1.1.2 ! misho 490: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 491: }
492: }
493:
494: /*
495: * Fallback to HTML or ASCII when the encoding is unspecified
496: */
497: if (handler == NULL)
498: handler = xmlFindCharEncodingHandler("HTML");
499: if (handler == NULL)
500: handler = xmlFindCharEncodingHandler("ascii");
501:
502: /*
503: * save the content to a temp buffer.
504: */
505: buf = xmlOutputBufferCreateFile(out, handler);
506: if (buf == NULL) return(0);
507:
508: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
509:
510: ret = xmlOutputBufferClose(buf);
511: return(ret);
512: }
513:
514: /**
515: * htmlNodeDumpFile:
516: * @out: the FILE pointer
517: * @doc: the document
518: * @cur: the current node
519: *
520: * Dump an HTML node, recursive behaviour,children are printed too,
521: * and formatting returns are added.
522: */
523: void
524: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
525: htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
526: }
527:
528: /**
529: * htmlDocDumpMemoryFormat:
530: * @cur: the document
531: * @mem: OUT: the memory pointer
532: * @size: OUT: the memory length
533: * @format: should formatting spaces been added
534: *
535: * Dump an HTML document in memory and return the xmlChar * and it's size.
536: * It's up to the caller to free the memory.
537: */
538: void
539: htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
540: xmlOutputBufferPtr buf;
541: xmlCharEncodingHandlerPtr handler = NULL;
542: const char *encoding;
543:
544: xmlInitParser();
545:
546: if ((mem == NULL) || (size == NULL))
547: return;
548: if (cur == NULL) {
549: *mem = NULL;
550: *size = 0;
551: return;
552: }
553:
554: encoding = (const char *) htmlGetMetaEncoding(cur);
555:
556: if (encoding != NULL) {
557: xmlCharEncoding enc;
558:
559: enc = xmlParseCharEncoding(encoding);
560: if (enc != cur->charset) {
561: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
562: /*
563: * Not supported yet
564: */
565: *mem = NULL;
566: *size = 0;
567: return;
568: }
569:
570: handler = xmlFindCharEncodingHandler(encoding);
1.1.1.2 ! misho 571: if (handler == NULL)
! 572: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
! 573:
1.1 misho 574: } else {
575: handler = xmlFindCharEncodingHandler(encoding);
576: }
577: }
578:
579: /*
580: * Fallback to HTML or ASCII when the encoding is unspecified
581: */
582: if (handler == NULL)
583: handler = xmlFindCharEncodingHandler("HTML");
584: if (handler == NULL)
585: handler = xmlFindCharEncodingHandler("ascii");
586:
587: buf = xmlAllocOutputBufferInternal(handler);
588: if (buf == NULL) {
589: *mem = NULL;
590: *size = 0;
591: return;
592: }
593:
1.1.1.2 ! misho 594: htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
1.1 misho 595:
596: xmlOutputBufferFlush(buf);
597: if (buf->conv != NULL) {
598: *size = buf->conv->use;
599: *mem = xmlStrndup(buf->conv->content, *size);
600: } else {
601: *size = buf->buffer->use;
602: *mem = xmlStrndup(buf->buffer->content, *size);
603: }
604: (void)xmlOutputBufferClose(buf);
605: }
606:
607: /**
608: * htmlDocDumpMemory:
609: * @cur: the document
610: * @mem: OUT: the memory pointer
611: * @size: OUT: the memory length
612: *
613: * Dump an HTML document in memory and return the xmlChar * and it's size.
614: * It's up to the caller to free the memory.
615: */
616: void
617: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
618: htmlDocDumpMemoryFormat(cur, mem, size, 1);
619: }
620:
621:
622: /************************************************************************
623: * *
624: * Dumping HTML tree content to an I/O output buffer *
625: * *
626: ************************************************************************/
627:
628: void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
629:
630: /**
631: * htmlDtdDumpOutput:
632: * @buf: the HTML buffer output
633: * @doc: the document
634: * @encoding: the encoding string
635: *
636: * TODO: check whether encoding is needed
637: *
638: * Dump the HTML document DTD, if any.
639: */
640: static void
641: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
642: const char *encoding ATTRIBUTE_UNUSED) {
643: xmlDtdPtr cur = doc->intSubset;
644:
645: if (cur == NULL) {
646: htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
647: return;
648: }
649: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
650: xmlOutputBufferWriteString(buf, (const char *)cur->name);
651: if (cur->ExternalID != NULL) {
652: xmlOutputBufferWriteString(buf, " PUBLIC ");
653: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
654: if (cur->SystemID != NULL) {
655: xmlOutputBufferWriteString(buf, " ");
656: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
657: }
658: } else if (cur->SystemID != NULL) {
659: xmlOutputBufferWriteString(buf, " SYSTEM ");
660: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
661: }
662: xmlOutputBufferWriteString(buf, ">\n");
663: }
664:
665: /**
666: * htmlAttrDumpOutput:
667: * @buf: the HTML buffer output
668: * @doc: the document
669: * @cur: the attribute pointer
670: * @encoding: the encoding string
671: *
672: * Dump an HTML attribute
673: */
674: static void
675: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
676: const char *encoding ATTRIBUTE_UNUSED) {
677: xmlChar *value;
678:
679: /*
680: * TODO: The html output method should not escape a & character
681: * occurring in an attribute value immediately followed by
682: * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
683: */
684:
685: if (cur == NULL) {
686: return;
687: }
688: xmlOutputBufferWriteString(buf, " ");
689: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
690: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
691: xmlOutputBufferWriteString(buf, ":");
692: }
693: xmlOutputBufferWriteString(buf, (const char *)cur->name);
694: if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
695: value = xmlNodeListGetString(doc, cur->children, 0);
696: if (value) {
697: xmlOutputBufferWriteString(buf, "=");
698: if ((cur->ns == NULL) && (cur->parent != NULL) &&
699: (cur->parent->ns == NULL) &&
700: ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
701: (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
702: (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
703: ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
704: (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
705: xmlChar *escaped;
706: xmlChar *tmp = value;
707:
708: while (IS_BLANK_CH(*tmp)) tmp++;
709:
710: escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
711: if (escaped != NULL) {
712: xmlBufferWriteQuotedString(buf->buffer, escaped);
713: xmlFree(escaped);
714: } else {
715: xmlBufferWriteQuotedString(buf->buffer, value);
716: }
717: } else {
718: xmlBufferWriteQuotedString(buf->buffer, value);
719: }
720: xmlFree(value);
721: } else {
722: xmlOutputBufferWriteString(buf, "=\"\"");
723: }
724: }
725: }
726:
727: /**
728: * htmlAttrListDumpOutput:
729: * @buf: the HTML buffer output
730: * @doc: the document
731: * @cur: the first attribute pointer
732: * @encoding: the encoding string
733: *
734: * Dump a list of HTML attributes
735: */
736: static void
737: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
738: if (cur == NULL) {
739: return;
740: }
741: while (cur != NULL) {
742: htmlAttrDumpOutput(buf, doc, cur, encoding);
743: cur = cur->next;
744: }
745: }
746:
747:
748:
749: /**
750: * htmlNodeListDumpOutput:
751: * @buf: the HTML buffer output
752: * @doc: the document
753: * @cur: the first node
754: * @encoding: the encoding string
755: * @format: should formatting spaces been added
756: *
757: * Dump an HTML node list, recursive behaviour,children are printed too.
758: */
759: static void
760: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
761: xmlNodePtr cur, const char *encoding, int format) {
762: if (cur == NULL) {
763: return;
764: }
765: while (cur != NULL) {
766: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
767: cur = cur->next;
768: }
769: }
770:
771: /**
772: * htmlNodeDumpFormatOutput:
773: * @buf: the HTML buffer output
774: * @doc: the document
775: * @cur: the current node
776: * @encoding: the encoding string
777: * @format: should formatting spaces been added
778: *
779: * Dump an HTML node, recursive behaviour,children are printed too.
780: */
781: void
782: htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
783: xmlNodePtr cur, const char *encoding, int format) {
784: const htmlElemDesc * info;
785:
786: xmlInitParser();
787:
788: if ((cur == NULL) || (buf == NULL)) {
789: return;
790: }
791: /*
792: * Special cases.
793: */
794: if (cur->type == XML_DTD_NODE)
795: return;
796: if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
797: (cur->type == XML_DOCUMENT_NODE)){
798: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
799: return;
800: }
801: if (cur->type == XML_ATTRIBUTE_NODE) {
802: htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
803: return;
804: }
805: if (cur->type == HTML_TEXT_NODE) {
806: if (cur->content != NULL) {
807: if (((cur->name == (const xmlChar *)xmlStringText) ||
808: (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
809: ((cur->parent == NULL) ||
810: ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
811: (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
812: xmlChar *buffer;
813:
814: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
815: if (buffer != NULL) {
816: xmlOutputBufferWriteString(buf, (const char *)buffer);
817: xmlFree(buffer);
818: }
819: } else {
820: xmlOutputBufferWriteString(buf, (const char *)cur->content);
821: }
822: }
823: return;
824: }
825: if (cur->type == HTML_COMMENT_NODE) {
826: if (cur->content != NULL) {
827: xmlOutputBufferWriteString(buf, "<!--");
828: xmlOutputBufferWriteString(buf, (const char *)cur->content);
829: xmlOutputBufferWriteString(buf, "-->");
830: }
831: return;
832: }
833: if (cur->type == HTML_PI_NODE) {
834: if (cur->name == NULL)
835: return;
836: xmlOutputBufferWriteString(buf, "<?");
837: xmlOutputBufferWriteString(buf, (const char *)cur->name);
838: if (cur->content != NULL) {
839: xmlOutputBufferWriteString(buf, " ");
840: xmlOutputBufferWriteString(buf, (const char *)cur->content);
841: }
842: xmlOutputBufferWriteString(buf, ">");
843: return;
844: }
845: if (cur->type == HTML_ENTITY_REF_NODE) {
846: xmlOutputBufferWriteString(buf, "&");
847: xmlOutputBufferWriteString(buf, (const char *)cur->name);
848: xmlOutputBufferWriteString(buf, ";");
849: return;
850: }
851: if (cur->type == HTML_PRESERVE_NODE) {
852: if (cur->content != NULL) {
853: xmlOutputBufferWriteString(buf, (const char *)cur->content);
854: }
855: return;
856: }
857:
858: /*
859: * Get specific HTML info for that node.
860: */
861: if (cur->ns == NULL)
862: info = htmlTagLookup(cur->name);
863: else
864: info = NULL;
865:
866: xmlOutputBufferWriteString(buf, "<");
867: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
868: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
869: xmlOutputBufferWriteString(buf, ":");
870: }
871: xmlOutputBufferWriteString(buf, (const char *)cur->name);
872: if (cur->nsDef)
873: xmlNsListDumpOutput(buf, cur->nsDef);
874: if (cur->properties != NULL)
875: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
876:
877: if ((info != NULL) && (info->empty)) {
878: xmlOutputBufferWriteString(buf, ">");
879: if ((format) && (!info->isinline) && (cur->next != NULL)) {
880: if ((cur->next->type != HTML_TEXT_NODE) &&
881: (cur->next->type != HTML_ENTITY_REF_NODE) &&
882: (cur->parent != NULL) &&
883: (cur->parent->name != NULL) &&
884: (cur->parent->name[0] != 'p')) /* p, pre, param */
885: xmlOutputBufferWriteString(buf, "\n");
886: }
887: return;
888: }
889: if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
890: (cur->children == NULL)) {
891: if ((info != NULL) && (info->saveEndTag != 0) &&
892: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
893: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
894: xmlOutputBufferWriteString(buf, ">");
895: } else {
896: xmlOutputBufferWriteString(buf, "></");
897: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
898: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
899: xmlOutputBufferWriteString(buf, ":");
900: }
901: xmlOutputBufferWriteString(buf, (const char *)cur->name);
902: xmlOutputBufferWriteString(buf, ">");
903: }
904: if ((format) && (cur->next != NULL) &&
905: (info != NULL) && (!info->isinline)) {
906: if ((cur->next->type != HTML_TEXT_NODE) &&
907: (cur->next->type != HTML_ENTITY_REF_NODE) &&
908: (cur->parent != NULL) &&
909: (cur->parent->name != NULL) &&
910: (cur->parent->name[0] != 'p')) /* p, pre, param */
911: xmlOutputBufferWriteString(buf, "\n");
912: }
913: return;
914: }
915: xmlOutputBufferWriteString(buf, ">");
916: if ((cur->type != XML_ELEMENT_NODE) &&
917: (cur->content != NULL)) {
918: /*
919: * Uses the OutputBuffer property to automatically convert
920: * invalids to charrefs
921: */
922:
923: xmlOutputBufferWriteString(buf, (const char *) cur->content);
924: }
925: if (cur->children != NULL) {
926: if ((format) && (info != NULL) && (!info->isinline) &&
927: (cur->children->type != HTML_TEXT_NODE) &&
928: (cur->children->type != HTML_ENTITY_REF_NODE) &&
929: (cur->children != cur->last) &&
930: (cur->name != NULL) &&
931: (cur->name[0] != 'p')) /* p, pre, param */
932: xmlOutputBufferWriteString(buf, "\n");
933: htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
934: if ((format) && (info != NULL) && (!info->isinline) &&
935: (cur->last->type != HTML_TEXT_NODE) &&
936: (cur->last->type != HTML_ENTITY_REF_NODE) &&
937: (cur->children != cur->last) &&
938: (cur->name != NULL) &&
939: (cur->name[0] != 'p')) /* p, pre, param */
940: xmlOutputBufferWriteString(buf, "\n");
941: }
942: xmlOutputBufferWriteString(buf, "</");
943: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
944: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
945: xmlOutputBufferWriteString(buf, ":");
946: }
947: xmlOutputBufferWriteString(buf, (const char *)cur->name);
948: xmlOutputBufferWriteString(buf, ">");
949: if ((format) && (info != NULL) && (!info->isinline) &&
950: (cur->next != NULL)) {
951: if ((cur->next->type != HTML_TEXT_NODE) &&
952: (cur->next->type != HTML_ENTITY_REF_NODE) &&
953: (cur->parent != NULL) &&
954: (cur->parent->name != NULL) &&
955: (cur->parent->name[0] != 'p')) /* p, pre, param */
956: xmlOutputBufferWriteString(buf, "\n");
957: }
958: }
959:
960: /**
961: * htmlNodeDumpOutput:
962: * @buf: the HTML buffer output
963: * @doc: the document
964: * @cur: the current node
965: * @encoding: the encoding string
966: *
967: * Dump an HTML node, recursive behaviour,children are printed too,
968: * and formatting returns/spaces are added.
969: */
970: void
971: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972: xmlNodePtr cur, const char *encoding) {
973: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
974: }
975:
976: /**
977: * htmlDocContentDumpFormatOutput:
978: * @buf: the HTML buffer output
979: * @cur: the document
980: * @encoding: the encoding string
981: * @format: should formatting spaces been added
982: *
983: * Dump an HTML document.
984: */
985: void
986: htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987: const char *encoding, int format) {
988: int type;
989:
990: xmlInitParser();
991:
992: if ((buf == NULL) || (cur == NULL))
993: return;
994:
995: /*
996: * force to output the stuff as HTML, especially for entities
997: */
998: type = cur->type;
999: cur->type = XML_HTML_DOCUMENT_NODE;
1000: if (cur->intSubset != NULL) {
1001: htmlDtdDumpOutput(buf, cur, NULL);
1002: }
1003: if (cur->children != NULL) {
1004: htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1005: }
1006: xmlOutputBufferWriteString(buf, "\n");
1007: cur->type = (xmlElementType) type;
1008: }
1009:
1010: /**
1011: * htmlDocContentDumpOutput:
1012: * @buf: the HTML buffer output
1013: * @cur: the document
1014: * @encoding: the encoding string
1015: *
1016: * Dump an HTML document. Formating return/spaces are added.
1017: */
1018: void
1019: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1020: const char *encoding) {
1021: htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1022: }
1023:
1024: /************************************************************************
1025: * *
1026: * Saving functions front-ends *
1027: * *
1028: ************************************************************************/
1029:
1030: /**
1031: * htmlDocDump:
1032: * @f: the FILE*
1033: * @cur: the document
1034: *
1035: * Dump an HTML document to an open FILE.
1036: *
1037: * returns: the number of byte written or -1 in case of failure.
1038: */
1039: int
1040: htmlDocDump(FILE *f, xmlDocPtr cur) {
1041: xmlOutputBufferPtr buf;
1042: xmlCharEncodingHandlerPtr handler = NULL;
1043: const char *encoding;
1044: int ret;
1045:
1046: xmlInitParser();
1047:
1048: if ((cur == NULL) || (f == NULL)) {
1049: return(-1);
1050: }
1051:
1052: encoding = (const char *) htmlGetMetaEncoding(cur);
1053:
1054: if (encoding != NULL) {
1055: xmlCharEncoding enc;
1056:
1057: enc = xmlParseCharEncoding(encoding);
1058: if (enc != cur->charset) {
1059: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1060: /*
1061: * Not supported yet
1062: */
1063: return(-1);
1064: }
1065:
1066: handler = xmlFindCharEncodingHandler(encoding);
1067: if (handler == NULL)
1.1.1.2 ! misho 1068: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 1069: } else {
1070: handler = xmlFindCharEncodingHandler(encoding);
1071: }
1072: }
1073:
1074: /*
1075: * Fallback to HTML or ASCII when the encoding is unspecified
1076: */
1077: if (handler == NULL)
1078: handler = xmlFindCharEncodingHandler("HTML");
1079: if (handler == NULL)
1080: handler = xmlFindCharEncodingHandler("ascii");
1081:
1082: buf = xmlOutputBufferCreateFile(f, handler);
1083: if (buf == NULL) return(-1);
1084: htmlDocContentDumpOutput(buf, cur, NULL);
1085:
1086: ret = xmlOutputBufferClose(buf);
1087: return(ret);
1088: }
1089:
1090: /**
1091: * htmlSaveFile:
1092: * @filename: the filename (or URL)
1093: * @cur: the document
1094: *
1095: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1096: * used.
1097: * returns: the number of byte written or -1 in case of failure.
1098: */
1099: int
1100: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1101: xmlOutputBufferPtr buf;
1102: xmlCharEncodingHandlerPtr handler = NULL;
1103: const char *encoding;
1104: int ret;
1105:
1106: if ((cur == NULL) || (filename == NULL))
1107: return(-1);
1108:
1109: xmlInitParser();
1110:
1111: encoding = (const char *) htmlGetMetaEncoding(cur);
1112:
1113: if (encoding != NULL) {
1114: xmlCharEncoding enc;
1115:
1116: enc = xmlParseCharEncoding(encoding);
1117: if (enc != cur->charset) {
1118: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1119: /*
1120: * Not supported yet
1121: */
1122: return(-1);
1123: }
1124:
1125: handler = xmlFindCharEncodingHandler(encoding);
1126: if (handler == NULL)
1.1.1.2 ! misho 1127: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 1128: }
1129: }
1130:
1131: /*
1132: * Fallback to HTML or ASCII when the encoding is unspecified
1133: */
1134: if (handler == NULL)
1135: handler = xmlFindCharEncodingHandler("HTML");
1136: if (handler == NULL)
1137: handler = xmlFindCharEncodingHandler("ascii");
1138:
1139: /*
1140: * save the content to a temp buffer.
1141: */
1142: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1143: if (buf == NULL) return(0);
1144:
1145: htmlDocContentDumpOutput(buf, cur, NULL);
1146:
1147: ret = xmlOutputBufferClose(buf);
1148: return(ret);
1149: }
1150:
1151: /**
1152: * htmlSaveFileFormat:
1153: * @filename: the filename
1154: * @cur: the document
1155: * @format: should formatting spaces been added
1156: * @encoding: the document encoding
1157: *
1158: * Dump an HTML document to a file using a given encoding.
1159: *
1160: * returns: the number of byte written or -1 in case of failure.
1161: */
1162: int
1163: htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1164: const char *encoding, int format) {
1165: xmlOutputBufferPtr buf;
1166: xmlCharEncodingHandlerPtr handler = NULL;
1167: int ret;
1168:
1169: if ((cur == NULL) || (filename == NULL))
1170: return(-1);
1171:
1172: xmlInitParser();
1173:
1174: if (encoding != NULL) {
1175: xmlCharEncoding enc;
1176:
1177: enc = xmlParseCharEncoding(encoding);
1178: if (enc != cur->charset) {
1179: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1180: /*
1181: * Not supported yet
1182: */
1183: return(-1);
1184: }
1185:
1186: handler = xmlFindCharEncodingHandler(encoding);
1187: if (handler == NULL)
1.1.1.2 ! misho 1188: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 1189: }
1190: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1191: } else {
1192: htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1193: }
1194:
1195: /*
1196: * Fallback to HTML or ASCII when the encoding is unspecified
1197: */
1198: if (handler == NULL)
1199: handler = xmlFindCharEncodingHandler("HTML");
1200: if (handler == NULL)
1201: handler = xmlFindCharEncodingHandler("ascii");
1202:
1203: /*
1204: * save the content to a temp buffer.
1205: */
1206: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1207: if (buf == NULL) return(0);
1208:
1209: htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1210:
1211: ret = xmlOutputBufferClose(buf);
1212: return(ret);
1213: }
1214:
1215: /**
1216: * htmlSaveFileEnc:
1217: * @filename: the filename
1218: * @cur: the document
1219: * @encoding: the document encoding
1220: *
1221: * Dump an HTML document to a file using a given encoding
1222: * and formatting returns/spaces are added.
1223: *
1224: * returns: the number of byte written or -1 in case of failure.
1225: */
1226: int
1227: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1228: return(htmlSaveFileFormat(filename, cur, encoding, 1));
1229: }
1230:
1231: #endif /* LIBXML_OUTPUT_ENABLED */
1232:
1233: #define bottom_HTMLtree
1234: #include "elfgcchack.h"
1235: #endif /* LIBXML_HTML_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>