Annotation of embedaddon/libxml2/HTMLtree.c, revision 1.1.1.3
1.1 misho 1: /*
2: * HTMLtree.c : implementation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * daniel@veillard.com
7: */
8:
9:
10: #define IN_LIBXML
11: #include "libxml.h"
12: #ifdef LIBXML_HTML_ENABLED
13:
14: #include <string.h> /* for memset() only ! */
15:
16: #ifdef HAVE_CTYPE_H
17: #include <ctype.h>
18: #endif
19: #ifdef HAVE_STDLIB_H
20: #include <stdlib.h>
21: #endif
22:
23: #include <libxml/xmlmemory.h>
24: #include <libxml/HTMLparser.h>
25: #include <libxml/HTMLtree.h>
26: #include <libxml/entities.h>
27: #include <libxml/valid.h>
28: #include <libxml/xmlerror.h>
29: #include <libxml/parserInternals.h>
30: #include <libxml/globals.h>
31: #include <libxml/uri.h>
32:
1.1.1.3 ! misho 33: #include "buf.h"
! 34:
1.1 misho 35: /************************************************************************
36: * *
1.1.1.3 ! misho 37: * Getting/Setting encoding meta tags *
1.1 misho 38: * *
39: ************************************************************************/
40:
41: /**
42: * htmlGetMetaEncoding:
43: * @doc: the document
1.1.1.3 ! misho 44: *
1.1 misho 45: * Encoding definition lookup in the Meta tags
46: *
47: * Returns the current encoding as flagged in the HTML source
48: */
49: const xmlChar *
50: htmlGetMetaEncoding(htmlDocPtr doc) {
51: htmlNodePtr cur;
52: const xmlChar *content;
53: const xmlChar *encoding;
54:
55: if (doc == NULL)
56: return(NULL);
57: cur = doc->children;
58:
59: /*
60: * Search the html
61: */
62: while (cur != NULL) {
63: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
64: if (xmlStrEqual(cur->name, BAD_CAST"html"))
65: break;
66: if (xmlStrEqual(cur->name, BAD_CAST"head"))
67: goto found_head;
68: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
69: goto found_meta;
70: }
71: cur = cur->next;
72: }
73: if (cur == NULL)
74: return(NULL);
75: cur = cur->children;
76:
77: /*
78: * Search the head
79: */
80: while (cur != NULL) {
81: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
82: if (xmlStrEqual(cur->name, BAD_CAST"head"))
83: break;
84: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
85: goto found_meta;
86: }
87: cur = cur->next;
88: }
89: if (cur == NULL)
90: return(NULL);
91: found_head:
92: cur = cur->children;
93:
94: /*
95: * Search the meta elements
96: */
97: found_meta:
98: while (cur != NULL) {
99: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
100: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
101: xmlAttrPtr attr = cur->properties;
102: int http;
103: const xmlChar *value;
104:
105: content = NULL;
106: http = 0;
107: while (attr != NULL) {
108: if ((attr->children != NULL) &&
109: (attr->children->type == XML_TEXT_NODE) &&
110: (attr->children->next == NULL)) {
111: value = attr->children->content;
112: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
113: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
114: http = 1;
115: else if ((value != NULL)
116: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
117: content = value;
118: if ((http != 0) && (content != NULL))
119: goto found_content;
120: }
121: attr = attr->next;
122: }
123: }
124: }
125: cur = cur->next;
126: }
127: return(NULL);
128:
129: found_content:
130: encoding = xmlStrstr(content, BAD_CAST"charset=");
1.1.1.3 ! misho 131: if (encoding == NULL)
1.1 misho 132: encoding = xmlStrstr(content, BAD_CAST"Charset=");
1.1.1.3 ! misho 133: if (encoding == NULL)
1.1 misho 134: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
135: if (encoding != NULL) {
136: encoding += 8;
137: } else {
138: encoding = xmlStrstr(content, BAD_CAST"charset =");
1.1.1.3 ! misho 139: if (encoding == NULL)
1.1 misho 140: encoding = xmlStrstr(content, BAD_CAST"Charset =");
1.1.1.3 ! misho 141: if (encoding == NULL)
1.1 misho 142: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
143: if (encoding != NULL)
144: encoding += 9;
145: }
146: if (encoding != NULL) {
147: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
148: }
149: return(encoding);
150: }
151:
152: /**
153: * htmlSetMetaEncoding:
154: * @doc: the document
155: * @encoding: the encoding string
1.1.1.2 misho 156: *
1.1 misho 157: * Sets the current encoding in the Meta tags
158: * NOTE: this will not change the document content encoding, just
159: * the META flag associated.
160: *
161: * Returns 0 in case of success and -1 in case of error
162: */
163: int
164: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
165: htmlNodePtr cur, meta = NULL, head = NULL;
166: const xmlChar *content = NULL;
167: char newcontent[100];
168:
1.1.1.2 misho 169: newcontent[0] = 0;
1.1 misho 170:
171: if (doc == NULL)
172: return(-1);
173:
174: /* html isn't a real encoding it's just libxml2 way to get entities */
175: if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
176: return(-1);
177:
178: if (encoding != NULL) {
179: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180: (char *)encoding);
181: newcontent[sizeof(newcontent) - 1] = 0;
182: }
183:
184: cur = doc->children;
185:
186: /*
187: * Search the html
188: */
189: while (cur != NULL) {
190: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
191: if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
192: break;
193: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
194: goto found_head;
195: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
196: goto found_meta;
197: }
198: cur = cur->next;
199: }
200: if (cur == NULL)
201: return(-1);
202: cur = cur->children;
203:
204: /*
205: * Search the head
206: */
207: while (cur != NULL) {
208: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
209: if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
210: break;
211: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
212: head = cur->parent;
213: goto found_meta;
214: }
215: }
216: cur = cur->next;
217: }
218: if (cur == NULL)
219: return(-1);
220: found_head:
221: head = cur;
222: if (cur->children == NULL)
223: goto create;
224: cur = cur->children;
225:
226: found_meta:
227: /*
228: * Search and update all the remaining the meta elements carrying
229: * encoding informations
230: */
231: while (cur != NULL) {
232: if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
233: if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
234: xmlAttrPtr attr = cur->properties;
235: int http;
236: const xmlChar *value;
237:
238: content = NULL;
239: http = 0;
240: while (attr != NULL) {
241: if ((attr->children != NULL) &&
242: (attr->children->type == XML_TEXT_NODE) &&
243: (attr->children->next == NULL)) {
244: value = attr->children->content;
245: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
246: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
247: http = 1;
248: else
249: {
1.1.1.2 misho 250: if ((value != NULL) &&
1.1 misho 251: (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
252: content = value;
253: }
254: if ((http != 0) && (content != NULL))
255: break;
256: }
257: attr = attr->next;
258: }
259: if ((http != 0) && (content != NULL)) {
260: meta = cur;
261: break;
262: }
263:
264: }
265: }
266: cur = cur->next;
267: }
268: create:
269: if (meta == NULL) {
270: if ((encoding != NULL) && (head != NULL)) {
271: /*
272: * Create a new Meta element with the right attributes
273: */
274:
275: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
276: if (head->children == NULL)
277: xmlAddChild(head, meta);
278: else
279: xmlAddPrevSibling(head->children, meta);
280: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
281: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
282: }
283: } else {
1.1.1.2 misho 284: /* remove the meta tag if NULL is passed */
285: if (encoding == NULL) {
286: xmlUnlinkNode(meta);
287: xmlFreeNode(meta);
288: }
1.1 misho 289: /* change the document only if there is a real encoding change */
1.1.1.2 misho 290: else if (xmlStrcasestr(content, encoding) == NULL) {
1.1 misho 291: xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
292: }
293: }
294:
295:
296: return(0);
297: }
298:
299: /**
300: * booleanHTMLAttrs:
301: *
302: * These are the HTML attributes which will be output
303: * in minimized form, i.e. <option selected="selected"> will be
304: * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
305: *
306: */
307: static const char* htmlBooleanAttrs[] = {
308: "checked", "compact", "declare", "defer", "disabled", "ismap",
309: "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
310: "selected", NULL
311: };
312:
313:
314: /**
315: * htmlIsBooleanAttr:
316: * @name: the name of the attribute to check
317: *
318: * Determine if a given attribute is a boolean attribute.
1.1.1.3 ! misho 319: *
1.1 misho 320: * returns: false if the attribute is not boolean, true otherwise.
321: */
322: int
323: htmlIsBooleanAttr(const xmlChar *name)
324: {
325: int i = 0;
326:
327: while (htmlBooleanAttrs[i] != NULL) {
328: if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
329: return 1;
330: i++;
331: }
332: return 0;
333: }
334:
335: #ifdef LIBXML_OUTPUT_ENABLED
336: /*
337: * private routine exported from xmlIO.c
338: */
339: xmlOutputBufferPtr
340: xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
341: /************************************************************************
342: * *
1.1.1.3 ! misho 343: * Output error handlers *
1.1 misho 344: * *
345: ************************************************************************/
346: /**
347: * htmlSaveErrMemory:
348: * @extra: extra informations
349: *
350: * Handle an out of memory condition
351: */
352: static void
353: htmlSaveErrMemory(const char *extra)
354: {
355: __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
356: }
357:
358: /**
359: * htmlSaveErr:
360: * @code: the error number
361: * @node: the location of the error.
362: * @extra: extra informations
363: *
364: * Handle an out of memory condition
365: */
366: static void
367: htmlSaveErr(int code, xmlNodePtr node, const char *extra)
368: {
369: const char *msg = NULL;
370:
371: switch(code) {
372: case XML_SAVE_NOT_UTF8:
373: msg = "string is not in UTF-8\n";
374: break;
375: case XML_SAVE_CHAR_INVALID:
376: msg = "invalid character value\n";
377: break;
378: case XML_SAVE_UNKNOWN_ENCODING:
379: msg = "unknown encoding %s\n";
380: break;
381: case XML_SAVE_NO_DOCTYPE:
382: msg = "HTML has no DOCTYPE\n";
383: break;
384: default:
385: msg = "unexpected error number\n";
386: }
387: __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
388: }
389:
390: /************************************************************************
391: * *
1.1.1.3 ! misho 392: * Dumping HTML tree content to a simple buffer *
1.1 misho 393: * *
394: ************************************************************************/
395:
396: /**
1.1.1.3 ! misho 397: * htmlBufNodeDumpFormat:
! 398: * @buf: the xmlBufPtr output
1.1 misho 399: * @doc: the document
400: * @cur: the current node
401: * @format: should formatting spaces been added
402: *
403: * Dump an HTML node, recursive behaviour,children are printed too.
404: *
405: * Returns the number of byte written or -1 in case of error
406: */
1.1.1.3 ! misho 407: static size_t
! 408: htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
1.1 misho 409: int format) {
1.1.1.3 ! misho 410: size_t use;
1.1 misho 411: int ret;
412: xmlOutputBufferPtr outbuf;
413:
414: if (cur == NULL) {
415: return (-1);
416: }
417: if (buf == NULL) {
418: return (-1);
419: }
420: outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
421: if (outbuf == NULL) {
422: htmlSaveErrMemory("allocating HTML output buffer");
423: return (-1);
424: }
425: memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
426: outbuf->buffer = buf;
427: outbuf->encoder = NULL;
428: outbuf->writecallback = NULL;
429: outbuf->closecallback = NULL;
430: outbuf->context = NULL;
431: outbuf->written = 0;
432:
1.1.1.3 ! misho 433: use = xmlBufUse(buf);
1.1 misho 434: htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
435: xmlFree(outbuf);
1.1.1.3 ! misho 436: ret = xmlBufUse(buf) - use;
1.1 misho 437: return (ret);
438: }
439:
440: /**
441: * htmlNodeDump:
442: * @buf: the HTML buffer output
443: * @doc: the document
444: * @cur: the current node
445: *
446: * Dump an HTML node, recursive behaviour,children are printed too,
447: * and formatting returns are added.
448: *
449: * Returns the number of byte written or -1 in case of error
450: */
451: int
452: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
1.1.1.3 ! misho 453: xmlBufPtr buffer;
! 454: size_t ret;
! 455:
! 456: if ((buf == NULL) || (cur == NULL))
! 457: return(-1);
! 458:
1.1 misho 459: xmlInitParser();
1.1.1.3 ! misho 460: buffer = xmlBufFromBuffer(buf);
! 461: if (buffer == NULL)
! 462: return(-1);
! 463:
! 464: ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
! 465:
! 466: xmlBufBackToBuffer(buffer);
1.1 misho 467:
1.1.1.3 ! misho 468: if (ret > INT_MAX)
! 469: return(-1);
! 470: return((int) ret);
1.1 misho 471: }
472:
473: /**
474: * htmlNodeDumpFileFormat:
475: * @out: the FILE pointer
476: * @doc: the document
477: * @cur: the current node
478: * @encoding: the document encoding
479: * @format: should formatting spaces been added
480: *
481: * Dump an HTML node, recursive behaviour,children are printed too.
482: *
483: * TODO: if encoding == NULL try to save in the doc encoding
484: *
485: * returns: the number of byte written or -1 in case of failure.
486: */
487: int
488: htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
489: xmlNodePtr cur, const char *encoding, int format) {
490: xmlOutputBufferPtr buf;
491: xmlCharEncodingHandlerPtr handler = NULL;
492: int ret;
493:
494: xmlInitParser();
495:
496: if (encoding != NULL) {
497: xmlCharEncoding enc;
498:
499: enc = xmlParseCharEncoding(encoding);
500: if (enc != XML_CHAR_ENCODING_UTF8) {
501: handler = xmlFindCharEncodingHandler(encoding);
502: if (handler == NULL)
1.1.1.2 misho 503: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 504: }
505: }
506:
507: /*
508: * Fallback to HTML or ASCII when the encoding is unspecified
509: */
510: if (handler == NULL)
511: handler = xmlFindCharEncodingHandler("HTML");
512: if (handler == NULL)
513: handler = xmlFindCharEncodingHandler("ascii");
514:
1.1.1.3 ! misho 515: /*
1.1 misho 516: * save the content to a temp buffer.
517: */
518: buf = xmlOutputBufferCreateFile(out, handler);
519: if (buf == NULL) return(0);
520:
521: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
522:
523: ret = xmlOutputBufferClose(buf);
524: return(ret);
525: }
526:
527: /**
528: * htmlNodeDumpFile:
529: * @out: the FILE pointer
530: * @doc: the document
531: * @cur: the current node
532: *
533: * Dump an HTML node, recursive behaviour,children are printed too,
534: * and formatting returns are added.
535: */
536: void
537: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
538: htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
539: }
540:
541: /**
542: * htmlDocDumpMemoryFormat:
543: * @cur: the document
544: * @mem: OUT: the memory pointer
545: * @size: OUT: the memory length
546: * @format: should formatting spaces been added
547: *
548: * Dump an HTML document in memory and return the xmlChar * and it's size.
549: * It's up to the caller to free the memory.
550: */
551: void
552: htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
553: xmlOutputBufferPtr buf;
554: xmlCharEncodingHandlerPtr handler = NULL;
555: const char *encoding;
556:
557: xmlInitParser();
558:
559: if ((mem == NULL) || (size == NULL))
560: return;
561: if (cur == NULL) {
562: *mem = NULL;
563: *size = 0;
564: return;
565: }
566:
567: encoding = (const char *) htmlGetMetaEncoding(cur);
568:
569: if (encoding != NULL) {
570: xmlCharEncoding enc;
571:
572: enc = xmlParseCharEncoding(encoding);
573: if (enc != cur->charset) {
574: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
575: /*
576: * Not supported yet
577: */
578: *mem = NULL;
579: *size = 0;
580: return;
581: }
582:
583: handler = xmlFindCharEncodingHandler(encoding);
1.1.1.2 misho 584: if (handler == NULL)
585: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
586:
1.1 misho 587: } else {
588: handler = xmlFindCharEncodingHandler(encoding);
589: }
590: }
591:
592: /*
593: * Fallback to HTML or ASCII when the encoding is unspecified
594: */
595: if (handler == NULL)
596: handler = xmlFindCharEncodingHandler("HTML");
597: if (handler == NULL)
598: handler = xmlFindCharEncodingHandler("ascii");
599:
600: buf = xmlAllocOutputBufferInternal(handler);
601: if (buf == NULL) {
602: *mem = NULL;
603: *size = 0;
604: return;
605: }
606:
1.1.1.2 misho 607: htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
1.1 misho 608:
609: xmlOutputBufferFlush(buf);
610: if (buf->conv != NULL) {
1.1.1.3 ! misho 611: *size = xmlBufUse(buf->conv);
! 612: *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
1.1 misho 613: } else {
1.1.1.3 ! misho 614: *size = xmlBufUse(buf->buffer);
! 615: *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
1.1 misho 616: }
617: (void)xmlOutputBufferClose(buf);
618: }
619:
620: /**
621: * htmlDocDumpMemory:
622: * @cur: the document
623: * @mem: OUT: the memory pointer
624: * @size: OUT: the memory length
625: *
626: * Dump an HTML document in memory and return the xmlChar * and it's size.
627: * It's up to the caller to free the memory.
628: */
629: void
630: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
631: htmlDocDumpMemoryFormat(cur, mem, size, 1);
632: }
633:
634:
635: /************************************************************************
636: * *
1.1.1.3 ! misho 637: * Dumping HTML tree content to an I/O output buffer *
1.1 misho 638: * *
639: ************************************************************************/
640:
641: void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
642:
643: /**
644: * htmlDtdDumpOutput:
645: * @buf: the HTML buffer output
646: * @doc: the document
647: * @encoding: the encoding string
1.1.1.3 ! misho 648: *
1.1 misho 649: * TODO: check whether encoding is needed
650: *
651: * Dump the HTML document DTD, if any.
652: */
653: static void
654: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
655: const char *encoding ATTRIBUTE_UNUSED) {
656: xmlDtdPtr cur = doc->intSubset;
657:
658: if (cur == NULL) {
659: htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
660: return;
661: }
662: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
663: xmlOutputBufferWriteString(buf, (const char *)cur->name);
664: if (cur->ExternalID != NULL) {
665: xmlOutputBufferWriteString(buf, " PUBLIC ");
1.1.1.3 ! misho 666: xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
1.1 misho 667: if (cur->SystemID != NULL) {
668: xmlOutputBufferWriteString(buf, " ");
1.1.1.3 ! misho 669: xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
! 670: }
1.1 misho 671: } else if (cur->SystemID != NULL) {
672: xmlOutputBufferWriteString(buf, " SYSTEM ");
1.1.1.3 ! misho 673: xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
1.1 misho 674: }
675: xmlOutputBufferWriteString(buf, ">\n");
676: }
677:
678: /**
679: * htmlAttrDumpOutput:
680: * @buf: the HTML buffer output
681: * @doc: the document
682: * @cur: the attribute pointer
683: * @encoding: the encoding string
684: *
685: * Dump an HTML attribute
686: */
687: static void
688: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
689: const char *encoding ATTRIBUTE_UNUSED) {
690: xmlChar *value;
691:
692: /*
1.1.1.3 ! misho 693: * The html output method should not escape a & character
! 694: * occurring in an attribute value immediately followed by
! 695: * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
! 696: * This is implemented in xmlEncodeEntitiesReentrant
1.1 misho 697: */
698:
699: if (cur == NULL) {
700: return;
701: }
702: xmlOutputBufferWriteString(buf, " ");
703: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
704: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
705: xmlOutputBufferWriteString(buf, ":");
706: }
707: xmlOutputBufferWriteString(buf, (const char *)cur->name);
708: if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
709: value = xmlNodeListGetString(doc, cur->children, 0);
710: if (value) {
711: xmlOutputBufferWriteString(buf, "=");
712: if ((cur->ns == NULL) && (cur->parent != NULL) &&
713: (cur->parent->ns == NULL) &&
714: ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
715: (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
716: (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
717: ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
718: (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
719: xmlChar *escaped;
720: xmlChar *tmp = value;
721:
722: while (IS_BLANK_CH(*tmp)) tmp++;
723:
1.1.1.3 ! misho 724: /*
! 725: * the < and > have already been escaped at the entity level
! 726: * And doing so here breaks server side includes
! 727: */
! 728: escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
1.1 misho 729: if (escaped != NULL) {
1.1.1.3 ! misho 730: xmlBufWriteQuotedString(buf->buffer, escaped);
1.1 misho 731: xmlFree(escaped);
732: } else {
1.1.1.3 ! misho 733: xmlBufWriteQuotedString(buf->buffer, value);
1.1 misho 734: }
735: } else {
1.1.1.3 ! misho 736: xmlBufWriteQuotedString(buf->buffer, value);
1.1 misho 737: }
738: xmlFree(value);
739: } else {
740: xmlOutputBufferWriteString(buf, "=\"\"");
741: }
742: }
743: }
744:
745: /**
746: * htmlAttrListDumpOutput:
747: * @buf: the HTML buffer output
748: * @doc: the document
749: * @cur: the first attribute pointer
750: * @encoding: the encoding string
751: *
752: * Dump a list of HTML attributes
753: */
754: static void
755: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
756: if (cur == NULL) {
757: return;
758: }
759: while (cur != NULL) {
760: htmlAttrDumpOutput(buf, doc, cur, encoding);
761: cur = cur->next;
762: }
763: }
764:
765:
766:
767: /**
768: * htmlNodeListDumpOutput:
769: * @buf: the HTML buffer output
770: * @doc: the document
771: * @cur: the first node
772: * @encoding: the encoding string
773: * @format: should formatting spaces been added
774: *
775: * Dump an HTML node list, recursive behaviour,children are printed too.
776: */
777: static void
778: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
779: xmlNodePtr cur, const char *encoding, int format) {
780: if (cur == NULL) {
781: return;
782: }
783: while (cur != NULL) {
784: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
785: cur = cur->next;
786: }
787: }
788:
789: /**
790: * htmlNodeDumpFormatOutput:
791: * @buf: the HTML buffer output
792: * @doc: the document
793: * @cur: the current node
794: * @encoding: the encoding string
795: * @format: should formatting spaces been added
796: *
797: * Dump an HTML node, recursive behaviour,children are printed too.
798: */
799: void
800: htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
801: xmlNodePtr cur, const char *encoding, int format) {
802: const htmlElemDesc * info;
803:
804: xmlInitParser();
805:
806: if ((cur == NULL) || (buf == NULL)) {
807: return;
808: }
809: /*
810: * Special cases.
811: */
812: if (cur->type == XML_DTD_NODE)
813: return;
814: if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
815: (cur->type == XML_DOCUMENT_NODE)){
816: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
817: return;
818: }
819: if (cur->type == XML_ATTRIBUTE_NODE) {
820: htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
821: return;
822: }
823: if (cur->type == HTML_TEXT_NODE) {
824: if (cur->content != NULL) {
825: if (((cur->name == (const xmlChar *)xmlStringText) ||
826: (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
827: ((cur->parent == NULL) ||
828: ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
829: (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
830: xmlChar *buffer;
831:
832: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
833: if (buffer != NULL) {
834: xmlOutputBufferWriteString(buf, (const char *)buffer);
835: xmlFree(buffer);
836: }
837: } else {
838: xmlOutputBufferWriteString(buf, (const char *)cur->content);
839: }
840: }
841: return;
842: }
843: if (cur->type == HTML_COMMENT_NODE) {
844: if (cur->content != NULL) {
845: xmlOutputBufferWriteString(buf, "<!--");
846: xmlOutputBufferWriteString(buf, (const char *)cur->content);
847: xmlOutputBufferWriteString(buf, "-->");
848: }
849: return;
850: }
851: if (cur->type == HTML_PI_NODE) {
852: if (cur->name == NULL)
853: return;
854: xmlOutputBufferWriteString(buf, "<?");
855: xmlOutputBufferWriteString(buf, (const char *)cur->name);
856: if (cur->content != NULL) {
857: xmlOutputBufferWriteString(buf, " ");
858: xmlOutputBufferWriteString(buf, (const char *)cur->content);
859: }
860: xmlOutputBufferWriteString(buf, ">");
861: return;
862: }
863: if (cur->type == HTML_ENTITY_REF_NODE) {
864: xmlOutputBufferWriteString(buf, "&");
865: xmlOutputBufferWriteString(buf, (const char *)cur->name);
866: xmlOutputBufferWriteString(buf, ";");
867: return;
868: }
869: if (cur->type == HTML_PRESERVE_NODE) {
870: if (cur->content != NULL) {
871: xmlOutputBufferWriteString(buf, (const char *)cur->content);
872: }
873: return;
874: }
875:
876: /*
877: * Get specific HTML info for that node.
878: */
879: if (cur->ns == NULL)
880: info = htmlTagLookup(cur->name);
881: else
882: info = NULL;
883:
884: xmlOutputBufferWriteString(buf, "<");
885: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
886: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
887: xmlOutputBufferWriteString(buf, ":");
888: }
889: xmlOutputBufferWriteString(buf, (const char *)cur->name);
890: if (cur->nsDef)
891: xmlNsListDumpOutput(buf, cur->nsDef);
892: if (cur->properties != NULL)
893: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
894:
895: if ((info != NULL) && (info->empty)) {
896: xmlOutputBufferWriteString(buf, ">");
897: if ((format) && (!info->isinline) && (cur->next != NULL)) {
898: if ((cur->next->type != HTML_TEXT_NODE) &&
899: (cur->next->type != HTML_ENTITY_REF_NODE) &&
900: (cur->parent != NULL) &&
901: (cur->parent->name != NULL) &&
902: (cur->parent->name[0] != 'p')) /* p, pre, param */
903: xmlOutputBufferWriteString(buf, "\n");
904: }
905: return;
906: }
907: if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
908: (cur->children == NULL)) {
909: if ((info != NULL) && (info->saveEndTag != 0) &&
910: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
911: (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
912: xmlOutputBufferWriteString(buf, ">");
913: } else {
914: xmlOutputBufferWriteString(buf, "></");
915: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
916: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
917: xmlOutputBufferWriteString(buf, ":");
918: }
919: xmlOutputBufferWriteString(buf, (const char *)cur->name);
920: xmlOutputBufferWriteString(buf, ">");
921: }
922: if ((format) && (cur->next != NULL) &&
923: (info != NULL) && (!info->isinline)) {
924: if ((cur->next->type != HTML_TEXT_NODE) &&
925: (cur->next->type != HTML_ENTITY_REF_NODE) &&
926: (cur->parent != NULL) &&
927: (cur->parent->name != NULL) &&
928: (cur->parent->name[0] != 'p')) /* p, pre, param */
929: xmlOutputBufferWriteString(buf, "\n");
930: }
931: return;
932: }
933: xmlOutputBufferWriteString(buf, ">");
934: if ((cur->type != XML_ELEMENT_NODE) &&
935: (cur->content != NULL)) {
936: /*
937: * Uses the OutputBuffer property to automatically convert
938: * invalids to charrefs
939: */
940:
941: xmlOutputBufferWriteString(buf, (const char *) cur->content);
942: }
943: if (cur->children != NULL) {
944: if ((format) && (info != NULL) && (!info->isinline) &&
945: (cur->children->type != HTML_TEXT_NODE) &&
946: (cur->children->type != HTML_ENTITY_REF_NODE) &&
947: (cur->children != cur->last) &&
948: (cur->name != NULL) &&
949: (cur->name[0] != 'p')) /* p, pre, param */
950: xmlOutputBufferWriteString(buf, "\n");
951: htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
952: if ((format) && (info != NULL) && (!info->isinline) &&
953: (cur->last->type != HTML_TEXT_NODE) &&
954: (cur->last->type != HTML_ENTITY_REF_NODE) &&
955: (cur->children != cur->last) &&
956: (cur->name != NULL) &&
957: (cur->name[0] != 'p')) /* p, pre, param */
958: xmlOutputBufferWriteString(buf, "\n");
959: }
960: xmlOutputBufferWriteString(buf, "</");
961: if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
962: xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
963: xmlOutputBufferWriteString(buf, ":");
964: }
965: xmlOutputBufferWriteString(buf, (const char *)cur->name);
966: xmlOutputBufferWriteString(buf, ">");
967: if ((format) && (info != NULL) && (!info->isinline) &&
968: (cur->next != NULL)) {
969: if ((cur->next->type != HTML_TEXT_NODE) &&
970: (cur->next->type != HTML_ENTITY_REF_NODE) &&
971: (cur->parent != NULL) &&
972: (cur->parent->name != NULL) &&
973: (cur->parent->name[0] != 'p')) /* p, pre, param */
974: xmlOutputBufferWriteString(buf, "\n");
975: }
976: }
977:
978: /**
979: * htmlNodeDumpOutput:
980: * @buf: the HTML buffer output
981: * @doc: the document
982: * @cur: the current node
983: * @encoding: the encoding string
984: *
985: * Dump an HTML node, recursive behaviour,children are printed too,
986: * and formatting returns/spaces are added.
987: */
988: void
989: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
990: xmlNodePtr cur, const char *encoding) {
991: htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
992: }
993:
994: /**
995: * htmlDocContentDumpFormatOutput:
996: * @buf: the HTML buffer output
997: * @cur: the document
998: * @encoding: the encoding string
999: * @format: should formatting spaces been added
1000: *
1001: * Dump an HTML document.
1002: */
1003: void
1004: htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1005: const char *encoding, int format) {
1006: int type;
1007:
1008: xmlInitParser();
1009:
1010: if ((buf == NULL) || (cur == NULL))
1011: return;
1012:
1013: /*
1014: * force to output the stuff as HTML, especially for entities
1015: */
1016: type = cur->type;
1017: cur->type = XML_HTML_DOCUMENT_NODE;
1018: if (cur->intSubset != NULL) {
1019: htmlDtdDumpOutput(buf, cur, NULL);
1020: }
1021: if (cur->children != NULL) {
1022: htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1023: }
1024: xmlOutputBufferWriteString(buf, "\n");
1025: cur->type = (xmlElementType) type;
1026: }
1027:
1028: /**
1029: * htmlDocContentDumpOutput:
1030: * @buf: the HTML buffer output
1031: * @cur: the document
1032: * @encoding: the encoding string
1033: *
1034: * Dump an HTML document. Formating return/spaces are added.
1035: */
1036: void
1037: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1038: const char *encoding) {
1039: htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1040: }
1041:
1042: /************************************************************************
1043: * *
1044: * Saving functions front-ends *
1045: * *
1046: ************************************************************************/
1047:
1048: /**
1049: * htmlDocDump:
1050: * @f: the FILE*
1051: * @cur: the document
1052: *
1053: * Dump an HTML document to an open FILE.
1054: *
1055: * returns: the number of byte written or -1 in case of failure.
1056: */
1057: int
1058: htmlDocDump(FILE *f, xmlDocPtr cur) {
1059: xmlOutputBufferPtr buf;
1060: xmlCharEncodingHandlerPtr handler = NULL;
1061: const char *encoding;
1062: int ret;
1063:
1064: xmlInitParser();
1065:
1066: if ((cur == NULL) || (f == NULL)) {
1067: return(-1);
1068: }
1069:
1070: encoding = (const char *) htmlGetMetaEncoding(cur);
1071:
1072: if (encoding != NULL) {
1073: xmlCharEncoding enc;
1074:
1075: enc = xmlParseCharEncoding(encoding);
1076: if (enc != cur->charset) {
1077: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1078: /*
1079: * Not supported yet
1080: */
1081: return(-1);
1082: }
1083:
1084: handler = xmlFindCharEncodingHandler(encoding);
1085: if (handler == NULL)
1.1.1.2 misho 1086: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 1087: } else {
1088: handler = xmlFindCharEncodingHandler(encoding);
1089: }
1090: }
1091:
1092: /*
1093: * Fallback to HTML or ASCII when the encoding is unspecified
1094: */
1095: if (handler == NULL)
1096: handler = xmlFindCharEncodingHandler("HTML");
1097: if (handler == NULL)
1098: handler = xmlFindCharEncodingHandler("ascii");
1099:
1100: buf = xmlOutputBufferCreateFile(f, handler);
1101: if (buf == NULL) return(-1);
1102: htmlDocContentDumpOutput(buf, cur, NULL);
1103:
1104: ret = xmlOutputBufferClose(buf);
1105: return(ret);
1106: }
1107:
1108: /**
1109: * htmlSaveFile:
1110: * @filename: the filename (or URL)
1111: * @cur: the document
1112: *
1113: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1114: * used.
1115: * returns: the number of byte written or -1 in case of failure.
1116: */
1117: int
1118: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1119: xmlOutputBufferPtr buf;
1120: xmlCharEncodingHandlerPtr handler = NULL;
1121: const char *encoding;
1122: int ret;
1123:
1124: if ((cur == NULL) || (filename == NULL))
1125: return(-1);
1.1.1.3 ! misho 1126:
1.1 misho 1127: xmlInitParser();
1128:
1129: encoding = (const char *) htmlGetMetaEncoding(cur);
1130:
1131: if (encoding != NULL) {
1132: xmlCharEncoding enc;
1133:
1134: enc = xmlParseCharEncoding(encoding);
1135: if (enc != cur->charset) {
1136: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1137: /*
1138: * Not supported yet
1139: */
1140: return(-1);
1141: }
1142:
1143: handler = xmlFindCharEncodingHandler(encoding);
1144: if (handler == NULL)
1.1.1.2 misho 1145: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 1146: }
1147: }
1148:
1149: /*
1150: * Fallback to HTML or ASCII when the encoding is unspecified
1151: */
1152: if (handler == NULL)
1153: handler = xmlFindCharEncodingHandler("HTML");
1154: if (handler == NULL)
1155: handler = xmlFindCharEncodingHandler("ascii");
1156:
1.1.1.3 ! misho 1157: /*
1.1 misho 1158: * save the content to a temp buffer.
1159: */
1160: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1161: if (buf == NULL) return(0);
1162:
1163: htmlDocContentDumpOutput(buf, cur, NULL);
1164:
1165: ret = xmlOutputBufferClose(buf);
1166: return(ret);
1167: }
1168:
1169: /**
1170: * htmlSaveFileFormat:
1171: * @filename: the filename
1172: * @cur: the document
1173: * @format: should formatting spaces been added
1174: * @encoding: the document encoding
1175: *
1176: * Dump an HTML document to a file using a given encoding.
1.1.1.3 ! misho 1177: *
1.1 misho 1178: * returns: the number of byte written or -1 in case of failure.
1179: */
1180: int
1181: htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1182: const char *encoding, int format) {
1183: xmlOutputBufferPtr buf;
1184: xmlCharEncodingHandlerPtr handler = NULL;
1185: int ret;
1186:
1187: if ((cur == NULL) || (filename == NULL))
1188: return(-1);
1189:
1190: xmlInitParser();
1191:
1192: if (encoding != NULL) {
1193: xmlCharEncoding enc;
1194:
1195: enc = xmlParseCharEncoding(encoding);
1196: if (enc != cur->charset) {
1197: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1198: /*
1199: * Not supported yet
1200: */
1201: return(-1);
1202: }
1203:
1204: handler = xmlFindCharEncodingHandler(encoding);
1205: if (handler == NULL)
1.1.1.2 misho 1206: htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1.1 misho 1207: }
1208: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1209: } else {
1210: htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1211: }
1212:
1213: /*
1214: * Fallback to HTML or ASCII when the encoding is unspecified
1215: */
1216: if (handler == NULL)
1217: handler = xmlFindCharEncodingHandler("HTML");
1218: if (handler == NULL)
1219: handler = xmlFindCharEncodingHandler("ascii");
1220:
1.1.1.3 ! misho 1221: /*
1.1 misho 1222: * save the content to a temp buffer.
1223: */
1224: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1225: if (buf == NULL) return(0);
1226:
1227: htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1228:
1229: ret = xmlOutputBufferClose(buf);
1230: return(ret);
1231: }
1232:
1233: /**
1234: * htmlSaveFileEnc:
1235: * @filename: the filename
1236: * @cur: the document
1237: * @encoding: the document encoding
1238: *
1239: * Dump an HTML document to a file using a given encoding
1240: * and formatting returns/spaces are added.
1.1.1.3 ! misho 1241: *
1.1 misho 1242: * returns: the number of byte written or -1 in case of failure.
1243: */
1244: int
1245: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1246: return(htmlSaveFileFormat(filename, cur, encoding, 1));
1247: }
1248:
1249: #endif /* LIBXML_OUTPUT_ENABLED */
1250:
1251: #define bottom_HTMLtree
1252: #include "elfgcchack.h"
1253: #endif /* LIBXML_HTML_ENABLED */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>