Return to htmltidy.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / curl / docs / examples |
1.1 ! misho 1: /*************************************************************************** ! 2: * _ _ ____ _ ! 3: * Project ___| | | | _ \| | ! 4: * / __| | | | |_) | | ! 5: * | (__| |_| | _ <| |___ ! 6: * \___|\___/|_| \_\_____| ! 7: * ! 8: * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al. ! 9: * ! 10: * This software is licensed as described in the file COPYING, which ! 11: * you should have received as part of this distribution. The terms ! 12: * are also available at https://curl.haxx.se/docs/copyright.html. ! 13: * ! 14: * You may opt to use, copy, modify, merge, publish, distribute and/or sell ! 15: * copies of the Software, and permit persons to whom the Software is ! 16: * furnished to do so, under the terms of the COPYING file. ! 17: * ! 18: * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY ! 19: * KIND, either express or implied. ! 20: * ! 21: ***************************************************************************/ ! 22: /* <DESC> ! 23: * Download a document and use libtidy to parse the HTML. ! 24: * </DESC> ! 25: */ ! 26: /* ! 27: * LibTidy => https://www.html-tidy.org/ ! 28: */ ! 29: ! 30: #include <stdio.h> ! 31: #include <tidy.h> ! 32: #include <tidybuffio.h> ! 33: #include <curl/curl.h> ! 34: ! 35: /* curl write callback, to fill tidy's input buffer... */ ! 36: uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) ! 37: { ! 38: uint r; ! 39: r = size * nmemb; ! 40: tidyBufAppend(out, in, r); ! 41: return r; ! 42: } ! 43: ! 44: /* Traverse the document tree */ ! 45: void dumpNode(TidyDoc doc, TidyNode tnod, int indent) ! 46: { ! 47: TidyNode child; ! 48: for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { ! 49: ctmbstr name = tidyNodeGetName(child); ! 50: if(name) { ! 51: /* if it has a name, then it's an HTML tag ... */ ! 52: TidyAttr attr; ! 53: printf("%*.*s%s ", indent, indent, "<", name); ! 54: /* walk the attribute list */ ! 55: for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { ! 56: printf(tidyAttrName(attr)); ! 57: tidyAttrValue(attr)?printf("=\"%s\" ", ! 58: tidyAttrValue(attr)):printf(" "); ! 59: } ! 60: printf(">\n"); ! 61: } ! 62: else { ! 63: /* if it doesn't have a name, then it's probably text, cdata, etc... */ ! 64: TidyBuffer buf; ! 65: tidyBufInit(&buf); ! 66: tidyNodeGetText(doc, child, &buf); ! 67: printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); ! 68: tidyBufFree(&buf); ! 69: } ! 70: dumpNode(doc, child, indent + 4); /* recursive */ ! 71: } ! 72: } ! 73: ! 74: ! 75: int main(int argc, char **argv) ! 76: { ! 77: if(argc == 2) { ! 78: CURL *curl; ! 79: char curl_errbuf[CURL_ERROR_SIZE]; ! 80: TidyDoc tdoc; ! 81: TidyBuffer docbuf = {0}; ! 82: TidyBuffer tidy_errbuf = {0}; ! 83: int err; ! 84: ! 85: curl = curl_easy_init(); ! 86: curl_easy_setopt(curl, CURLOPT_URL, argv[1]); ! 87: curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); ! 88: curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); ! 89: curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); ! 90: curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); ! 91: ! 92: tdoc = tidyCreate(); ! 93: tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ ! 94: tidyOptSetInt(tdoc, TidyWrapLen, 4096); ! 95: tidySetErrorBuffer(tdoc, &tidy_errbuf); ! 96: tidyBufInit(&docbuf); ! 97: ! 98: curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); ! 99: err = curl_easy_perform(curl); ! 100: if(!err) { ! 101: err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ ! 102: if(err >= 0) { ! 103: err = tidyCleanAndRepair(tdoc); /* fix any problems */ ! 104: if(err >= 0) { ! 105: err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ ! 106: if(err >= 0) { ! 107: dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */ ! 108: fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ ! 109: } ! 110: } ! 111: } ! 112: } ! 113: else ! 114: fprintf(stderr, "%s\n", curl_errbuf); ! 115: ! 116: /* clean-up */ ! 117: curl_easy_cleanup(curl); ! 118: tidyBufFree(&docbuf); ! 119: tidyBufFree(&tidy_errbuf); ! 120: tidyRelease(tdoc); ! 121: return err; ! 122: ! 123: } ! 124: else ! 125: printf("usage: %s <url>\n", argv[0]); ! 126: ! 127: return 0; ! 128: }