File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / curl / docs / examples / htmltidy.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Jun 3 10:01:15 2020 UTC (4 years, 10 months ago) by misho
Branches: curl, MAIN
CVS tags: v7_70_0p4, HEAD
curl

    1: /***************************************************************************
    2:  *                                  _   _ ____  _
    3:  *  Project                     ___| | | |  _ \| |
    4:  *                             / __| | | | |_) | |
    5:  *                            | (__| |_| |  _ <| |___
    6:  *                             \___|\___/|_| \_\_____|
    7:  *
    8:  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
    9:  *
   10:  * This software is licensed as described in the file COPYING, which
   11:  * you should have received as part of this distribution. The terms
   12:  * are also available at https://curl.haxx.se/docs/copyright.html.
   13:  *
   14:  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
   15:  * copies of the Software, and permit persons to whom the Software is
   16:  * furnished to do so, under the terms of the COPYING file.
   17:  *
   18:  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
   19:  * KIND, either express or implied.
   20:  *
   21:  ***************************************************************************/
   22: /* <DESC>
   23:  * Download a document and use libtidy to parse the HTML.
   24:  * </DESC>
   25:  */
   26: /*
   27:  * LibTidy => https://www.html-tidy.org/
   28:  */
   29: 
   30: #include <stdio.h>
   31: #include <tidy.h>
   32: #include <tidybuffio.h>
   33: #include <curl/curl.h>
   34: 
   35: /* curl write callback, to fill tidy's input buffer...  */
   36: uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
   37: {
   38:   uint r;
   39:   r = size * nmemb;
   40:   tidyBufAppend(out, in, r);
   41:   return r;
   42: }
   43: 
   44: /* Traverse the document tree */
   45: void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
   46: {
   47:   TidyNode child;
   48:   for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
   49:     ctmbstr name = tidyNodeGetName(child);
   50:     if(name) {
   51:       /* if it has a name, then it's an HTML tag ... */
   52:       TidyAttr attr;
   53:       printf("%*.*s%s ", indent, indent, "<", name);
   54:       /* walk the attribute list */
   55:       for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
   56:         printf(tidyAttrName(attr));
   57:         tidyAttrValue(attr)?printf("=\"%s\" ",
   58:                                    tidyAttrValue(attr)):printf(" ");
   59:       }
   60:       printf(">\n");
   61:     }
   62:     else {
   63:       /* if it doesn't have a name, then it's probably text, cdata, etc... */
   64:       TidyBuffer buf;
   65:       tidyBufInit(&buf);
   66:       tidyNodeGetText(doc, child, &buf);
   67:       printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
   68:       tidyBufFree(&buf);
   69:     }
   70:     dumpNode(doc, child, indent + 4); /* recursive */
   71:   }
   72: }
   73: 
   74: 
   75: int main(int argc, char **argv)
   76: {
   77:   if(argc == 2) {
   78:     CURL *curl;
   79:     char curl_errbuf[CURL_ERROR_SIZE];
   80:     TidyDoc tdoc;
   81:     TidyBuffer docbuf = {0};
   82:     TidyBuffer tidy_errbuf = {0};
   83:     int err;
   84: 
   85:     curl = curl_easy_init();
   86:     curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
   87:     curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
   88:     curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
   89:     curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
   90:     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
   91: 
   92:     tdoc = tidyCreate();
   93:     tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
   94:     tidyOptSetInt(tdoc, TidyWrapLen, 4096);
   95:     tidySetErrorBuffer(tdoc, &tidy_errbuf);
   96:     tidyBufInit(&docbuf);
   97: 
   98:     curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
   99:     err = curl_easy_perform(curl);
  100:     if(!err) {
  101:       err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
  102:       if(err >= 0) {
  103:         err = tidyCleanAndRepair(tdoc); /* fix any problems */
  104:         if(err >= 0) {
  105:           err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
  106:           if(err >= 0) {
  107:             dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */
  108:             fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
  109:           }
  110:         }
  111:       }
  112:     }
  113:     else
  114:       fprintf(stderr, "%s\n", curl_errbuf);
  115: 
  116:     /* clean-up */
  117:     curl_easy_cleanup(curl);
  118:     tidyBufFree(&docbuf);
  119:     tidyBufFree(&tidy_errbuf);
  120:     tidyRelease(tdoc);
  121:     return err;
  122: 
  123:   }
  124:   else
  125:     printf("usage: %s <url>\n", argv[0]);
  126: 
  127:   return 0;
  128: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>