Annotation of embedaddon/curl/docs/examples/htmltidy.c, revision 1.1.1.1

1.1       misho       1: /***************************************************************************
                      2:  *                                  _   _ ____  _
                      3:  *  Project                     ___| | | |  _ \| |
                      4:  *                             / __| | | | |_) | |
                      5:  *                            | (__| |_| |  _ <| |___
                      6:  *                             \___|\___/|_| \_\_____|
                      7:  *
                      8:  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
                      9:  *
                     10:  * This software is licensed as described in the file COPYING, which
                     11:  * you should have received as part of this distribution. The terms
                     12:  * are also available at https://curl.haxx.se/docs/copyright.html.
                     13:  *
                     14:  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
                     15:  * copies of the Software, and permit persons to whom the Software is
                     16:  * furnished to do so, under the terms of the COPYING file.
                     17:  *
                     18:  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
                     19:  * KIND, either express or implied.
                     20:  *
                     21:  ***************************************************************************/
                     22: /* <DESC>
                     23:  * Download a document and use libtidy to parse the HTML.
                     24:  * </DESC>
                     25:  */
                     26: /*
                     27:  * LibTidy => https://www.html-tidy.org/
                     28:  */
                     29: 
                     30: #include <stdio.h>
                     31: #include <tidy.h>
                     32: #include <tidybuffio.h>
                     33: #include <curl/curl.h>
                     34: 
                     35: /* curl write callback, to fill tidy's input buffer...  */
                     36: uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
                     37: {
                     38:   uint r;
                     39:   r = size * nmemb;
                     40:   tidyBufAppend(out, in, r);
                     41:   return r;
                     42: }
                     43: 
                     44: /* Traverse the document tree */
                     45: void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
                     46: {
                     47:   TidyNode child;
                     48:   for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
                     49:     ctmbstr name = tidyNodeGetName(child);
                     50:     if(name) {
                     51:       /* if it has a name, then it's an HTML tag ... */
                     52:       TidyAttr attr;
                     53:       printf("%*.*s%s ", indent, indent, "<", name);
                     54:       /* walk the attribute list */
                     55:       for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
                     56:         printf(tidyAttrName(attr));
                     57:         tidyAttrValue(attr)?printf("=\"%s\" ",
                     58:                                    tidyAttrValue(attr)):printf(" ");
                     59:       }
                     60:       printf(">\n");
                     61:     }
                     62:     else {
                     63:       /* if it doesn't have a name, then it's probably text, cdata, etc... */
                     64:       TidyBuffer buf;
                     65:       tidyBufInit(&buf);
                     66:       tidyNodeGetText(doc, child, &buf);
                     67:       printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
                     68:       tidyBufFree(&buf);
                     69:     }
                     70:     dumpNode(doc, child, indent + 4); /* recursive */
                     71:   }
                     72: }
                     73: 
                     74: 
                     75: int main(int argc, char **argv)
                     76: {
                     77:   if(argc == 2) {
                     78:     CURL *curl;
                     79:     char curl_errbuf[CURL_ERROR_SIZE];
                     80:     TidyDoc tdoc;
                     81:     TidyBuffer docbuf = {0};
                     82:     TidyBuffer tidy_errbuf = {0};
                     83:     int err;
                     84: 
                     85:     curl = curl_easy_init();
                     86:     curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
                     87:     curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
                     88:     curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
                     89:     curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
                     90:     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
                     91: 
                     92:     tdoc = tidyCreate();
                     93:     tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
                     94:     tidyOptSetInt(tdoc, TidyWrapLen, 4096);
                     95:     tidySetErrorBuffer(tdoc, &tidy_errbuf);
                     96:     tidyBufInit(&docbuf);
                     97: 
                     98:     curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
                     99:     err = curl_easy_perform(curl);
                    100:     if(!err) {
                    101:       err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
                    102:       if(err >= 0) {
                    103:         err = tidyCleanAndRepair(tdoc); /* fix any problems */
                    104:         if(err >= 0) {
                    105:           err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
                    106:           if(err >= 0) {
                    107:             dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */
                    108:             fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
                    109:           }
                    110:         }
                    111:       }
                    112:     }
                    113:     else
                    114:       fprintf(stderr, "%s\n", curl_errbuf);
                    115: 
                    116:     /* clean-up */
                    117:     curl_easy_cleanup(curl);
                    118:     tidyBufFree(&docbuf);
                    119:     tidyBufFree(&tidy_errbuf);
                    120:     tidyRelease(tdoc);
                    121:     return err;
                    122: 
                    123:   }
                    124:   else
                    125:     printf("usage: %s <url>\n", argv[0]);
                    126: 
                    127:   return 0;
                    128: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>