Annotation of embedaddon/curl/docs/examples/crawler.c, revision 1.1

1.1     ! misho       1: /***************************************************************************
        !             2:  *                                  _   _ ____  _
        !             3:  *  Project                     ___| | | |  _ \| |
        !             4:  *                             / __| | | | |_) | |
        !             5:  *                            | (__| |_| |  _ <| |___
        !             6:  *                             \___|\___/|_| \_\_____|
        !             7:  *
        !             8:  * Web crawler based on curl and libxml2.
        !             9:  * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com>
        !            10:  * License: MIT
        !            11:  *
        !            12:  * To compile:
        !            13:  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
        !            14:  *
        !            15:  */
        !            16: /* <DESC>
        !            17:  * Web crawler based on curl and libxml2 to stress-test curl with
        !            18:  * hundreds of concurrent connections to various servers.
        !            19:  * </DESC>
        !            20:  */
        !            21: 
        !            22: /* Parameters */
        !            23: int max_con = 200;
        !            24: int max_total = 20000;
        !            25: int max_requests = 500;
        !            26: int max_link_per_page = 5;
        !            27: int follow_relative_links = 0;
        !            28: char *start_page = "https://www.reuters.com";
        !            29: 
        !            30: #include <libxml/HTMLparser.h>
        !            31: #include <libxml/xpath.h>
        !            32: #include <libxml/uri.h>
        !            33: #include <curl/curl.h>
        !            34: #include <stdlib.h>
        !            35: #include <string.h>
        !            36: #include <math.h>
        !            37: #include <signal.h>
        !            38: 
        !            39: int pending_interrupt = 0;
        !            40: void sighandler(int dummy)
        !            41: {
        !            42:   pending_interrupt = 1;
        !            43: }
        !            44: 
        !            45: /* resizable buffer */
        !            46: typedef struct {
        !            47:   char *buf;
        !            48:   size_t size;
        !            49: } memory;
        !            50: 
        !            51: size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
        !            52: {
        !            53:   size_t realsize = sz * nmemb;
        !            54:   memory *mem = (memory*) ctx;
        !            55:   char *ptr = realloc(mem->buf, mem->size + realsize);
        !            56:   if(!ptr) {
        !            57:     /* out of memory */
        !            58:     printf("not enough memory (realloc returned NULL)\n");
        !            59:     return 0;
        !            60:   }
        !            61:   mem->buf = ptr;
        !            62:   memcpy(&(mem->buf[mem->size]), contents, realsize);
        !            63:   mem->size += realsize;
        !            64:   return realsize;
        !            65: }
        !            66: 
        !            67: CURL *make_handle(char *url)
        !            68: {
        !            69:   CURL *handle = curl_easy_init();
        !            70: 
        !            71:   /* Important: use HTTP2 over HTTPS */
        !            72:   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
        !            73:   curl_easy_setopt(handle, CURLOPT_URL, url);
        !            74: 
        !            75:   /* buffer body */
        !            76:   memory *mem = malloc(sizeof(memory));
        !            77:   mem->size = 0;
        !            78:   mem->buf = malloc(1);
        !            79:   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
        !            80:   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
        !            81:   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
        !            82: 
        !            83:   /* For completeness */
        !            84:   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
        !            85:   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
        !            86:   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
        !            87:   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
        !            88:   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
        !            89:   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
        !            90:   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
        !            91:   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
        !            92:   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
        !            93:   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
        !            94:   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
        !            95:   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
        !            96:   return handle;
        !            97: }
        !            98: 
        !            99: /* HREF finder implemented in libxml2 but could be any HTML parser */
        !           100: size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
        !           101: {
        !           102:   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
        !           103:              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
        !           104:   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
        !           105:   if(!doc)
        !           106:     return 0;
        !           107:   xmlChar *xpath = (xmlChar*) "//a/@href";
        !           108:   xmlXPathContextPtr context = xmlXPathNewContext(doc);
        !           109:   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
        !           110:   xmlXPathFreeContext(context);
        !           111:   if(!result)
        !           112:     return 0;
        !           113:   xmlNodeSetPtr nodeset = result->nodesetval;
        !           114:   if(xmlXPathNodeSetIsEmpty(nodeset)) {
        !           115:     xmlXPathFreeObject(result);
        !           116:     return 0;
        !           117:   }
        !           118:   size_t count = 0;
        !           119:   int i;
        !           120:   for(i = 0; i < nodeset->nodeNr; i++) {
        !           121:     double r = rand();
        !           122:     int x = r * nodeset->nodeNr / RAND_MAX;
        !           123:     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
        !           124:     xmlChar *href = xmlNodeListGetString(doc, node, 1);
        !           125:     if(follow_relative_links) {
        !           126:       xmlChar *orig = href;
        !           127:       href = xmlBuildURI(href, (xmlChar *) url);
        !           128:       xmlFree(orig);
        !           129:     }
        !           130:     char *link = (char *) href;
        !           131:     if(!link || strlen(link) < 20)
        !           132:       continue;
        !           133:     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
        !           134:       curl_multi_add_handle(multi_handle, make_handle(link));
        !           135:       if(count++ == max_link_per_page)
        !           136:         break;
        !           137:     }
        !           138:     xmlFree(link);
        !           139:   }
        !           140:   xmlXPathFreeObject(result);
        !           141:   return count;
        !           142: }
        !           143: 
        !           144: int is_html(char *ctype)
        !           145: {
        !           146:   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
        !           147: }
        !           148: 
        !           149: int main(void)
        !           150: {
        !           151:   signal(SIGINT, sighandler);
        !           152:   LIBXML_TEST_VERSION;
        !           153:   curl_global_init(CURL_GLOBAL_DEFAULT);
        !           154:   CURLM *multi_handle = curl_multi_init();
        !           155:   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
        !           156:   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
        !           157: 
        !           158:   /* enables http/2 if available */
        !           159: #ifdef CURLPIPE_MULTIPLEX
        !           160:   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
        !           161: #endif
        !           162: 
        !           163:   /* sets html start page */
        !           164:   curl_multi_add_handle(multi_handle, make_handle(start_page));
        !           165: 
        !           166:   int msgs_left;
        !           167:   int pending = 0;
        !           168:   int complete = 0;
        !           169:   int still_running = 1;
        !           170:   while(still_running && !pending_interrupt) {
        !           171:     int numfds;
        !           172:     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
        !           173:     curl_multi_perform(multi_handle, &still_running);
        !           174: 
        !           175:     /* See how the transfers went */
        !           176:     CURLMsg *m = NULL;
        !           177:     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
        !           178:       if(m->msg == CURLMSG_DONE) {
        !           179:         CURL *handle = m->easy_handle;
        !           180:         char *url;
        !           181:         memory *mem;
        !           182:         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
        !           183:         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
        !           184:         if(m->data.result == CURLE_OK) {
        !           185:           long res_status;
        !           186:           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
        !           187:           if(res_status == 200) {
        !           188:             char *ctype;
        !           189:             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
        !           190:             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
        !           191:             if(is_html(ctype) && mem->size > 100) {
        !           192:               if(pending < max_requests && (complete + pending) < max_total) {
        !           193:                 pending += follow_links(multi_handle, mem, url);
        !           194:                 still_running = 1;
        !           195:               }
        !           196:             }
        !           197:           }
        !           198:           else {
        !           199:             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
        !           200:           }
        !           201:         }
        !           202:         else {
        !           203:           printf("[%d] Connection failure: %s\n", complete, url);
        !           204:         }
        !           205:         curl_multi_remove_handle(multi_handle, handle);
        !           206:         curl_easy_cleanup(handle);
        !           207:         free(mem->buf);
        !           208:         free(mem);
        !           209:         complete++;
        !           210:         pending--;
        !           211:       }
        !           212:     }
        !           213:   }
        !           214:   curl_multi_cleanup(multi_handle);
        !           215:   curl_global_cleanup();
        !           216:   return 0;
        !           217: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>