Annotation of embedaddon/curl/docs/examples/crawler.c, revision 1.1.1.1

1.1       misho       1: /***************************************************************************
                      2:  *                                  _   _ ____  _
                      3:  *  Project                     ___| | | |  _ \| |
                      4:  *                             / __| | | | |_) | |
                      5:  *                            | (__| |_| |  _ <| |___
                      6:  *                             \___|\___/|_| \_\_____|
                      7:  *
                      8:  * Web crawler based on curl and libxml2.
                      9:  * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com>
                     10:  * License: MIT
                     11:  *
                     12:  * To compile:
                     13:  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
                     14:  *
                     15:  */
                     16: /* <DESC>
                     17:  * Web crawler based on curl and libxml2 to stress-test curl with
                     18:  * hundreds of concurrent connections to various servers.
                     19:  * </DESC>
                     20:  */
                     21: 
                     22: /* Parameters */
                     23: int max_con = 200;
                     24: int max_total = 20000;
                     25: int max_requests = 500;
                     26: int max_link_per_page = 5;
                     27: int follow_relative_links = 0;
                     28: char *start_page = "https://www.reuters.com";
                     29: 
                     30: #include <libxml/HTMLparser.h>
                     31: #include <libxml/xpath.h>
                     32: #include <libxml/uri.h>
                     33: #include <curl/curl.h>
                     34: #include <stdlib.h>
                     35: #include <string.h>
                     36: #include <math.h>
                     37: #include <signal.h>
                     38: 
                     39: int pending_interrupt = 0;
                     40: void sighandler(int dummy)
                     41: {
                     42:   pending_interrupt = 1;
                     43: }
                     44: 
                     45: /* resizable buffer */
                     46: typedef struct {
                     47:   char *buf;
                     48:   size_t size;
                     49: } memory;
                     50: 
                     51: size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
                     52: {
                     53:   size_t realsize = sz * nmemb;
                     54:   memory *mem = (memory*) ctx;
                     55:   char *ptr = realloc(mem->buf, mem->size + realsize);
                     56:   if(!ptr) {
                     57:     /* out of memory */
                     58:     printf("not enough memory (realloc returned NULL)\n");
                     59:     return 0;
                     60:   }
                     61:   mem->buf = ptr;
                     62:   memcpy(&(mem->buf[mem->size]), contents, realsize);
                     63:   mem->size += realsize;
                     64:   return realsize;
                     65: }
                     66: 
                     67: CURL *make_handle(char *url)
                     68: {
                     69:   CURL *handle = curl_easy_init();
                     70: 
                     71:   /* Important: use HTTP2 over HTTPS */
                     72:   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
                     73:   curl_easy_setopt(handle, CURLOPT_URL, url);
                     74: 
                     75:   /* buffer body */
                     76:   memory *mem = malloc(sizeof(memory));
                     77:   mem->size = 0;
                     78:   mem->buf = malloc(1);
                     79:   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
                     80:   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
                     81:   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
                     82: 
                     83:   /* For completeness */
                     84:   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
                     85:   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
                     86:   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
                     87:   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
                     88:   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
                     89:   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
                     90:   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
                     91:   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
                     92:   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
                     93:   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
                     94:   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
                     95:   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
                     96:   return handle;
                     97: }
                     98: 
                     99: /* HREF finder implemented in libxml2 but could be any HTML parser */
                    100: size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
                    101: {
                    102:   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
                    103:              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
                    104:   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
                    105:   if(!doc)
                    106:     return 0;
                    107:   xmlChar *xpath = (xmlChar*) "//a/@href";
                    108:   xmlXPathContextPtr context = xmlXPathNewContext(doc);
                    109:   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
                    110:   xmlXPathFreeContext(context);
                    111:   if(!result)
                    112:     return 0;
                    113:   xmlNodeSetPtr nodeset = result->nodesetval;
                    114:   if(xmlXPathNodeSetIsEmpty(nodeset)) {
                    115:     xmlXPathFreeObject(result);
                    116:     return 0;
                    117:   }
                    118:   size_t count = 0;
                    119:   int i;
                    120:   for(i = 0; i < nodeset->nodeNr; i++) {
                    121:     double r = rand();
                    122:     int x = r * nodeset->nodeNr / RAND_MAX;
                    123:     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
                    124:     xmlChar *href = xmlNodeListGetString(doc, node, 1);
                    125:     if(follow_relative_links) {
                    126:       xmlChar *orig = href;
                    127:       href = xmlBuildURI(href, (xmlChar *) url);
                    128:       xmlFree(orig);
                    129:     }
                    130:     char *link = (char *) href;
                    131:     if(!link || strlen(link) < 20)
                    132:       continue;
                    133:     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
                    134:       curl_multi_add_handle(multi_handle, make_handle(link));
                    135:       if(count++ == max_link_per_page)
                    136:         break;
                    137:     }
                    138:     xmlFree(link);
                    139:   }
                    140:   xmlXPathFreeObject(result);
                    141:   return count;
                    142: }
                    143: 
                    144: int is_html(char *ctype)
                    145: {
                    146:   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
                    147: }
                    148: 
                    149: int main(void)
                    150: {
                    151:   signal(SIGINT, sighandler);
                    152:   LIBXML_TEST_VERSION;
                    153:   curl_global_init(CURL_GLOBAL_DEFAULT);
                    154:   CURLM *multi_handle = curl_multi_init();
                    155:   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
                    156:   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
                    157: 
                    158:   /* enables http/2 if available */
                    159: #ifdef CURLPIPE_MULTIPLEX
                    160:   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
                    161: #endif
                    162: 
                    163:   /* sets html start page */
                    164:   curl_multi_add_handle(multi_handle, make_handle(start_page));
                    165: 
                    166:   int msgs_left;
                    167:   int pending = 0;
                    168:   int complete = 0;
                    169:   int still_running = 1;
                    170:   while(still_running && !pending_interrupt) {
                    171:     int numfds;
                    172:     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
                    173:     curl_multi_perform(multi_handle, &still_running);
                    174: 
                    175:     /* See how the transfers went */
                    176:     CURLMsg *m = NULL;
                    177:     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
                    178:       if(m->msg == CURLMSG_DONE) {
                    179:         CURL *handle = m->easy_handle;
                    180:         char *url;
                    181:         memory *mem;
                    182:         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
                    183:         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
                    184:         if(m->data.result == CURLE_OK) {
                    185:           long res_status;
                    186:           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
                    187:           if(res_status == 200) {
                    188:             char *ctype;
                    189:             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
                    190:             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
                    191:             if(is_html(ctype) && mem->size > 100) {
                    192:               if(pending < max_requests && (complete + pending) < max_total) {
                    193:                 pending += follow_links(multi_handle, mem, url);
                    194:                 still_running = 1;
                    195:               }
                    196:             }
                    197:           }
                    198:           else {
                    199:             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
                    200:           }
                    201:         }
                    202:         else {
                    203:           printf("[%d] Connection failure: %s\n", complete, url);
                    204:         }
                    205:         curl_multi_remove_handle(multi_handle, handle);
                    206:         curl_easy_cleanup(handle);
                    207:         free(mem->buf);
                    208:         free(mem);
                    209:         complete++;
                    210:         pending--;
                    211:       }
                    212:     }
                    213:   }
                    214:   curl_multi_cleanup(multi_handle);
                    215:   curl_global_cleanup();
                    216:   return 0;
                    217: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>