File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / curl / docs / examples / crawler.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Jun 3 10:01:15 2020 UTC (4 years, 10 months ago) by misho
Branches: curl, MAIN
CVS tags: v7_70_0p4, HEAD
curl

    1: /***************************************************************************
    2:  *                                  _   _ ____  _
    3:  *  Project                     ___| | | |  _ \| |
    4:  *                             / __| | | | |_) | |
    5:  *                            | (__| |_| |  _ <| |___
    6:  *                             \___|\___/|_| \_\_____|
    7:  *
    8:  * Web crawler based on curl and libxml2.
    9:  * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com>
   10:  * License: MIT
   11:  *
   12:  * To compile:
   13:  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
   14:  *
   15:  */
   16: /* <DESC>
   17:  * Web crawler based on curl and libxml2 to stress-test curl with
   18:  * hundreds of concurrent connections to various servers.
   19:  * </DESC>
   20:  */
   21: 
   22: /* Parameters */
   23: int max_con = 200;
   24: int max_total = 20000;
   25: int max_requests = 500;
   26: int max_link_per_page = 5;
   27: int follow_relative_links = 0;
   28: char *start_page = "https://www.reuters.com";
   29: 
   30: #include <libxml/HTMLparser.h>
   31: #include <libxml/xpath.h>
   32: #include <libxml/uri.h>
   33: #include <curl/curl.h>
   34: #include <stdlib.h>
   35: #include <string.h>
   36: #include <math.h>
   37: #include <signal.h>
   38: 
   39: int pending_interrupt = 0;
   40: void sighandler(int dummy)
   41: {
   42:   pending_interrupt = 1;
   43: }
   44: 
   45: /* resizable buffer */
   46: typedef struct {
   47:   char *buf;
   48:   size_t size;
   49: } memory;
   50: 
   51: size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
   52: {
   53:   size_t realsize = sz * nmemb;
   54:   memory *mem = (memory*) ctx;
   55:   char *ptr = realloc(mem->buf, mem->size + realsize);
   56:   if(!ptr) {
   57:     /* out of memory */
   58:     printf("not enough memory (realloc returned NULL)\n");
   59:     return 0;
   60:   }
   61:   mem->buf = ptr;
   62:   memcpy(&(mem->buf[mem->size]), contents, realsize);
   63:   mem->size += realsize;
   64:   return realsize;
   65: }
   66: 
   67: CURL *make_handle(char *url)
   68: {
   69:   CURL *handle = curl_easy_init();
   70: 
   71:   /* Important: use HTTP2 over HTTPS */
   72:   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
   73:   curl_easy_setopt(handle, CURLOPT_URL, url);
   74: 
   75:   /* buffer body */
   76:   memory *mem = malloc(sizeof(memory));
   77:   mem->size = 0;
   78:   mem->buf = malloc(1);
   79:   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
   80:   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
   81:   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
   82: 
   83:   /* For completeness */
   84:   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
   85:   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
   86:   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
   87:   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
   88:   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
   89:   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
   90:   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
   91:   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
   92:   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
   93:   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
   94:   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
   95:   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
   96:   return handle;
   97: }
   98: 
   99: /* HREF finder implemented in libxml2 but could be any HTML parser */
  100: size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
  101: {
  102:   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
  103:              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
  104:   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
  105:   if(!doc)
  106:     return 0;
  107:   xmlChar *xpath = (xmlChar*) "//a/@href";
  108:   xmlXPathContextPtr context = xmlXPathNewContext(doc);
  109:   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
  110:   xmlXPathFreeContext(context);
  111:   if(!result)
  112:     return 0;
  113:   xmlNodeSetPtr nodeset = result->nodesetval;
  114:   if(xmlXPathNodeSetIsEmpty(nodeset)) {
  115:     xmlXPathFreeObject(result);
  116:     return 0;
  117:   }
  118:   size_t count = 0;
  119:   int i;
  120:   for(i = 0; i < nodeset->nodeNr; i++) {
  121:     double r = rand();
  122:     int x = r * nodeset->nodeNr / RAND_MAX;
  123:     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
  124:     xmlChar *href = xmlNodeListGetString(doc, node, 1);
  125:     if(follow_relative_links) {
  126:       xmlChar *orig = href;
  127:       href = xmlBuildURI(href, (xmlChar *) url);
  128:       xmlFree(orig);
  129:     }
  130:     char *link = (char *) href;
  131:     if(!link || strlen(link) < 20)
  132:       continue;
  133:     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
  134:       curl_multi_add_handle(multi_handle, make_handle(link));
  135:       if(count++ == max_link_per_page)
  136:         break;
  137:     }
  138:     xmlFree(link);
  139:   }
  140:   xmlXPathFreeObject(result);
  141:   return count;
  142: }
  143: 
  144: int is_html(char *ctype)
  145: {
  146:   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
  147: }
  148: 
  149: int main(void)
  150: {
  151:   signal(SIGINT, sighandler);
  152:   LIBXML_TEST_VERSION;
  153:   curl_global_init(CURL_GLOBAL_DEFAULT);
  154:   CURLM *multi_handle = curl_multi_init();
  155:   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
  156:   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
  157: 
  158:   /* enables http/2 if available */
  159: #ifdef CURLPIPE_MULTIPLEX
  160:   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
  161: #endif
  162: 
  163:   /* sets html start page */
  164:   curl_multi_add_handle(multi_handle, make_handle(start_page));
  165: 
  166:   int msgs_left;
  167:   int pending = 0;
  168:   int complete = 0;
  169:   int still_running = 1;
  170:   while(still_running && !pending_interrupt) {
  171:     int numfds;
  172:     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
  173:     curl_multi_perform(multi_handle, &still_running);
  174: 
  175:     /* See how the transfers went */
  176:     CURLMsg *m = NULL;
  177:     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
  178:       if(m->msg == CURLMSG_DONE) {
  179:         CURL *handle = m->easy_handle;
  180:         char *url;
  181:         memory *mem;
  182:         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
  183:         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
  184:         if(m->data.result == CURLE_OK) {
  185:           long res_status;
  186:           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
  187:           if(res_status == 200) {
  188:             char *ctype;
  189:             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
  190:             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
  191:             if(is_html(ctype) && mem->size > 100) {
  192:               if(pending < max_requests && (complete + pending) < max_total) {
  193:                 pending += follow_links(multi_handle, mem, url);
  194:                 still_running = 1;
  195:               }
  196:             }
  197:           }
  198:           else {
  199:             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
  200:           }
  201:         }
  202:         else {
  203:           printf("[%d] Connection failure: %s\n", complete, url);
  204:         }
  205:         curl_multi_remove_handle(multi_handle, handle);
  206:         curl_easy_cleanup(handle);
  207:         free(mem->buf);
  208:         free(mem);
  209:         complete++;
  210:         pending--;
  211:       }
  212:     }
  213:   }
  214:   curl_multi_cleanup(multi_handle);
  215:   curl_global_cleanup();
  216:   return 0;
  217: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>