Annotation of embedaddon/curl/docs/examples/crawler.c, revision 1.1
1.1 ! misho 1: /***************************************************************************
! 2: * _ _ ____ _
! 3: * Project ___| | | | _ \| |
! 4: * / __| | | | |_) | |
! 5: * | (__| |_| | _ <| |___
! 6: * \___|\___/|_| \_\_____|
! 7: *
! 8: * Web crawler based on curl and libxml2.
! 9: * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com>
! 10: * License: MIT
! 11: *
! 12: * To compile:
! 13: * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
! 14: *
! 15: */
! 16: /* <DESC>
! 17: * Web crawler based on curl and libxml2 to stress-test curl with
! 18: * hundreds of concurrent connections to various servers.
! 19: * </DESC>
! 20: */
! 21:
! 22: /* Parameters */
! 23: int max_con = 200;
! 24: int max_total = 20000;
! 25: int max_requests = 500;
! 26: int max_link_per_page = 5;
! 27: int follow_relative_links = 0;
! 28: char *start_page = "https://www.reuters.com";
! 29:
! 30: #include <libxml/HTMLparser.h>
! 31: #include <libxml/xpath.h>
! 32: #include <libxml/uri.h>
! 33: #include <curl/curl.h>
! 34: #include <stdlib.h>
! 35: #include <string.h>
! 36: #include <math.h>
! 37: #include <signal.h>
! 38:
! 39: int pending_interrupt = 0;
! 40: void sighandler(int dummy)
! 41: {
! 42: pending_interrupt = 1;
! 43: }
! 44:
! 45: /* resizable buffer */
! 46: typedef struct {
! 47: char *buf;
! 48: size_t size;
! 49: } memory;
! 50:
! 51: size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
! 52: {
! 53: size_t realsize = sz * nmemb;
! 54: memory *mem = (memory*) ctx;
! 55: char *ptr = realloc(mem->buf, mem->size + realsize);
! 56: if(!ptr) {
! 57: /* out of memory */
! 58: printf("not enough memory (realloc returned NULL)\n");
! 59: return 0;
! 60: }
! 61: mem->buf = ptr;
! 62: memcpy(&(mem->buf[mem->size]), contents, realsize);
! 63: mem->size += realsize;
! 64: return realsize;
! 65: }
! 66:
! 67: CURL *make_handle(char *url)
! 68: {
! 69: CURL *handle = curl_easy_init();
! 70:
! 71: /* Important: use HTTP2 over HTTPS */
! 72: curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
! 73: curl_easy_setopt(handle, CURLOPT_URL, url);
! 74:
! 75: /* buffer body */
! 76: memory *mem = malloc(sizeof(memory));
! 77: mem->size = 0;
! 78: mem->buf = malloc(1);
! 79: curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
! 80: curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
! 81: curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
! 82:
! 83: /* For completeness */
! 84: curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
! 85: curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
! 86: curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
! 87: curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
! 88: curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
! 89: curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
! 90: curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
! 91: curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
! 92: curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
! 93: curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
! 94: curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
! 95: curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
! 96: return handle;
! 97: }
! 98:
! 99: /* HREF finder implemented in libxml2 but could be any HTML parser */
! 100: size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
! 101: {
! 102: int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
! 103: HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
! 104: htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
! 105: if(!doc)
! 106: return 0;
! 107: xmlChar *xpath = (xmlChar*) "//a/@href";
! 108: xmlXPathContextPtr context = xmlXPathNewContext(doc);
! 109: xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
! 110: xmlXPathFreeContext(context);
! 111: if(!result)
! 112: return 0;
! 113: xmlNodeSetPtr nodeset = result->nodesetval;
! 114: if(xmlXPathNodeSetIsEmpty(nodeset)) {
! 115: xmlXPathFreeObject(result);
! 116: return 0;
! 117: }
! 118: size_t count = 0;
! 119: int i;
! 120: for(i = 0; i < nodeset->nodeNr; i++) {
! 121: double r = rand();
! 122: int x = r * nodeset->nodeNr / RAND_MAX;
! 123: const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
! 124: xmlChar *href = xmlNodeListGetString(doc, node, 1);
! 125: if(follow_relative_links) {
! 126: xmlChar *orig = href;
! 127: href = xmlBuildURI(href, (xmlChar *) url);
! 128: xmlFree(orig);
! 129: }
! 130: char *link = (char *) href;
! 131: if(!link || strlen(link) < 20)
! 132: continue;
! 133: if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
! 134: curl_multi_add_handle(multi_handle, make_handle(link));
! 135: if(count++ == max_link_per_page)
! 136: break;
! 137: }
! 138: xmlFree(link);
! 139: }
! 140: xmlXPathFreeObject(result);
! 141: return count;
! 142: }
! 143:
! 144: int is_html(char *ctype)
! 145: {
! 146: return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
! 147: }
! 148:
! 149: int main(void)
! 150: {
! 151: signal(SIGINT, sighandler);
! 152: LIBXML_TEST_VERSION;
! 153: curl_global_init(CURL_GLOBAL_DEFAULT);
! 154: CURLM *multi_handle = curl_multi_init();
! 155: curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
! 156: curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
! 157:
! 158: /* enables http/2 if available */
! 159: #ifdef CURLPIPE_MULTIPLEX
! 160: curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
! 161: #endif
! 162:
! 163: /* sets html start page */
! 164: curl_multi_add_handle(multi_handle, make_handle(start_page));
! 165:
! 166: int msgs_left;
! 167: int pending = 0;
! 168: int complete = 0;
! 169: int still_running = 1;
! 170: while(still_running && !pending_interrupt) {
! 171: int numfds;
! 172: curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
! 173: curl_multi_perform(multi_handle, &still_running);
! 174:
! 175: /* See how the transfers went */
! 176: CURLMsg *m = NULL;
! 177: while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
! 178: if(m->msg == CURLMSG_DONE) {
! 179: CURL *handle = m->easy_handle;
! 180: char *url;
! 181: memory *mem;
! 182: curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
! 183: curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
! 184: if(m->data.result == CURLE_OK) {
! 185: long res_status;
! 186: curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
! 187: if(res_status == 200) {
! 188: char *ctype;
! 189: curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
! 190: printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
! 191: if(is_html(ctype) && mem->size > 100) {
! 192: if(pending < max_requests && (complete + pending) < max_total) {
! 193: pending += follow_links(multi_handle, mem, url);
! 194: still_running = 1;
! 195: }
! 196: }
! 197: }
! 198: else {
! 199: printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
! 200: }
! 201: }
! 202: else {
! 203: printf("[%d] Connection failure: %s\n", complete, url);
! 204: }
! 205: curl_multi_remove_handle(multi_handle, handle);
! 206: curl_easy_cleanup(handle);
! 207: free(mem->buf);
! 208: free(mem);
! 209: complete++;
! 210: pending--;
! 211: }
! 212: }
! 213: }
! 214: curl_multi_cleanup(multi_handle);
! 215: curl_global_cleanup();
! 216: return 0;
! 217: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>