Return to crawler.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / curl / docs / examples |
1.1 ! misho 1: /*************************************************************************** ! 2: * _ _ ____ _ ! 3: * Project ___| | | | _ \| | ! 4: * / __| | | | |_) | | ! 5: * | (__| |_| | _ <| |___ ! 6: * \___|\___/|_| \_\_____| ! 7: * ! 8: * Web crawler based on curl and libxml2. ! 9: * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com> ! 10: * License: MIT ! 11: * ! 12: * To compile: ! 13: * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) ! 14: * ! 15: */ ! 16: /* <DESC> ! 17: * Web crawler based on curl and libxml2 to stress-test curl with ! 18: * hundreds of concurrent connections to various servers. ! 19: * </DESC> ! 20: */ ! 21: ! 22: /* Parameters */ ! 23: int max_con = 200; ! 24: int max_total = 20000; ! 25: int max_requests = 500; ! 26: int max_link_per_page = 5; ! 27: int follow_relative_links = 0; ! 28: char *start_page = "https://www.reuters.com"; ! 29: ! 30: #include <libxml/HTMLparser.h> ! 31: #include <libxml/xpath.h> ! 32: #include <libxml/uri.h> ! 33: #include <curl/curl.h> ! 34: #include <stdlib.h> ! 35: #include <string.h> ! 36: #include <math.h> ! 37: #include <signal.h> ! 38: ! 39: int pending_interrupt = 0; ! 40: void sighandler(int dummy) ! 41: { ! 42: pending_interrupt = 1; ! 43: } ! 44: ! 45: /* resizable buffer */ ! 46: typedef struct { ! 47: char *buf; ! 48: size_t size; ! 49: } memory; ! 50: ! 51: size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) ! 52: { ! 53: size_t realsize = sz * nmemb; ! 54: memory *mem = (memory*) ctx; ! 55: char *ptr = realloc(mem->buf, mem->size + realsize); ! 56: if(!ptr) { ! 57: /* out of memory */ ! 58: printf("not enough memory (realloc returned NULL)\n"); ! 59: return 0; ! 60: } ! 61: mem->buf = ptr; ! 62: memcpy(&(mem->buf[mem->size]), contents, realsize); ! 63: mem->size += realsize; ! 64: return realsize; ! 65: } ! 66: ! 67: CURL *make_handle(char *url) ! 68: { ! 69: CURL *handle = curl_easy_init(); ! 70: ! 71: /* Important: use HTTP2 over HTTPS */ ! 72: curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); ! 73: curl_easy_setopt(handle, CURLOPT_URL, url); ! 74: ! 75: /* buffer body */ ! 76: memory *mem = malloc(sizeof(memory)); ! 77: mem->size = 0; ! 78: mem->buf = malloc(1); ! 79: curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); ! 80: curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); ! 81: curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); ! 82: ! 83: /* For completeness */ ! 84: curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); ! 85: curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); ! 86: curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); ! 87: curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); ! 88: curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L); ! 89: curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); ! 90: curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); ! 91: curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); ! 92: curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); ! 93: curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); ! 94: curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); ! 95: curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); ! 96: return handle; ! 97: } ! 98: ! 99: /* HREF finder implemented in libxml2 but could be any HTML parser */ ! 100: size_t follow_links(CURLM *multi_handle, memory *mem, char *url) ! 101: { ! 102: int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ ! 103: HTML_PARSE_NOWARNING | HTML_PARSE_NONET; ! 104: htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); ! 105: if(!doc) ! 106: return 0; ! 107: xmlChar *xpath = (xmlChar*) "//a/@href"; ! 108: xmlXPathContextPtr context = xmlXPathNewContext(doc); ! 109: xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); ! 110: xmlXPathFreeContext(context); ! 111: if(!result) ! 112: return 0; ! 113: xmlNodeSetPtr nodeset = result->nodesetval; ! 114: if(xmlXPathNodeSetIsEmpty(nodeset)) { ! 115: xmlXPathFreeObject(result); ! 116: return 0; ! 117: } ! 118: size_t count = 0; ! 119: int i; ! 120: for(i = 0; i < nodeset->nodeNr; i++) { ! 121: double r = rand(); ! 122: int x = r * nodeset->nodeNr / RAND_MAX; ! 123: const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; ! 124: xmlChar *href = xmlNodeListGetString(doc, node, 1); ! 125: if(follow_relative_links) { ! 126: xmlChar *orig = href; ! 127: href = xmlBuildURI(href, (xmlChar *) url); ! 128: xmlFree(orig); ! 129: } ! 130: char *link = (char *) href; ! 131: if(!link || strlen(link) < 20) ! 132: continue; ! 133: if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { ! 134: curl_multi_add_handle(multi_handle, make_handle(link)); ! 135: if(count++ == max_link_per_page) ! 136: break; ! 137: } ! 138: xmlFree(link); ! 139: } ! 140: xmlXPathFreeObject(result); ! 141: return count; ! 142: } ! 143: ! 144: int is_html(char *ctype) ! 145: { ! 146: return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); ! 147: } ! 148: ! 149: int main(void) ! 150: { ! 151: signal(SIGINT, sighandler); ! 152: LIBXML_TEST_VERSION; ! 153: curl_global_init(CURL_GLOBAL_DEFAULT); ! 154: CURLM *multi_handle = curl_multi_init(); ! 155: curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); ! 156: curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); ! 157: ! 158: /* enables http/2 if available */ ! 159: #ifdef CURLPIPE_MULTIPLEX ! 160: curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); ! 161: #endif ! 162: ! 163: /* sets html start page */ ! 164: curl_multi_add_handle(multi_handle, make_handle(start_page)); ! 165: ! 166: int msgs_left; ! 167: int pending = 0; ! 168: int complete = 0; ! 169: int still_running = 1; ! 170: while(still_running && !pending_interrupt) { ! 171: int numfds; ! 172: curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); ! 173: curl_multi_perform(multi_handle, &still_running); ! 174: ! 175: /* See how the transfers went */ ! 176: CURLMsg *m = NULL; ! 177: while((m = curl_multi_info_read(multi_handle, &msgs_left))) { ! 178: if(m->msg == CURLMSG_DONE) { ! 179: CURL *handle = m->easy_handle; ! 180: char *url; ! 181: memory *mem; ! 182: curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); ! 183: curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); ! 184: if(m->data.result == CURLE_OK) { ! 185: long res_status; ! 186: curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); ! 187: if(res_status == 200) { ! 188: char *ctype; ! 189: curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); ! 190: printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); ! 191: if(is_html(ctype) && mem->size > 100) { ! 192: if(pending < max_requests && (complete + pending) < max_total) { ! 193: pending += follow_links(multi_handle, mem, url); ! 194: still_running = 1; ! 195: } ! 196: } ! 197: } ! 198: else { ! 199: printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); ! 200: } ! 201: } ! 202: else { ! 203: printf("[%d] Connection failure: %s\n", complete, url); ! 204: } ! 205: curl_multi_remove_handle(multi_handle, handle); ! 206: curl_easy_cleanup(handle); ! 207: free(mem->buf); ! 208: free(mem); ! 209: complete++; ! 210: pending--; ! 211: } ! 212: } ! 213: } ! 214: curl_multi_cleanup(multi_handle); ! 215: curl_global_cleanup(); ! 216: return 0; ! 217: }