Annotation of embedaddon/curl/docs/examples/crawler.c, revision 1.1.1.1
1.1 misho 1: /***************************************************************************
2: * _ _ ____ _
3: * Project ___| | | | _ \| |
4: * / __| | | | |_) | |
5: * | (__| |_| | _ <| |___
6: * \___|\___/|_| \_\_____|
7: *
8: * Web crawler based on curl and libxml2.
9: * Copyright (C) 2018 - 2020 Jeroen Ooms <jeroenooms@gmail.com>
10: * License: MIT
11: *
12: * To compile:
13: * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
14: *
15: */
16: /* <DESC>
17: * Web crawler based on curl and libxml2 to stress-test curl with
18: * hundreds of concurrent connections to various servers.
19: * </DESC>
20: */
21:
22: /* Parameters */
23: int max_con = 200;
24: int max_total = 20000;
25: int max_requests = 500;
26: int max_link_per_page = 5;
27: int follow_relative_links = 0;
28: char *start_page = "https://www.reuters.com";
29:
30: #include <libxml/HTMLparser.h>
31: #include <libxml/xpath.h>
32: #include <libxml/uri.h>
33: #include <curl/curl.h>
34: #include <stdlib.h>
35: #include <string.h>
36: #include <math.h>
37: #include <signal.h>
38:
39: int pending_interrupt = 0;
40: void sighandler(int dummy)
41: {
42: pending_interrupt = 1;
43: }
44:
45: /* resizable buffer */
46: typedef struct {
47: char *buf;
48: size_t size;
49: } memory;
50:
51: size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
52: {
53: size_t realsize = sz * nmemb;
54: memory *mem = (memory*) ctx;
55: char *ptr = realloc(mem->buf, mem->size + realsize);
56: if(!ptr) {
57: /* out of memory */
58: printf("not enough memory (realloc returned NULL)\n");
59: return 0;
60: }
61: mem->buf = ptr;
62: memcpy(&(mem->buf[mem->size]), contents, realsize);
63: mem->size += realsize;
64: return realsize;
65: }
66:
67: CURL *make_handle(char *url)
68: {
69: CURL *handle = curl_easy_init();
70:
71: /* Important: use HTTP2 over HTTPS */
72: curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
73: curl_easy_setopt(handle, CURLOPT_URL, url);
74:
75: /* buffer body */
76: memory *mem = malloc(sizeof(memory));
77: mem->size = 0;
78: mem->buf = malloc(1);
79: curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
80: curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
81: curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
82:
83: /* For completeness */
84: curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
85: curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
86: curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
87: curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
88: curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
89: curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
90: curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
91: curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
92: curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
93: curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
94: curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
95: curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
96: return handle;
97: }
98:
99: /* HREF finder implemented in libxml2 but could be any HTML parser */
100: size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
101: {
102: int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
103: HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
104: htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
105: if(!doc)
106: return 0;
107: xmlChar *xpath = (xmlChar*) "//a/@href";
108: xmlXPathContextPtr context = xmlXPathNewContext(doc);
109: xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
110: xmlXPathFreeContext(context);
111: if(!result)
112: return 0;
113: xmlNodeSetPtr nodeset = result->nodesetval;
114: if(xmlXPathNodeSetIsEmpty(nodeset)) {
115: xmlXPathFreeObject(result);
116: return 0;
117: }
118: size_t count = 0;
119: int i;
120: for(i = 0; i < nodeset->nodeNr; i++) {
121: double r = rand();
122: int x = r * nodeset->nodeNr / RAND_MAX;
123: const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
124: xmlChar *href = xmlNodeListGetString(doc, node, 1);
125: if(follow_relative_links) {
126: xmlChar *orig = href;
127: href = xmlBuildURI(href, (xmlChar *) url);
128: xmlFree(orig);
129: }
130: char *link = (char *) href;
131: if(!link || strlen(link) < 20)
132: continue;
133: if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
134: curl_multi_add_handle(multi_handle, make_handle(link));
135: if(count++ == max_link_per_page)
136: break;
137: }
138: xmlFree(link);
139: }
140: xmlXPathFreeObject(result);
141: return count;
142: }
143:
144: int is_html(char *ctype)
145: {
146: return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
147: }
148:
149: int main(void)
150: {
151: signal(SIGINT, sighandler);
152: LIBXML_TEST_VERSION;
153: curl_global_init(CURL_GLOBAL_DEFAULT);
154: CURLM *multi_handle = curl_multi_init();
155: curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
156: curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
157:
158: /* enables http/2 if available */
159: #ifdef CURLPIPE_MULTIPLEX
160: curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
161: #endif
162:
163: /* sets html start page */
164: curl_multi_add_handle(multi_handle, make_handle(start_page));
165:
166: int msgs_left;
167: int pending = 0;
168: int complete = 0;
169: int still_running = 1;
170: while(still_running && !pending_interrupt) {
171: int numfds;
172: curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
173: curl_multi_perform(multi_handle, &still_running);
174:
175: /* See how the transfers went */
176: CURLMsg *m = NULL;
177: while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
178: if(m->msg == CURLMSG_DONE) {
179: CURL *handle = m->easy_handle;
180: char *url;
181: memory *mem;
182: curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
183: curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
184: if(m->data.result == CURLE_OK) {
185: long res_status;
186: curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
187: if(res_status == 200) {
188: char *ctype;
189: curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
190: printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
191: if(is_html(ctype) && mem->size > 100) {
192: if(pending < max_requests && (complete + pending) < max_total) {
193: pending += follow_links(multi_handle, mem, url);
194: still_running = 1;
195: }
196: }
197: }
198: else {
199: printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
200: }
201: }
202: else {
203: printf("[%d] Connection failure: %s\n", complete, url);
204: }
205: curl_multi_remove_handle(multi_handle, handle);
206: curl_easy_cleanup(handle);
207: free(mem->buf);
208: free(mem);
209: complete++;
210: pending--;
211: }
212: }
213: }
214: curl_multi_cleanup(multi_handle);
215: curl_global_cleanup();
216: return 0;
217: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>