Annotation of embedaddon/curl/docs/examples/htmltitle.cpp, revision 1.1.1.1

1.1       misho       1: /***************************************************************************
                      2:  *                                  _   _ ____  _
                      3:  *  Project                     ___| | | |  _ \| |
                      4:  *                             / __| | | | |_) | |
                      5:  *                            | (__| |_| |  _ <| |___
                      6:  *                             \___|\___/|_| \_\_____|
                      7:  *
                      8:  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
                      9:  *
                     10:  * This software is licensed as described in the file COPYING, which
                     11:  * you should have received as part of this distribution. The terms
                     12:  * are also available at https://curl.haxx.se/docs/copyright.html.
                     13:  *
                     14:  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
                     15:  * copies of the Software, and permit persons to whom the Software is
                     16:  * furnished to do so, under the terms of the COPYING file.
                     17:  *
                     18:  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
                     19:  * KIND, either express or implied.
                     20:  *
                     21:  ***************************************************************************/
                     22: /* <DESC>
                     23:  * Get a web page, extract the title with libxml.
                     24:  * </DESC>
                     25: 
                     26:  Written by Lars Nilsson
                     27: 
                     28:  GNU C++ compile command line suggestion (edit paths accordingly):
                     29: 
                     30:  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
                     31:  -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
                     32: */
                     33: #include <stdio.h>
                     34: #include <string.h>
                     35: #include <stdlib.h>
                     36: #include <string>
                     37: #include <curl/curl.h>
                     38: #include <libxml/HTMLparser.h>
                     39: 
                     40: //
                     41: //  Case-insensitive string comparison
                     42: //
                     43: 
                     44: #ifdef _MSC_VER
                     45: #define COMPARE(a, b) (!_stricmp((a), (b)))
                     46: #else
                     47: #define COMPARE(a, b) (!strcasecmp((a), (b)))
                     48: #endif
                     49: 
                     50: //
                     51: //  libxml callback context structure
                     52: //
                     53: 
                     54: struct Context
                     55: {
                     56:   Context(): addTitle(false) { }
                     57: 
                     58:   bool addTitle;
                     59:   std::string title;
                     60: };
                     61: 
                     62: //
                     63: //  libcurl variables for error strings and returned data
                     64: 
                     65: static char errorBuffer[CURL_ERROR_SIZE];
                     66: static std::string buffer;
                     67: 
                     68: //
                     69: //  libcurl write callback function
                     70: //
                     71: 
                     72: static int writer(char *data, size_t size, size_t nmemb,
                     73:                   std::string *writerData)
                     74: {
                     75:   if(writerData == NULL)
                     76:     return 0;
                     77: 
                     78:   writerData->append(data, size*nmemb);
                     79: 
                     80:   return size * nmemb;
                     81: }
                     82: 
                     83: //
                     84: //  libcurl connection initialization
                     85: //
                     86: 
                     87: static bool init(CURL *&conn, char *url)
                     88: {
                     89:   CURLcode code;
                     90: 
                     91:   conn = curl_easy_init();
                     92: 
                     93:   if(conn == NULL) {
                     94:     fprintf(stderr, "Failed to create CURL connection\n");
                     95:     exit(EXIT_FAILURE);
                     96:   }
                     97: 
                     98:   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
                     99:   if(code != CURLE_OK) {
                    100:     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
                    101:     return false;
                    102:   }
                    103: 
                    104:   code = curl_easy_setopt(conn, CURLOPT_URL, url);
                    105:   if(code != CURLE_OK) {
                    106:     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
                    107:     return false;
                    108:   }
                    109: 
                    110:   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
                    111:   if(code != CURLE_OK) {
                    112:     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
                    113:     return false;
                    114:   }
                    115: 
                    116:   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
                    117:   if(code != CURLE_OK) {
                    118:     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
                    119:     return false;
                    120:   }
                    121: 
                    122:   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
                    123:   if(code != CURLE_OK) {
                    124:     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
                    125:     return false;
                    126:   }
                    127: 
                    128:   return true;
                    129: }
                    130: 
                    131: //
                    132: //  libxml start element callback function
                    133: //
                    134: 
                    135: static void StartElement(void *voidContext,
                    136:                          const xmlChar *name,
                    137:                          const xmlChar **attributes)
                    138: {
                    139:   Context *context = static_cast<Context *>(voidContext);
                    140: 
                    141:   if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) {
                    142:     context->title = "";
                    143:     context->addTitle = true;
                    144:   }
                    145:   (void) attributes;
                    146: }
                    147: 
                    148: //
                    149: //  libxml end element callback function
                    150: //
                    151: 
                    152: static void EndElement(void *voidContext,
                    153:                        const xmlChar *name)
                    154: {
                    155:   Context *context = static_cast<Context *>(voidContext);
                    156: 
                    157:   if(COMPARE(reinterpret_cast<char *>(name), "TITLE"))
                    158:     context->addTitle = false;
                    159: }
                    160: 
                    161: //
                    162: //  Text handling helper function
                    163: //
                    164: 
                    165: static void handleCharacters(Context *context,
                    166:                              const xmlChar *chars,
                    167:                              int length)
                    168: {
                    169:   if(context->addTitle)
                    170:     context->title.append(reinterpret_cast<char *>(chars), length);
                    171: }
                    172: 
                    173: //
                    174: //  libxml PCDATA callback function
                    175: //
                    176: 
                    177: static void Characters(void *voidContext,
                    178:                        const xmlChar *chars,
                    179:                        int length)
                    180: {
                    181:   Context *context = static_cast<Context *>(voidContext);
                    182: 
                    183:   handleCharacters(context, chars, length);
                    184: }
                    185: 
                    186: //
                    187: //  libxml CDATA callback function
                    188: //
                    189: 
                    190: static void cdata(void *voidContext,
                    191:                   const xmlChar *chars,
                    192:                   int length)
                    193: {
                    194:   Context *context = static_cast<Context *>(voidContext);
                    195: 
                    196:   handleCharacters(context, chars, length);
                    197: }
                    198: 
                    199: //
                    200: //  libxml SAX callback structure
                    201: //
                    202: 
                    203: static htmlSAXHandler saxHandler =
                    204: {
                    205:   NULL,
                    206:   NULL,
                    207:   NULL,
                    208:   NULL,
                    209:   NULL,
                    210:   NULL,
                    211:   NULL,
                    212:   NULL,
                    213:   NULL,
                    214:   NULL,
                    215:   NULL,
                    216:   NULL,
                    217:   NULL,
                    218:   NULL,
                    219:   StartElement,
                    220:   EndElement,
                    221:   NULL,
                    222:   Characters,
                    223:   NULL,
                    224:   NULL,
                    225:   NULL,
                    226:   NULL,
                    227:   NULL,
                    228:   NULL,
                    229:   NULL,
                    230:   cdata,
                    231:   NULL
                    232: };
                    233: 
                    234: //
                    235: //  Parse given (assumed to be) HTML text and return the title
                    236: //
                    237: 
                    238: static void parseHtml(const std::string &html,
                    239:                       std::string &title)
                    240: {
                    241:   htmlParserCtxtPtr ctxt;
                    242:   Context context;
                    243: 
                    244:   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
                    245:                                   XML_CHAR_ENCODING_NONE);
                    246: 
                    247:   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
                    248:   htmlParseChunk(ctxt, "", 0, 1);
                    249: 
                    250:   htmlFreeParserCtxt(ctxt);
                    251: 
                    252:   title = context.title;
                    253: }
                    254: 
                    255: int main(int argc, char *argv[])
                    256: {
                    257:   CURL *conn = NULL;
                    258:   CURLcode code;
                    259:   std::string title;
                    260: 
                    261:   // Ensure one argument is given
                    262: 
                    263:   if(argc != 2) {
                    264:     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
                    265:     exit(EXIT_FAILURE);
                    266:   }
                    267: 
                    268:   curl_global_init(CURL_GLOBAL_DEFAULT);
                    269: 
                    270:   // Initialize CURL connection
                    271: 
                    272:   if(!init(conn, argv[1])) {
                    273:     fprintf(stderr, "Connection initializion failed\n");
                    274:     exit(EXIT_FAILURE);
                    275:   }
                    276: 
                    277:   // Retrieve content for the URL
                    278: 
                    279:   code = curl_easy_perform(conn);
                    280:   curl_easy_cleanup(conn);
                    281: 
                    282:   if(code != CURLE_OK) {
                    283:     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
                    284:     exit(EXIT_FAILURE);
                    285:   }
                    286: 
                    287:   // Parse the (assumed) HTML code
                    288:   parseHtml(buffer, title);
                    289: 
                    290:   // Display the extracted title
                    291:   printf("Title: %s\n", title.c_str());
                    292: 
                    293:   return EXIT_SUCCESS;
                    294: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>