File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / curl / docs / examples / htmltitle.cpp
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Jun 3 10:01:15 2020 UTC (4 years, 10 months ago) by misho
Branches: curl, MAIN
CVS tags: v7_70_0p4, HEAD
curl

    1: /***************************************************************************
    2:  *                                  _   _ ____  _
    3:  *  Project                     ___| | | |  _ \| |
    4:  *                             / __| | | | |_) | |
    5:  *                            | (__| |_| |  _ <| |___
    6:  *                             \___|\___/|_| \_\_____|
    7:  *
    8:  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
    9:  *
   10:  * This software is licensed as described in the file COPYING, which
   11:  * you should have received as part of this distribution. The terms
   12:  * are also available at https://curl.haxx.se/docs/copyright.html.
   13:  *
   14:  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
   15:  * copies of the Software, and permit persons to whom the Software is
   16:  * furnished to do so, under the terms of the COPYING file.
   17:  *
   18:  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
   19:  * KIND, either express or implied.
   20:  *
   21:  ***************************************************************************/
   22: /* <DESC>
   23:  * Get a web page, extract the title with libxml.
   24:  * </DESC>
   25: 
   26:  Written by Lars Nilsson
   27: 
   28:  GNU C++ compile command line suggestion (edit paths accordingly):
   29: 
   30:  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
   31:  -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
   32: */
   33: #include <stdio.h>
   34: #include <string.h>
   35: #include <stdlib.h>
   36: #include <string>
   37: #include <curl/curl.h>
   38: #include <libxml/HTMLparser.h>
   39: 
   40: //
   41: //  Case-insensitive string comparison
   42: //
   43: 
   44: #ifdef _MSC_VER
   45: #define COMPARE(a, b) (!_stricmp((a), (b)))
   46: #else
   47: #define COMPARE(a, b) (!strcasecmp((a), (b)))
   48: #endif
   49: 
   50: //
   51: //  libxml callback context structure
   52: //
   53: 
   54: struct Context
   55: {
   56:   Context(): addTitle(false) { }
   57: 
   58:   bool addTitle;
   59:   std::string title;
   60: };
   61: 
   62: //
   63: //  libcurl variables for error strings and returned data
   64: 
   65: static char errorBuffer[CURL_ERROR_SIZE];
   66: static std::string buffer;
   67: 
   68: //
   69: //  libcurl write callback function
   70: //
   71: 
   72: static int writer(char *data, size_t size, size_t nmemb,
   73:                   std::string *writerData)
   74: {
   75:   if(writerData == NULL)
   76:     return 0;
   77: 
   78:   writerData->append(data, size*nmemb);
   79: 
   80:   return size * nmemb;
   81: }
   82: 
   83: //
   84: //  libcurl connection initialization
   85: //
   86: 
   87: static bool init(CURL *&conn, char *url)
   88: {
   89:   CURLcode code;
   90: 
   91:   conn = curl_easy_init();
   92: 
   93:   if(conn == NULL) {
   94:     fprintf(stderr, "Failed to create CURL connection\n");
   95:     exit(EXIT_FAILURE);
   96:   }
   97: 
   98:   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
   99:   if(code != CURLE_OK) {
  100:     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
  101:     return false;
  102:   }
  103: 
  104:   code = curl_easy_setopt(conn, CURLOPT_URL, url);
  105:   if(code != CURLE_OK) {
  106:     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
  107:     return false;
  108:   }
  109: 
  110:   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
  111:   if(code != CURLE_OK) {
  112:     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
  113:     return false;
  114:   }
  115: 
  116:   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
  117:   if(code != CURLE_OK) {
  118:     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
  119:     return false;
  120:   }
  121: 
  122:   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
  123:   if(code != CURLE_OK) {
  124:     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
  125:     return false;
  126:   }
  127: 
  128:   return true;
  129: }
  130: 
  131: //
  132: //  libxml start element callback function
  133: //
  134: 
  135: static void StartElement(void *voidContext,
  136:                          const xmlChar *name,
  137:                          const xmlChar **attributes)
  138: {
  139:   Context *context = static_cast<Context *>(voidContext);
  140: 
  141:   if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) {
  142:     context->title = "";
  143:     context->addTitle = true;
  144:   }
  145:   (void) attributes;
  146: }
  147: 
  148: //
  149: //  libxml end element callback function
  150: //
  151: 
  152: static void EndElement(void *voidContext,
  153:                        const xmlChar *name)
  154: {
  155:   Context *context = static_cast<Context *>(voidContext);
  156: 
  157:   if(COMPARE(reinterpret_cast<char *>(name), "TITLE"))
  158:     context->addTitle = false;
  159: }
  160: 
  161: //
  162: //  Text handling helper function
  163: //
  164: 
  165: static void handleCharacters(Context *context,
  166:                              const xmlChar *chars,
  167:                              int length)
  168: {
  169:   if(context->addTitle)
  170:     context->title.append(reinterpret_cast<char *>(chars), length);
  171: }
  172: 
  173: //
  174: //  libxml PCDATA callback function
  175: //
  176: 
  177: static void Characters(void *voidContext,
  178:                        const xmlChar *chars,
  179:                        int length)
  180: {
  181:   Context *context = static_cast<Context *>(voidContext);
  182: 
  183:   handleCharacters(context, chars, length);
  184: }
  185: 
  186: //
  187: //  libxml CDATA callback function
  188: //
  189: 
  190: static void cdata(void *voidContext,
  191:                   const xmlChar *chars,
  192:                   int length)
  193: {
  194:   Context *context = static_cast<Context *>(voidContext);
  195: 
  196:   handleCharacters(context, chars, length);
  197: }
  198: 
  199: //
  200: //  libxml SAX callback structure
  201: //
  202: 
  203: static htmlSAXHandler saxHandler =
  204: {
  205:   NULL,
  206:   NULL,
  207:   NULL,
  208:   NULL,
  209:   NULL,
  210:   NULL,
  211:   NULL,
  212:   NULL,
  213:   NULL,
  214:   NULL,
  215:   NULL,
  216:   NULL,
  217:   NULL,
  218:   NULL,
  219:   StartElement,
  220:   EndElement,
  221:   NULL,
  222:   Characters,
  223:   NULL,
  224:   NULL,
  225:   NULL,
  226:   NULL,
  227:   NULL,
  228:   NULL,
  229:   NULL,
  230:   cdata,
  231:   NULL
  232: };
  233: 
  234: //
  235: //  Parse given (assumed to be) HTML text and return the title
  236: //
  237: 
  238: static void parseHtml(const std::string &html,
  239:                       std::string &title)
  240: {
  241:   htmlParserCtxtPtr ctxt;
  242:   Context context;
  243: 
  244:   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
  245:                                   XML_CHAR_ENCODING_NONE);
  246: 
  247:   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
  248:   htmlParseChunk(ctxt, "", 0, 1);
  249: 
  250:   htmlFreeParserCtxt(ctxt);
  251: 
  252:   title = context.title;
  253: }
  254: 
  255: int main(int argc, char *argv[])
  256: {
  257:   CURL *conn = NULL;
  258:   CURLcode code;
  259:   std::string title;
  260: 
  261:   // Ensure one argument is given
  262: 
  263:   if(argc != 2) {
  264:     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
  265:     exit(EXIT_FAILURE);
  266:   }
  267: 
  268:   curl_global_init(CURL_GLOBAL_DEFAULT);
  269: 
  270:   // Initialize CURL connection
  271: 
  272:   if(!init(conn, argv[1])) {
  273:     fprintf(stderr, "Connection initializion failed\n");
  274:     exit(EXIT_FAILURE);
  275:   }
  276: 
  277:   // Retrieve content for the URL
  278: 
  279:   code = curl_easy_perform(conn);
  280:   curl_easy_cleanup(conn);
  281: 
  282:   if(code != CURLE_OK) {
  283:     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
  284:     exit(EXIT_FAILURE);
  285:   }
  286: 
  287:   // Parse the (assumed) HTML code
  288:   parseHtml(buffer, title);
  289: 
  290:   // Display the extracted title
  291:   printf("Title: %s\n", title.c_str());
  292: 
  293:   return EXIT_SUCCESS;
  294: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>