/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_www.c - Raptor WWW retrieval core * * Copyright (C) 2003-2008, David Beckett http://www.dajobe.org/ * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * * */ #ifdef HAVE_CONFIG_H #include #endif #ifdef WIN32 #include #endif #include #include #include #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_SYS_STAT_H #include #endif /* Raptor includes */ #include "raptor.h" #include "raptor_internal.h" static int raptor_www_init_common(int skip_www_init_finish, int *www_initialized); static void raptor_www_finish_common(int skip_www_init_finish); static int raptor_www_file_fetch(raptor_www* www); #ifndef RAPTOR_DISABLE_V1 /* should raptor_www do initializing and cleanup of the WWW library */ static int raptor_www_skip_www_init_finish=0; static int raptor_www_initialized=0; #endif #ifndef RAPTOR_DISABLE_V1 /** * raptor_www_init: * * Initialise the WWW class. * * Must be called before creating any #raptor_www object. * * See also: raptor_www_init_v2() **/ void raptor_www_init(void) { raptor_www_init_common(raptor_www_skip_www_init_finish, &raptor_www_initialized); } #endif /** * raptor_www_init_v2: * @world: raptor_world object * * Initialise the WWW class. * * Must be called before creating any #raptor_www object. * * See also: raptor_www_init() * * Return value: non-0 on failure **/ int raptor_www_init_v2(raptor_world* world) { #ifndef RAPTOR_DISABLE_V1 /* support legacy v1 raptor_www_no_www_library_init_finish() */ if(raptor_www_skip_www_init_finish) world->www_skip_www_init_finish = raptor_www_skip_www_init_finish; /* skip init if already inited with legacy init() */ if(raptor_www_initialized) return 0; #endif return raptor_www_init_common(world->www_skip_www_init_finish, &world->www_initialized); } static int raptor_www_init_common(int skip_www_init_finish, int *www_initialized) { int rc = 0; if(*www_initialized) return 0; if(!skip_www_init_finish) { #ifdef RAPTOR_WWW_LIBCURL rc = curl_global_init(CURL_GLOBAL_ALL); #endif } *www_initialized = 1; return rc; } #ifndef RAPTOR_DISABLE_V1 /** * raptor_www_no_www_library_init_finish: * * Do not initialise or finish the lower level WWW library. * * If this is called then the raptor_www library will neither * initialise or terminate the lower level WWW library. Usually in * raptor_init either curl_global_init (for libcurl) * are called and in raptor_finish curl_global_cleanup is called. * * This allows the application finer control over these libraries such * as setting other global options or potentially calling and terminating * raptor several times. It does mean that applications which use * this call must do their own extra work in order to allocate and free * all resources to the system. * * This function must be called before raptor_init. * * See also: raptor_www_no_www_library_init_finish_v2() * **/ void raptor_www_no_www_library_init_finish(void) { raptor_www_skip_www_init_finish = 1; } #endif /** * raptor_www_no_www_library_init_finish_v2: * @world: raptor_world object * * Do not initialise or finish the lower level WWW library. * * If this is called then the raptor_www library will neither * initialise or terminate the lower level WWW library. Usually in * raptor_world_open() either curl_global_init (for libcurl) * are called and in raptor_finish curl_global_cleanup is called. * * This allows the application finer control over these libraries such * as setting other global options or potentially calling and terminating * raptor several times. It does mean that applications which use * this call must do their own extra work in order to allocate and free * all resources to the system. * * This function must be called before raptor_world_open(). * **/ void raptor_www_no_www_library_init_finish_v2(raptor_world* world) { world->www_skip_www_init_finish = 1; } #ifndef RAPTOR_DISABLE_V1 /** * raptor_www_finish: * * Terminate the WWW class. * * Must be called to clean any resources used by the WWW implementation. * * See also: raptor_www_finish_v2() **/ void raptor_www_finish(void) { raptor_www_finish_common(raptor_www_skip_www_init_finish); } #endif /** * raptor_www_finish_v2: * @world: raptor_world object * * Terminate the WWW class. * * Must be called to clean any resources used by the WWW implementation. * * See also: raptor_www_finish() **/ void raptor_www_finish_v2(raptor_world* world) { raptor_www_finish_common(world->www_skip_www_init_finish); } static void raptor_www_finish_common(int skip_www_init_finish) { if(!skip_www_init_finish) { #ifdef RAPTOR_WWW_LIBCURL curl_global_cleanup(); #endif } } #ifndef RAPTOR_DISABLE_V1 /** * raptor_www_new_with_connection: * @connection: external WWW connection object. * * Constructor - create a new #raptor_www object over an existing WWW connection. * * At present this only works with a libcurl CURL handle object * when raptor is compiled with libcurl suppport. Otherwise the * @connection is ignored. This allows such things as setting * up special flags on the curl handle before passing into the constructor. * * raptor_init() MUST have been called before calling this function. * Use raptor_www_new_with_connection_v2() if using raptor_world APIs. * * Return value: a new #raptor_www object or NULL on failure. **/ raptor_www* raptor_www_new_with_connection(void *connection) { return raptor_www_new_with_connection_v2(raptor_world_instance(), connection); } #endif /** * raptor_www_new_with_connection_v2: * @world: raptor_world object * @connection: external WWW connection object. * * Constructor - create a new #raptor_www object over an existing WWW connection. * * At present this only works with a libcurl CURL handle object * when raptor is compiled with libcurl suppport. Otherwise the * @connection is ignored. This allows such things as setting * up special flags on the curl handle before passing into the constructor. * * Return value: a new #raptor_www object or NULL on failure. **/ raptor_www* raptor_www_new_with_connection_v2(raptor_world* world, void *connection) { raptor_www* www=(raptor_www* )RAPTOR_CALLOC(www, 1, sizeof(raptor_www)); if(!www) return NULL; www->world=world; www->type=NULL; www->free_type=1; /* default is to free content type */ www->total_bytes=0; www->failed=0; www->status_code=0; www->write_bytes=NULL; www->content_type=NULL; www->uri_filter=NULL; www->connection_timeout=10; www->cache_control=NULL; #ifdef RAPTOR_WWW_LIBCURL www->curl_handle=(CURL*)connection; raptor_www_curl_init(www); #endif #ifdef RAPTOR_WWW_LIBXML raptor_www_libxml_init(www); #endif #ifdef RAPTOR_WWW_LIBFETCH raptor_www_libfetch_init(www); #endif www->error_handlers.locator=&www->locator; raptor_error_handlers_init_v2(world, &www->error_handlers); return www; } #ifndef RAPTOR_DISABLE_V1 /** * raptor_www_new: * * Constructor - create a new #raptor_www object. * * raptor_init() MUST have been called before calling this function. * Use raptor_www_new_v2() if using raptor_world APIs. * * Return value: a new #raptor_www or NULL on failure. **/ raptor_www* raptor_www_new(void) { return raptor_www_new_v2(raptor_world_instance()); } #endif /** * raptor_www_new_v2: * @world: raptor_world object * * Constructor - create a new #raptor_www object. * * Return value: a new #raptor_www or NULL on failure. **/ raptor_www* raptor_www_new_v2(raptor_world* world) { return raptor_www_new_with_connection_v2(world, NULL); } /** * raptor_www_free: * @www: WWW object. * * Destructor - destroy a #raptor_www object. **/ void raptor_www_free(raptor_www* www) { /* free context */ if(www->type) { if(www->free_type) RAPTOR_FREE(cstring, www->type); www->type=NULL; } if(www->user_agent) { RAPTOR_FREE(cstring, www->user_agent); www->user_agent=NULL; } if(www->cache_control) { RAPTOR_FREE(cstring, www->cache_control); www->cache_control=NULL; } if(www->proxy) { RAPTOR_FREE(cstring, www->proxy); www->proxy=NULL; } if(www->http_accept) { RAPTOR_FREE(cstring, www->http_accept); www->http_accept=NULL; } #ifdef RAPTOR_WWW_LIBCURL raptor_www_curl_free(www); #endif #ifdef RAPTOR_WWW_LIBXML raptor_www_libxml_free(www); #endif #ifdef RAPTOR_WWW_LIBFETCH raptor_www_libfetch_free(www); #endif if(www->uri) raptor_free_uri_v2(www->world, www->uri); if(www->final_uri) raptor_free_uri_v2(www->world, www->final_uri); RAPTOR_FREE(www, www); } /** * raptor_www_set_error_handler: * @www: WWW object * @error_handler: error handler function * @error_data: error handler data * * Set the error handler routine for the raptor_www class. * * This takes the same arguments as the raptor_parser_set_error() and * raptor_parser_set_warning_handler() methods. **/ void raptor_www_set_error_handler(raptor_www* www, raptor_message_handler error_handler, void *error_data) { www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].user_data=error_data; www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].handler=error_handler; } /** * raptor_www_set_write_bytes_handler: * @www: WWW object * @handler: bytes handler function * @user_data: bytes handler data * * Set the handler to receive bytes written by the #raptor_www implementation. * **/ void raptor_www_set_write_bytes_handler(raptor_www* www, raptor_www_write_bytes_handler handler, void *user_data) { www->write_bytes=handler; www->write_bytes_userdata=user_data; } /** * raptor_www_set_content_type_handler: * @www: WWW object * @handler: content type handler function * @user_data: content type handler data * * Set the handler to receive the HTTP Content-Type header value. * * This is called if or when the value is discovered during retrieval * by the raptor_www implementation. Not all implementations provide * access to this. **/ void raptor_www_set_content_type_handler(raptor_www* www, raptor_www_content_type_handler handler, void *user_data) { www->content_type=handler; www->content_type_userdata=user_data; } /** * raptor_www_set_user_agent: * @www: WWW object * @user_agent: User-Agent string * * Set the user agent value, for HTTP requests typically. **/ void raptor_www_set_user_agent(raptor_www* www, const char *user_agent) { char *ua_copy=NULL; if(!user_agent || !*user_agent) { www->user_agent=NULL; return; } ua_copy=(char*)RAPTOR_MALLOC(cstring, strlen(user_agent)+1); if(!ua_copy) return; strcpy(ua_copy, user_agent); www->user_agent=ua_copy; } /** * raptor_www_set_proxy: * @www: WWW object * @proxy: proxy string. * * Set the proxy for the WWW object. * * The @proxy usually a string of the form http://server.domain:port. **/ void raptor_www_set_proxy(raptor_www* www, const char *proxy) { char *proxy_copy; if(!proxy) return; proxy_copy=(char*)RAPTOR_MALLOC(cstring, strlen(proxy)+1); if(!proxy_copy) return; strcpy(proxy_copy, proxy); www->proxy=proxy_copy; } /** * raptor_www_set_http_accept: * @www: #raptor_www class * @value: Accept: header value or NULL to have an empty one. * * Set HTTP Accept header. * **/ void raptor_www_set_http_accept(raptor_www* www, const char *value) { char *value_copy; size_t len=8; /* strlen("Accept:")+1 */ if(value) len+=1+strlen(value); /* " "+value */ value_copy=(char*)RAPTOR_MALLOC(cstring, len); if(!value_copy) return; www->http_accept=value_copy; strcpy(value_copy, "Accept:"); value_copy+=7; if(value) { *value_copy++=' '; strcpy(value_copy, value); } #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG2("Using Accept header: '%s'\n", www->http_accept); #endif } /** * raptor_www_set_connection_timeout: * @www: WWW object * @timeout: Timeout in seconds * * Set WWW connection timeout **/ void raptor_www_set_connection_timeout(raptor_www* www, int timeout) { www->connection_timeout=timeout; } /** * raptor_www_set_http_cache_control: * @www: WWW object * @cache_control: Cache-Control header value (or NULL to disable) * * Set HTTP Cache-Control:header (default none) * * The @cache_control value can be a string to set it, "" to send * a blank header or NULL to not set the header at all. * * Return value: non-0 on failure **/ int raptor_www_set_http_cache_control(raptor_www* www, const char* cache_control) { char *cache_control_copy; const char* const header="Cache-Control:"; const size_t header_len=14; /* strlen("Cache-Control:") */ size_t len; RAPTOR_ASSERT((strlen(header) != header_len), "Cache-Control header length is wrong"); if(www->cache_control) { RAPTOR_FREE(cstring, www->cache_control); www->cache_control=NULL; } if(!cache_control) { www->cache_control=NULL; return 0; } len=header_len + 1 +strlen(cache_control) + 1; /* header+" "+cache_control+"\0" */ cache_control_copy=(char*)RAPTOR_MALLOC(cstring, len); if(!cache_control_copy) return 1; www->cache_control=cache_control_copy; strncpy(cache_control_copy, header, header_len); cache_control_copy+= header_len; if(*cache_control) { *cache_control_copy++=' '; strcpy(cache_control_copy, cache_control); } #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG2("Using Cache-Control header: '%s'\n", www->cache_control); #endif return 0; } /** * raptor_www_set_uri_filter: * @www: WWW object * @filter: URI filter function * @user_data: User data to pass to filter function * * Set URI filter function for WWW retrieval. **/ void raptor_www_set_uri_filter(raptor_www* www, raptor_uri_filter_func filter, void *user_data) { www->uri_filter=filter; www->uri_filter_user_data=user_data; } /** * raptor_www_get_connection: * @www: #raptor_www object * * Get WWW library connection object. * * Return the internal WWW connection handle. For libcurl, this * returns the CURL handle and for libxml the context. Otherwise * it returns NULL. * * Return value: connection pointer **/ void* raptor_www_get_connection(raptor_www* www) { #ifdef RAPTOR_WWW_NONE return NULL; #endif #ifdef RAPTOR_WWW_LIBCURL return www->curl_handle; #endif #ifdef RAPTOR_WWW_LIBXML return www->ctxt; #endif #ifdef RAPTOR_WWW_LIBFETCH return NULL; #endif return NULL; } /** * raptor_www_abort: * @www: WWW object * @reason: abort reason message * * Abort an ongoing raptor WWW operation and pass back a reason. * * This is typically used within one of the raptor WWW handlers * when retrieval need no longer continue due to another * processing issue or error. **/ void raptor_www_abort(raptor_www* www, const char *reason) { www->failed=1; } void raptor_www_error(raptor_www* www, const char *message, ...) { va_list arguments; va_start(arguments, message); raptor_log_error_varargs(www->world, RAPTOR_LOG_LEVEL_ERROR, www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].handler, www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].user_data, &www->locator, message, arguments); va_end(arguments); } static int raptor_www_file_handle_fetch(raptor_www* www, FILE* fh) { unsigned char buffer[RAPTOR_WWW_BUFFER_SIZE+1]; while(!feof(fh)) { int len=fread(buffer, 1, RAPTOR_WWW_BUFFER_SIZE, fh); if(len > 0) { www->total_bytes += len; buffer[len]='\0'; if(www->write_bytes) www->write_bytes(www, www->write_bytes_userdata, buffer, len, 1); } if(feof(fh) || www->failed) break; } if(!www->failed) www->status_code=200; return www->failed; } static int raptor_www_file_fetch(raptor_www* www) { char *filename; FILE *fh; unsigned char *uri_string=raptor_uri_as_string_v2(www->world, www->uri); #if defined(HAVE_UNISTD_H) && defined(HAVE_SYS_STAT_H) struct stat buf; #endif www->status_code=200; filename=raptor_uri_uri_string_to_filename(uri_string); if(!filename) { raptor_www_error(www, "Not a file: URI"); return 1; } #if defined(HAVE_UNISTD_H) && defined(HAVE_SYS_STAT_H) if(!stat(filename, &buf) && S_ISDIR(buf.st_mode)) { raptor_www_error(www, "Cannot read from a directory '%s'", filename); RAPTOR_FREE(cstring, filename); www->status_code=404; return 1; } #endif fh=fopen(filename, "rb"); if(!fh) { raptor_www_error(www, "file '%s' open failed - %s", filename, strerror(errno)); RAPTOR_FREE(cstring, filename); www->status_code=(errno == EACCES) ? 403: 404; www->failed=1; return www->failed; } raptor_www_file_handle_fetch(www, fh); fclose(fh); RAPTOR_FREE(cstring, filename); return www->failed; } /** * raptor_www_fetch: * @www: WWW object * @uri: URI to read from * * Start a WWW content retrieval for the given URI, returning data via the write_bytes handler. * * Return value: non-0 on failure. **/ int raptor_www_fetch(raptor_www *www, raptor_uri *uri) { int status=1; www->uri=raptor_new_uri_for_retrieval_v2(www->world, uri); www->locator.uri=uri; www->locator.line= -1; www->locator.column= -1; if(www->uri_filter) if(www->uri_filter(www->uri_filter_user_data, uri)) return status; #ifdef RAPTOR_WWW_NONE status=raptor_www_file_fetch(www); #else if(raptor_uri_uri_string_is_file_uri(raptor_uri_as_string_v2(www->world, www->uri))) status=raptor_www_file_fetch(www); else { #ifdef RAPTOR_WWW_LIBCURL status=raptor_www_curl_fetch(www); #endif #ifdef RAPTOR_WWW_LIBXML status=raptor_www_libxml_fetch(www); #endif #ifdef RAPTOR_WWW_LIBFETCH status=raptor_www_libfetch_fetch(www); #endif } #endif if(!status && www->status_code && www->status_code != 200){ raptor_www_error(www, "Resolving URI failed with HTTP status %d", www->status_code); status=1; } www->failed=status; return www->failed; } static void raptor_www_fetch_to_string_write_bytes(raptor_www* www, void *userdata, const void *ptr, size_t size, size_t nmemb) { raptor_stringbuffer* sb=(raptor_stringbuffer*)userdata; int len=size*nmemb; raptor_stringbuffer_append_counted_string(sb, (unsigned char*)ptr, len, 1); } /** * raptor_www_fetch_to_string: * @www: raptor_www object * @uri: raptor_uri to retrieve * @string_p: pointer to location to hold string * @length_p: pointer to location to hold length of string (or NULL) * @malloc_handler: pointer to malloc to use to make string (or NULL) * * Start a WWW content retrieval for the given URI, returning the data in a new string. * * If malloc_handler is null, raptor will allocate it using it's * own memory allocator. *string_p is set to NULL on failure (and * *length_p to 0 if length_p is not NULL). * * Return value: non-0 on failure **/ RAPTOR_EXTERN_C int raptor_www_fetch_to_string(raptor_www *www, raptor_uri *uri, void **string_p, size_t *length_p, void *(*malloc_handler)(size_t size)) { raptor_stringbuffer *sb=NULL; void *str=NULL; raptor_www_write_bytes_handler saved_write_bytes; void *saved_write_bytes_userdata; sb=raptor_new_stringbuffer(); if(!sb) return 1; if(length_p) *length_p=0; saved_write_bytes=www->write_bytes; saved_write_bytes_userdata=www->write_bytes_userdata; raptor_www_set_write_bytes_handler(www, raptor_www_fetch_to_string_write_bytes, sb); if(raptor_www_fetch(www, uri)) str=NULL; else { size_t len=raptor_stringbuffer_length(sb); if(len) { str=(void*)malloc_handler(len+1); if(str) { raptor_stringbuffer_copy_to_string(sb, (unsigned char*)str, len+1); *string_p=str; if(length_p) *length_p=len; } } } if(sb) raptor_free_stringbuffer(sb); raptor_www_set_write_bytes_handler(www, saved_write_bytes, saved_write_bytes_userdata); return (str == NULL); } /** * raptor_www_get_final_uri: * @www: #raptor_www object * * Get the WWW final resolved URI. * * This returns the URI used after any protocol redirection. * * Return value: a new URI or NULL if not known. **/ raptor_uri* raptor_www_get_final_uri(raptor_www* www) { return www->final_uri ? raptor_uri_copy_v2(www->world, www->final_uri) : NULL; } /** * raptor_www_set_final_uri_handler: * @www: WWW object * @handler: content type handler function * @user_data: content type handler data * * Set the handler to receive the HTTP Content-Type header value. * * This is called if or when the value is discovered during retrieval * by the raptor_www implementation. Not all implementations provide * access to this. **/ void raptor_www_set_final_uri_handler(raptor_www* www, raptor_www_final_uri_handler handler, void *user_data) { www->final_uri_handler=handler; www->final_uri_userdata=user_data; }