/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_grddl.c - Raptor GRDDL (+microformats) Parser implementation * * Copyright (C) 2005-2008, David Beckett http://www.dajobe.org/ * Copyright (C) 2005, University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * */ /* * Specifications: * Gleaning Resource Descriptions from Dialects of Languages (GRDDL) * W3C Proposed Recommendation 16 July 2007 * http://www.w3.org/TR/2007/PR-grddl-20070716/ * http://www.w3.org/TR/grddl/ * */ #ifdef HAVE_CONFIG_H #include #endif #ifdef WIN32 #include #endif #include #include #include #include #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_STDLIB_H #include #endif /* Raptor includes */ #include "raptor.h" #include "raptor_internal.h" #include /* for xmlXPathRegisterNs() */ #include #include #include #include #include #include #include /* * libxslt API notes * * Inputs to an XSLT transformation process with libxslt are: * 1. A set of (key:value) parameters. * 2. An xsltStylesheetPtr for the XSLT sheet * Which could be made from a file or an xmlDoc; and the xmlDoc. * made from a file or memory buffer. * 3. An xmlDoc for the XML source * Which could be made from a file or a memory buffer. * */ static void raptor_grddl_filter_triples(void *user_data, const raptor_statement *statement); static void raptor_libxslt_error_common(raptor_parser* rdf_parser, const char *msg, va_list args, const char *prefix) RAPTOR_PRINTF_FORMAT(2, 0); static void raptor_grddl_xsltGenericError_handler(void *user_data, const char *msg, ...) RAPTOR_PRINTF_FORMAT(2, 0); typedef struct { /* transformation (XSLT) or profile URI */ raptor_uri* uri; /* base URI in effect when the above was found */ raptor_uri* base_uri; } grddl_xml_context; /* * XSLT parser object */ struct raptor_grddl_parser_context_s { raptor_world* world; raptor_parser* rdf_parser; xmlSAXHandler sax; /* HTML document ctxt */ htmlParserCtxtPtr html_ctxt; /* XML document ctxt */ xmlParserCtxtPtr xml_ctxt; /* Create xpath evaluation context */ xmlXPathContextPtr xpathCtx; /* parser for dealing with the result */ raptor_parser* internal_parser; /* ... constructed with this name */ const char* internal_parser_name; /* sax2 structure - only for recording error pointers */ raptor_sax2* sax2; /* URI of root namespace of document */ raptor_uri* root_ns_uri; /* List of transformation URIs for document */ raptor_sequence* doc_transform_uris; /* Copy of the user data statement_handler overwritten to point to * raptor_grddl_filter_triples() */ void* saved_user_data; raptor_statement_handler saved_statement_handler; /* URI data-view:namespaceTransformation */ raptor_uri* namespace_transformation_uri; /* URI data-view:profileTransformation */ raptor_uri* profile_transformation_uri; /* List of namespace / URIs */ raptor_sequence* profile_uris; /* List of visited URIs */ raptor_sequence* visited_uris; /* Depth of GRDDL parsers - 0 means that the lists above * are owned by this parser: visited_uris * */ int grddl_depth; /* Content-Type of top-level document */ char* content_type; /* Check content type once */ int content_type_check; /* stringbuffer to use to store retrieved document */ raptor_stringbuffer* sb; /* non-0 to perform an additional RDF/XML parse on a retrieved document * because it has been identified as RDF/XML. */ int process_this_as_rdfxml; /* non-0 to perform GRDL processing on document */ int grddl_processing; /* non-0 to perform XML Include processing on document */ int xinclude_processing; /* non-0 to perform HTML Base processing on document */ int html_base_processing; /* non-0 to perform HTML processing on document */ int html_link_processing; }; typedef struct raptor_grddl_parser_context_s raptor_grddl_parser_context; static void raptor_libxslt_error_common(raptor_parser* rdf_parser, const char *msg, va_list args, const char *prefix) { int prefix_length=strlen(prefix); int length; char *nmsg; length=prefix_length+strlen(msg)+1; nmsg=(char*)RAPTOR_MALLOC(cstring, length); if(nmsg) { strcpy(nmsg, prefix); strcpy(nmsg+prefix_length, msg); if(nmsg[length-1]=='\n') nmsg[length-1]='\0'; } raptor_parser_error_varargs(rdf_parser, nmsg ? nmsg : msg, args); if(nmsg) RAPTOR_FREE(cstring,nmsg); } static void raptor_grddl_xsltGenericError_handler(void *user_data, const char *msg, ...) { raptor_parser* rdf_parser=(raptor_parser*)user_data; va_list arguments; if(!msg || *msg == '\n') return; va_start(arguments, msg); raptor_libxslt_error_common(rdf_parser, msg, arguments, "libxslt error: "); va_end(arguments); } static grddl_xml_context* raptor_new_xml_context(raptor_world* world, raptor_uri* uri, raptor_uri* base_uri) { grddl_xml_context* xml_context; xml_context=(grddl_xml_context*)RAPTOR_MALLOC(xml_context, sizeof(grddl_xml_context)); if(uri) uri=raptor_uri_copy_v2(world, uri); if(base_uri) base_uri=raptor_uri_copy_v2(world, base_uri); xml_context->uri=uri; xml_context->base_uri=base_uri; return xml_context; } static void grddl_free_xml_context(void *context, void* userdata) { raptor_world* world=(raptor_world*)context; grddl_xml_context* xml_context=(grddl_xml_context*)userdata; if(xml_context->uri) raptor_free_uri_v2(world, xml_context->uri); if(xml_context->base_uri) raptor_free_uri_v2(world, xml_context->base_uri); RAPTOR_FREE(grddl_xml_context, xml_context); } static int raptor_grddl_parse_init_common(raptor_parser* rdf_parser, const char *name) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; grddl_parser->world=rdf_parser->world; grddl_parser->rdf_parser=rdf_parser; /* sax2 structure - only for recording error pointers */ grddl_parser->sax2=raptor_new_sax2(rdf_parser, &rdf_parser->error_handlers); /* The following error fields are normally initialised by * raptor_libxml_init() via raptor_sax2_parse_start() which is * not used here as we go to libxml calls direct. */ raptor_libxml_init_sax_error_handlers(&grddl_parser->sax); /* Sequence of URIs of XSLT sheets to transform the document */ grddl_parser->doc_transform_uris = raptor_new_sequence_v2((raptor_sequence_free_handler_v2*)grddl_free_xml_context, NULL, rdf_parser->world); grddl_parser->grddl_processing=1; grddl_parser->xinclude_processing=1; grddl_parser->html_base_processing=0; grddl_parser->html_link_processing=1; return 0; } static int raptor_grddl_parse_init(raptor_parser* rdf_parser, const char *name) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; raptor_world* world=rdf_parser->world; raptor_grddl_parse_init_common(rdf_parser, name); /* Sequence of URIs from */ grddl_parser->profile_uris = raptor_new_sequence_v2((raptor_sequence_free_handler_v2*)grddl_free_xml_context, NULL, (void*)world); grddl_parser->namespace_transformation_uri=raptor_new_uri_v2(world, (const unsigned char*)"http://www.w3.org/2003/g/data-view#namespaceTransformation"); grddl_parser->profile_transformation_uri=raptor_new_uri_v2(world, (const unsigned char*)"http://www.w3.org/2003/g/data-view#profileTransformation"); /* Sequence of URIs visited - may be overwritten if this is not * the depth 0 grddl parser */ grddl_parser->visited_uris = raptor_new_sequence_v2((raptor_sequence_free_handler_v2*)raptor_free_uri_v2, (raptor_sequence_print_handler_v2*)raptor_uri_print_v2, (void*)world); return 0; } static void raptor_grddl_parse_terminate(raptor_parser *rdf_parser) { raptor_grddl_parser_context *grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; if(grddl_parser->xml_ctxt) { if(grddl_parser->xml_ctxt->myDoc) { xmlFreeDoc(grddl_parser->xml_ctxt->myDoc); grddl_parser->xml_ctxt->myDoc=NULL; } xmlFreeParserCtxt(grddl_parser->xml_ctxt); } if(grddl_parser->html_ctxt) { if(grddl_parser->html_ctxt->myDoc) { xmlFreeDoc(grddl_parser->html_ctxt->myDoc); grddl_parser->html_ctxt->myDoc=NULL; } htmlFreeParserCtxt(grddl_parser->html_ctxt); } if(grddl_parser->xpathCtx) xmlXPathFreeContext(grddl_parser->xpathCtx); if(grddl_parser->internal_parser) raptor_free_parser(grddl_parser->internal_parser); if(grddl_parser->sax2) raptor_free_sax2(grddl_parser->sax2); if(grddl_parser->root_ns_uri) raptor_free_uri_v2(rdf_parser->world, grddl_parser->root_ns_uri); if(grddl_parser->doc_transform_uris) raptor_free_sequence(grddl_parser->doc_transform_uris); if(grddl_parser->profile_uris) raptor_free_sequence(grddl_parser->profile_uris); if(grddl_parser->namespace_transformation_uri) raptor_free_uri_v2(rdf_parser->world, grddl_parser->namespace_transformation_uri); if(grddl_parser->profile_transformation_uri) raptor_free_uri_v2(rdf_parser->world, grddl_parser->profile_transformation_uri); if(!grddl_parser->grddl_depth) { if(grddl_parser->visited_uris) raptor_free_sequence(grddl_parser->visited_uris); } if(grddl_parser->content_type) RAPTOR_FREE(cstring, grddl_parser->content_type); if(grddl_parser->sb) raptor_free_stringbuffer(grddl_parser->sb); } static void raptor_grddl_parser_add_parent(raptor_parser *rdf_parser, raptor_grddl_parser_context* parent_grddl_parser) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; /* Do not set parent twice */ if(grddl_parser->visited_uris == parent_grddl_parser->visited_uris) return; /* free any sequence here */ if(grddl_parser->visited_uris) raptor_free_sequence(grddl_parser->visited_uris); /* share parent's list and do not free it here */ grddl_parser->visited_uris= parent_grddl_parser->visited_uris; grddl_parser->grddl_depth= parent_grddl_parser->grddl_depth+1; grddl_parser->saved_user_data= parent_grddl_parser->rdf_parser; grddl_parser->saved_statement_handler= raptor_grddl_filter_triples; } static int raptor_grddl_parse_start(raptor_parser *rdf_parser) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; raptor_locator *locator=&rdf_parser->locator; locator->line=1; grddl_parser->content_type_check=0; grddl_parser->process_this_as_rdfxml=0; return 0; } #define MATCH_IS_VALUE_LIST 1 #define MATCH_IS_PROFILE 2 #define MATCH_IS_HARDCODED 4 /* stop looking for other hardcoded matches */ #define MATCH_LAST 8 static struct { const xmlChar* xpath; int flags; const xmlChar* xslt_sheet_uri; } match_table[]={ /* XHTML document where the GRDDL profile is in * inside the html * Value of @rel is a space-separated list of link types. */ { (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/html:link[contains(@rel,\"transformation\")]/@href", 0, NULL } , /* XHTML document where the GRDDL profile is in * inside the html * Value of @rel is a space-separated list of link types. */ { (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/../..//html:a[contains(@rel,\"transformation\")]/@href", 0, NULL } , /* XML document linking to transform via attribute dataview:transformation * on the root element. * Example: http://www.w3.org/2004/01/rdxh/grddl-p3p-example **/ { (const xmlChar*)"/*/@dataview:transformation", MATCH_IS_VALUE_LIST, NULL } , /* hCalendar microformat http://microformats.org/wiki/hcalendar */ { (const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' vevent ')]", MATCH_IS_HARDCODED, (const xmlChar*)"http://www.w3.org/2002/12/cal/glean-hcal.xsl" } , /* hReview microformat http://microformats.org/wiki/review */ { (const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' hreview ')]", MATCH_IS_HARDCODED | MATCH_LAST, /* stop here since hCard is inside hReview */ (const xmlChar*)"http://www.w3.org/2001/sw/grddl-wg/doc29/hreview2rdfxml.xsl" } , /* hCard microformat http://microformats.org/wiki/hcard */ { (const xmlChar*)"//*[contains(concat(' ', concat(normalize-space(@class),' ')),' vcard ')]", MATCH_IS_HARDCODED, (const xmlChar*)"http://www.w3.org/2006/vcard/hcard2rdf.xsl" } , { NULL, 0, NULL } }; static const char* grddl_namespace_uris_ignore_list[]={ "http://www.w3.org/1999/xhtml", "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "http://www.w3.org/2001/XMLSchema", NULL }; /* add URI to XSLT transformation URI list */ static void raptor_grddl_add_transform_xml_context(raptor_grddl_parser_context* grddl_parser, grddl_xml_context* xml_context) { int i; raptor_uri* uri=xml_context->uri; #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG2("Found document transformation URI '%s'\n", raptor_uri_as_string_v2(grddl_parser->world, uri)); #endif for(i=0; i < raptor_sequence_size(grddl_parser->doc_transform_uris); i++) { grddl_xml_context* xc=(grddl_xml_context*)raptor_sequence_get_at(grddl_parser->doc_transform_uris, i); if(raptor_uri_equals_v2(grddl_parser->world, uri, xc->uri)) { #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG2("Already seen XSLT URI '%s'\n", raptor_uri_as_string_v2(grddl_parser->world, uri)); #endif grddl_free_xml_context(grddl_parser->world, xml_context); return; } } RAPTOR_DEBUG3("Adding new document transformation XSLT URI %s with base URI %s\n", (uri ? (const char*)raptor_uri_as_string_v2(grddl_parser->world, uri): "(NONE)"), (xml_context->base_uri ? (const char*)raptor_uri_as_string_v2(grddl_parser->world, xml_context->base_uri) : "(NONE)")); raptor_sequence_push(grddl_parser->doc_transform_uris, xml_context); } static void raptor_grddl_filter_triples(void *user_data, const raptor_statement *statement) { raptor_parser* rdf_parser=(raptor_parser*)user_data; raptor_grddl_parser_context* grddl_parser; int i; raptor_uri* predicate_uri; grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; /* Look for a triple */ if(!statement->subject_type == RAPTOR_IDENTIFIER_TYPE_RESOURCE || !statement->predicate_type == RAPTOR_IDENTIFIER_TYPE_RESOURCE || !statement->object_type == RAPTOR_IDENTIFIER_TYPE_RESOURCE) return; #if RAPTOR_DEBUG > 2 RAPTOR_DEBUG2("Parser %p: Relaying statement: ", rdf_parser); raptor_print_statement(statement, stderr); fputc('\n', stderr); #endif #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG3("Parser %p: Checking against %d profile URIs\n", rdf_parser, raptor_sequence_size(grddl_parser->profile_uris)); #endif /* Look for (i=0, root namespace URI) * data-view:namespaceTransformation ?tr * or (i>0, profile URIs) * data-view:profileTransformation ?tr * and then ?tr becomes a new document transformation URI */ predicate_uri=grddl_parser->namespace_transformation_uri; for(i=0; i < raptor_sequence_size(grddl_parser->profile_uris); i++) { grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_get_at(grddl_parser->profile_uris, i); raptor_uri* profile_uri=xml_context->uri; grddl_xml_context* new_xml_context; if(i==1) predicate_uri=grddl_parser->profile_transformation_uri; if(!profile_uri) continue; if(raptor_uri_equals_v2(rdf_parser->world, (raptor_uri*)statement->subject, profile_uri) && raptor_uri_equals_v2(rdf_parser->world, (raptor_uri*)statement->predicate, predicate_uri)) { raptor_uri* uri=(raptor_uri*)statement->object; #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG4("Parser %p: Matches profile URI #%d '%s'\n", rdf_parser, i, raptor_uri_as_string_v2(rdf_parser->world, profile_uri)); #endif new_xml_context=raptor_new_xml_context(rdf_parser->world, uri, rdf_parser->base_uri); raptor_grddl_add_transform_xml_context(grddl_parser, new_xml_context); } else { #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG4("Parser %p: Failed to match profile URI #%d '%s'\n", rdf_parser, i, raptor_uri_as_string_v2(rdf_parser->world, profile_uri)); #endif } } } static int raptor_grddl_ensure_internal_parser(raptor_parser* rdf_parser, const char* parser_name, int filter) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; if(!grddl_parser->internal_parser_name || !strcmp(parser_name, "guess") || strcmp(grddl_parser->internal_parser_name, parser_name)) { /* construct a new parser if none in use or not what is required */ if(grddl_parser->internal_parser) { RAPTOR_DEBUG3("Parser %p: Freeing internal %s parser.\n", rdf_parser, grddl_parser->internal_parser_name); raptor_free_parser(grddl_parser->internal_parser); grddl_parser->internal_parser=NULL; grddl_parser->internal_parser_name=NULL; } RAPTOR_DEBUG3("Parser %p: Allocating new internal %s parser.\n", rdf_parser, parser_name); grddl_parser->internal_parser=raptor_new_parser_v2(rdf_parser->world, parser_name); if(!grddl_parser->internal_parser) { raptor_parser_error(rdf_parser, "Failed to create %s parser", parser_name); return 1; } /* initialise the new parser with the outer state */ grddl_parser->internal_parser_name=parser_name; if(raptor_parser_copy_user_state(grddl_parser->internal_parser, rdf_parser)) return 1; grddl_parser->saved_user_data=rdf_parser->user_data; grddl_parser->saved_statement_handler=rdf_parser->statement_handler; } /* Filter the triples for profile/namespace URIs */ if(filter) { grddl_parser->internal_parser->user_data= rdf_parser; grddl_parser->internal_parser->statement_handler= raptor_grddl_filter_triples; } else { grddl_parser->internal_parser->user_data= grddl_parser->saved_user_data; grddl_parser->internal_parser->statement_handler= grddl_parser->saved_statement_handler; } return 0; } /* Run a GRDDL transform using a pre-parsed XSLT stylesheet already * formed into a libxml document (with URI) */ static int raptor_grddl_run_grddl_transform_doc(raptor_parser* rdf_parser, grddl_xml_context* xml_context, xmlDocPtr xslt_doc, xmlDocPtr doc) { raptor_world* world = rdf_parser->world; raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; int ret=0; xsltStylesheetPtr sheet=NULL; xmlDocPtr res=NULL; xmlChar *doc_txt=NULL; int doc_txt_len=0; const char* parser_name; const char* params[7]; const unsigned char* base_uri_string; size_t base_uri_len; raptor_uri* xslt_uri; raptor_uri* base_uri; char *quoted_base_uri=NULL; xmlGenericErrorFunc saved_xsltGenericError; void *saved_xsltGenericErrorContext; xsltTransformContextPtr userCtxt; xslt_uri=xml_context->uri; base_uri=xml_context->base_uri ? xml_context->base_uri : xml_context->uri; base_uri_string=raptor_uri_as_counted_string_v2(rdf_parser->world, base_uri, &base_uri_len); RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI '%s' with doc base URI '%s'\n", raptor_uri_as_string_v2(rdf_parser->world, xslt_uri), base_uri_string); sheet = xsltParseStylesheetDoc(xslt_doc); if(!sheet) { raptor_parser_error(rdf_parser, "Failed to parse stylesheet in '%s'", raptor_uri_as_string_v2(rdf_parser->world, xslt_uri)); ret=1; goto cleanup_xslt; } userCtxt = xsltNewTransformContext(sheet, doc); if(world->xslt_security_preferences) xsltSetCtxtSecurityPrefs((xsltSecurityPrefs*)world->xslt_security_preferences, userCtxt); saved_xsltGenericError = xsltGenericError; saved_xsltGenericErrorContext = xsltGenericErrorContext; xsltSetGenericErrorFunc(rdf_parser, raptor_grddl_xsltGenericError_handler); #if 1 /* FIXME: * Define 'base', 'Base' and 'url' params to allow some XSLT sheets to work: * base: * http://www.w3.org/2000/07/uri43/uri.xsl * Base: * http://www.w3.org/2000/08/w3c-synd/home2rss.xsl * url: (optional) * http://www.w3.org/2001/sw/grddl-wg/td/RDFa2RDFXML.xsl */ quoted_base_uri=(char*)RAPTOR_MALLOC(cstring, base_uri_len+3); quoted_base_uri[0]='\''; strncpy(quoted_base_uri+1, (const char*)base_uri_string, base_uri_len); quoted_base_uri[base_uri_len+1]='\''; quoted_base_uri[base_uri_len+2]='\0'; params[0]="base"; params[1]=(const char*)quoted_base_uri; params[2]="Base"; params[3]=(const char*)quoted_base_uri; params[4]="url"; params[5]=(const char*)quoted_base_uri; params[6]=NULL; res = xsltApplyStylesheetUser(sheet, doc, params, NULL, NULL, userCtxt); #else /* No params */ res = xsltApplyStylesheetUser(sheet, doc, NULL, NULL, NULL, userCtxt); #endif if(!res) { raptor_parser_error(rdf_parser, "Failed to apply stylesheet in '%s'", raptor_uri_as_string_v2(rdf_parser->world, xslt_uri)); ret=1; goto cleanup_xslt; } if(res->type == XML_HTML_DOCUMENT_NODE) { if(sheet->method != NULL) xmlFree(sheet->method); sheet->method = (xmlChar*)xmlMalloc(5); strncpy((char*)sheet->method, "html", 5); } /* write the resulting XML to a string */ xsltSaveResultToString(&doc_txt, &doc_txt_len, res, sheet); if(!doc_txt || !doc_txt_len) { /* FIXME: continue with an empty document? */ raptor_parser_warning(rdf_parser, "XSLT returned an empty document"); goto cleanup_xslt; } RAPTOR_DEBUG4("XSLT returned %d bytes document method %s media type %s\n", doc_txt_len, (sheet->method ? (const char*)sheet->method : "NULL"), (sheet->mediaType ? (const char*)sheet->mediaType : "NULL")); /* FIXME: Assumes mime types for XSLT */ if(sheet->mediaType == NULL && sheet->method) { if(!(strcmp((const char*)sheet->method, "text"))) { sheet->mediaType = (xmlChar*)xmlMalloc(11); strncpy((char*)sheet->mediaType, "text/plain",11); } else if(!(strcmp((const char*)sheet->method, "xml"))) { sheet->mediaType = (xmlChar*)xmlMalloc(16); strncpy((char*)sheet->mediaType, "application/xml",16); } else if(!(strcmp((const char*)sheet->method, "html"))) { sheet->mediaType = (xmlChar*)xmlMalloc(10); /* FIXME: use xhtml mime type? */ strncpy((char*)sheet->mediaType, "text/html",10); } } /* FIXME: Assume all that all media XML is RDF/XML and also that * with no information at all we have RDF/XML */ if(!sheet->mediaType || (sheet->mediaType && !strcmp((const char*)sheet->mediaType, "application/xml"))) { if(sheet->mediaType) xmlFree(sheet->mediaType); sheet->mediaType = (xmlChar*)xmlMalloc(20); strncpy((char*)sheet->mediaType, "application/rdf+xml",20); } parser_name=raptor_guess_parser_name_v2(rdf_parser->world, NULL, (const char*)sheet->mediaType, doc_txt, doc_txt_len, NULL); if(!parser_name) { RAPTOR_DEBUG3("Parser %p: Guessed no parser from mime type '%s' and content - ending", rdf_parser, sheet->mediaType); goto cleanup_xslt; } RAPTOR_DEBUG4("Parser %p: Guessed parser %s from mime type '%s' and content\n", rdf_parser, parser_name, sheet->mediaType); if(!strcmp((const char*)parser_name, "grddl")) { RAPTOR_DEBUG2("Parser %p: Ignoring guess to run grddl parser - ending", rdf_parser); goto cleanup_xslt; } ret=raptor_grddl_ensure_internal_parser(rdf_parser, parser_name, 0); if(ret) goto cleanup_xslt; if(grddl_parser->internal_parser) { grddl_parser->internal_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(rdf_parser); /* generate the triples */ raptor_start_parse(grddl_parser->internal_parser, base_uri); raptor_parse_chunk(grddl_parser->internal_parser, doc_txt, doc_txt_len, 1); rdf_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(grddl_parser->internal_parser); } cleanup_xslt: if(userCtxt) xsltFreeTransformContext(userCtxt); if(quoted_base_uri) RAPTOR_FREE(cstring, quoted_base_uri); if(doc_txt) xmlFree(doc_txt); if(res) xmlFreeDoc(res); if(sheet) xsltFreeStylesheet(sheet); xsltSetGenericErrorFunc(saved_xsltGenericErrorContext, saved_xsltGenericError); return ret; } typedef struct { raptor_parser* rdf_parser; xmlParserCtxtPtr xc; raptor_uri* base_uri; } raptor_grddl_xml_parse_bytes_context; static void raptor_grddl_uri_xml_parse_bytes(raptor_www* www, void *userdata, const void *ptr, size_t size, size_t nmemb) { raptor_grddl_xml_parse_bytes_context* xpbc=(raptor_grddl_xml_parse_bytes_context*)userdata; int len=size*nmemb; int rc=0; if(!xpbc->xc) { xmlParserCtxtPtr xc; xc = xmlCreatePushParserCtxt(NULL, NULL, (const char*)ptr, len, (const char*)raptor_uri_as_string_v2(www->world, xpbc->base_uri)); if(!xc) rc=1; else { int libxml_options = 0; #ifdef RAPTOR_LIBXML_XML_PARSE_NONET if(xpbc->rdf_parser->features[RAPTOR_FEATURE_NO_NET]) libxml_options |= XML_PARSE_NONET; #endif #ifdef HAVE_XMLCTXTUSEOPTIONS xmlCtxtUseOptions(xc, libxml_options); #endif xc->replaceEntities = 1; xc->loadsubset = 1; } xpbc->xc=xc; } else rc=xmlParseChunk(xpbc->xc, (const char*)ptr, len, 0); if(rc) raptor_parser_error(xpbc->rdf_parser, "XML Parsing failed"); } static void raptor_grddl_discard_message(void *user_data, raptor_locator* locator, const char *message) { #ifdef RAPTOR_DEBUG raptor_world* world=(raptor_world*)user_data; RAPTOR_DEBUG3("%s: Discarded error message: %s\n", raptor_uri_as_string_v2(world, locator->uri), message); #endif return; } #define FETCH_IGNORE_ERRORS 1 #define FETCH_ACCEPT_XSLT 2 static int raptor_grddl_fetch_uri(raptor_parser* rdf_parser, raptor_uri* uri, raptor_www_write_bytes_handler write_bytes_handler, void* write_bytes_user_data, raptor_www_content_type_handler content_type_handler, void* content_type_user_data, int flags) { raptor_www *www; const char *accept_h; int ret=0; int ignore_errors=(flags & FETCH_IGNORE_ERRORS); if(rdf_parser->features[RAPTOR_FEATURE_NO_NET]) { if(!raptor_uri_uri_string_is_file_uri(raptor_uri_as_string_v2(rdf_parser->world, uri))) return 1; } www=raptor_www_new_v2(rdf_parser->world); if(!www) return 1; raptor_www_set_user_agent(www, "grddl/0.1"); if(flags & FETCH_ACCEPT_XSLT) { raptor_www_set_http_accept(www, "application/xml"); } else { accept_h=raptor_parser_get_accept_header(rdf_parser); if(accept_h) { raptor_www_set_http_accept(www, accept_h); RAPTOR_FREE(cstring, accept_h); } } if(rdf_parser->uri_filter) raptor_www_set_uri_filter(www, rdf_parser->uri_filter, rdf_parser->uri_filter_user_data); if(ignore_errors) raptor_www_set_error_handler(www, raptor_grddl_discard_message, rdf_parser->world); else raptor_www_set_error_handler(www, rdf_parser->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].handler, rdf_parser->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].user_data); raptor_www_set_write_bytes_handler(www, write_bytes_handler, write_bytes_user_data); raptor_www_set_content_type_handler(www, content_type_handler, content_type_user_data); if(rdf_parser->features[RAPTOR_FEATURE_WWW_TIMEOUT] > 0) raptor_www_set_connection_timeout(www, rdf_parser->features[RAPTOR_FEATURE_WWW_TIMEOUT]); ret=raptor_www_fetch(www, uri); raptor_www_free(www); return ret; } /* Run a GRDDL transform using a XSLT stylesheet at a given URI */ static int raptor_grddl_run_grddl_transform_uri(raptor_parser* rdf_parser, grddl_xml_context* xml_context, xmlDocPtr doc) { raptor_grddl_parser_context* grddl_parser; xmlParserCtxtPtr xslt_ctxt=NULL; raptor_grddl_xml_parse_bytes_context xpbc; int ret=0; raptor_uri* xslt_uri; raptor_uri* base_uri; raptor_uri* old_locator_uri; raptor_locator *locator=&rdf_parser->locator; xslt_uri=xml_context->uri; base_uri=xml_context->base_uri ? xml_context->base_uri : xml_context->uri; grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI %s and base URI %s\n", raptor_uri_as_string_v2(rdf_parser->world, xslt_uri), raptor_uri_as_string_v2(rdf_parser->world, base_uri)); /* make an xsltStylesheetPtr via the raptor_grddl_uri_xml_parse_bytes * callback as bytes are returned */ xpbc.xc=NULL; xpbc.rdf_parser=rdf_parser; xpbc.base_uri=xslt_uri; old_locator_uri=locator->uri; locator->uri=xslt_uri; ret=raptor_grddl_fetch_uri(rdf_parser, xslt_uri, raptor_grddl_uri_xml_parse_bytes, &xpbc, NULL, NULL, FETCH_ACCEPT_XSLT); xslt_ctxt = xpbc.xc; if(ret) { locator->uri=old_locator_uri; raptor_parser_warning(rdf_parser, "Fetching XSLT document URI '%s' failed", raptor_uri_as_string_v2(rdf_parser->world, xslt_uri)); ret=0; } else { xmlParseChunk(xpbc.xc, NULL, 0, 1); ret=raptor_grddl_run_grddl_transform_doc(rdf_parser, xml_context, xslt_ctxt->myDoc, doc); locator->uri=old_locator_uri; } if(xslt_ctxt) xmlFreeParserCtxt(xslt_ctxt); return ret; } static int raptor_grddl_seen_uri(raptor_grddl_parser_context* grddl_parser, raptor_uri* uri) { int i; int seen=0; raptor_sequence* seq=grddl_parser->visited_uris; for(i=0; i < raptor_sequence_size(seq); i++) { raptor_uri* vuri=(raptor_uri*)raptor_sequence_get_at(seq, i); if(raptor_uri_equals_v2(grddl_parser->world, uri, vuri)) { seen=1; break; } } if(seen) RAPTOR_DEBUG2("Already seen URI '%s'\n", raptor_uri_as_string_v2(grddl_parser->world, uri)); return seen; } static void raptor_grddl_done_uri(raptor_grddl_parser_context* grddl_parser, raptor_uri* uri) { if(!grddl_parser->visited_uris) return; if(!raptor_grddl_seen_uri(grddl_parser, uri)) { raptor_sequence* seq=grddl_parser->visited_uris; raptor_sequence_push(seq, raptor_uri_copy_v2(grddl_parser->world, uri)); } } static raptor_sequence* raptor_grddl_run_xpath_match(raptor_parser* rdf_parser, xmlDocPtr doc, const xmlChar* xpathExpr, int flags) { raptor_grddl_parser_context* grddl_parser; /* Evaluate xpath expression */ xmlXPathObjectPtr xpathObj=NULL; raptor_sequence* seq=NULL; xmlNodeSetPtr nodes; int i; grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; seq=raptor_new_sequence((raptor_sequence_free_handler*)grddl_free_xml_context, NULL); /* Evaluate xpath expression */ xpathObj = xmlXPathEvalExpression(xpathExpr, grddl_parser->xpathCtx); if(!xpathObj) { raptor_parser_error(rdf_parser, "Unable to evaluate XPath expression \"%s\"", xpathExpr); raptor_free_sequence(seq); seq=NULL; goto cleanup_xpath_match; } nodes=xpathObj->nodesetval; if(!nodes || xmlXPathNodeSetIsEmpty(nodes)) { #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG3("No match found with XPath expression \"%s\" over '%s'\n", xpathExpr, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); #endif raptor_free_sequence(seq); seq=NULL; goto cleanup_xpath_match; } #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG3("Found match with XPath expression \"%s\" over '%s'\n", xpathExpr, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); #endif for(i=0; i < xmlXPathNodeSetGetLength(nodes); i++) { xmlNodePtr node=nodes->nodeTab[i]; const unsigned char* uri_string=NULL; xmlChar *base_uri_string; raptor_uri* base_uri=NULL; raptor_uri* uri=NULL; if(node->type != XML_ATTRIBUTE_NODE && node->type != XML_ELEMENT_NODE) { raptor_parser_error(rdf_parser, "Got unexpected node type %d", node->type); continue; } /* xmlNodeGetBase() returns base URI or NULL and must be freed * with xmlFree() */ if(grddl_parser->html_base_processing) { xmlElementType savedType=doc->type; doc->type=XML_HTML_DOCUMENT_NODE; base_uri_string=xmlNodeGetBase(doc, node); doc->type=savedType; } else base_uri_string=xmlNodeGetBase(doc, node); if(node->type == XML_ATTRIBUTE_NODE) uri_string=(const unsigned char*)node->children->content; else { /* XML_ELEMENT_NODE */ if(node->ns) uri_string=(const unsigned char*)node->ns->href; } if(base_uri_string) { base_uri=raptor_new_uri_v2(rdf_parser->world, base_uri_string); xmlFree(base_uri_string); #if RAPTOR_DEBUG > 1 RAPTOR_DEBUG2("XML base URI of match is '%s'\n", raptor_uri_as_string_v2(rdf_parser->world, base_uri)); #endif } else if(rdf_parser->base_uri) base_uri=raptor_uri_copy_v2(rdf_parser->world, rdf_parser->base_uri); else base_uri=NULL; if(flags & MATCH_IS_VALUE_LIST) { char *start; char *end; char* buffer; size_t list_len=strlen((const char*)uri_string); buffer=(char*)RAPTOR_MALLOC(cstring, list_len+1); strncpy(buffer, (const char*)uri_string, list_len+1); for(start=end=buffer; end; start=end+1) { grddl_xml_context* xml_context; end=strchr(start, ' '); if(end) *end='\0'; if(start == end) continue; #if RAPTOR_DEBUG RAPTOR_DEBUG2("Got list match URI '%s'\n", start); #endif uri=raptor_new_uri_relative_to_base_v2(rdf_parser->world, base_uri, (const unsigned char*)start); if(flags & MATCH_IS_PROFILE && !strcmp((const char*)raptor_uri_as_string_v2(rdf_parser->world, uri), "http://www.w3.org/2003/g/data-view'")) { raptor_free_uri_v2(rdf_parser->world, uri); continue; } xml_context=raptor_new_xml_context(rdf_parser->world, uri, base_uri); raptor_sequence_push(seq, xml_context); } RAPTOR_FREE(cstring, buffer); } else if (flags & MATCH_IS_HARDCODED) { #if RAPTOR_DEBUG RAPTOR_DEBUG2("Got hardcoded XSLT match for %s\n", xpathExpr); #endif /* return at first match, that's enough */ break; } else { grddl_xml_context* xml_context; #if RAPTOR_DEBUG RAPTOR_DEBUG2("Got single match URI '%s'\n", uri_string); #endif uri=raptor_new_uri_relative_to_base_v2(rdf_parser->world, base_uri, uri_string); xml_context=raptor_new_xml_context(rdf_parser->world, uri, base_uri); raptor_sequence_push(seq, xml_context); raptor_free_uri_v2(rdf_parser->world, uri); } if(base_uri) raptor_free_uri_v2(rdf_parser->world, base_uri); } cleanup_xpath_match: if(xpathObj) xmlXPathFreeObject(xpathObj); return seq; } static void raptor_grddl_check_recursive_content_type_handler(raptor_www* www, void* userdata, const char* content_type) { raptor_parser* rdf_parser=(raptor_parser*)userdata; raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; size_t len; if(!content_type) return; len=strlen(content_type)+1; if(grddl_parser->content_type) RAPTOR_FREE(cstring,grddl_parser->content_type); grddl_parser->content_type=(char*)RAPTOR_MALLOC(cstring, len+1); strncpy(grddl_parser->content_type, content_type, len+1); if(!strncmp(content_type, "application/rdf+xml", 19)) { grddl_parser->process_this_as_rdfxml=1; RAPTOR_DEBUG2("Parser %p: Found RDF/XML content type\n", rdf_parser); raptor_parser_save_content(rdf_parser, 1); } if(!strncmp(content_type, "text/html", 9) || !strncmp(content_type, "application/html+xml", 20)) { RAPTOR_DEBUG3("Parser %p: Found HTML content type '%s'\n", rdf_parser, content_type); grddl_parser->html_base_processing=1; } } #define RECURSIVE_FLAGS_IGNORE_ERRORS 1 #define RECURSIVE_FLAGS_FILTER 2 static int raptor_grddl_run_recursive(raptor_parser* rdf_parser, raptor_uri* uri, const char *parser_name, int flags) { raptor_grddl_parser_context* grddl_parser; raptor_www_content_type_handler content_type_handler=NULL; int ret=0; const unsigned char* ibuffer=NULL; size_t ibuffer_len=0; raptor_parse_bytes_context rpbc; int ignore_errors=(flags & RECURSIVE_FLAGS_IGNORE_ERRORS) > 0; int filter=(flags & RECURSIVE_FLAGS_FILTER) > 0; int fetch_uri_flags=0; int is_grddl=!strcmp(parser_name, "grddl"); grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; if(raptor_grddl_seen_uri(grddl_parser, uri)) return 0; if(is_grddl) content_type_handler=raptor_grddl_check_recursive_content_type_handler; if(raptor_grddl_ensure_internal_parser(rdf_parser, parser_name, filter)) return !ignore_errors; RAPTOR_DEBUG3("Running recursive %s operation on URI '%s'\n", parser_name, raptor_uri_as_string_v2(rdf_parser->world, uri)); grddl_parser->internal_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(rdf_parser); if(is_grddl) raptor_grddl_parser_add_parent(grddl_parser->internal_parser, grddl_parser); rpbc.rdf_parser=grddl_parser->internal_parser; rpbc.base_uri=NULL; rpbc.final_uri=NULL; rpbc.started=0; if(ignore_errors) fetch_uri_flags |=FETCH_IGNORE_ERRORS; if(raptor_grddl_fetch_uri(grddl_parser->internal_parser, uri, raptor_parse_uri_write_bytes, &rpbc, content_type_handler, grddl_parser->internal_parser, fetch_uri_flags)) { if(!ignore_errors) raptor_parser_warning(rdf_parser, "Fetching GRDDL document URI '%s' failed\n", raptor_uri_as_string_v2(rdf_parser->world, uri)); ret=0; goto tidy; } if(ignore_errors) { raptor_error_handlers* eh=&grddl_parser->internal_parser->error_handlers; int i; /* NOTE not setting RAPTOR_LOG_LEVEL_NONE handler */ for(i=1; i <= (int)eh->last_log_level; i++) { eh->handlers[i].handler=raptor_grddl_discard_message; eh->handlers[i].user_data=rdf_parser->world; } } raptor_parse_chunk(grddl_parser->internal_parser, NULL, 0, 1); rdf_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(grddl_parser->internal_parser); /* If content was saved, process it as RDF/XML */ ibuffer=raptor_parser_get_content(grddl_parser->internal_parser, &ibuffer_len); if(ibuffer && strcmp(parser_name, "rdfxml")) { RAPTOR_DEBUG2("Running additional RDF/XML parse on URI '%s' content\n", raptor_uri_as_string_v2(rdf_parser->world, uri)); if(raptor_grddl_ensure_internal_parser(rdf_parser, "rdfxml", 1)) ret=1; else { grddl_parser->internal_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(rdf_parser); if(raptor_start_parse(grddl_parser->internal_parser, uri)) ret=1; else { ret=raptor_parse_chunk(grddl_parser->internal_parser, ibuffer, ibuffer_len, 1); rdf_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(grddl_parser->internal_parser); } } RAPTOR_FREE(cstring, ibuffer); raptor_parser_save_content(grddl_parser->internal_parser, 0); } if(rpbc.final_uri) raptor_free_uri_v2(rdf_parser->world, rpbc.final_uri); if(ignore_errors) ret=0; tidy: return ret; } static void raptor_grddl_libxml_discard_error(void* user_data, const char *msg, ...) { return; } static int raptor_grddl_parse_chunk(raptor_parser* rdf_parser, const unsigned char *s, size_t len, int is_end) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; int i; int ret=0; const unsigned char* uri_string; raptor_uri* uri; /* XML document DOM */ xmlDocPtr doc; int expri; const unsigned char* buffer=NULL; size_t buffer_len=0; int buffer_is_libxml=0; int loop; raptor_error_handlers eh; if(grddl_parser->content_type && !grddl_parser->content_type_check) { grddl_parser->content_type_check++; if(!strncmp(grddl_parser->content_type, "application/rdf+xml", 19)) { RAPTOR_DEBUG3("Parser %p: Found document with type '%s' is RDF/XML\n", rdf_parser, grddl_parser->content_type); grddl_parser->process_this_as_rdfxml=1; } if(!strncmp(grddl_parser->content_type, "text/html", 9) || !strncmp(grddl_parser->content_type, "application/html+xml", 20)) { RAPTOR_DEBUG3("Parser %p: Found document with type '%s' is HTML\n", rdf_parser, grddl_parser->content_type); grddl_parser->html_base_processing=1; } } if(!grddl_parser->sb) grddl_parser->sb=raptor_new_stringbuffer(); raptor_stringbuffer_append_counted_string(grddl_parser->sb, s, len, 1); if(!is_end) return 0; buffer_len=raptor_stringbuffer_length(grddl_parser->sb); buffer=(const unsigned char*)RAPTOR_MALLOC(cstring, buffer_len+1); if(buffer) raptor_stringbuffer_copy_to_string(grddl_parser->sb, (unsigned char*)buffer, buffer_len); uri_string=raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri); if(1) { raptor_error_handlers_init_v2(rdf_parser->world, &eh); eh.last_log_level=rdf_parser->error_handlers.last_log_level; /* Save error handlers and discard parsing errors * NOTE not setting RAPTOR_LOG_LEVEL_NONE handler */ for(i = RAPTOR_LOG_LEVEL_NONE; i <= (int)eh.last_log_level; i++) { eh.handlers[i].handler = rdf_parser->error_handlers.handlers[i].handler; eh.handlers[i].user_data = rdf_parser->error_handlers.handlers[i].user_data; if(i > RAPTOR_LOG_LEVEL_NONE) { rdf_parser->error_handlers.handlers[i].handler = raptor_grddl_discard_message; rdf_parser->error_handlers.handlers[i].user_data = rdf_parser->world; } } } RAPTOR_DEBUG4("Parser %p: URI %s: processing %d bytes of content\n", rdf_parser, uri_string, (int)buffer_len); for(loop=0; loop<2; loop++) { int rc; if(loop == 0) { int libxml_options = 0; RAPTOR_DEBUG2("Parser %p: Creating an XML parser\n", rdf_parser); /* try to create an XML parser context */ grddl_parser->xml_ctxt = xmlCreatePushParserCtxt(NULL, NULL, (const char*)buffer, buffer_len, (const char*)uri_string); if(!grddl_parser->xml_ctxt) { RAPTOR_DEBUG2("Parser %p: Creating an XML parser failed\n", rdf_parser); continue; } #ifdef RAPTOR_LIBXML_XML_PARSE_NONET if(rdf_parser->features[RAPTOR_FEATURE_NO_NET]) libxml_options |= XML_PARSE_NONET; #endif #ifdef HAVE_XMLCTXTUSEOPTIONS xmlCtxtUseOptions(grddl_parser->xml_ctxt, libxml_options); #endif grddl_parser->xml_ctxt->vctxt.warning = raptor_grddl_libxml_discard_error; grddl_parser->xml_ctxt->vctxt.error = raptor_grddl_libxml_discard_error; grddl_parser->xml_ctxt->replaceEntities = 1; grddl_parser->xml_ctxt->loadsubset = 1; } else if (loop == 1) { /* try to create an HTML parser context */ if(rdf_parser->features[RAPTOR_FEATURE_HTML_TAG_SOUP]) { xmlCharEncoding enc; int options; RAPTOR_DEBUG2("Parser %p: Creating an HTML parser\n", rdf_parser); enc = xmlDetectCharEncoding((const unsigned char*)buffer, buffer_len); grddl_parser->html_ctxt = htmlCreatePushParserCtxt(/*sax*/ NULL, /*user_data*/ NULL, (const char *)buffer, buffer_len, (const char *)uri_string, enc); if(!grddl_parser->html_ctxt) { RAPTOR_DEBUG2("Parser %p: Creating an HTML parser failed\n", rdf_parser); continue; } /* HTML parser */ grddl_parser->html_ctxt->replaceEntities = 1; grddl_parser->html_ctxt->loadsubset = 1; grddl_parser->html_ctxt->vctxt.error = raptor_grddl_libxml_discard_error; /* HTML_PARSE_NOWARNING disables sax->warning, vxtxt.warning */ /* HTML_PARSE_NOERROR disables sax->error, vctxt.error */ options = HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING; #ifdef HTML_PARSE_RECOVER options |= HTML_PARSE_RECOVER; #endif #ifdef RAPTOR_LIBXML_HTML_PARSE_NONET if(rdf_parser->features[RAPTOR_FEATURE_NO_NET]) options |= HTML_PARSE_NONET; #endif htmlCtxtUseOptions(grddl_parser->html_ctxt, options); } else continue; } else continue; xmlSetStructuredErrorFunc(&rdf_parser->error_handlers, raptor_libxml_xmlStructuredErrorFunc); rc=0; if(grddl_parser->html_ctxt) { RAPTOR_DEBUG2("Parser %p: Parsing as HTML\n", rdf_parser); rc=htmlParseChunk(grddl_parser->html_ctxt, (const char*)s, 0, 1); RAPTOR_DEBUG3("Parser %p: Parsing as HTML %s\n", rdf_parser, (rc ? "failed" : "succeeded")); if(rc) { if(grddl_parser->html_ctxt->myDoc) { xmlFreeDoc(grddl_parser->html_ctxt->myDoc); grddl_parser->html_ctxt->myDoc=NULL; } htmlFreeParserCtxt(grddl_parser->html_ctxt); grddl_parser->html_ctxt=NULL; } } else { RAPTOR_DEBUG2("Parser %p: Parsing as XML\n", rdf_parser); rc=xmlParseChunk(grddl_parser->xml_ctxt, (const char*)s, 0, 1); RAPTOR_DEBUG3("Parser %p: Parsing as XML %s\n", rdf_parser, (rc ? "failed" : "succeeded")); if(rc) { if(grddl_parser->xml_ctxt->myDoc) { xmlFreeDoc(grddl_parser->xml_ctxt->myDoc); grddl_parser->xml_ctxt->myDoc=NULL; } xmlFreeParserCtxt(grddl_parser->xml_ctxt); grddl_parser->xml_ctxt=NULL; } } if(!rc) break; } if(1) { /* Restore error handlers */ for(i = RAPTOR_LOG_LEVEL_NONE + 1; i<= (int)eh.last_log_level; i++) { rdf_parser->error_handlers.handlers[i].handler = eh.handlers[i].handler; rdf_parser->error_handlers.handlers[i].user_data = eh.handlers[i].user_data; } } if(!grddl_parser->html_ctxt && !grddl_parser->xml_ctxt) { raptor_parser_error(rdf_parser, "Failed to create HTML or XML parsers"); ret=1; goto tidy; } raptor_grddl_done_uri(grddl_parser, rdf_parser->base_uri); if(grddl_parser->html_ctxt) doc=grddl_parser->html_ctxt->myDoc; else doc=grddl_parser->xml_ctxt->myDoc; if(!doc) { raptor_parser_error(rdf_parser, "Failed to create XML DOM for GRDDL document"); ret=1; goto tidy; } if(!grddl_parser->grddl_processing) goto transform; if(grddl_parser->xinclude_processing) { RAPTOR_DEBUG3("Parser %p: Running XInclude processing on URI '%s'\n", rdf_parser, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); if(xmlXIncludeProcess(doc) < 0) { raptor_parser_error(rdf_parser, "XInclude processing failed for GRDDL document"); ret=1; goto tidy; } else { int blen; /* write the result of XML Include to buffer */ RAPTOR_FREE(cstring, buffer); xmlDocDumpFormatMemory(doc, (xmlChar**)&buffer, &blen, 1 /* indent the result */); buffer_len=blen; buffer_is_libxml=1; RAPTOR_DEBUG3("Parser %p: XML Include processing returned %d bytes document\n", rdf_parser, (int)buffer_len); } } RAPTOR_DEBUG3("Parser %p: Running top-level GRDDL on URI '%s'\n", rdf_parser, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); /* Work out if there is a root namespace URI */ if(1) { xmlNodePtr xnp; xmlNsPtr rootNs = NULL; const unsigned char* ns_uri_string=NULL; xnp = xmlDocGetRootElement(doc); if(xnp) { rootNs = xnp->ns; if(rootNs) ns_uri_string = (const unsigned char*)(rootNs->href); } if(ns_uri_string) { int n; RAPTOR_DEBUG3("Parser %p: Root namespace URI is %s\n", rdf_parser, ns_uri_string); if(!strcmp((const char*)ns_uri_string, (const char*)raptor_rdf_namespace_uri) && !strcmp((const char*)xnp->name, "RDF")) { RAPTOR_DEBUG3("Parser %p: Root element of %s is rdf:RDF - process this as RDF/XML later\n", rdf_parser, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); grddl_parser->process_this_as_rdfxml=1; } for(n=0; grddl_namespace_uris_ignore_list[n]; n++) { if(!strcmp(grddl_namespace_uris_ignore_list[n], (const char*)ns_uri_string)) { /* ignore this namespace */ RAPTOR_DEBUG3("Parser %p: Ignoring GRDDL for namespace URI '%s'\n", rdf_parser, ns_uri_string); ns_uri_string=NULL; break; } } if(ns_uri_string) { grddl_xml_context* xml_context; grddl_parser->root_ns_uri=raptor_new_uri_relative_to_base_v2(rdf_parser->world, rdf_parser->base_uri, ns_uri_string); xml_context=raptor_new_xml_context(rdf_parser->world, grddl_parser->root_ns_uri, rdf_parser->base_uri); raptor_sequence_push(grddl_parser->profile_uris, xml_context); RAPTOR_DEBUG3("Parser %p: Processing GRDDL namespace URI '%s'\n", rdf_parser, raptor_uri_as_string_v2(rdf_parser->world, grddl_parser->root_ns_uri)); raptor_grddl_run_recursive(rdf_parser, grddl_parser->root_ns_uri, "grddl", RECURSIVE_FLAGS_IGNORE_ERRORS | RECURSIVE_FLAGS_FILTER); } } } /* Always put something at the start of the list even if NULL * so later it can be searched for in output triples */ if(!grddl_parser->root_ns_uri) { grddl_xml_context* xml_context; xml_context=raptor_new_xml_context(rdf_parser->world, NULL, NULL); raptor_sequence_push(grddl_parser->profile_uris, xml_context); } /* Create the XPath evaluation context */ if(!grddl_parser->xpathCtx) { grddl_parser->xpathCtx = xmlXPathNewContext(doc); if(!grddl_parser->xpathCtx) { raptor_parser_error(rdf_parser, "Failed to create XPath context for GRDDL document"); ret=1; goto tidy; } xmlXPathRegisterNs(grddl_parser->xpathCtx, (const xmlChar*)"html", (const xmlChar*)"http://www.w3.org/1999/xhtml"); xmlXPathRegisterNs(grddl_parser->xpathCtx, (const xmlChar*)"dataview", (const xmlChar*)"http://www.w3.org/2003/g/data-view#"); } /* Try URIs */ if(1) { raptor_sequence* result; result=raptor_grddl_run_xpath_match(rdf_parser, doc, (const xmlChar*)"/html:html/html:head/@profile", MATCH_IS_VALUE_LIST | MATCH_IS_PROFILE); if(result) { RAPTOR_DEBUG4("Parser %p: Found %d URIs in URI '%s'\n", rdf_parser, raptor_sequence_size(result), raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); /* Store profile URIs, skipping NULLs or the GRDDL profile itself */ while(raptor_sequence_size(result)) { grddl_xml_context* xml_context; xml_context=(grddl_xml_context*)raptor_sequence_unshift(result); if(!xml_context) continue; uri=xml_context->uri; if(!strcmp("http://www.w3.org/2003/g/data-view", (const char*)raptor_uri_as_string_v2(rdf_parser->world, uri))) { RAPTOR_DEBUG3("Ignoring of URI %s: URI %s\n", raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri), raptor_uri_as_string_v2(rdf_parser->world, uri)); grddl_free_xml_context(rdf_parser->world, xml_context); continue; } raptor_sequence_push(grddl_parser->profile_uris, xml_context); } raptor_free_sequence(result); /* Recursive GRDDL through all the URIs */ for(i=1; i < raptor_sequence_size(grddl_parser->profile_uris); i++) { grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_get_at(grddl_parser->profile_uris, i); uri=xml_context->uri; if(!uri) continue; RAPTOR_DEBUG4("Processing #%d of URI %s: URI %s\n", i, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri), raptor_uri_as_string_v2(rdf_parser->world, uri)); ret=raptor_grddl_run_recursive(rdf_parser, uri, "grddl", RECURSIVE_FLAGS_IGNORE_ERRORS| RECURSIVE_FLAGS_FILTER); } } } /* end head profile URIs */ /* Try XHTML document with alternate forms * * Value of @href is a URI */ if(grddl_parser->html_link_processing && rdf_parser->features[RAPTOR_FEATURE_HTML_LINK]) { raptor_sequence* result; result=raptor_grddl_run_xpath_match(rdf_parser, doc, (const xmlChar*)"/html:html/html:head/html:link[@type=\"application/rdf+xml\"]/@href", 0); if(result) { RAPTOR_DEBUG4("Parser %p: Found %d URIs in URI '%s'\n", rdf_parser, raptor_sequence_size(result), raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); /* Recursively parse all the URIs, skipping NULLs */ i=0; while(raptor_sequence_size(result)) { grddl_xml_context* xml_context; xml_context=(grddl_xml_context*)raptor_sequence_unshift(result); if(!xml_context) continue; uri=xml_context->uri; if(uri) { RAPTOR_DEBUG4("Processing #%d of URI %s: URI %s\n", i, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri), raptor_uri_as_string_v2(rdf_parser->world, uri)); i++; ret=raptor_grddl_run_recursive(rdf_parser, uri, "guess", RECURSIVE_FLAGS_IGNORE_ERRORS); } grddl_free_xml_context(rdf_parser->world, xml_context); } raptor_free_sequence(result); } } /* Try all XPaths */ for(expri=0; match_table[expri].xpath; expri++) { raptor_sequence* result; int flags=match_table[expri].flags; if((flags & MATCH_IS_HARDCODED) && !rdf_parser->features[RAPTOR_FEATURE_MICROFORMATS]) continue; result=raptor_grddl_run_xpath_match(rdf_parser, doc, match_table[expri].xpath, flags); if(result) { if(match_table[expri].xslt_sheet_uri) { grddl_xml_context* xml_context; /* Ignore what matched, use a hardcoded XSLT URI */ uri_string=match_table[expri].xslt_sheet_uri; RAPTOR_DEBUG3("Parser %p: Using hard-coded XSLT URI '%s'\n", rdf_parser, uri_string); raptor_free_sequence(result); result=raptor_new_sequence((raptor_sequence_free_handler*)grddl_free_xml_context, NULL); uri=raptor_new_uri_relative_to_base_v2(rdf_parser->world, rdf_parser->base_uri, uri_string); xml_context=raptor_new_xml_context(rdf_parser->world, uri, rdf_parser->base_uri); raptor_sequence_push(result, xml_context); raptor_free_uri_v2(rdf_parser->world, uri); } while(raptor_sequence_size(result)) { grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_unshift(result); if(!xml_context) break; raptor_grddl_add_transform_xml_context(grddl_parser, xml_context); } raptor_free_sequence(result); if(flags & MATCH_LAST) break; } if(rdf_parser->failed) break; } /* end XPath expression loop */ if(rdf_parser->failed) { ret=1; goto tidy; } /* Process this document's content buffer as RDF/XML */ if(grddl_parser->process_this_as_rdfxml && buffer) { RAPTOR_DEBUG3("Parser %p: Running additional RDF/XML parse on root document URI '%s' content\n", rdf_parser, raptor_uri_as_string_v2(rdf_parser->world, rdf_parser->base_uri)); if(raptor_grddl_ensure_internal_parser(rdf_parser, "rdfxml", 0)) ret=1; else { grddl_parser->internal_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(rdf_parser); if(raptor_start_parse(grddl_parser->internal_parser, rdf_parser->base_uri)) ret=1; else { ret=raptor_parse_chunk(grddl_parser->internal_parser, buffer, buffer_len, 1); rdf_parser->default_generate_id_handler_base= raptor_parser_get_current_base_id(grddl_parser->internal_parser); } } } /* Apply all transformation URIs seen */ transform: while(raptor_sequence_size(grddl_parser->doc_transform_uris)) { grddl_xml_context* xml_context=(grddl_xml_context*)raptor_sequence_unshift(grddl_parser->doc_transform_uris); ret=raptor_grddl_run_grddl_transform_uri(rdf_parser, xml_context, doc); grddl_free_xml_context(rdf_parser->world, xml_context); if(ret) break; } tidy: if(buffer) { if(buffer_is_libxml) xmlFree((xmlChar*)buffer); else RAPTOR_FREE(cstring, buffer); } if(grddl_parser->sb) { raptor_free_stringbuffer(grddl_parser->sb); grddl_parser->sb=NULL; } if(grddl_parser->xml_ctxt) { if(grddl_parser->xml_ctxt->myDoc) { xmlFreeDoc(grddl_parser->xml_ctxt->myDoc); grddl_parser->xml_ctxt->myDoc=NULL; } xmlFreeParserCtxt(grddl_parser->xml_ctxt); grddl_parser->xml_ctxt=NULL; } if(grddl_parser->html_ctxt) { if(grddl_parser->html_ctxt->myDoc) { xmlFreeDoc(grddl_parser->html_ctxt->myDoc); grddl_parser->html_ctxt->myDoc=NULL; } xmlFreeParserCtxt(grddl_parser->html_ctxt); grddl_parser->html_ctxt=NULL; } if(grddl_parser->xpathCtx) { xmlXPathFreeContext(grddl_parser->xpathCtx); grddl_parser->xpathCtx=NULL; } return (ret != 0); } static int raptor_grddl_parse_recognise_syntax(raptor_parser_factory* factory, const unsigned char *buffer, size_t len, const unsigned char *identifier, const unsigned char *suffix, const char *mime_type) { int score= 0; if(suffix) { if(!strcmp((const char*)suffix, "xhtml")) score=4; if(!strcmp((const char*)suffix, "html")) score=2; } else if(identifier) { if(strstr((const char*)identifier, "xhtml")) score=4; } return score; } static void raptor_grddl_parse_content_type_handler(raptor_parser* rdf_parser, const char* content_type) { raptor_grddl_parser_context* grddl_parser=(raptor_grddl_parser_context*)rdf_parser->context; if(content_type) { size_t len=strlen(content_type)+1; if(grddl_parser->content_type) RAPTOR_FREE(cstring,grddl_parser->content_type); grddl_parser->content_type=(char*)RAPTOR_MALLOC(cstring, len+1); strncpy(grddl_parser->content_type, content_type, len+1); } } static int raptor_grddl_parser_register_factory(raptor_parser_factory *factory) { int rc=0; factory->context_length = sizeof(raptor_grddl_parser_context); factory->need_base_uri = 1; factory->init = raptor_grddl_parse_init; factory->terminate = raptor_grddl_parse_terminate; factory->start = raptor_grddl_parse_start; factory->chunk = raptor_grddl_parse_chunk; factory->recognise_syntax = raptor_grddl_parse_recognise_syntax; factory->content_type_handler= raptor_grddl_parse_content_type_handler; rc+= raptor_parser_factory_add_mime_type(factory, "text/html", 2) != 0; rc+= raptor_parser_factory_add_mime_type(factory, "application/xhtml+xml", 4) != 0; return rc; } int raptor_init_parser_grddl_common(raptor_world* world) { #ifdef HAVE_XSLTINIT xsltInit(); #endif if(world->xslt_security_preferences == NULL) { xsltSecurityPrefsPtr raptor_xslt_sec = NULL; raptor_xslt_sec = xsltNewSecurityPrefs(); xsltSetDefaultSecurityPrefs(raptor_xslt_sec); /* no read from file (read from URI with scheme = file) */ xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_READ_FILE, xsltSecurityForbid); /* no create/write to file */ xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_WRITE_FILE, xsltSecurityForbid); /* no create directory */ xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_CREATE_DIRECTORY, xsltSecurityForbid); /* yes read from URI with scheme != file (XSLT_SECPREF_READ_NETWORK) */ /* no write to network (you can 'write' with GET params anyway) */ xsltSetSecurityPrefs(raptor_xslt_sec, XSLT_SECPREF_WRITE_NETWORK, xsltSecurityForbid); world->xslt_security_preferences = (void*)raptor_xslt_sec; world->free_xslt_security_preferences = 1; } return 0; } int raptor_init_parser_grddl(raptor_world* world) { return !raptor_parser_register_factory(world, "grddl", "Gleaning Resource Descriptions from Dialects of Languages", &raptor_grddl_parser_register_factory); } void raptor_terminate_parser_grddl_common(raptor_world *world) { xsltCleanupGlobals(); if(world->xslt_security_preferences && world->free_xslt_security_preferences) xsltFreeSecurityPrefs((xsltSecurityPrefsPtr)world->xslt_security_preferences); }