/* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */ /* AbiWord: ie_imp_MHT - plugin for Multipart [X]HTML * * Copyright (C) 2002 Francis James Franklin * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include #include #include #include #ifdef HAVE_UNISTD_H #include #endif // AbiWord includes #include "ut_assert.h" #include "ut_debugmsg.h" #include "ut_base64.h" #include "ut_bytebuf.h" #include "ut_hash.h" #include "ut_vector.h" #include "pd_Document.h" #include "ie_impGraphic.h" #include "ie_imp_MHT.h" #include "ie_impexp_HTML.h" #ifdef XHTML_HTML_TIDY_SUPPORTED #include "tidyReader.h" #endif #ifdef XHTML_HTML_XML2_SUPPORTED #include "ut_html.h" #endif /*****************************************************************/ /*****************************************************************/ IE_Imp_MHT_Sniffer::IE_Imp_MHT_Sniffer () : IE_ImpSniffer("AbiMHT::Multipart HTML") { // } // supported suffixes static IE_SuffixConfidence IE_Imp_MHT_Sniffer__SuffixConfidence[] = { { "mht", UT_CONFIDENCE_GOOD }, { "", UT_CONFIDENCE_ZILCH } }; const IE_SuffixConfidence * IE_Imp_MHT_Sniffer::getSuffixConfidence () { return IE_Imp_MHT_Sniffer__SuffixConfidence; } // supported mimetypes static IE_MimeConfidence IE_Imp_MHT_Sniffer__MimeConfidence[] = { { IE_MIME_MATCH_FULL, IE_MIMETYPE_RELATED, UT_CONFIDENCE_GOOD }, { IE_MIME_MATCH_BOGUS, "", UT_CONFIDENCE_ZILCH } }; const IE_MimeConfidence * IE_Imp_MHT_Sniffer::getMimeConfidence () { return IE_Imp_MHT_Sniffer__MimeConfidence; } static const char * s_strnstr (const char * haystack, UT_uint32 iNumbytes, const char * needle) { UT_uint32 needle_length = static_cast(strlen (needle)); UT_uint32 i = 0; if (needle_length > iNumbytes) return NULL; const char * ptr = haystack; const char * match = NULL; while (i < (iNumbytes - needle_length)) { if (*ptr == *needle) if (strncmp (ptr, needle, needle_length) == 0) { match = ptr; break; } ptr++; i++; } return match; } UT_Confidence_t IE_Imp_MHT_Sniffer::recognizeContents (const char * szBuf, UT_uint32 iNumbytes) { if (s_strnstr (szBuf, iNumbytes, IE_MIMETYPE_RELATED)) if (s_strnstr (szBuf, iNumbytes, IE_MIMETYPE_HTML) || s_strnstr (szBuf, iNumbytes, IE_MIMETYPE_XHTML)) { return UT_CONFIDENCE_GOOD; } return UT_CONFIDENCE_ZILCH; } UT_Error IE_Imp_MHT_Sniffer::constructImporter (PD_Document * pDocument, IE_Imp ** ppie) { IE_Imp_MHT * p = new IE_Imp_MHT (pDocument); *ppie = p; return UT_OK; } bool IE_Imp_MHT_Sniffer::getDlgLabels (const char ** pszDesc, const char ** pszSuffixList, IEFileType * ft) { *pszDesc = "Multipart HTML (.mht)"; *pszSuffixList = "*.mht"; *ft = getFileType (); return true; } /*****************************************************************/ /*****************************************************************/ IE_Imp_MHT::IE_Imp_MHT (PD_Document * pDocument) : IE_Imp_XHTML(pDocument), m_document(0), m_parts(new UT_Vector), m_eps(0) { // } IE_Imp_MHT::~IE_Imp_MHT () { UT_VECTOR_PURGEALL(UT_Multipart *,(*m_parts)); DELETEP(m_parts); } UT_Error IE_Imp_MHT::importFile (const char * szFilename) { int fd_in = open (szFilename, O_RDONLY); if (fd_in < 0) return UT_ERROR; m_eps = eps_begin (INTERFACE_STREAM, &fd_in); if (m_eps == 0) { close (fd_in); return UT_ERROR; } bool bValid = false; for (header_t * h = eps_next_header (m_eps); h; h = eps_next_header (m_eps)) { const char * name = reinterpret_cast(h->name); const char * data = reinterpret_cast(h->data); if (name && data) if (g_ascii_strcasecmp (name, "content-type") == 0) { UT_uint32 length = static_cast(strlen (data)); if (s_strnstr (data, length, IE_MIMETYPE_RELATED)) if (s_strnstr (data, length, IE_MIMETYPE_HTML) || s_strnstr (data, length, IE_MIMETYPE_XHTML)) { bValid = true; } } eps_header_free (m_eps); } UT_Error import_status = UT_OK; if (bValid) { while (eps_next_line (m_eps)) { // nothing interesting here } int parts = 0; while ((!(m_eps->u->b->eof)) && (m_eps->content_type & CON_MULTI)) { UT_Multipart * part = importMultipart (); if (part == 0) break; if (part->isXHTML () || part->isHTML4 ()) { if (m_document) { UT_DEBUGMSG(("Multipart HTML document has multiple HTML regions!\n")); DELETEP(part); import_status = UT_IE_BOGUSDOCUMENT; break; } m_document = part; } if (m_parts->addItem (part) < 0) { UT_DEBUGMSG(("Multipart HTML: error appending part!\n")); DELETEP(part); import_status = UT_OUTOFMEM; break; } } } eps_end (m_eps); close (fd_in); if (m_document == 0) { UT_DEBUGMSG(("Multipart HTML document has no HTML regions!\n")); import_status = UT_IE_BOGUSDOCUMENT; } if (import_status == UT_OK) { if (m_document->isXHTML ()) { import_status = importXHTML (szFilename); } else if (m_document->isHTML4 ()) { import_status = importHTML4 (szFilename); } else import_status = UT_ERROR; } return import_status; } FG_Graphic * IE_Imp_MHT::importImage (const gchar * szSrc) { bool bContentID = (strncmp ((const char *) szSrc, "cid:", 4) == 0); const UT_Multipart * part = 0; UT_uint32 count = m_parts->getItemCount (); for (UT_uint32 i = 0; i < count; i++) { const UT_Multipart * ptr = reinterpret_cast((*m_parts)[i]); if (!ptr->isImage ()) continue; if (bContentID) { if (ptr->contentID ()) if (strncmp (reinterpret_cast(szSrc) + 4, ptr->contentID () + 1, strlen (static_cast (szSrc)) - 4) == 0) { part = ptr; break; } } else { if (ptr->contentLocation ()) if (strcmp (reinterpret_cast(szSrc), ptr->contentLocation ()) == 0) { part = ptr; break; } } } if (part == 0) { UT_DEBUGMSG(("Multipart HTML: importImage: `%s' not an image, or not in archive\n",szSrc)); return 0; } const UT_ByteBuf * pBB = part->getBuffer (); if (pBB == 0) { UT_DEBUGMSG(("Multipart HTML: importImage: `%s' - image in archive but not (or no longer?) loaded!\n",szSrc)); return 0; } if (pBB->getLength () == 0) { UT_DEBUGMSG(("Multipart HTML: importImage: `%s' - image in archive but has no size!\n",szSrc)); return 0; } IE_ImpGraphic * pieg = 0; if (IE_ImpGraphic::constructImporter (pBB, IEGFT_Unknown, &pieg) != UT_OK) { UT_DEBUGMSG(("unable to construct image importer!\n")); return 0; } if (pieg == 0) return 0; UT_Multipart * vol_part = const_cast(part); FG_Graphic * pfg = 0; UT_Error import_status = pieg->importGraphic (vol_part->detachBuffer (), &pfg); delete pieg; if (import_status != UT_OK) { UT_DEBUGMSG(("unable to import image!\n")); return 0; } UT_DEBUGMSG(("image loaded successfully\n")); return pfg; } UT_Error IE_Imp_MHT::importXHTML (const char * szFilename) { const UT_Byte * buffer = m_document->getBuffer()->getPointer (0); UT_uint32 length = m_document->getBuffer()->getLength (); MultiReader wrapper(buffer,length); setReader (&wrapper); return IE_Imp_XHTML::importFile (szFilename); } UT_Error IE_Imp_MHT::importHTML4 (const char * szFilename) { UT_Error e = UT_ERROR; #ifdef XHTML_HTML_TIDY_SUPPORTED const UT_Byte * buffer = m_document->getBuffer()->getPointer (0); UT_uint32 length = m_document->getBuffer()->getLength (); TidyReader wrapper(buffer,length); setReader (&wrapper); e = IE_Imp_XHTML::importFile (szFilename); setReader (0); #endif #ifdef XHTML_HTML_XML2_SUPPORTED const UT_Byte * buffer = m_document->getBuffer()->getPointer (0); UT_uint32 length = m_document->getBuffer()->getLength (); UT_XML_BufReader wrapper(reinterpret_cast(buffer),length); setReader (&wrapper); UT_HTML parser; setParser (&parser); e = IE_Imp_XHTML::importFile (szFilename); setParser (0); setReader (0); #endif return e; } UT_Multipart * IE_Imp_MHT::importMultipart () { if (!mime_init_stream (m_eps)) return 0; UT_Multipart * part = new UT_Multipart; if (part == 0) return 0; for (header_t * h = mime_next_header (m_eps); h; h = mime_next_header (m_eps)) { const char * name = reinterpret_cast(h->name); const char * data = reinterpret_cast(h->data); if (name && data) part->insert (name, data); header_kill (h); } bool bLoad = (part->isImage () || part->isXHTML () || part->isHTML4 ()); for (unsigned char * l = mime_next_line (m_eps); l; l = mime_next_line (m_eps)) { char * line = reinterpret_cast(l); UT_uint32 length = static_cast(strlen (line)); if (bLoad && length) part->append (line, length); } return part; } UT_Multipart::UT_Multipart () : m_map(new UT_StringPtrMap), m_buf(new UT_ByteBuf), m_location(0), m_id(0), m_type(0), m_encoding(0), m_cte(cte_other), m_ct(ct_other), m_b64length(0) { // } UT_Multipart::~UT_Multipart () { clear (); DELETEP(m_map); DELETEP(m_buf); } bool UT_Multipart::insert (const char * name, const char * value) { if (( name == 0) || ( value == 0)) return false; if ((*name == 0) || (*value == 0)) return false; char * new_value = g_strdup (value); if (new_value == 0) return false; if (!m_map->insert (name, new_value)) { FREEP(new_value); return false; } if (g_ascii_strcasecmp (name, "content-transfer-encoding") == 0) { m_encoding = new_value; if (g_ascii_strcasecmp (new_value, "base64") == 0) { m_cte = cte_base64; } else if (g_ascii_strcasecmp (new_value, "quoted-printable") == 0) { m_cte = cte_quoted; } else m_cte = cte_other; } else if (g_ascii_strcasecmp (name, "content-location") == 0) { m_location = new_value; } else if (g_ascii_strcasecmp (name, "content-id") == 0) { m_id = new_value; } else if (g_ascii_strcasecmp (name, "content-type") == 0) { m_type = new_value; if (strncmp (new_value, IE_MIMETYPE_HTML, strlen (IE_MIMETYPE_HTML)) == 0) { m_ct = ct_html4; } else if (strncmp (new_value, IE_MIMETYPE_XHTML, strlen (IE_MIMETYPE_XHTML)) == 0) { m_ct = ct_xhtml; } else if (strncmp (new_value, "image/", 6) == 0) { m_ct = ct_image; } else m_ct = ct_other; } return true; } const char * UT_Multipart::lookup (const char * name) { if ( name == 0) return 0; if (*name == 0) return 0; const void * vptr = m_map->pick (name); return reinterpret_cast(vptr); } bool UT_Multipart::append (const char * buffer, UT_uint32 length) { static const char * s_newline = "\n"; if (m_buf == 0) return false; if ((buffer == 0) || (length == 0)) return true; // ?? if (isBase64 ()) return append_Base64 (buffer, length); if (isQuoted ()) return append_Quoted (buffer, length); return (m_buf->append (reinterpret_cast(buffer), length) && m_buf->append (reinterpret_cast(s_newline), 1)); } bool UT_Multipart::append_Base64 (const char * buffer, UT_uint32 length) { bool success = true; char binbuffer[60]; const char * bufptr = buffer; for (UT_uint32 i = 0; i < length; i++) { char c = *bufptr++; bool bEnd = (c == '='); unsigned char u = static_cast(c); if (!isspace ((int) u)) m_b64buffer[m_b64length++] = c; if (bEnd || (m_b64length == 80) || ((i + 1 == length) && m_b64length && ((m_b64length & 0x03) == 0))) { const char * b64bufptr = m_b64buffer; char * binbufptr = binbuffer; size_t binlength = 60; UT_UTF8_Base64Decode (binbufptr, binlength, b64bufptr, m_b64length); if (m_b64length) memmove (m_b64buffer, b64bufptr, m_b64length); if (m_b64length > 3) { UT_DEBUGMSG(("Multipart HTML: append_Base64: oddness while decoding!\n")); success = false; } if (binlength < 60) if (!m_buf->append (reinterpret_cast(binbuffer), 60 - binlength)) success = false; } if (bEnd || !success) break; } return success; } bool UT_Multipart::append_Quoted (const char * buffer, UT_uint32 length) { char * str = 0; if (length > 78) // shouldn't be { str = (char *) g_try_malloc (length + 2); if (str == 0) return false; } else str = m_b64buffer; char hexbuf[3]; hexbuf[2] = 0; bool suppressNewLine = false; const char * bufptr = buffer; const char * bufend = buffer + length; char * strptr = str; while ((bufptr < bufend) && !suppressNewLine) switch (*bufptr) { case '=': if (bufptr + 1 == bufend) suppressNewLine = true; else { bufptr++; hexbuf[0] = *bufptr++; hexbuf[1] = *bufptr++; unsigned int escape; if (sscanf (hexbuf, "%x", &escape) == 1) *strptr++ = static_cast(escape & 0xff); } break; default: *strptr++ = *bufptr++; break; } if (!suppressNewLine) *strptr++ = '\n'; *strptr = 0; bool success = m_buf->append (reinterpret_cast(str), strlen (str)); if (length > 80) FREEP(str); return success; } UT_ByteBuf * UT_Multipart::detachBuffer () { UT_ByteBuf * bufret = m_buf; m_buf = 0; return bufret; } void UT_Multipart::clear () { //UT_HASH_PURGEDATA (char *, m_map, free); m_map->purgeData(); m_map->clear (); if (m_buf) m_buf->truncate (0); } MultiReader::MultiReader (const UT_Byte * buffer, UT_uint32 length) : m_buffer(buffer), m_bufptr(buffer), m_length(length) { // } MultiReader::~MultiReader () { // } bool MultiReader::openFile (const char * /* szFilename */) { m_bufptr = m_buffer; return (m_buffer && m_length); } UT_uint32 MultiReader::readBytes (char * buffer, UT_uint32 length) { UT_uint32 length_remaining = m_length - (m_bufptr - m_buffer); UT_uint32 length_copy = (length > length_remaining) ? length_remaining : length; if (buffer) memcpy (buffer, m_bufptr, length_copy); m_bufptr += length_copy; return length_copy; } void MultiReader::closeFile () { m_bufptr = m_buffer + m_length; }