// -*- mode: c++; tab-width: 4; indent-tabs-mode: t; eval: (progn (c-set-style "stroustrup") (c-set-offset 'innamespace 0)); -*- // vi:set ts=4 sts=4 sw=4 noet : // // Copyright 2010, 2011 wkhtmltopdf authors // // This file is part of wkhtmltopdf. // // wkhtmltopdf is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // wkhtmltopdf is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with wkhtmltopdf. If not, see . #include "multipageloader_p.hh" #include #include #include #include #include #include #if QT_VERSION >= 0x050000 #include #endif namespace wkhtmltopdf { /*! \file multipageloader.hh \brief Defines the MultiPageLoader class */ /*! \file multipageloader_p.hh \brief Defines the MultiPageLoaderPrivate class */ LoaderObject::LoaderObject(QWebPage & p): page(p), skip(false) {}; MyNetworkAccessManager::MyNetworkAccessManager(const settings::LoadPage & s): disposed(false), settings(s) { if ( !s.cacheDir.isEmpty() ){ QNetworkDiskCache *cache = new QNetworkDiskCache(this); cache->setCacheDirectory(s.cacheDir); QNetworkAccessManager::setCache(cache); } } void MyNetworkAccessManager::dispose() { disposed = true; } void MyNetworkAccessManager::allow(QString path) { QString x = QFileInfo(path).canonicalFilePath(); if (x.isEmpty()) return; allowed.insert(x); } QNetworkReply * MyNetworkAccessManager::createRequest(Operation op, const QNetworkRequest & req, QIODevice * outgoingData) { if (disposed) { emit warning("Received createRequest signal on a disposed ResourceObject's NetworkAccessManager. " "This might be an indication of an iframe taking too long to load."); // Needed to avoid race conditions by spurious network requests // by scripts or iframes taking too long to load. QNetworkRequest r2 = req; r2.setUrl(QUrl("about:blank")); return QNetworkAccessManager::createRequest(op, r2, outgoingData); } bool isLocalFileAccess = req.url().scheme().length() <= 1 || req.url().scheme() == "file"; if (isLocalFileAccess && settings.blockLocalFileAccess) { bool ok=false; QString path = QFileInfo(req.url().toLocalFile()).canonicalFilePath(); QString old = ""; while (path != old) { if (allowed.contains(path)) { ok=true; break; } old = path; path = QFileInfo(path).path(); } if (!ok) { QNetworkRequest r2 = req; emit warning(QString("Blocked access to file %1").arg(QFileInfo(req.url().toLocalFile()).canonicalFilePath())); r2.setUrl(QUrl("about:blank")); return QNetworkAccessManager::createRequest(op, r2, outgoingData); } } QNetworkRequest r3 = req; if (settings.repeatCustomHeaders) { typedef QPair HT; foreach (const HT & j, settings.customHeaders) r3.setRawHeader(j.first.toLatin1(), j.second.toLatin1()); } return QNetworkAccessManager::createRequest(op, r3, outgoingData); } MyNetworkProxyFactory::MyNetworkProxyFactory (QNetworkProxy proxy, QList bph): bypassHosts(bph), originalProxy(QList() << proxy), noProxy(QList() << QNetworkProxy(QNetworkProxy::DefaultProxy)){} QList MyNetworkProxyFactory::queryProxy (const QNetworkProxyQuery & query) { QString host = query.url().host(); foreach (const QString & bypassHost, bypassHosts) { if (host.compare(bypassHost, Qt::CaseInsensitive) == 0) return noProxy; } return originalProxy; } MyQWebPage::MyQWebPage(ResourceObject & res): resource(res) {} void MyQWebPage::javaScriptAlert(QWebFrame *, const QString & msg) { resource.warning(QString("Javascript alert: %1").arg(msg)); } bool MyQWebPage::javaScriptConfirm(QWebFrame *, const QString & msg) { resource.warning(QString("Javascript confirm: %1 (answered yes)").arg(msg)); return true; } bool MyQWebPage::javaScriptPrompt(QWebFrame *, const QString & msg, const QString & defaultValue, QString * result) { resource.warning(QString("Javascript prompt: %1 (answered %2)").arg(msg,defaultValue)); result = (QString*)&defaultValue; Q_UNUSED(result); return true; } void MyQWebPage::javaScriptConsoleMessage(const QString & message, int lineNumber, const QString & sourceID) { if (resource.settings.debugJavascript) resource.warning(QString("%1:%2 %3").arg(sourceID).arg(lineNumber).arg(message)); } bool MyQWebPage::shouldInterruptJavaScript() { if (resource.settings.stopSlowScripts) { resource.warning("A slow script was stopped"); return true; } return false; } ResourceObject::ResourceObject(MultiPageLoaderPrivate & mpl, const QUrl & u, const settings::LoadPage & s): networkAccessManager(s), url(u), loginTry(0), progress(0), finished(false), signalPrint(false), multiPageLoader(mpl), webPage(*this), lo(webPage), httpErrorCode(0), settings(s) { connect(&networkAccessManager, SIGNAL(authenticationRequired(QNetworkReply*, QAuthenticator *)),this, SLOT(handleAuthenticationRequired(QNetworkReply *, QAuthenticator *))); foreach (const QString & path, s.allowed) networkAccessManager.allow(path); if (url.scheme() == "file") networkAccessManager.allow(url.toLocalFile()); connect(&webPage, SIGNAL(loadStarted()), this, SLOT(loadStarted())); connect(&webPage, SIGNAL(loadProgress(int)), this, SLOT(loadProgress(int))); connect(&webPage, SIGNAL(loadFinished(bool)), this, SLOT(loadFinished(bool))); connect(&webPage, SIGNAL(printRequested(QWebFrame*)), this, SLOT(printRequested(QWebFrame*))); //If some ssl error occurs we want sslErrors to be called, so the we can ignore it connect(&networkAccessManager, SIGNAL(sslErrors(QNetworkReply*, const QList&)),this, SLOT(sslErrors(QNetworkReply*, const QList&))); connect(&networkAccessManager, SIGNAL(finished (QNetworkReply *)), this, SLOT(amfinished (QNetworkReply *) ) ); connect(&networkAccessManager, SIGNAL(warning(const QString &)), this, SLOT(warning(const QString &))); networkAccessManager.setCookieJar(multiPageLoader.cookieJar); //If we must use a proxy, create a host of objects if (!settings.proxy.host.isEmpty()) { QNetworkProxy proxy; proxy.setHostName(settings.proxy.host); proxy.setPort(settings.proxy.port); proxy.setType(settings.proxy.type); // to retrieve a web page, it's not needed to use a fully transparent // http proxy. Moreover, the CONNECT() method is frequently disabled // by proxies administrators. if (settings.proxy.type == QNetworkProxy::HttpProxy) proxy.setCapabilities(QNetworkProxy::CachingCapability | QNetworkProxy::TunnelingCapability); if (!settings.proxy.user.isEmpty()) proxy.setUser(settings.proxy.user); if (!settings.proxy.password.isEmpty()) proxy.setPassword(settings.proxy.password); if (!settings.bypassProxyForHosts.isEmpty()) networkAccessManager.setProxyFactory( new MyNetworkProxyFactory(proxy, settings.bypassProxyForHosts)); else networkAccessManager.setProxy(proxy); } webPage.setNetworkAccessManager(&networkAccessManager); webPage.mainFrame()->setZoomFactor(settings.zoomFactor); } /*! * Once loading starting, this is called */ void ResourceObject::loadStarted() { if (finished == true) { ++multiPageLoader.loading; finished = false; } if (multiPageLoader.loadStartedEmitted) return; multiPageLoader.loadStartedEmitted=true; emit multiPageLoader.outer.loadStarted(); } /*! * Called when the page is loading, display some progress to the using * \param progress the loading progress in percent */ void ResourceObject::loadProgress(int p) { // If we are finished, ignore this signal. if (finished || multiPageLoader.resources.size() <= 0) { warning("A finished ResourceObject received a loading progress signal. " "This might be an indication of an iframe taking too long to load."); return; } multiPageLoader.progressSum -= progress; progress = p; multiPageLoader.progressSum += progress; emit multiPageLoader.outer.loadProgress(multiPageLoader.progressSum / multiPageLoader.resources.size()); } void ResourceObject::loadFinished(bool ok) { // If we are finished, this might be a potential bug. if (finished || multiPageLoader.resources.size() <= 0) { warning("A finished ResourceObject received a loading finished signal. " "This might be an indication of an iframe taking too long to load."); return; } multiPageLoader.hasError = multiPageLoader.hasError || (!ok && settings.loadErrorHandling == settings::LoadPage::abort); if (!ok) { if (settings.loadErrorHandling == settings::LoadPage::abort) error(QString("Failed loading page ") + url.toString() + " (sometimes it will work just to ignore this error with --load-error-handling ignore)"); else if (settings.loadErrorHandling == settings::LoadPage::skip) { warning(QString("Failed loading page ") + url.toString() + " (skipped)"); lo.skip = true; } else warning(QString("Failed loading page ") + url.toString() + " (ignored)"); } bool isMain = multiPageLoader.isMainLoader; // Evaluate extra user supplied javascript for the main loader if (isMain) foreach (const QString & str, settings.runScript) webPage.mainFrame()->evaluateJavaScript(str); // XXX: If loading failed there's no need to wait // for javascript on this resource. if (!ok || signalPrint || settings.jsdelay == 0) loadDone(); else if (isMain && !settings.windowStatus.isEmpty()) waitWindowStatus(); else QTimer::singleShot(settings.jsdelay, this, SLOT(loadDone())); } void ResourceObject::waitWindowStatus() { QString windowStatus = webPage.mainFrame()->evaluateJavaScript("window.status").toString(); //warning(QString("window.status:" + windowStatus + " settings.windowStatus:" + settings.windowStatus)); if (windowStatus != settings.windowStatus) { QTimer::singleShot(50, this, SLOT(waitWindowStatus())); } else { QTimer::singleShot(settings.jsdelay, this, SLOT(loadDone())); } } void ResourceObject::printRequested(QWebFrame *) { signalPrint=true; loadDone(); } void ResourceObject::loadDone() { if (finished) return; finished=true; // Ensure no more loading goes.. webPage.triggerAction(QWebPage::Stop); webPage.triggerAction(QWebPage::StopScheduledPageRefresh); networkAccessManager.dispose(); //disconnect(this, 0, 0, 0); --multiPageLoader.loading; if (multiPageLoader.loading == 0) multiPageLoader.loadDone(); } /*! * Called when the page requires authentication, fills in the username * and password supplied on the command line */ void ResourceObject::handleAuthenticationRequired(QNetworkReply *reply, QAuthenticator *authenticator) { Q_UNUSED(reply); // XXX: Avoid calling 'reply->abort()' from within this signal. // As stated by doc, request would be finished when no // user/pass properties are assigned to authenticator object. // See: http://qt-project.org/doc/qt-5.0/qtnetwork/qnetworkaccessmanager.html#authenticationRequired if (settings.username.isEmpty()) { //If no username is given, complain the such is required error("Authentication Required"); } else if (loginTry >= 2) { //If the login has failed a sufficient number of times, //the username or password must be wrong error("Invalid username or password"); } else { authenticator->setUser(settings.username); authenticator->setPassword(settings.password); ++loginTry; } } void ResourceObject::warning(const QString & str) { emit multiPageLoader.outer.warning(str); } void ResourceObject::error(const QString & str) { emit multiPageLoader.outer.error(str); } /*! * Track and handle network errors * \param reply The networkreply that has finished */ void ResourceObject::amfinished(QNetworkReply * reply) { int networkStatus = reply->error(); int httpStatus = reply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt(); if ((networkStatus != 0 && networkStatus != 5) || (httpStatus > 399 && httpErrorCode == 0)) { QFileInfo fi(reply->url().toString()); bool mediaFile = settings::LoadPage::mediaFilesExtensions.contains(fi.completeSuffix().toLower()); if ( ! mediaFile) { // XXX: Notify network errors as higher priority than HTTP errors. // QT's QNetworkReply::NetworkError enum uses values overlapping // HTTP status codes, so adding 1000 to QT's codes will avoid // confusion. Also a network error at this point will probably mean // no HTTP access at all, so we want network errors to be reported // with a higher priority than HTTP ones. // See: http://doc-snapshot.qt-project.org/4.8/qnetworkreply.html#NetworkError-enum httpErrorCode = networkStatus > 0 ? (networkStatus + 1000) : httpStatus; return; } if (settings.mediaLoadErrorHandling == settings::LoadPage::abort) { httpErrorCode = networkStatus > 0 ? (networkStatus + 1000) : httpStatus; error(QString("Failed to load ") + reply->url().toString() + ", with code: " + QString::number(httpErrorCode) + " (sometimes it will work just to ignore this error with --load-media-error-handling ignore)"); } else { warning(QString("Failed to load %1 (%2)") .arg(reply->url().toString()) .arg(settings::loadErrorHandlingToStr(settings.mediaLoadErrorHandling)) ); } } } /*! * Handle any ssl error by ignoring */ void ResourceObject::sslErrors(QNetworkReply *reply, const QList &) { //We ignore any ssl error, as it is next to impossible to send or receive //any private information with wkhtmltopdf anyhow, seeing as you cannot authenticate reply->ignoreSslErrors(); warning("SSL error ignored"); } void ResourceObject::load() { finished=false; ++multiPageLoader.loading; bool hasFiles=false; foreach (const settings::PostItem & pi, settings.post) hasFiles |= pi.file; QByteArray postData; QString boundary; if (hasFiles) { boundary = QUuid::createUuid().toString().remove('-').remove('{').remove('}'); foreach (const settings::PostItem & pi, settings.post) { //TODO escape values here postData.append("--"); postData.append(boundary); postData.append("\ncontent-disposition: form-data; name=\""); postData.append(pi.name); postData.append('\"'); if (pi.file) { QFile f(pi.value); if (!f.open(QIODevice::ReadOnly) ) { error(QString("Unable to open file ")+pi.value); multiPageLoader.fail(); } postData.append("; filename=\""); postData.append( QFileInfo(pi.value).fileName()); postData.append("\"\n\n"); postData.append( f.readAll() ); //TODO ADD MIME TYPE } else { postData.append("\n\n"); postData.append(pi.value); } postData.append('\n'); } if (!postData.isEmpty()) { postData.append("--"); postData.append(boundary); postData.append("--\n"); } } else { #if QT_VERSION >= 0x050000 QUrlQuery q; foreach (const settings::PostItem & pi, settings.post) q.addQueryItem(pi.name, pi.value); postData = q.query(QUrl::FullyEncoded).toLocal8Bit(); #else QUrl u; foreach (const settings::PostItem & pi, settings.post) u.addQueryItem(pi.name, pi.value); postData = u.encodedQuery(); #endif } typedef QPair SSP; foreach (const SSP & pair, settings.cookies) multiPageLoader.cookieJar->useCookie(url, pair.first, pair.second); QNetworkRequest r = QNetworkRequest(url); typedef QPair HT; foreach (const HT & j, settings.customHeaders) r.setRawHeader(j.first.toLatin1(), j.second.toLatin1()); if (postData.isEmpty()) webPage.mainFrame()->load(r); else { if (hasFiles) r.setHeader(QNetworkRequest::ContentTypeHeader, QString("multipart/form-data, boundary=")+boundary); webPage.mainFrame()->load(r, QNetworkAccessManager::PostOperation, postData); } } void MyCookieJar::useCookie(const QUrl &, const QString & name, const QString & value) { extraCookies.push_back(QNetworkCookie(name.toUtf8(), value.toUtf8())); } QList MyCookieJar::cookiesForUrl(const QUrl & url) const { QList list = QNetworkCookieJar::cookiesForUrl(url); list.append(extraCookies); return list; } void MyCookieJar::loadFromFile(const QString & path) { QFile cookieJar(path); if (cookieJar.open(QIODevice::ReadOnly | QIODevice::Text) ) setAllCookies(QNetworkCookie::parseCookies(cookieJar.readAll())); } void MyCookieJar::saveToFile(const QString & path) { QFile cookieJar(path); if (cookieJar.open(QIODevice::WriteOnly | QIODevice::Text) ) foreach (const QNetworkCookie & cookie, allCookies()) { cookieJar.write(cookie.toRawForm()); cookieJar.write(";\n"); } } void MultiPageLoaderPrivate::loadDone() { if (!settings.cookieJar.isEmpty()) cookieJar->saveToFile(settings.cookieJar); if (!finishedEmitted) { finishedEmitted = true; emit outer.loadFinished(!hasError); } } /*! * Copy a file from some place to another * \param src The source to copy from * \param dst The destination to copy to */ bool MultiPageLoader::copyFile(QFile & src, QFile & dst) { // TODO enable again when // http://bugreports.qt.nokia.com/browse/QTBUG-6894 // is fixed // QByteArray buf(1024*1024*5,0); // while ( qint64 r=src.read(buf.data(),buf.size())) { // if (r == -1) return false; // if (dst.write(buf.data(),r) != r) return false; // } if (dst.write( src.readAll() ) == -1) return false; src.close(); dst.close(); return true; } MultiPageLoaderPrivate::MultiPageLoaderPrivate(const settings::LoadGlobal & s, MultiPageLoader & o): outer(o), settings(s) { cookieJar = new MyCookieJar(); if (!settings.cookieJar.isEmpty()) cookieJar->loadFromFile(settings.cookieJar); } MultiPageLoaderPrivate::~MultiPageLoaderPrivate() { clearResources(); } LoaderObject * MultiPageLoaderPrivate::addResource(const QUrl & url, const settings::LoadPage & page) { ResourceObject * ro = new ResourceObject(*this, url, page); resources.push_back(ro); return &ro->lo; } void MultiPageLoaderPrivate::load() { progressSum=0; loadStartedEmitted=false; finishedEmitted=false; hasError=false; loading=0; for (int i=0; i < resources.size(); ++i) resources[i]->load(); if (resources.size() == 0) loadDone(); } void MultiPageLoaderPrivate::clearResources() { while (resources.size() > 0) { // XXX: Using deleteLater() to dispose // resources, to avoid race conditions with // pending signals reaching a deleted resource. // Also, and we must avoid calling clear() // on resources list, is it tries to delete // each objet on removal. ResourceObject *tmp = resources.takeFirst(); tmp->deleteLater(); } tempIn.removeAll(); } void MultiPageLoaderPrivate::cancel() { //foreach (QWebPage * page, pages) // page->triggerAction(QWebPage::Stop); } void MultiPageLoaderPrivate::fail() { hasError = true; cancel(); clearResources(); } /*! \brief Construct a multipage loader object, load settings read from the supplied settings \param s The settings to be used while loading pages */ MultiPageLoader::MultiPageLoader(settings::LoadGlobal & s, bool mainLoader): d(new MultiPageLoaderPrivate(s, *this)) { d->isMainLoader = mainLoader; } MultiPageLoader::~MultiPageLoader() { MultiPageLoaderPrivate *tmp = d; d = 0; tmp->deleteLater(); } /*! \brief Add a resource, to be loaded described by a string @param string Url describing the resource to load */ LoaderObject * MultiPageLoader::addResource(const QString & string, const settings::LoadPage & s, const QString * data) { QString url=string; if (data && !data->isEmpty()) { url = d->tempIn.create(".html"); QFile tmp(url); if (!tmp.open(QIODevice::WriteOnly) || tmp.write(data->toUtf8())==0) { emit error("Unable to create temporary file"); return NULL; } } else if (url == "-") { QFile in; in.open(stdin,QIODevice::ReadOnly); url = d->tempIn.create(".html"); QFile tmp(url); if (!tmp.open(QIODevice::WriteOnly) || !copyFile(in, tmp)) { emit error("Unable to create temporary file"); return NULL; } } return addResource(guessUrlFromString(url), s); } /*! \brief Add a page to be loaded @param url Url of the page to load */ LoaderObject * MultiPageLoader::addResource(const QUrl & url, const settings::LoadPage & s) { return d->addResource(url, s); } /*! \brief Guess a url, by looking at a string (shamelessly copied from Arora Project) \param string The string the is suppose to be some kind of url */ QUrl MultiPageLoader::guessUrlFromString(const QString &string) { QString urlStr = string.trimmed(); // check if the string is just a host with a port QRegExp hostWithPort(QLatin1String("^[a-zA-Z\\.]+\\:[0-9]*$")); if (hostWithPort.exactMatch(urlStr)) urlStr = QLatin1String("http://") + urlStr; // Check if it looks like a qualified URL. Try parsing it and see. QRegExp test(QLatin1String("^[a-zA-Z]+\\://.*")); bool hasSchema = test.exactMatch(urlStr); if (hasSchema) { bool isAscii = true; foreach (const QChar &c, urlStr) { if (c >= 0x80) { isAscii = false; break; } } QUrl url; if (isAscii) { url = QUrl::fromEncoded(urlStr.toLatin1(), QUrl::TolerantMode); } else { url = QUrl(urlStr, QUrl::TolerantMode); } if (url.isValid()) return url; } // Might be a file. if (QFile::exists(urlStr)) { QFileInfo info(urlStr); return QUrl::fromLocalFile(info.absoluteFilePath()); } // Might be a shorturl - try to detect the schema. if (!hasSchema) { int dotIndex = urlStr.indexOf(QLatin1Char('.')); if (dotIndex != -1) { QString prefix = urlStr.left(dotIndex).toLower(); QString schema = (prefix == QLatin1String("ftp")) ? prefix : QLatin1String("http"); QUrl url(schema + QLatin1String("://") + urlStr, QUrl::TolerantMode); if (url.isValid()) return url; } } // Fall back to QUrl's own tolerant parser. QUrl url = QUrl(string, QUrl::TolerantMode); // finally for cases where the user just types in a hostname add http if (url.scheme().isEmpty()) url = QUrl(QLatin1String("http://") + string, QUrl::TolerantMode); return url; } /*! \brief Return the most severe http error code returned during loading */ int MultiPageLoader::httpErrorCode() { int res=0; foreach (const ResourceObject * ro, d->resources) if (ro->httpErrorCode > res) res = ro->httpErrorCode; return res; } /*! \brief Begin loading all the resources added */ void MultiPageLoader::load() { d->load(); } /*! \brief Clear all the resources */ void MultiPageLoader::clearResources() { d->clearResources(); } /*! \brief Cancel the loading of the pages */ void MultiPageLoader::cancel() { d->cancel(); } /*! \fn MultiPageLoader::loadFinished(bool ok) \brief Signal emitted when all pages have been loaded \param ok True if all the pages have been loaded sucessfully */ /*! \fn MultiPageLoader::loadProgress(int progress) \brief Signal emitted once load has progressed \param progress Progress in percent */ /*! \fn MultiPageLoader::loadStarted() \brief Signal emitted when loading has started */ /*! \fn void MultiPageLoader::warning(QString text) \brief Signal emitted when a none fatal warning has occured \param text A string describing the warning */ /*! \fn void MultiPageLoader::error(QString text) \brief Signal emitted when a fatal error has occured \param text A string describing the error */ }