# -*- test-case-name: twisted.web.test.test_webclient -*- # Copyright (c) 2001-2004 Twisted Matrix Laboratories. # See LICENSE for details. # """HTTP client. API Stability: stable """ import urlparse, os, types from twisted.web import http from twisted.internet import defer, protocol, reactor from twisted.python import failure from twisted.python.util import InsensitiveDict from twisted.web import error class PartialDownloadError(error.Error): """Page was only partially downloaded, we got disconnected in middle. The bit that was downloaded is in the response attribute. """ class HTTPPageGetter(http.HTTPClient): quietLoss = 0 followRedirect = 1 failed = 0 def connectionMade(self): method = getattr(self.factory, 'method', 'GET') self.sendCommand(method, self.factory.path) self.sendHeader('Host', self.factory.headers.get("host", self.factory.host)) self.sendHeader('User-Agent', self.factory.agent) if self.factory.cookies: l=[] for cookie, cookval in self.factory.cookies.items(): l.append('%s=%s' % (cookie, cookval)) self.sendHeader('Cookie', '; '.join(l)) data = getattr(self.factory, 'postdata', None) if data is not None: self.sendHeader("Content-Length", str(len(data))) for (key, value) in self.factory.headers.items(): if key.lower() != "content-length": # we calculated it on our own self.sendHeader(key, value) self.endHeaders() self.headers = {} if data is not None: self.transport.write(data) def handleHeader(self, key, value): key = key.lower() l = self.headers[key] = self.headers.get(key, []) l.append(value) def handleStatus(self, version, status, message): self.version, self.status, self.message = version, status, message self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) m = getattr(self, 'handleStatus_'+self.status, self.handleStatusDefault) m() def handleStatus_200(self): pass handleStatus_201 = lambda self: self.handleStatus_200() handleStatus_202 = lambda self: self.handleStatus_200() def handleStatusDefault(self): self.failed = 1 def handleStatus_301(self): l = self.headers.get('location') if not l: self.handleStatusDefault() url = l[0] if self.followRedirect: scheme, host, port, path = \ _parse(url, defaultPort=self.transport.getPeer().port) self.factory.setURL(url) if self.factory.scheme == 'https': from twisted.internet import ssl contextFactory = ssl.ClientContextFactory() reactor.connectSSL(self.factory.host, self.factory.port, self.factory, contextFactory) else: reactor.connectTCP(self.factory.host, self.factory.port, self.factory) else: self.handleStatusDefault() self.factory.noPage( failure.Failure( error.PageRedirect( self.status, self.message, location = url))) self.quietLoss = 1 self.transport.loseConnection() handleStatus_302 = lambda self: self.handleStatus_301() def handleStatus_303(self): self.factory.method = 'GET' self.handleStatus_301() def connectionLost(self, reason): if not self.quietLoss: http.HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.quietLoss: return if self.failed: self.factory.noPage( failure.Failure( error.Error( self.status, self.message, response))) elif self.length != None and self.length != 0: self.factory.noPage(failure.Failure( PartialDownloadError(self.status, self.message, response))) else: self.factory.page(response) # server might be stupid and not close connection. admittedly # the fact we do only one request per connection is also # stupid... self.transport.loseConnection() def timeout(self): self.quietLoss = True self.transport.loseConnection() self.factory.noPage(defer.TimeoutError("Getting %s took longer than %s seconds." % (self.factory.url, self.factory.timeout))) class HTTPPageDownloader(HTTPPageGetter): transmittingPage = 0 def handleStatus_200(self, partialContent=0): HTTPPageGetter.handleStatus_200(self) self.transmittingPage = 1 self.factory.pageStart(partialContent) def handleStatus_206(self): self.handleStatus_200(partialContent=1) def handleResponsePart(self, data): if self.transmittingPage: self.factory.pagePart(data) def handleResponseEnd(self): if self.transmittingPage: self.factory.pageEnd() self.transmittingPage = 0 if self.failed: self.factory.noPage( failure.Failure( error.Error( self.status, self.message, None))) self.transport.loseConnection() class HTTPClientFactory(protocol.ClientFactory): """Download a given URL. @type deferred: Deferred @ivar deferred: A Deferred that will fire when the content has been retrieved. Once this is fired, the ivars `status', `version', and `message' will be set. @type status: str @ivar status: The status of the response. @type version: str @ivar version: The version of the response. @type message: str @ivar message: The text message returned with the status. @type response_headers: dict @ivar response_headers: The headers that were specified in the response from the server. """ protocol = HTTPPageGetter url = None scheme = None host = '' port = None path = None def __init__(self, url, method='GET', postdata=None, headers=None, agent="Twisted PageGetter", timeout=0, cookies=None, followRedirect=1): self.protocol.followRedirect = followRedirect self.timeout = timeout self.agent = agent if cookies is None: cookies = {} self.cookies = cookies if headers is not None: self.headers = InsensitiveDict(headers) else: self.headers = InsensitiveDict() if postdata is not None: self.headers.setdefault('Content-Length', len(postdata)) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("connection", "close") self.postdata = postdata self.method = method self.setURL(url) self.waiting = 1 self.deferred = defer.Deferred() self.response_headers = None def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.url) def setURL(self, url): self.url = url scheme, host, port, path = _parse(url) if scheme and host: self.scheme = scheme self.host = host self.port = port self.path = path def buildProtocol(self, addr): p = protocol.ClientFactory.buildProtocol(self, addr) if self.timeout: timeoutCall = reactor.callLater(self.timeout, p.timeout) self.deferred.addBoth(self._cancelTimeout, timeoutCall) return p def _cancelTimeout(self, result, timeoutCall): if timeoutCall.active(): timeoutCall.cancel() return result def gotHeaders(self, headers): self.response_headers = headers if headers.has_key('set-cookie'): for cookie in headers['set-cookie']: cookparts = cookie.split(';') cook = cookparts[0] cook.lstrip() k, v = cook.split('=', 1) self.cookies[k.lstrip()] = v.lstrip() def gotStatus(self, version, status, message): self.version, self.status, self.message = version, status, message def page(self, page): if self.waiting: self.waiting = 0 self.deferred.callback(page) def noPage(self, reason): if self.waiting: self.waiting = 0 self.deferred.errback(reason) def clientConnectionFailed(self, _, reason): if self.waiting: self.waiting = 0 self.deferred.errback(reason) class HTTPDownloader(HTTPClientFactory): """Download to a file.""" protocol = HTTPPageDownloader value = None def __init__(self, url, fileOrName, method='GET', postdata=None, headers=None, agent="Twisted client", supportPartial=0): self.requestedPartial = 0 if isinstance(fileOrName, types.StringTypes): self.fileName = fileOrName self.file = None if supportPartial and os.path.exists(self.fileName): fileLength = os.path.getsize(self.fileName) if fileLength: self.requestedPartial = fileLength if headers == None: headers = {} headers["range"] = "bytes=%d-" % fileLength else: self.file = fileOrName HTTPClientFactory.__init__(self, url, method=method, postdata=postdata, headers=headers, agent=agent) self.deferred = defer.Deferred() self.waiting = 1 def gotHeaders(self, headers): if self.requestedPartial: contentRange = headers.get("content-range", None) if not contentRange: # server doesn't support partial requests, oh well self.requestedPartial = 0 return start, end, realLength = http.parseContentRange(contentRange[0]) if start != self.requestedPartial: # server is acting wierdly self.requestedPartial = 0 def openFile(self, partialContent): if partialContent: file = open(self.fileName, 'rb+') file.seek(0, 2) else: file = open(self.fileName, 'wb') return file def pageStart(self, partialContent): """Called on page download start. @param partialContent: tells us if the download is partial download we requested. """ if partialContent and not self.requestedPartial: raise ValueError, "we shouldn't get partial content response if we didn't want it!" if self.waiting: self.waiting = 0 try: if not self.file: self.file = self.openFile(partialContent) except IOError: #raise self.deferred.errback(failure.Failure()) def pagePart(self, data): if not self.file: return try: self.file.write(data) except IOError: #raise self.file = None self.deferred.errback(failure.Failure()) def pageEnd(self): if not self.file: return try: self.file.close() except IOError: self.deferred.errback(failure.Failure()) return self.deferred.callback(self.value) def _parse(url, defaultPort=None): parsed = urlparse.urlparse(url) scheme = parsed[0] path = urlparse.urlunparse(('','')+parsed[2:]) if defaultPort is None: if scheme == 'https': defaultPort = 443 else: defaultPort = 80 host, port = parsed[1], defaultPort if ':' in host: host, port = host.split(':') port = int(port) return scheme, host, port, path def getPage(url, contextFactory=None, *args, **kwargs): """Download a web page as a string. Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See HTTPClientFactory to see what extra args can be passed. """ scheme, host, port, path = _parse(url) factory = HTTPClientFactory(url, *args, **kwargs) if scheme == 'https': from twisted.internet import ssl if contextFactory is None: contextFactory = ssl.ClientContextFactory() reactor.connectSSL(host, port, factory, contextFactory) else: reactor.connectTCP(host, port, factory) return factory.deferred def downloadPage(url, file, contextFactory=None, *args, **kwargs): """Download a web page to a file. @param file: path to file on filesystem, or file-like object. See HTTPDownloader to see what extra args can be passed. """ scheme, host, port, path = _parse(url) factory = HTTPDownloader(url, file, *args, **kwargs) if scheme == 'https': from twisted.internet import ssl if contextFactory is None: contextFactory = ssl.ClientContextFactory() reactor.connectSSL(host, port, factory, contextFactory) else: reactor.connectTCP(host, port, factory) return factory.deferred