"""An extremely asynch approach to unzipping files. This allows you to unzip a little bit of a file at a time, which means it can integrate nicely with a reactor. """ from __future__ import generators import zipfile import os.path import binascii import zlib class ChunkingZipFile(zipfile.ZipFile): """A ZipFile object which, with readfile(), also gives you access to a filelike object for each entry. """ def readfile(self, name): """Return file-like object for name.""" if self.mode not in ("r", "a"): raise RuntimeError, 'read() requires mode "r" or "a"' if not self.fp: raise RuntimeError, \ "Attempt to read ZIP archive that was already closed" zinfo = self.getinfo(name) self.fp.seek(zinfo.file_offset, 0) if zinfo.compress_type == zipfile.ZIP_STORED: return ZipFileEntry(self.fp, zinfo.compress_size) elif zinfo.compress_type == zipfile.ZIP_DEFLATED: if not zlib: raise RuntimeError, \ "De-compression requires the (missing) zlib module" return DeflatedZipFileEntry(self.fp, zinfo.compress_size) else: raise zipfile.BadZipfile, \ "Unsupported compression method %d for file %s" % \ (zinfo.compress_type, name) def read(self, name): """Return file bytes (as a string) for name.""" f = self.readfile(name) zinfo = self.getinfo(name) bytes = f.read() crc = binascii.crc32(bytes) if crc != zinfo.CRC: raise zipfile.BadZipfile, "Bad CRC-32 for file %s" % name return bytes class ZipFileEntry: """File-like object used to read an uncompressed entry in a ZipFile""" def __init__(self, fp, length): self.fp = fp self.readBytes = 0 self.length = length self.finished = 0 def tell(self): return self.readBytes def read(self, n=None): if n is None: n = self.length - self.readBytes if n == 0 or self.finished: return '' data = self.fp.read(min(n, self.length - self.readBytes)) self.readBytes += len(data) if self.readBytes == self.length or len(data) < n: self.finished = 1 return data def close(self): self.finished = 1 del self.fp class DeflatedZipFileEntry: """File-like object used to read a deflated entry in a ZipFile""" def __init__(self, fp, length): self.fp = fp self.returnedBytes = 0 self.readBytes = 0 self.decomp = zlib.decompressobj(-15) self.buffer = "" self.length = length self.finished = 0 def tell(self): return self.returnedBytes def read(self, n=None): if self.finished: return "" if n is None: result = [self.buffer,] result.append(self.decomp.decompress(self.fp.read(self.length - self.readBytes))) result.append(self.decomp.decompress("Z")) result.append(self.decomp.flush()) self.buffer = "" self.finished = 1 result = "".join(result) self.returnedBytes += len(result) return result else: while len(self.buffer) < n: data = self.fp.read(min(n, 1024, self.length - self.readBytes)) self.readBytes += len(data) if not data: result = self.buffer + self.decomp.decompress("Z") + self.decomp.flush() self.finished = 1 self.buffer = "" self.returnedBytes += len(result) return result else: self.buffer += self.decomp.decompress(data) result = self.buffer[:n] self.buffer = self.buffer[n:] self.returnedBytes += len(result) return result def close(self): self.finished = 1 del self.fp def unzip(filename, directory=".", overwrite=0): """Unzip the file @param filename: the name of the zip file @param directory: the directory into which the files will be extracted @param overwrite: if on, overwrite files when they exist. You can still get an error if you try to create a directory over a file with the same name or vice-versa. """ for i in unzipIter(filename, directory, overwrite): pass DIR_BIT=16 def unzipIter(filename, directory='.', overwrite=0): """Return a generator for the zipfile. This implementation will yield after every file. The value it yields is the number of files left to unzip. """ zf=zipfile.ZipFile(filename, 'r') names=zf.namelist() if not os.path.exists(directory): os.makedirs(directory) remaining=countZipFileEntries(filename) for entry in names: remaining=remaining - 1 isdir=zf.getinfo(entry).external_attr & DIR_BIT f=os.path.join(directory, entry) if isdir: # overwrite flag only applies to files if not os.path.exists(f): os.makedirs(f) else: # create the directory the file will be in first, # since we can't guarantee it exists fdir=os.path.split(f)[0] if not os.path.exists(fdir): os.makedirs(f) if overwrite or not os.path.exists(f): outfile=file(f, 'wb') outfile.write(zf.read(entry)) outfile.close() yield remaining def countZipFileChunks(filename, chunksize): """Predict the number of chunks that will be extracted from the entire zipfile, given chunksize blocks. """ totalchunks=0 zf=ChunkingZipFile(filename) for info in zf.infolist(): totalchunks=totalchunks+countFileChunks(info, chunksize) return totalchunks def countFileChunks(zipinfo, chunksize): size=zipinfo.file_size count=size/chunksize if size%chunksize > 0: count=count+1 # each file counts as at least one chunk return count or 1 def countZipFileEntries(filename): zf=zipfile.ZipFile(filename) return len(zf.namelist()) def unzipIterChunky(filename, directory='.', overwrite=0, chunksize=4096): """Return a generator for the zipfile. This implementation will yield after every chunksize uncompressed bytes, or at the end of a file, whichever comes first. The value it yields is the number of chunks left to unzip. """ czf=ChunkingZipFile(filename, 'r') if not os.path.exists(directory): os.makedirs(directory) remaining=countZipFileChunks(filename, chunksize) names=czf.namelist() infos=czf.infolist() for entry, info in zip(names, infos): isdir=info.external_attr & DIR_BIT f=os.path.join(directory, entry) if isdir: # overwrite flag only applies to files if not os.path.exists(f): os.makedirs(f) remaining=remaining-1 assert remaining>=0 yield remaining else: # create the directory the file will be in first, # since we can't guarantee it exists fdir=os.path.split(f)[0] if not os.path.exists(fdir): os.makedirs(f) if overwrite or not os.path.exists(f): outfile=file(f, 'wb') fp=czf.readfile(entry) if info.file_size==0: remaining=remaining-1 assert remaining>=0 yield remaining fread=fp.read ftell=fp.tell owrite=outfile.write size=info.file_size while ftell() < size: hunk=fread(chunksize) owrite(hunk) remaining=remaining-1 assert remaining>=0 yield remaining outfile.close() else: remaining=remaining-countFileChunks(info, chunksize) assert remaining>=0 yield remaining