I want to allow users to download an archive of multiple large files at once. However, the files and the archive may be too large to store in memory or on disk on my server (they are streamed in from other servers on the fly). I'd like to generate the archive as I stream it to the user.
I can use Tar or Zip or whatever is simplest. I am using django, which allows me to return a generator or file-like object in my response. This object could be used to pump the process along. However, I am having trouble figuring out how to build this sort of thing around the zipfile or tarfile libraries, and I'm afraid they may not support reading files as they go, or reading the archive as it is built.
This answer on converting an iterator to a file-like object might help. tarfile#addfile
takes an iterable, but it appears to immediately pass that to shutil.copyfileobj
, so this may not be as generator-friendly as I had hoped.
I ended up using SpiderOak ZipStream.
You can do it by generating and streaming a zip file with no compression, which is basically to just add the headers before each file's content. You're right, the libraries don't support this, but you can hack around them to get it working.
This code wraps zipfile.ZipFile with a class that manages the stream and creates instances of zipfile.ZipInfo for the files as they come. CRC and size can be set at the end. You can push data from the input stream into it with put_file(), write() and flush(), and read data out of it to the output stream with read().
import struct
import zipfile
import time
from StringIO import StringIO
class ZipStreamer(object):
def __init__(self):
self.out_stream = StringIO()
# write to the stringIO with no compression
self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)
self.current_file = None
self._last_streamed = 0
def put_file(self, name, date_time=None):
if date_time is None:
date_time = time.localtime(time.time())[:6]
zinfo = zipfile.ZipInfo(name, date_time)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.flag_bits = 0x08
zinfo.external_attr = 0600 << 16
zinfo.header_offset = self.out_stream.pos
# write right values later
zinfo.CRC = 0
zinfo.file_size = 0
zinfo.compress_size = 0
self.zipfile._writecheck(zinfo)
# write header to stream
self.out_stream.write(zinfo.FileHeader())
self.current_file = zinfo
def flush(self):
zinfo = self.current_file
self.out_stream.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
self.zipfile.filelist.append(zinfo)
self.zipfile.NameToInfo[zinfo.filename] = zinfo
self.current_file = None
def write(self, bytes):
self.out_stream.write(bytes)
self.out_stream.flush()
zinfo = self.current_file
# update these...
zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
zinfo.file_size += len(bytes)
zinfo.compress_size += len(bytes)
def read(self):
i = self.out_stream.pos
self.out_stream.seek(self._last_streamed)
bytes = self.out_stream.read()
self.out_stream.seek(i)
self._last_streamed = i
return bytes
def close(self):
self.zipfile.close()
Keep in mind that this code was just a quick proof of concept and I did no further development or testing once I decided to let the http server itself deal with this problem. A few things you should look into if you decide to use it is to check if nested folders are archived correctly, and filename encoding (which is always a pain with zip files anyway).
You can stream a ZipFile to a Pylons or Django response fileobj by wrapping the fileobj in something file-like that implements tell()
. This will buffer each individual file in the zip in memory, but stream the zip itself. We use it to stream download a zip file full of images, so we never buffer more than a single image in memory.
This example streams to sys.stdout
. For Pylons use response.body_file
, for Django you can use the HttpResponse
itself as a file.
import zipfile
import sys
class StreamFile(object):
def __init__(self, fileobj):
self.fileobj = fileobj
self.pos = 0
def write(self, str):
self.fileobj.write(str)
self.pos += len(str)
def tell(self):
return self.pos
def flush(self):
self.fileobj.flush()
# Wrap a stream so ZipFile can use it
out = StreamFile(sys.stdout)
z = zipfile.ZipFile(out, 'w', zipfile.ZIP_DEFLATED)
for i in range(5):
z.writestr("hello{0}.txt".format(i), "this is hello{0} contents\n".format(i) * 3)
z.close()
Here is the solution from Pedro Werneck (from above) but with a fix to avoid collecting all data in memory (read
method is fixed a little bit):
class ZipStreamer(object):
def __init__(self):
self.out_stream = StringIO.StringIO()
# write to the stringIO with no compression
self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)
self.current_file = None
self._last_streamed = 0
def put_file(self, name, date_time=None):
if date_time is None:
date_time = time.localtime(time.time())[:6]
zinfo = zipfile.ZipInfo(name, date_time)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.flag_bits = 0x08
zinfo.external_attr = 0600 << 16
zinfo.header_offset = self.out_stream.pos
# write right values later
zinfo.CRC = 0
zinfo.file_size = 0
zinfo.compress_size = 0
self.zipfile._writecheck(zinfo)
# write header to mega_streamer
self.out_stream.write(zinfo.FileHeader())
self.current_file = zinfo
def flush(self):
zinfo = self.current_file
self.out_stream.write(
struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
zinfo.file_size))
self.zipfile.filelist.append(zinfo)
self.zipfile.NameToInfo[zinfo.filename] = zinfo
self.current_file = None
def write(self, bytes):
self.out_stream.write(bytes)
self.out_stream.flush()
zinfo = self.current_file
# update these...
zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
zinfo.file_size += len(bytes)
zinfo.compress_size += len(bytes)
def read(self):
self.out_stream.seek(self._last_streamed)
bytes = self.out_stream.read()
self._last_streamed = 0
# cleaning up memory in each iteration
self.out_stream.seek(0)
self.out_stream.truncate()
self.out_stream.flush()
return bytes
def close(self):
self.zipfile.close()
then you can use stream_generator
function as a stream for a zip file
def stream_generator(files_paths):
s = ZipStreamer()
for f in files_paths:
s.put_file(f)
with open(f) as _f:
s.write(_f.read())
s.flush()
yield s.read()
s.close()
example for Falcon:
class StreamZipEndpoint(object):
def on_get(self, req, resp):
files_pathes = [
'/path/to/file/1',
'/path/to/file/2',
]
zip_filename = 'output_filename.zip'
resp.content_type = 'application/zip'
resp.set_headers([
('Content-Disposition', 'attachment; filename="%s"' % (
zip_filename,))
])
resp.stream = stream_generator(files_pathes)