How to zip a very large file in python

2019-03-31 02:56发布

I would like to zip a couple of files that may amount to about 99 GB using python. Please what is the most efficient way to do this using the zipfile library. This is a sample code I have

with gcs.open(zip_file_name, 'w', content_type=b'application/zip') as f:

    with  zipfile.ZipFile(f, 'w') as z:

        for file in files:

            is_owner = (is_page_allowed_to_visitor(page, visitor) or (file.owner_id == visitor.id) )

            if is_owner:
                file.show = True
            elif file.available_from:
                if file.available_from > datetime.now():
                    file.show = False
            elif file.available_to:
                if file.available_to < datetime.now():
                    file.show = False
            else:
                file.show = True

            if file.show:

                file_name = "/%s/%s" % (gcs_store.get_bucket_name(), file.gcs_name)

                gcs_reader = gcs.open(file_name, 'r')

                z.writestr('%s-%s' %(file.created_on, file.name), gcs_reader.read() )

                gcs_reader.close()

f.close() #closing zip file

Some points to note:

1) I am using the google app engine to host the files so I cannot use the zipfile.write() method. I can only get the file contents in bytes.

Thanks in advance

1条回答
你好瞎i
2楼-- · 2019-03-31 03:28

I have added a new method to the zipfile library. This enhanced zipfile library is open source and can be found on github (EnhancedZipFile). I added a new method with the inspiration from the zipfile.write() method and the zipfile.writestr()method

def writebuffered(self, zinfo_or_arcname, file_pointer, file_size, compress_type=None):
    if not isinstance(zinfo_or_arcname, ZipInfo):
        zinfo = ZipInfo(filename=zinfo_or_arcname,
                        date_time=time.localtime(time.time())[:6])

        zinfo.compress_type = self.compression
        if zinfo.filename[-1] == '/':
            zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
            zinfo.external_attr |= 0x10           # MS-DOS directory flag
        else:
            zinfo.external_attr = 0o600 << 16     # ?rw-------
    else:
        zinfo = zinfo_or_arcname

    zinfo.file_size = file_size            # Uncompressed size
    zinfo.header_offset = self.fp.tell()    # Start of header bytes
    self._writecheck(zinfo)
    self._didModify = True

    fp = file_pointer
    # Must overwrite CRC and sizes with correct data later
    zinfo.CRC = CRC = 0
    zinfo.compress_size = compress_size = 0
    # Compressed size can be larger than uncompressed size
    zip64 = self._allowZip64 and \
            zinfo.file_size * 1.05 > ZIP64_LIMIT
    self.fp.write(zinfo.FileHeader(zip64))
    if zinfo.compress_type == ZIP_DEFLATED:
        cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
             zlib.DEFLATED, -15)
    else:
        cmpr = None
    file_size = 0
    while 1:
        buf = fp.read(1024 * 8)
        if not buf:
            break
        file_size = file_size + len(buf)
        CRC = crc32(buf, CRC) & 0xffffffff
        if cmpr:
            buf = cmpr.compress(buf)
            compress_size = compress_size + len(buf)
        self.fp.write(buf)

    if cmpr:
        buf = cmpr.flush()
        compress_size = compress_size + len(buf)
        self.fp.write(buf)
        zinfo.compress_size = compress_size
    else:
        zinfo.compress_size = file_size
    zinfo.CRC = CRC
    zinfo.file_size = file_size
    if not zip64 and self._allowZip64:
        if file_size > ZIP64_LIMIT:
            raise RuntimeError('File size has increased during compressing')
        if compress_size > ZIP64_LIMIT:
            raise RuntimeError('Compressed size larger than uncompressed size')
    # Seek backwards and write file header (which will now include
    # correct CRC and file sizes)
    position = self.fp.tell()       # Preserve current position in file
    self.fp.flush()
    self.filelist.append(zinfo)
    self.NameToInfo[zinfo.filename] = zinfo

Points to note

查看更多
登录 后发表回答