Python seek on remote file using HTTP

2019-01-14 04:21发布

问题:

How do I seek to a particular position on a remote (HTTP) file so I can download only that part?

Lets say the bytes on a remote file were: 1234567890

I wanna seek to 4 and download 3 bytes from there so I would have: 456

and also, how do I check if a remote file exists? I tried, os.path.isfile() but it returns False when I'm passing a remote file url.

回答1:

If you are downloading the remote file through HTTP, you need to set the Range header.

Check in this example how it can be done. Looks like this:

myUrlclass.addheader("Range","bytes=%s-" % (existSize))

EDIT: I just found a better implementation. This class is very simple to use, as it can be seen in the docstring.

class HTTPRangeHandler(urllib2.BaseHandler):
"""Handler that enables HTTP Range headers.

This was extremely simple. The Range header is a HTTP feature to
begin with so all this class does is tell urllib2 that the 
"206 Partial Content" reponse from the HTTP server is what we 
expected.

Example:
    import urllib2
    import byterange

    range_handler = range.HTTPRangeHandler()
    opener = urllib2.build_opener(range_handler)

    # install it
    urllib2.install_opener(opener)

    # create Request and set Range header
    req = urllib2.Request('http://www.python.org/')
    req.header['Range'] = 'bytes=30-50'
    f = urllib2.urlopen(req)
"""

def http_error_206(self, req, fp, code, msg, hdrs):
    # 206 Partial Content Response
    r = urllib.addinfourl(fp, hdrs, req.get_full_url())
    r.code = code
    r.msg = msg
    return r

def http_error_416(self, req, fp, code, msg, hdrs):
    # HTTP's Range Not Satisfiable error
    raise RangeError('Requested Range Not Satisfiable')

Update: The "better implementation" has moved to github: excid3/urlgrabber in the byterange.py file.



回答2:

I highly recommend using the requests library. It is easily the best HTTP library I have ever used. In particular, to accomplish what you have described, you would do something like:

import requests

url = "http://www.sffaudio.com/podcasts/ShellGameByPhilipK.Dick.pdf"

# Retrieve bytes between offsets 3 and 5 (inclusive).
r = requests.get(url, headers={"range": "bytes=3-5"})

# If a 4XX client error or a 5XX server error is encountered, we raise it.
r.raise_for_status()


回答3:

AFAIK, this is not possible using fseek() or similar. You need to use the HTTP Range header to achieve this. This header may or may not be supported by the server, so your mileage may vary.

import urllib2

myHeaders = {'Range':'bytes=0-9'}

req = urllib2.Request('http://www.promotionalpromos.com/mirrors/gnu/gnu/bash/bash-1.14.3-1.14.4.diff.gz',headers=myHeaders)

partialFile = urllib2.urlopen(req)

s2 = (partialFile.read())

EDIT: This is of course assuming that by remote file you mean a file stored on a HTTP server...

If the file you want is on an FTP server, FTP only allows to to specify a start offset and not a range. If this is what you want, then the following code should do it (not tested!)

import ftplib
fileToRetrieve = 'somefile.zip'
fromByte = 15
ftp = ftplib.FTP('ftp.someplace.net')
outFile = open('partialFile', 'wb')
ftp.retrbinary('RETR '+ fileToRetrieve, outFile.write, rest=str(fromByte))
outFile.close()


回答4:

I did not find any existing implementations of a file-like interface with seek() to HTTP URLs, so I rolled my own simple version: https://github.com/valgur/pyhttpio. It depends on urllib.request but could probably easily be modified to use requests, if necessary.

The full code:

import cgi
import time
import urllib.request
from io import IOBase
from sys import stderr


class SeekableHTTPFile(IOBase):
    def __init__(self, url, name=None, repeat_time=-1, debug=False):
        """Allow a file accessible via HTTP to be used like a local file by utilities
         that use `seek()` to read arbitrary parts of the file, such as `ZipFile`.
        Seeking is done via the 'range: bytes=xx-yy' HTTP header.

        Parameters
        ----------
        url : str
            A HTTP or HTTPS URL
        name : str, optional
            The filename of the file.
            Will be filled from the Content-Disposition header if not provided.
        repeat_time : int, optional
            In case of HTTP errors wait `repeat_time` seconds before trying again.
            Negative value or `None` disables retrying and simply passes on the exception (the default).
        """
        super().__init__()
        self.url = url
        self.name = name
        self.repeat_time = repeat_time
        self.debug = debug
        self._pos = 0
        self._seekable = True
        with self._urlopen() as f:
            if self.debug:
                print(f.getheaders())
            self.content_length = int(f.getheader("Content-Length", -1))
            if self.content_length < 0:
                self._seekable = False
            if f.getheader("Accept-Ranges", "none").lower() != "bytes":
                self._seekable = False
            if name is None:
                header = f.getheader("Content-Disposition")
                if header:
                    value, params = cgi.parse_header(header)
                    self.name = params["filename"]

    def seek(self, offset, whence=0):
        if not self.seekable():
            raise OSError
        if whence == 0:
            self._pos = 0
        elif whence == 1:
            pass
        elif whence == 2:
            self._pos = self.content_length
        self._pos += offset
        return self._pos

    def seekable(self, *args, **kwargs):
        return self._seekable

    def readable(self, *args, **kwargs):
        return not self.closed

    def writable(self, *args, **kwargs):
        return False

    def read(self, amt=-1):
        if self._pos >= self.content_length:
            return b""
        if amt < 0:
            end = self.content_length - 1
        else:
            end = min(self._pos + amt - 1, self.content_length - 1)
        byte_range = (self._pos, end)
        self._pos = end + 1
        with self._urlopen(byte_range) as f:
            return f.read()

    def readall(self):
        return self.read(-1)

    def tell(self):
        return self._pos

    def __getattribute__(self, item):
        attr = object.__getattribute__(self, item)
        if not object.__getattribute__(self, "debug"):
            return attr

        if hasattr(attr, '__call__'):
            def trace(*args, **kwargs):
                a = ", ".join(map(str, args))
                if kwargs:
                    a += ", ".join(["{}={}".format(k, v) for k, v in kwargs.items()])
                print("Calling: {}({})".format(item, a))
                return attr(*args, **kwargs)

            return trace
        else:
            return attr

    def _urlopen(self, byte_range=None):
        header = {}
        if byte_range:
            header = {"range": "bytes={}-{}".format(*byte_range)}
        while True:
            try:
                r = urllib.request.Request(self.url, headers=header)
                return urllib.request.urlopen(r)
            except urllib.error.HTTPError as e:
                if self.repeat_time is None or self.repeat_time < 0:
                    raise
                print("Server responded with " + str(e), file=stderr)
                print("Sleeping for {} seconds before trying again".format(self.repeat_time), file=stderr)
                time.sleep(self.repeat_time)

A small usage example:

url = "https://www.python.org/ftp/python/3.5.0/python-3.5.0-embed-amd64.zip"
f = SeekableHTTPFile(url, debug=True)
zf = ZipFile(f)
zf.printdir()
zf.extract("python.exe")

Edit: There is actually a mostly identical, if slightly more minimal, implementation in this answer: https://stackoverflow.com/a/7852229/2997179



标签: python http seek