Use tornado future to fetch url, two different way

2019-06-13 08:04发布

问题:

I want to use tornado to fetch batch urls. So my code shows below:

from tornado.concurrent import Future
from tornado.httpclient import AsyncHTTPClient    
from tornado.ioloop import IOLoop    


class BatchHttpClient(object):    
    def __init__(self, urls, timeout=20):    
        self.async_http_client = AsyncHTTPClient()    
        self.urls = urls    
        self.timeout = 20    

    def __mid(self):    
        results = []    
        for url in self.urls:    
            future = Future()    

            def f_callback(f1):    
                future.set_result(f1.result())    

            f = self.async_http_client.fetch(url)    
            f.add_done_callback(f_callback)    
            results.append(future)    
        return results    

    def get_batch(self):    
        results = IOLoop.current().run_sync(self.__mid)    
        return results    


urls = ["http://www.baidu.com?v={}".format(i) for i in range(10)]    
batch_http_client = BatchHttpClient(urls)    
print batch_http_client.get_batch()    

When I run the code, an error occurs:

ERROR:tornado.application:Exception in callback <function f_callback at 0x7f35458cae60> for <tornado.concurrent.Future object at 0x7f35458c9650>
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/tornado/concurrent.py", line 317, in _set_done
    cb(self)
  File "/home/q/www/base_data_manager/utils/async_util.py", line 21, in f_callback
    future.set_result(f1.result())
  File "/usr/local/lib/python2.7/dist-packages/tornado/concurrent.py", line 271, in set_result
    self._set_done()
  File "/usr/local/lib/python2.7/dist-packages/tornado/concurrent.py", line 315, in _set_done
    for cb in self._callbacks:
TypeError: 'NoneType' object is not iterable

But if I change the code like:

class BatchHttpClient(object):
    def __init__(self, urls, timeout=20):
        self.async_http_client = AsyncHTTPClient()
        self.urls = urls
        self.timeout = 20

    def _get_batch(self, url):
        future = Future()
        f = self.async_http_client.fetch(url)
        def callback(f1):
            print future
            print f1.result()

            future.set_result(f1.result())
            print '---------'
        f.add_done_callback(callback)
        return future

    def __mid(self):
        results = []
        for url in self.urls:
            results.append(self._get_batch(url))
        return results

    def get_batch(self):
        results = IOLoop.current().run_sync(self.__mid)
        return results


urls = ["http://www.baidu.com?v={}".format(i) for i in range(10)]
batch_http_client = BatchHttpClient(urls)
for result in batch_http_client.get_batch():
    print result.body

Then it works. What I do is just add a mid-function,why the results are different.

回答1:

In your first code snippet, the problem is that by the time your callbacks execute, the value of future is the last value set by the loop. In other words, when this executes:

def f_callback(f1):    
    future.set_result(f1.result())    

the value of future is always the same. You can see this if you add a print future: the object's address will always be the same.

In your second snippet, each future and each callback are created in a function called by the loop. So each callback gets its value for future from a new scope, which fixes the problem.

Another way to fix the issue would be to modify __mid like this:

def __mid(self):
    results = []
    for url in self.urls:
        future = Future()

        def make_callback(future):
            def f_callback(f1):
                future.set_result(f1.result())
            return f_callback

        f = self.async_http_client.fetch(url)
        f.add_done_callback(make_callback(future))
        results.append(future)
    return results

By creating the callback in make_callback(future), the value of future in the callbacks comes from a different scope for each callback.



回答2:

Louis's answer is correct, but I'd like to suggest a few simpler alternatives.

First, you could use functools.partial instead of a make_callback wrapper function:

def __mid(self):    
    results = []    
    for url in self.urls:    
        future = Future()    

        def f_callback(output, input):    
            output.set_result(f1.result())    

        f = self.async_http_client.fetch(url)
        # partial() binds the current value of future to
        # the output argument.
        f.add_done_callback(functools.partial(f_callback, future))
        results.append(future)    
    return results    

But the intermediate Future looks completely unnecessary. This is equivalent to:

def __mid(self):
    return [self.async_http_client.fetch(url) for url in self.urls]

Personally I would make __mid a coroutine:

@gen.coroutine
def __mid(self):
    return (yield [self.async_http_client.fetch_url(url) for url in self.urls])

If you don't want to use coroutines, you may prefer to pass a callback to AsyncHTTPClient.fetch instead of using Future.add_done_callback on its result.