I want to use the coroutine to crawl and parse webpages. I write a sample and test. The program could run well in python 3.5 in ubuntu 16.04 and it will quit when all the works have been done. The source code is below.
import aiohttp
import asyncio
from bs4 import BeautifulSoup
async def coro():
coro_loop = asyncio.get_event_loop()
url = u'https://www.python.org/'
for _ in range(4):
async with aiohttp.ClientSession(loop=coro_loop) as coro_session:
with aiohttp.Timeout(30, loop=coro_session.loop):
async with coro_session.get(url) as resp:
print('get response from url: %s' % url)
source_code = await resp.read()
soup = BeautifulSoup(source_code, 'lxml')
def main():
loop = asyncio.get_event_loop()
worker = loop.create_task(coro())
try:
loop.run_until_complete(worker)
except KeyboardInterrupt:
print ('keyboard interrupt')
worker.cancel()
finally:
loop.stop()
loop.run_forever()
loop.close()
if __name__ == '__main__':
main()
While testing, I find when I shut down the program by 'Ctrl+C', there will be a error 'Task exception was never retrieved'.
^Ckeyboard interrupt
Task exception was never retrieved
future: <Task finished coro=<coro() done, defined at ./test.py:8> exception=KeyboardInterrupt()>
Traceback (most recent call last):
File "./test.py", line 23, in main
loop.run_until_complete(worker)
File "/usr/lib/python3.5/asyncio/base_events.py", line 375, in run_until_complete
self.run_forever()
File "/usr/lib/python3.5/asyncio/base_events.py", line 345, in run_forever
self._run_once()
File "/usr/lib/python3.5/asyncio/base_events.py", line 1312, in _run_once
handle._run()
File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
self._callback(*self._args)
File "/usr/lib/python3.5/asyncio/tasks.py", line 307, in _wakeup
self._step()
File "/usr/lib/python3.5/asyncio/tasks.py", line 239, in _step
result = coro.send(None)
File "./test.py", line 17, in coro
soup = BeautifulSoup(source_code, 'lxml')
File "/usr/lib/python3/dist-packages/bs4/__init__.py", line 215, in __init__
self._feed()
File "/usr/lib/python3/dist-packages/bs4/__init__.py", line 239, in _feed
self.builder.feed(self.markup)
File "/usr/lib/python3/dist-packages/bs4/builder/_lxml.py", line 240, in feed
self.parser.feed(markup)
File "src/lxml/parser.pxi", line 1194, in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:119773)
File "src/lxml/parser.pxi", line 1316, in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:119644)
File "src/lxml/parsertarget.pxi", line 141, in lxml.etree._TargetParserContext._handleParseResult (src/lxml/lxml.etree.c:137264)
File "src/lxml/parsertarget.pxi", line 135, in lxml.etree._TargetParserContext._handleParseResult (src/lxml/lxml.etree.c:137128)
File "src/lxml/lxml.etree.pyx", line 324, in lxml.etree._ExceptionContext._raise_if_stored (src/lxml/lxml.etree.c:11090)
File "src/lxml/saxparser.pxi", line 499, in lxml.etree._handleSaxData (src/lxml/lxml.etree.c:131013)
File "src/lxml/parsertarget.pxi", line 88, in lxml.etree._PythonSaxParserTarget._handleSaxData (src/lxml/lxml.etree.c:136397)
File "/usr/lib/python3/dist-packages/bs4/builder/_lxml.py", line 206, in data
def data(self, content):
KeyboardInterrupt
I looked through the offical docs of python but haven't got a clue. I try to capture the Keyboard Interrupt in coro().
try:
soup = BeautifulSoup(source_code, 'lxml')
except KeyboardInterrupt:
print ('capture exception')
raise
Everytime the 'try/except' around BeautifulSoup() capture the KeyboardInterrupt, the error will occur. It seems that BeautifulSoup contribute to the error. But how to tackle it?