How can I download multiple links simultaneously? My script below works but only downloads one at a time and it is extremely slow. I can't figure out how to incorporate multithreading in my script.
The Python script:
from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
dirname = urlparse.urlparse(url).path.split('.')[-1]
f = urllib2.urlopen(url)
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
articleTag = soup.html.body.article
converted = str(articleTag)
full_path = os.path.join(dirname, name)
open(full_path, 'w').write(converted)
print(name)
The HTML file called links.html
:
<a href="http://www.youversion.com/bible/gen.1.nmv-fas">http://www.youversion.com/bible/gen.1.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.2.nmv-fas">http://www.youversion.com/bible/gen.2.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.3.nmv-fas">http://www.youversion.com/bible/gen.3.nmv-fas</a>
<a href="http://www.youversion.com/bible/gen.4.nmv-fas">http://www.youversion.com/bible/gen.4.nmv-fas</a>
It looks to me like the consumer - producer problem - see wikipedia
You may use
import Queue, thread
# create a Queue.Queue here
queue = Queue.Queue()
print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
queue.put(url) # produce
def thrad():
url = queue.get() # consume
name = urlparse.urlparse(url).path.split('/')[-1]
dirname = urlparse.urlparse(url).path.split('.')[-1]
f = urllib2.urlopen(url)
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
articleTag = soup.html.body.article
converted = str(articleTag)
full_path = os.path.join(dirname, name)
open(full_path, 'wb').write(converted)
print(name)
thread.start_new(thrad, ()) # start 1 threads
I use multiprocessing
for parallelizing things -- for some reason I like it better than threading
from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
import multiprocessing
print ("downloading and parsing Bibles...")
def download_stuff(link):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
dirname = urlparse.urlparse(url).path.split('.')[-1]
f = urllib2.urlopen(url)
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
articleTag = soup.html.body.article
converted = str(articleTag)
full_path = os.path.join(dirname, name)
open(full_path, 'w').write(converted)
print(name)
root = html.parse(open('links.html'))
links = root.findall('//a')
pool = multiprocessing.Pool(processes=5) #use 5 processes to download the data
output = pool.map(download_stuff,links) #output is a list of [None,None,...] since download_stuff doesn't return anything
In 2017 there are some other options now, like asyncio and ThreadPoolExecutor.
Here is an example of ThreadPoolExecutor (included in Python futures)
from concurrent.futures import ThreadPoolExecutor
def download(url, filename):
... your dowload function...
pass
with ThreadPoolExecutor(max_workers=12) as executor:
future = executor.submit(download, url, filename)
print(future.result())
submit() function will submit the task to a queue. (queue management is done for you)
Python version 3.5 and above:
if max_workers is None or not given, it will default to the number of processors on the
machine, multiplied by 5.
You can set max_workers, a few times the number of CPU cores in practice, do some tests to see how faw you can go up, depending on context-switching overhead.
For more info:
https://docs.python.org/3/library/concurrent.futures.html