Have this code that reads a file and process it. The file is quite big, 12 million lines, so currently I split it manually into 1000 lines file and start each process sequentially for each 1000 lines (bash script).
Is there a way to use Twisted to load a file and process it by 1000 items from one file (progress bar would be nice) without the need for me to split it manually?
scanner.py
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Scanner v0.99')
parser.add_argument(
'-i', '--input', help='Input list of domains', required=True)
args = parser.parse_args()
input = args.input
with open(input) as f:
urls = f.read().splitlines()
def mainjob(reactor, urls=urls):
for url in tqdm(urls):
agent = Agent(reactor)
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['bot']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(lambda x: None) # ignore errors
return d
react(mainjob, argv[3:])
Update 1:
Now I execute it like this:
file.txt - 12,000,000 lines
chunk01.txt - file with 1000 lines . . .
I execute for each chunk file a script.
python scanner.py chunk01.txt
python scanner.py chunk02.txt
.
.
.
Want to execute script once:
python scanner.py file.txt
The problem lies, that I need to pass urls as arguments to react(). If I read it to memory (via f.read()) as 12,000,000 file it is too big. Hence I splitted the file and execute script on each small file.
Hope it is now clearer...
Update 2:
Based on @Jean-Paul Calderone answer, I cooked this code.
It seems to work, however I am bumped since on:
180,000 iterations.... I would assume 180,000 domains (each line from input file), the script has only printed/outputted ca. 35707 lines (entries). I would expect it to be something close to 180,000 ... I know some domains will time out. When I run it the "old" way, it was more consistent, the number was closer i.e number of input domains was close to outputed lines in output file.
Can there something be "bad" with the code? Any ideas?
python scanner.py > out.txt
181668it [1:47:36, 4.82it/s]
and counting the lines:
wc -l out.txt
36840 out.txt
scanner.py
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
def main(reactor, url_path):
urls = open(url_path)
return mainjob(reactor, (url.strip() for url in urls))
def mainjob(reactor, urls=argv[2:]):
#for url in urls:
# print url
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
tasks = list(cooperate(work) for i in range(100))
return gatherResults(list(task.whenDone() for task in tasks))
def process(agent, url):
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['bot']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(lambda x: None) # ignore errors
return d
react(main, ["./domains.txt"])
Update 3:
Updated the code to print errors to errors.txt
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
f = open("errors.txt", "w")
def error(response, url):
f.write("Error: "+url+"\n")
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
def main(reactor, url_path):
urls = open(url_path)
return mainjob(reactor, (url.strip() for url in urls))
def mainjob(reactor, urls=argv[2:]):
#for url in urls:
# print url
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
tasks = list(cooperate(work) for i in range(100))
return gatherResults(list(task.whenDone() for task in tasks))
def process(agent, url):
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['crawler']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(error, url)
return d
react(main, ["./domains.txt"])
f.close()
Update 4:
I captured the traffic with Wireshark, just with 2 domains, those domains errored previously:
user@laptop:~/crawler$ python scanner.py
2it [00:00, 840.71it/s]
user@laptop:~/crawler$ cat errors.txt
Error: google.al
Error: fau.edu.al
As you can see they had errors, but with Wireshark I see the response:
You need to add a limit to the amount of concurrency your program creates. Currently, you process all URLs given at the same time - or try to, at least:
This issues a request for each URL without waiting for any of them to complete. Instead, use
twisted.internet.task.cooperate
to run a limited number at a time. This runs one request at a time:You probably want more than that. So, call cooperate() a few more times:
This runs up to 100 requests at a time. Each task pulls the next element from
work
and waits on it.gatherResults
waits for all 100 tasks to finish.Now just avoid loading the complete input into memory at a time:
This opens the url file but only reads lines from it as they're needed.