I'm trying to scrape a large website of government records which requires a "snowball" method, i.e., starting at the main search page and then following each link that the scraper finds to the next page.
I've been able to load the main page using PyQt this SiteScraper tutorial.
import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from BeautifulSoup import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
baseUrl = 'http://www.thesite.gov'
url = 'http://www.thesite.gov/search'
r = Render(url)
html = r.frame.toHtml()
# use BeautifulSoup to cycle through each regulation
soup = BeautifulSoup(html)
regs = soup.find('div',{'class':'x-grid3-body'}).findAll('a')
# cycle through list and call up each page separately
for reg in regs:
link = baseUrl + reg['href']
link = str(link)
# use Qt to load each regulation page
r = Render(link)
html = r.frame.toHtml() # get actual rendered web page
The problem is I get this error when I try to render a new webpage:
RuntimeError: A QApplication instance already exists.
I get it that the function is trying to call another QApplication instance. But how do I navigate to a new page with the same instance?
class Render(QWebPage):
def __init__(self, app, url):
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
def main():
app = QApplication(sys.argv)
baseUrl = 'http://www.thesite.gov'
url = 'http://www.thesite.gov/search'
r = Render(app, url)
html = r.frame.toHtml()
OK then. If you really need JavaScript. (Can you get the answer from JSON at all? That would probably be easier still with simplejson
or json
.) The answer is don't make more than one QApplication. You're not allowed to. Make main
make a QApplication
and then use the QWebPage
without bothering to call QApplication.exec_()
. If that doesn't work, run it all in another QThread
.
I had the same problem (needing to load multiple pages with QWebPage) but I couldn't get any of these answers to work for me. Here's what did work, the key is to use a QEventLoop and connect loadFinished to loop.quit:
from PySide import QtCore, QtGui, QtWebKit
import sys
def loadPage(url):
page = QtWebKit.QWebPage()
loop = QtCore.QEventLoop() # Create event loop
page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
page.mainFrame().load(url)
loop.exec_() # Run event loop, it will end on loadFinished
return page.mainFrame().toHtml()
app = QtGui.QApplication(sys.argv)
urls = ['https://google.com', 'http://reddit.com', 'http://wikipedia.org']
for url in urls:
print '-----------------------------------------------------'
print 'Loading ' + url
html = loadPage(url)
print html
app.exit()
Posting a simplified example here compared to OP's to demonstrate the essential problem and solution.
You're crazy man! QT has a much better DOM than beautifulsoup.
Replace:
soup = BeautifulSoup(html)
With
page = QWebPage()
page.settings().setAttribute(QWebSettings.AutoLoadImages, False)
page.settings().setAttribute(QWebSettings.PluginsEnabled, False)
page.mainFrame().setHtml(html)
dom = page.mainFrame().documentElement()
Then you can simply scrape data like so:
li = dom.findFirst("body div#content div#special ul > li")
if not li.isNull():
class = li.attribute("class")
text = li.toPlainText()
Finally you should use QWebView instead of QWebPage. You can set it up to act like a server which can be controlled with a socket. This is what I do:
class QTimerWithPause(QTimer):
def __init__(self, parent = None):
super(QTimerWithPause, self).__init__ (parent)
self.startTime = 0
self.interval = 0
return
def start(self, interval):
from time import time
self.interval = interval
self.startTime = time()
super(QTimerWithPause, self).start(interval)
return
def pause(self):
from time import time
if self.isActive ():
self.stop()
elapsedTime = self.startTime - time()
self.startTime -= elapsedTime
# time() returns float secs, interval is int msec
self.interval -= int(elapsedTime*1000)+1
return
def resume(self):
if not self.isActive():
self.start(self.interval)
return
class CrawlerWebServer(QWebView):
TIMEOUT = 60
STUPID = r"(bing|yahoo|google)"
def __init__(self, host="0.0.0.0", port=50007, parent=None, enableImages=True, enablePlugins=True):
# Constructor
super(CrawlerWebServer, self).__init__(parent)
self.command = None
self.isLoading = True
self.isConnected = False
self.url = QUrl("http://mast3rpee.tk/")
self.timeout = QTimerWithPause(self)
self.socket = QTcpServer(self)
# 1: Settings
self.settings().enablePersistentStorage()
self.settings().setAttribute(QWebSettings.AutoLoadImages, enableImages)
self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins)
self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
# 2: Server
if args.verbosity > 0: print "Starting server..."
self.socket.setProxy(QNetworkProxy(QNetworkProxy.NoProxy))
self.socket.listen(QHostAddress(host), int(port))
self.connect(self.socket, SIGNAL("newConnection()"), self._connect)
if args.verbosity > 1:
print " Waiting for connection(" + host + ":" + str(port) + ")..."
# 3: Default page
self._load(10*1000, self._loadFinished)
return
def __del__(self):
try:
self.conn.close()
self.socket.close()
except:
pass
return
def _sendAuth(self):
self.conn.write("Welcome to WebCrawler server (http://mast3rpee.tk)\r\n\rLicenced under GPL\r\n\r\n")
def _connect(self):
self.disconnect(self.socket, SIGNAL("newConnection()"), self._connect)
self.conn = self.socket.nextPendingConnection()
self.conn.nextBlockSize = 0
self.connect(self.conn, SIGNAL("readyRead()"), self.io)
self.connect(self.conn, SIGNAL("disconnected()"), self.close)
self.connect(self.conn, SIGNAL("error()"), self.close)
self._sendAuth()
if args.verbosity > 1:
print " Connection by:", self.conn.peerAddress().toString()
self.isConnected = True
if self.isLoading == False:
self.conn.write("\r\nEnter command:")
return
def io(self):
if self.isLoading: return None
if args.verbosity > 0:
print "Reading command..."
data = self.conn.read(1024).strip(" \r\n\t")
if not data: return None
elif self.command is not None:
r = self.command(data)
self.command = None
return r
return self._getCommand(data)
def _getCommand(self, d):
from re import search
d = unicode(d, errors="ignore")
if search(r"(help|HELP)", d) is not None:
self.conn.write("URL | JS | WAIT | QUIT\r\n\r\nEnter Command:")
elif search(r"(url|URL)", d) is not None:
self.command = self._print
self.conn.write("Enter address:")
elif search(r"(js|JS|javascript|JAVASCRIPT)", d) is not None:
self.command = self._js
self.conn.write("Enter javascript to execte:")
elif search(r"(wait|WAIT)", d) is not None:
self.loadFinished.connect(self._loadFinishedPrint)
self.loadFinished.connect(self._loadFinished)
elif search(r"(quit|QUIT|exit|EXIT)", d) is not None:
self.close()
else:
self.conn.write("Invalid command!\r\n\r\nEnter Command:")
return
def _print(self, d):
u = d[:250]
self.out(u)
return True
def _js(self, d):
try:
self.page().mainFrame().evaluateJavaScript(d)
except:
pass
self.conn.write("Enter Javascript:")
return True
def _stop(self):
from time import sleep
if self.isLoading == False: return
if args.verbosity > 0:
print " Stopping..."
self.timeout.stop()
self.stop()
def _load(self, timeout, after):
# Loads a page into frame / sets up timeout
self.timeout.timeout.connect(self._stop)
self.timeout.start(timeout)
self.loadFinished.connect(after)
self.load(self.url)
return
def _loadDone(self, disconnect = None):
from re import search
from time import sleep
self.timeout.timeout.disconnect(self._stop)
self.timeout.stop()
if disconnect is not None:
self.loadFinished.disconnect(disconnect)
# Stick a while on the page
if search(CrawlerWebServer.STUPID, self.url.toString(QUrl.RemovePath)) is not None:
sleep(5)
else:
sleep(1)
return
def _loadError(self):
from time import sleep, time
if not self.timeout.isActive(): return True
if args.verbosity > 0: print " Error retrying..."
# 1: Pause timeout
self.timeout.pause()
# 2: Check for internet connection
while self.page().networkAccessManager().networkAccessible() == QNetworkAccessManager.NotAccessible: sleep(1)
# 3: Wait then try again
sleep(2)
self.reload()
self.timeout.resume()
return False
def go(self, url, after = None):
# Go to a specific address
global args
if after is None:
after = self._loadFinished
if args.verbosity > 0:
print "Loading url..."
self.url = QUrl(url)
self.isLoading = True
if args.verbosity > 1:
print " ", self.url.toString()
self._load(CrawlerWebServer.TIMEOUT * 1000, after)
return
def out(self, url):
# Print html of a a specific url
self.go(url, self._loadFinishedPrint)
return
def createWindow(self, windowType):
# Load links in the same web-view.
return self
def _loadFinished(self, ok):
# Default LoadFinished
from time import sleep
from re import search
if self.isLoading == False: return
if ok == False:
if not self._loadError(): return
self._loadDone(self._loadFinished)
if args.verbosity > 1:
print " Done"
if self.isConnected == True:
self.conn.write("\r\nEnter command:")
self.isLoading = False
return
def _loadFinishedPrint(self, ok):
# Print the evaluated HTML to stdout
if self.isLoading == False: return
if ok == False:
if not self._loadError(): return
self._loadDone(self._loadFinishedPrint)
if args.verbosity > 1:
print " Done"
h = unicode( self.page().mainFrame().toHtml(), errors="ignore" )
if args.verbosity > 2:
print "------------------\n" + h + "\n--------------------"
self.conn.write(h)
self.conn.write("\r\nEnter command:")
self.isLoading = False
return
def contextMenuEvent(self, event):
# Context Menu
menu = self.page().createStandardContextMenu()
menu.addSeparator()
action = menu.addAction('ReLoad')
@action.triggered.connect
def refresh():
self.load(self.url)
menu.exec_(QCursor.pos())
class CrawlerWebClient(object):
def __init__(self, host, port):
import socket
global args
# CONNECT TO SERVER
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.connect((host, port))
o = self.read()
if args.verbosity > 2:
print "\n------------------------------\n" + o + "\n------------------------------\n"
return
def __del__(self):
try: self.socket.close()
except: pass
def read(self):
from re import search
r = ""
while True:
out = self.socket.recv(64*1024).strip("\r\n")
if out.startswith(r"Enter"):
break
if out.endswith(r"Enter command:"):
r += out[:-14]
break
r += out
return r
def command(self, command):
global args
if args.verbosity > 2:
print " Command: [" + command + "]\n------------------------------"
self.socket.sendall(unicode(command))
r = self.read()
if args.verbosity > 2:
print r, "\n------------------------------\n"
return r
I am not familiar with PyQt, but as an option, you could write your script without using a class. That way, you can more easily re-use that application instance.
Hope it helps.