How to scrape several websites with pyqt4, scope c

2019-09-16 01:54发布

问题:

I would like to scrape two websites in java for links using PyQt4.QtWebKit to render the pages and then get the desired links. The code works fine with one page or url, but stops (but continues running until force quit) after printing the links of the first website. It seems the scope stays in the event loop of the render class. How can I get the program to change scope and continue with the for loop and rendering the second website? Using exit() in _loadFinished method just quits the program after the first iteration. Maybe the python app has to close and reopen to render the next page, which is impossible because the app is opened/reopened outside of the program?

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *
from PyQt4 import QtGui
from lxml import html 

class Render(QWebPage):
    def __init__(self, url):

        self.frame = None
        QWebPage.__init__(self)
        self.loadFinished.connect(self._loadFinished)
        self.mainFrame().load(QUrl(url))

    def _loadFinished(self, result):
        self.frame = self.mainFrame()
        result = self.frame.toHtml()
        formatted_result = str(result)
        tree = html.fromstring(formatted_result)
        archive_links = tree.xpath('//div/div/a/@href')[0:4]
        print(archive_links)


urls = ['http://pycoders.com/archive/', 'http://www.pythonjobshq.com']

def main(urls):

    app = QtGui.QApplication(sys.argv)
    for url in urls:
        r = Render(url)
    #s = Render(urls[1]) #The pages can be rendered parallel, but rendering more than a handful of pages a the same time is a bad idea
    sys.exit(app.exec_())

if __name__ == '__main__':
    main(urls)

Thankful for any help!