How to scrape data using pyqt5

2019-06-14 00:22发布

问题:

I would like to get the pop-up data from a website.

As shown in the first figure, I need to click a link.

After that, a pop-up, as shown in the second figure will appear.

The content of this pop-up is what I want.

I tried to follow the example using pyqyt5 to get the data.

However, the program continue to run permanently.

How to solve this problem?

Thank you very much.

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup


class Render(QWebEngineView):
    def __init__(self, url):
        self.html = None
        self.first_pass = True
        self.app = QApplication(sys.argv)
        QWebEngineView.__init__(self)
        self.loadFinished.connect(self._load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):        
        if self.first_pass:
            self._first_finished()
            self.first_pass = False
        else:
            self._second_finished()

    def _first_finished(self):
        self.page().runJavaScript("document.getElementById('auto-header-citypop-citylist');")


    def _second_finished(self):
        self.page().toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = r'https://www.autohome.com.cn'
web = Render(url)

with open('data2.html', 'w', encoding='utf-8-sig') as f:
    f.write(web.html)

回答1:

There was this JavaScript on the page:

       if (rf === "" || rf.toLocaleLowerCase().indexOf(".autohome.com.cn") === -1) {
            if (screen == undefined || screen.width < 810) {
                if (browser.versions.mobile == true || browser.versions.iPhone == true || browser.versions.ucweb == true || browser.versions.android == true || browser.versions.Symbian == true) {
                    window.location.href = "//m.autohome.com.cn/?from=pc";
                    return
                }
            }
        }  

Which redirected you to https://m.autohome.com.cn/?from=pc as you could see by printing self.url(). To get around this I set the Referer header like this:

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QByteArray
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from bs4 import BeautifulSoup


class Render(QWebEngineView):
    def __init__(self, url):
    self.html = None
    self.app = QApplication(sys.argv)
    QWebEngineView.__init__(self)
    self.loadFinished.connect(self._load_finished)
    self.request = QWebEngineHttpRequest(QUrl(url))
    self.request.setHeader(QByteArray().append('Referer'), QByteArray().append('https://www.autohome.com.cn/beijing/'))
    self.load(self.request)
    self.app.exec_()

    def _load_finished(self, result):  
    self.page().runJavaScript("document.getElementById('auto-header-switcharea').click();")
    self.page().toHtml(self.callable)


    def callable(self, data):
    self.html = data
    self.app.quit()

url = 'https://www.autohome.com.cn/beijing/'
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
for city in soup.find_all('a', {'name':'auto-header-citypop-city'}):
    print(city)

Outputs:

<a data-info="[110100, 646, '北京', 'beijing']" data-key="110100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">北京</a>
<a data-info="[440100, 62, '广州', 'guangzhou']" data-key="440100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">广州</a>
<a data-info="[440300, 670, '深圳', 'shenzhen']" data-key="440300" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">深圳</a>
<a data-info="[320100, 335, '南京', 'nanjing']" data-key="320100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">南京</a>
<a data-info="[310100, 649, '上海', 'shanghai']" data-key="310100" href="javascript:void(0);" name="auto-header-citypop-city" target="_self">上海</a>

....

There is no page load after the click event so no need for two _load_finished methods.



标签: python pyqt5