How to get the web page using requests.post?

2019-08-11 09:06发布

I want to get the result of the web page http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx with the input of stock code being 5.

The problem is that I don't know the website after pressing search as it runs a javascript.

Furthermore, how to find the parameters needed to pass to requests.post, e.g. data? Is header needed?

enter image description here

1条回答
劳资没心,怎么记你
2楼-- · 2019-08-11 09:56

You have multiple options:

1) You can use Selenium. First install Selenium.

sudo pip3 install selenium

Then get a driver https://sites.google.com/a/chromium.org/chromedriver/downloads (Depending upon your OS you may need to specify the location of your driver)

from selenium import webdriver
from bs4 import BeautifulSoup
import time

browser = webdriver.Chrome()
url = "http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx"
browser.get(url)
element = browser.find_element_by_id('ctl00_txt_stock_code')  # find the text box
time.sleep(2)
element.send_keys('5')  # populate the text box
time.sleep(2)
element.submit()  # submit the form
soup = BeautifulSoup(browser.page_source, 'html.parser')
browser.quit()
for news in soup.find_all(class_='news'):
    print(news.text)

2) Or use PyQt with QWebEngineView.

Install PyQt on Ubuntu:

    sudo apt-get install python3-pyqt5
    sudo apt-get install python3-pyqt5.qtwebengine

or on other OS (64 bit versions of Python)

    pip3 install PyQt5

Basically you load the first page with the form on. Fill in the form by running JavaScript then submit it. The loadFinished() signal is called twice, the second time because you submitted the form so you can use an if statement to differentiate between the calls.

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup


class Render(QWebEngineView):
    def __init__(self, url):
        self.html = None
        self.first_pass = True
        self.app = QApplication(sys.argv)
        QWebEngineView.__init__(self)
        self.loadFinished.connect(self._load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):
        if self.first_pass:
            self._first_finished()
            self.first_pass = False
        else:
            self._second_finished()

    def _first_finished(self):
        self.page().runJavaScript("document.getElementById('ctl00_txt_stock_code').value = '5';")
        self.page().runJavaScript("document.getElementById('ctl00_sel_DateOfReleaseFrom_y').value='1999';")
        self.page().runJavaScript("preprocessMainForm();")
        self.page().runJavaScript("document.forms[0].submit();")

    def _second_finished(self):
        self.page().toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.app.quit()

url = "http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
for news in soup.find_all(class_ = 'news'):
    print(news.text)

Outputs:

Voting Rights and Capital
Next Day Disclosure Return
NOTICE OF REDEMPTION AND CANCELLATION OF LISTING
THIRD INTERIM DIVIDEND FOR 2018
Notification of Transactions by Persons Discharging Managerial Responsibilities
Next Day Disclosure Return
THIRD INTERIM DIVIDEND FOR 2018
Monthly Return of Equity Issuer on Movements in Securities for the month ended 31 October 2018
Voting Rights and Capital
PUBLICATION OF BASE PROSPECTUS SUPPLEMENT
3Q 2018 EARNINGS RELEASE AUDIO WEBCAST AND CONFERENCE CALL
3Q EARNINGS RELEASE - HIGHLIGHTS
Scrip Dividend Circular
2018 Third Interim Dividend; Scrip Dividend
THIRD INTERIM DIVIDEND FOR 2018 SCRIP DIVIDEND ALTERNATIVE
NOTIFICATION OF MAJOR HOLDINGS
EARNINGS RELEASE FOR THIRD QUARTER 2018
NOTIFICATION OF MAJOR HOLDINGS
Monthly Return of Equity Issuer on Movements in Securities for the month ended 30 September 2018
THIRD INTERIM DIVIDEND FOR 2018; DIVIDEND ON PREFERENCE SHARES

Alternatively you can use Scrapy splash https://github.com/scrapy-plugins/scrapy-splash

Or Requests-HTML https://html.python-requests.org/ .

But I am not sure how you would fill the form in using these two last approaches.

Updated how to read the next pages:

import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup


class Render(QWebEngineView):
    def __init__(self, url):
    self.html = None
    self.count = 0
    self.first_pass = True
    self.app = QApplication(sys.argv)
    QWebEngineView.__init__(self)
    self.loadFinished.connect(self._load_finished)
    self.load(QUrl(url))
    self.app.exec_()

    def _load_finished(self, result):
    if self.first_pass:
        self._first_finished()
        self.first_pass = False
    else:
        self._second_finished()

    def _first_finished(self):
    self.page().runJavaScript("document.getElementById('ctl00_txt_stock_code').value = '5';")
    self.page().runJavaScript("document.getElementById('ctl00_sel_DateOfReleaseFrom_y').value='1999';")
    self.page().runJavaScript("preprocessMainForm();")
    self.page().runJavaScript("document.forms[0].submit();")

    def _second_finished(self):
    try:
        self.page().toHtml(self.parse)
        self.count += 1
        if self.count > 5:
             self.page().toHtml(self.callable)
        else:
            self.page().runJavaScript("document.getElementById('ctl00_btnNext2').click();")
    except:
        self.page().toHtml(self.callable)

    def parse(self, data):
    soup = BeautifulSoup(data, 'html.parser')
    for news in soup.find_all(class_ = 'news'):
        print(news.text)

    def callable(self, data):
    self.app.quit()

url = "http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx"
web = Render(url)
查看更多
登录 后发表回答