刮与无限滚动网站(scrape websites with infinite scrolling)

我已经写了很多，但铲运机我真的不知道如何处理无限滚动条。这些天，大多数网站等，Facebook和Pinterest的有着无穷的滚动条。

Answer 1:

您可以使用硒报废无限滚动网站Twitter或Facebook等。

步骤1：使用PIP安装硒

pip install selenium

步骤2：使用下面的代码来自动无限滚动并提取的源代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys

import unittest, time, re

class Sel(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(30)
        self.base_url = "https://twitter.com"
        self.verificationErrors = []
        self.accept_next_alert = True
    def test_sel(self):
        driver = self.driver
        delay = 3
        driver.get(self.base_url + "/search?q=stckoverflow&src=typd")
        driver.find_element_by_link_text("All").click()
        for i in range(1,100):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(4)
        html_source = driver.page_source
        data = html_source.encode('utf-8')


if __name__ == "__main__":
    unittest.main()

步骤3：如果需要打印的数据。