scrapy: “load more result” pages

2019-08-20 16:49发布

I was trying to write the follwing scrapy script to scrapy items from the follwing web site. I was able to scrap first page items but there are more about 2000 page that i want scrap all. There is a option "load more result" , I also try to scrap load more result's pages, but unable to do that. please help me.

from scrapy.shell import open_in_browser
import scrapy
from scrapy import Selector
import math
import json

class MyItems(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    link  = scrapy.Field()

class ProductSpider(scrapy.Spider):
    name= 'reuters'
    allowed_domains = ['reuters.com']
    start_urls = ['https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.']
    download_delay = 1.5
    def parse(self,response):
        for url in response.css('h3.search-result-title a ::attr(href)').extract():
        url=response.urljoin(url)
        yield scrapy.Request(url, callback=self.parse_article)

    #"load more result"

    job_count = 1970
    job_per_page = 10

    pages = math.ceil(job_count/job_per_page)

    for page in range(2,pages):

        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9,bn;q=0.8,af;q=0.7',
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
            'accept': '*/*',
            'referer': 'https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.',
            'authority': 'www.reuters.com',
            'cookie': '_ga=GA1.2.592162541.1518081459; _gid=GA1.2.1478931362.1518081459; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22e58b8e9e-8674-49b4-aaff-0248b6976654%22; _cb_ls=1; OX_plg=pm; __gads=ID=3c74f81d13d6c1b4:T=1518081460:S=ALNI_MZsx67ijryijAj2JcD2YXXZw20zIA; _cb=sjG2aCNHffBaLnBl; AAMC_reuters_0=REGION%7C3; aam_uuid=06971314173867630360429126725673522696; _cb_svref=null; D_DUID=334503eb-dac8-49cd-babd-02081b0b6d24; D_TOKEN=1.0:a25bacf1dbb943e3ba1e93edb2093843:9841e8a348072081c4b770cfdd017d59831a31e6d41f368c89065cd08eec79bb34c9020669a0d8cbd7a670e4e11de2e762b5f67038115c02ba5fcbd9da8de4078116daf500471d1d6440734c181cb49859090467365cbf9d646c0d3fc7e7bb7e4e2643ea7a20bf00f9a695f9bf30b0df402746b31e429526a87ed7aa3c9da9bb:4b5290392fda7a6ff1f0f529cfad0d027a406ae35b6edb8e7cd3f6493ca8b99d; OX_sd=2; mnet_session_depth=2%7C1518104359854; _chartbeat2=.1518081466539.1518104385876.1.k_ivd8UuDjDegChcDsjhRBbcy9U',
        }

        data = {'blob':'National Health Investors, Inc.',
         'bigOrSmall':'big',
         'articleWithBlog':'true',
         'sortBy':"",
         'dateRange':"",
         'numResultsToShow':'10',
         'pn':str(page),
         'callback':'addMoreNewsResults'}


        url ='https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=10&pn={}&callback=addMoreNewsResults'.format(page)

        yield scrapy.FormRequest(url,
            headers=headers,callback=self.parse
            )

def parse_article(self, response):
    print('\n')
    print('***Heading:***',response.css('h1.ArticleHeader_headline_2zdFM ::text').extract_first())
    print('***Url-Link:***',response.url)
    print('***Date :***',response.css('div.ArticleHeader_date_V9eGk ::text').extract())
    print('\n')

1条回答
我命由我不由天
2楼-- · 2019-08-20 17:05

Each click on "LOAD MORE RESULTS" returns Javascript response with JSON object inside:

if (typeof addMoreNewsResults == 'function') { 
addMoreNewsResults( {
    blob: 'National+Health+Investors%2C+Inc.',
    sortBy: 'relevance',
    dateRange: 'all',
    totalResultNumber: 1970,
    totalResultNumberStr: "1,970",
    news: [ 
        {
        id: "-pbm-push-idUSKBN1DG2CP",
        headline: "Diplomat Pharmacy plunges as <b>investors<\/b> fret over rapid PBM push",
        date: "November 16, 2017 11:22am EST",
        href: "/article/us-diplomat-stocks/diplomat-pharmacy-plunges-as-investors-fret-over-rapid-pbm-push-idUSKBN1DG2CP",
        blurb: "...(Reuters) - Shares of Diplomat Pharmacy <b>Inc<\/b> &lt;DPLO.N&gt; tumbled 20... <b>National<\/b> Pharmaceutical Services.\nSome analysts were not excited...",
        mainPicUrl: ""
        }, 
        {....

So you need to use different parsing mechanism to get information you want (import json, json.loads() etc)

There is much easy way. You can get everything in one request (just change numResultsToShow param to get everything): https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults

UPDATE

# -*- coding: utf-8 -*-

import scrapy
import re
import json

class ReutersSpider(scrapy.Spider):
    name = "reuters"
    start_urls = [
        'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults',
    ]

    def parse(self, response):

        json_string = re.search( r'addMoreNewsResults\((.+?) \);', response.body, re.DOTALL ).group(1)

        #Below code is used to transform from Javascript-ish JSON-like structure to JSON
        json_string = re.sub( r'^\s*(\w+):', r'"\1":', json_string, flags=re.MULTILINE)
        json_string = re.sub( r'(\w+),\s*$', r'"\1",', json_string, flags=re.MULTILINE)
        json_string = re.sub( r':\s*\'(.+?)\',\s*$', r': "\1",', json_string, flags=re.MULTILINE)

        results = json.loads(json_string)

        for result in results["news"]:
            item = {}
            item["href"] = result["href"]
            item["date"] = result["date"]

            yield item
查看更多
登录 后发表回答