Scrapy - Scraping different web pages in one scrap

2019-05-07 02:27发布

I'm creating a web app that scrapes a long list of shoes from different websites. Here are my two individual scrapy scripts:

http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3

from scrapy import Spider
from scrapy.http import Request
class ShoesSpider(Spider):
    name = "shoes"
    allowed_domains = ["store.nike.com"]
    start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3']
    def parse(self, response):
        shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract()
        for shoe in shoes:
            yield Request(shoe, callback=self.parse_shoes)

    def parse_shoes(self, response):
        url = response.url
        name = response.xpath('//*[@itemprop="name"]/text()').extract_first()
        price = response.xpath('//*[@itemprop="price"]/text()').extract_first()
        price = price.replace('$','')
        shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

        sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option')
        sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract()
        sizes = [s.strip() for s in sizes]
        yield {
            'url': url,
            'name' : name,
            'price' : price,
            'sizes' : sizes,
            'shoe_type': shoe_type
        }

http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp

    from scrapy import Spider
    from scrapy.http import Request
    class ShoesSpider(Spider):
        name = "shoes"
        allowed_domains = ["dickssportinggoods.com"]
        start_urls = ['http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp']
        def parse(self, response):
            shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract()
            for shoe in shoes:
                yield Request(shoe, callback=self.parse_shoes)
        def parse_shoes(self, response):
            sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract()
            if sizes == []:
                pass
            url = response.url
            name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first()
            price = response.xpath('.//*[@itemprop="price"]/text()').extract_first()
            #shoe_type =  response.css('.exp-product-subtitle::text').extract_first()
            yield {
                    'url': url,
                    'name' : name,
                    'price' : price,
                    'sizes' : sizes,
                    'shoe_type': ''
                }

How can I manage to put both of them together? I already went through the scrapy documentation and I haven't seen them mentioning this, it just mentions how to scrape two addresses from a root address. Thanks

标签: python scrapy
2条回答
Explosion°爆炸
2楼-- · 2019-05-07 02:56

Put your both domains in allowed_domains and put your both URLs in start_urls and then use simple if-else to determine what part of code to execute.

from scrapy import Spider
from scrapy.http import Request
class ShoesSpider(Spider):
    name = "shoes"
    allowed_domains = ["store.nike.com", "dickssportinggoods.com"]
    start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3', 'http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp']
    def parse(self, response):

        if "store.nike.com" in response.url:
            shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract()
        elif "dickssportinggoods.com" in response.url:
            shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract()

        for shoe in shoes:
            yield Request(shoe, callback=self.parse_shoes)

    def parse_shoes(self, response):
        url = response.url

        if "store.nike.com" in response.url:
            name = response.xpath('//*[@itemprop="name"]/text()').extract_first()
            price = response.xpath('//*[@itemprop="price"]/text()').extract_first()
            price = price.replace('$','')
            shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

            sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option')
            sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract()
            sizes = [s.strip() for s in sizes]
            yield {
                'url': url,
                'name' : name,
                'price' : price,
                'sizes' : sizes,
                'shoe_type': shoe_type
            }
        elif "dickssportinggoods.com" in response.url:
                sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract()
                if sizes == []:
                    pass
                url = response.url
                name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first()
                price = response.xpath('.//*[@itemprop="price"]/text()').extract_first()
                #shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

                yield {
                        'url': url,
                        'name' : name,
                        'price' : price,
                        'sizes' : sizes,
                        'shoe_type': ''
                }
查看更多
老娘就宠你
3楼-- · 2019-05-07 03:00

You don't have to specify allowed_domains variable. You can ignore allowed_domains variable so you don't have domain limitation.

查看更多
登录 后发表回答