How to split output from a list of urls in scrapy

I am trying to generate a csv file for each scraped url from a list of urls in scrapy. I do understand I shall modify pipeline.py, however all my attempts have failed so far. I do not understand how I can pass the url being scraped to the pipeline and use this as name for the output and split the output accordingly.

Any help?

Thanks

Here the spider and the pipeline

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'

        yield item

Pipeline:

from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter

class VappPipeline(object):
    def __init__(self):
        self.files = {}

@classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

   def spider_opened(self, spider):
        file = open('results/%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['item']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

标签： python scrapy

1条回答

爷的心禁止访问

2楼-- · 2019-07-28 03:03

I think you should be doing all those things in batch as a post-processing step when your crawl finishes instead of per-item but here's a draft on how you could do what you want:

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'
        item['url'] = response.url
        yield item


from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse

class VappPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = {}

    @classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

    def process_item(self, item, spider):
        url = item['url']
        parsed_uri = urlparse(url)
        domain = parsed_uri.netloc
        if domain not in self.exporter:
            file = open('results/%s.csv' % domain, 'w+b')
            self.files[domain] = file
            self.exporter[domain] = CsvItemExporter(file)
            self.exporter[domain].fields_to_export = ['item']
            self.exporter[domain].start_exporting()

        assert domain in self.exporter

        self.exporter[domain].export_item(item)

        return item

    def spider_closed(self, spider):
        for domain, exporter in self.exporter.iteritems():
            exporter.finish_exporting()
            self.files[domain].close()

0人赞添加讨论(0) 举报

How to split output from a list of urls in scrapy

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间