How to split output from a list of urls in scrapy

2019-07-28 02:24发布

I am trying to generate a csv file for each scraped url from a list of urls in scrapy. I do understand I shall modify pipeline.py, however all my attempts have failed so far. I do not understand how I can pass the url being scraped to the pipeline and use this as name for the output and split the output accordingly.

Any help?

Thanks

Here the spider and the pipeline

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'

        yield item

Pipeline:

from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter

class VappPipeline(object):
    def __init__(self):
        self.files = {}

@classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

   def spider_opened(self, spider):
        file = open('results/%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['item']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

标签: python scrapy
1条回答
爷的心禁止访问
2楼-- · 2019-07-28 03:03

I think you should be doing all those things in batch as a post-processing step when your crawl finishes instead of per-item but here's a draft on how you could do what you want:

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'
        item['url'] = response.url
        yield item


from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse

class VappPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = {}

    @classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

    def process_item(self, item, spider):
        url = item['url']
        parsed_uri = urlparse(url)
        domain = parsed_uri.netloc
        if domain not in self.exporter:
            file = open('results/%s.csv' % domain, 'w+b')
            self.files[domain] = file
            self.exporter[domain] = CsvItemExporter(file)
            self.exporter[domain].fields_to_export = ['item']
            self.exporter[domain].start_exporting()

        assert domain in self.exporter

        self.exporter[domain].export_item(item)

        return item

    def spider_closed(self, spider):
        for domain, exporter in self.exporter.iteritems():
            exporter.finish_exporting()
            self.files[domain].close()
查看更多
登录 后发表回答