I am trying to generate a csv file for each scraped url from a list of urls in scrapy. I do understand I shall modify pipeline.py, however all my attempts have failed so far. I do not understand how I can pass the url being scraped to the pipeline and use this as name for the output and split the output accordingly.
Any help?
Thanks
Here the spider and the pipeline
from scrapy import Spider
from scrapy.selector import Selector
from vApp.items import fItem
class VappSpider(Spider):
name = "vApp"
allowed_domains = ["google.co.uk"]
start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
def parse(self, response):
trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
for tr in trs:
item = fItem()
try:
item['item'] = tr.xpath('td/text()').extract()[0]
except IndexError:
item['item'] = 'null'
yield item
Pipeline:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class VappPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('results/%s.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.fields_to_export = ['item']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
I think you should be doing all those things in batch as a post-processing step when your crawl finishes instead of per-item but here's a draft on how you could do what you want: