I am trying to use Scrapy to export scraped items into a CSV field with each field enclosed in double quotes. Currently, the CSV exports correctly, but when I try to modify the item fields and add double quotes manually, the CSV ends up with each field enclosed in triple double quotes. Here is an example of what I'm trying to do:
Scrapy code
import scrapy
from tutorial.items import StoreItem
class SecilSpider(scrapy.Spider):
name = "secil"
allowed_domains = ["secilstore.com"]
def start_requests(self):
start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
return [ scrapy.Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
item = StoreItem()
for url in response.xpath('//div[@class="image"]/a/@href').extract():
yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
baseUrl = response.request.headers.get('Referer', None)
if baseUrl is not None:
baseUrl = baseUrl.split('Sayfa')[0]
color = response.xpath('//a[@class="renk"]/text()').extract()
for c in color:
item['url'] = baseUrl
item['productUrl'] = response.url
item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[@id="productMainImage"]/@src').extract()[0]
item['color'] = c
item['price'] = response.xpath('//span[@class="price cufonHover"]/text()').extract()[0] + "TL"
item['title'] = response.xpath('//h2[@class="cufon"]/text()').extract()
item['brand'] = response.xpath('//h3[@class="slogan cufonSemi"]/text()').extract()[0]
size = '|'.join(s.strip() for s in response.xpath('//a[@class="inStock"]/text()').extract())
item['size'] = size if size else -1
oldPrice = response.xpath('//div[@class="indirimFiyat"]/text()').extract()
item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
items.append(item)
yield item
My CSV Item Pipeline
class CSVPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, False,'"')
self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
So when, I try to modify a field in the spider and add double quotes manually like this (fpr example, for item['url']):
item['url'] = '"%s"' % baseUrl
the resulting CSV prints out the following:
"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1
As you can see, the first field is surrounded by triple double quotes instead of only one. Also what is interesting is that the prices are printed in double quotes. How can I surround each field with only one pair of double quotes?
Thanks!
I found it by modifying the CSVItemPipeline:
This allowed me to generate a CSV file with the fields in double quotes.