Scrapy :: Issues with CSV exporting

2019-06-02 02:04发布

I am trying to use Scrapy to export scraped items into a CSV field with each field enclosed in double quotes. Currently, the CSV exports correctly, but when I try to modify the item fields and add double quotes manually, the CSV ends up with each field enclosed in triple double quotes. Here is an example of what I'm trying to do:

Scrapy code

import scrapy
from tutorial.items import StoreItem

class SecilSpider(scrapy.Spider):
    name = "secil"
    allowed_domains = ["secilstore.com"]
    def start_requests(self):
        start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
                     ["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
                     ["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
        return [ scrapy.Request(url = start_url) for start_url in start_urls ]

    def parse(self, response):
        item = StoreItem()
        for url in response.xpath('//div[@class="image"]/a/@href').extract():
            yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
        baseUrl = response.request.headers.get('Referer', None)
        if baseUrl is not None:
            baseUrl = baseUrl.split('Sayfa')[0]
        color = response.xpath('//a[@class="renk"]/text()').extract()
        for c in color:
            item['url'] = baseUrl
            item['productUrl'] = response.url
            item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[@id="productMainImage"]/@src').extract()[0]
            item['color'] = c
            item['price'] = response.xpath('//span[@class="price cufonHover"]/text()').extract()[0] + "TL"
            item['title'] = response.xpath('//h2[@class="cufon"]/text()').extract()
            item['brand'] = response.xpath('//h3[@class="slogan cufonSemi"]/text()').extract()[0]
            size = '|'.join(s.strip() for s in response.xpath('//a[@class="inStock"]/text()').extract())
            item['size'] = size if size else -1
            oldPrice = response.xpath('//div[@class="indirimFiyat"]/text()').extract()
            item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
            items.append(item)
            yield item

My CSV Item Pipeline

class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file, False,'"')
    self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item

So when, I try to modify a field in the spider and add double quotes manually like this (fpr example, for item['url']):

item['url'] = '"%s"' % baseUrl

the resulting CSV prints out the following:

"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1

As you can see, the first field is surrounded by triple double quotes instead of only one. Also what is interesting is that the prices are printed in double quotes. How can I surround each field with only one pair of double quotes?

Thanks!

1条回答
来,给爷笑一个
2楼-- · 2019-06-02 02:41

I found it by modifying the CSVItemPipeline:

 self.exporter = CsvItemExporter(open(spider.name+".csv", "w"), False, 
                                        fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)

This allowed me to generate a CSV file with the fields in double quotes.

查看更多
登录 后发表回答