I have this object I'm trying to populate with an itemLoader:
{
"domains": "string",
"date_insert": "2016-12-23T11:25:00.213Z",
"title": "string",
"url": "string",
"body": "string",
"date": "2016-12-23T11:25:00.213Z",
"authors": [
"string"
],
"categories": [
"string"
],
"tags": [
"string"
],
"stats": {
"views_count": 0,
"comments_count": 0
}
}
Here's my items.py
class StatsItem(scrapy.Item):
views_count=scrapy.Field()
comments_count=scrapy.Field()
class ArticleItem(scrapy.Item):
domain = scrapy.Field()
date_insert=scrapy.Field()
date_update=scrapy.Field()
date=scrapy.Field()
title=scrapy.Field()
url=scrapy.Field()
body=scrapy.Field(
output_processor=Join())
date=scrapy.Field()
authors=scrapy.Field(
output_processor=Identity())
categories=scrapy.Field(
output_processor=Identity())
tags=scrapy.Field()
stats=scrapy.Field()
Part of my spider:
def parse(self, response):
loader = ArticleItemLoader(response=response)
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
loader.add_css('authors','span.meta-author')
loader.add_css('title', 'h1.title-article')
loader.add_value('url', response.url)
loader.add_xpath('date_insert', '//div[@class=\'meta\']/time[@itemprop=\'datePublished\']/@datetime')
loader.add_xpath('date_update', '//div[@class=\'meta\']/time[@itemprop=\'dateModified\']/@datetime')
loader.add_value('domain', domain)
loader.add_xpath('categories', '//ul[@class=\'breadcrumbs\']//li[not(contains(@class, \'home\'))]')
So far I have succesfuly populating every fields but "stats". I've checked this page correct way to nest Item data in scrapy but it seems to not be working anymore (I can't make it work, my error is TypeError: to_unicode must receive a bytes, str or unicode object, got StatsItem)
I'd like to use the itemLoader but I dont see how I could populate my "stats" with my StatsItem
Thx for the help
Edit I am close but it still doesnt work :
loader.add_value('stats', self.getStats(response))
def getStats(self, response):
statsLoader = StatsItemLoader(response=response)
statsLoader.add_xpath('comments_count', '//div[@class=\'btn-count\']//a/text()')
statsLoader.add_value('views_count', '42')
return json.dumps(dict(statsLoader.load_item()))
but my output is like : { [...] "stats": "{\"comments_count\": \"0\", \"views_count\": \"42\"}" }
Thanks to @eLRuLL I manage to find a decent solution :
items.py :
spider.py:
Originally it was not working because my input_processor was
MapCompose(remove_tags)
for the stats field. In order to serialize the object you have toreturn dict(loader.load_item())
and not justreturn loader.load_item()
Thanks !