I am trying to programatically call a spider through a script. I an unable to override the settings through the constructor using CrawlerProcess. Let me illustrate this with the default spider for scraping quotes from the official scrapy site (last code snippet at official scrapy quotes example spider).
class QuotesSpider(Spider):
name = "quotes"
def __init__(self, somestring, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.somestring = somestring
self.custom_settings = kwargs
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
Here is the script through which I try to run the quotes spider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
def main():
proc = CrawlerProcess(get_project_settings())
custom_settings_spider = \
{
'FEED_URI': 'quotes.csv',
'LOG_FILE': 'quotes.log'
}
proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
proc.start()
Scrapy Settings are a bit like Python dicts.
So you can update the settings object before passing it to CrawlerProcess
:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
def main():
s = get_project_settings()
s.update({
'FEED_URI': 'quotes.csv',
'LOG_FILE': 'quotes.log'
})
proc = CrawlerProcess(s)
proc.crawl('quotes', 'dummyinput', **custom_settings_spider)
proc.start()
Edit following OP's comments:
Here's a variation using CrawlerRunner
, with a new CrawlerRunner
for each crawl and re-configuring logging at each iteration to write to different files each time:
import logging
from twisted.internet import reactor, defer
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging, _get_handler
from scrapy.utils.project import get_project_settings
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
page = getattr(self, 'page', 1)
yield scrapy.Request('http://quotes.toscrape.com/page/{}/'.format(page),
self.parse)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
@defer.inlineCallbacks
def crawl():
s = get_project_settings()
for i in range(1, 4):
s.update({
'FEED_URI': 'quotes%03d.csv' % i,
'LOG_FILE': 'quotes%03d.log' % i
})
# manually configure logging for LOG_FILE
configure_logging(settings=s, install_root_handler=False)
logging.root.setLevel(logging.NOTSET)
handler = _get_handler(s)
logging.root.addHandler(handler)
runner = CrawlerRunner(s)
yield runner.crawl(QuotesSpider, page=i)
# reset root handler
logging.root.removeHandler(handler)
reactor.stop()
crawl()
reactor.run() # the script will block here until the last crawl call is finished
I think you can't override the custom_settings
variable of a Spider Class when calling it as a script, basically because the settings are being loaded before the spider is instantiated.
Now, I don't really see a point on changing the custom_settings
variable specifically, as it is only a way to override your default settings, and that's exactly what the CrawlerProcess
offers too, this works as expected:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
name = 'simple'
start_urls = ['http://httpbin.org/headers']
def parse(self, response):
for k, v in self.settings.items():
print('{}: {}'.format(k, v))
yield {
'headers': response.body
}
process = CrawlerProcess({
'USER_AGENT': 'my custom user anget',
'ANYKEY': 'any value',
})
process.crawl(MySpider)
process.start()
You can override a setting from the command line
https://doc.scrapy.org/en/latest/topics/settings.html#command-line-options
For example: scrapy crawl myspider -s LOG_FILE=scrapy.log
It seems you want to have custom log for each spiders. You need to activate the logging like this:
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
#ommited
def __init__(self):
configure_logging({'LOG_FILE' : "logs/mylog.log"})