I am using the API to run Scrapy from a script (Python 3.5, Scrapy 1.5).
The main script calls a function to deal with its logging:
def main(target_year):
project = os.path.splitext(os.path.basename(os.path.abspath(__file__)))[0]
iso_run_date = datetime.date.today().isoformat()
logger = utils.get_logger(project, iso_run_date)
scraping.run(project, iso_run_date, target_year)
Here is the function in the file "utils.py", with an additional class for formatting, that creates a logger with Python's logging library:
class UTCFormatter(logging.Formatter):
converter = time.gmtime
def get_logger(project, iso_run_date):
ip_address_param = 'ip'
logger = logging.getLogger(project)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(os.path.abspath(os.path.join(
'log', '{}_{}.log'.format(project, iso_run_date))))
file_handler.setLevel(logging.DEBUG)
formatter = UTCFormatter(
fmt=('[%(asctime)s.%(msecs)03dZ] %({})s %(name)s %(levelname)s: '
'%(message)s').format(ip_address_param),
datefmt='%Y-%m-%dT%H:%M:%S')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger = logging.LoggerAdapter(
logger, {ip_address_param: socket.gethostbyname(socket.gethostname())})
return logger
And here is the file "__init__.py" in Scrapy's directory:
@twisted.internet.defer.inlineCallbacks
def crawl(crawler_process, project, iso_run_date, target_year):
yield crawler_process.crawl(project, iso_run_date, target_year)
def run(project, iso_run_date, target_year):
os.environ.setdefault(
'SCRAPY_SETTINGS_MODULE', 'scraping.scraping.settings')
crawler_process = scrapy.crawler.CrawlerProcess(
scrapy.utils.project.get_project_settings())
crawl(crawler_process, project, iso_run_date, target_year)
crawler_process.start()
When I execute the script, I get logs from my main script in the output log file, but nothing from Scrapy.
When I add this inside my spider:
self.logger.debug('Test')
I get this error:
--- Logging error ---
Traceback (most recent call last):
File "/usr/lib/python3.5/logging/__init__.py", line 980, in emit
msg = self.format(record)
File "/usr/lib/python3.5/logging/__init__.py", line 830, in format
return fmt.format(record)
File "/usr/lib/python3.5/logging/__init__.py", line 570, in format
s = self.formatMessage(record)
File "/usr/lib/python3.5/logging/__init__.py", line 539, in formatMessage
return self._style.format(record)
File "/usr/lib/python3.5/logging/__init__.py", line 383, in format
return self._fmt % record.__dict__
KeyError: 'ip'
Call stack:
File "XXXXX.py", line 105, in <module>
main(target_year)
File "XXXXX.py", line 23, in main
scraping.run(project, iso_run_date, target_year)
File "/home/XYZ/virtualenvs/scraping/project/scraping/__init__.py", line 27, in run
crawler_process.start()
File "/home/XYZ/virtualenvs/scraping/lib/python3.5/site-packages/scrapy/crawler.py", line 291, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/home/XYZ/virtualenvs/scraping/lib/python3.5/site-packages/twisted/internet/base.py", line 1261, in run
self.mainLoop()
File "/home/XYZ/virtualenvs/scraping/lib/python3.5/site-packages/twisted/internet/base.py", line 1270, in mainLoop
self.runUntilCurrent()
File "/home/XYZ/virtualenvs/scraping/lib/python3.5/site-packages/twisted/internet/base.py", line 896, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/home/XYZ/virtualenvs/scraping/lib/python3.5/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
File "/home/XYZ/virtualenvs/scraping/lib/python3.5/site-packages/scrapy/core/engine.py", line 127, in _next_request
request = next(slot.start_requests)
File "/home/XYZ/virtualenvs/scraping/project/scraping/scraping/spiders/XXXXX.py", line 47, in start_requests
self.logger.debug('Test')
Message: 'Test'
Arguments: ()
Things were OK when I used basicConfig in my main script, it seemed Scrapy would just pick up this basic logger. But because of additional formatting, it looks like I need to use more advanced code for the logging.
I would like to be able to define a custom logger from my main script, as shown in the code, and have Scrapy use the same formatting with the same output file without having to redefine all of this a second time. Is this possible?
I found a way that seems to work.
The main script is unchanged:
The file "utils.py" now only uses logging.basicConfig(), like I was doing before.
The file "__init__.py" is also unchanged:
Now, all logging messages are output with the same custom format into the same log file. Also: