Force spider to stop in scrapy

2020-02-11 15:49发布

I have 20 spiders in one project, each spider has different task and URL to crawl ( but data are similar and I'm using shared items.py and pipelines.py for all of them), by the way in my pipelines class I want if some conditions satisfied that specified spider stop crawl. I've testing

  raise DropItem("terminated by me")

and

 raise CloseSpider('terminate by me')

but both of them just stop the current running of spider and next_page url still crawling !!!

part of my pipelines.py

class MongoDBPipeline(object):

    def __init__(self):
        connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]

    def process_item(self, item, spider):
        raise CloseSpider('terminateby')
        raise DropItem("terminateby")

        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Items added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item

and my spider

import scrapy
import json
from Maio.items import MaioItem



class ZhilevanSpider(scrapy.Spider):
    name = 'tehran'
    allowed_domains = []
    start_urls = ['https://search.Maio.io/json/']
    place_code = str(1);

    def start_requests(self):

        request_body = {
                "id": 2,
                "jsonrpc": "2.0",
                "method": "getlist",
                "params": [[["myitem", 0, [self.place_code]]], next_pdate]
        }
        # for body in request_body:
        #     request_body = body

        request_body = json.dumps(request_body)
        print(request_body)
        yield scrapy.Request(
            url='https://search.Maio.io/json/',
            method="POST",
            body=request_body,
            callback = self.parse,
            headers={'Content-type' : 'application/json;charset=UTF-8'}
        )

    def parse(self, response):

        print(response)
        # print(response.body.decode('utf-8'))
        input = (response.body.decode('utf-8'))
        result = json.loads(input)
        # for key,item in result["result"]:
        #     print(key)
        next_pdate = result["result"]["last_post_date"];
        print(result["result"]["last_post_date"])
        for item in result["result"]["post_list"]:
            print("title : {0}".format(item["title"]))
            ads = MaioItem()
            ads['title'] = item["title"]
            ads['desc'] = item["desc"]
        yield ads
        if(next_pdate):
            request_body = {
                "id": 2,
                "jsonrpc": "2.0",
                "method": "getlist",
                "params": [[["myitem", 0, [self.place_code]]], next_pdate]
            }

            request_body = json.dumps(request_body)
            yield scrapy.Request(
                url='https://search.Maio.io/json/',
                method="POST",
                body=request_body,
                callback=self.parse,
                headers={'Content-type': 'application/json; charset=UTF-8'}
            )

**update **

even I put sys.exit("SHUT DOWN EVERYTHING!") in the pipeline but next page still run .

I see the following log in every page running

sys.exit("SHUT DOWN EVERYTHING!")
SystemExit: SHUT DOWN EVERYTHING!

标签: python scrapy
1条回答
倾城 Initia
2楼-- · 2020-02-11 16:18

If you want to stop a spider from a pipeline, you can call the close_spider() function of the engine.

class MongoDBPipeline(object):

    def process_item(self, item, spider):
        spider.crawler.engine.close_spider(self, reason='finished')
查看更多
登录 后发表回答