我试图用Scrapy登录到在init一个网站,然后确认登录我要初始化,并通过start_urls启动标准抓取后。 林不知道是怎么了,但我得到清楚的登录名和每一件事确认,但从来没有开始parse_item。 任何帮助将十分赞赏。
我可以把它上升到“=====成功登录=================”
但
我不能去“===== PARSE项目================ ==========”
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from selenium import webdriver
class ProductDetailsSpider(InitSpider):
name = 'product_details_spider'
allowed_domains = ['my_domain.com']
login_page = 'http://www.my_domain.com/'
start_urls = ['http://www.my_domain.com/nextpage1/',
'http://www.my_domain.com/nextpage2/',
'http://www.my_domain.com/nextpage3/']
rules = (
Rule(SgmlLinkExtractor(allow=()),
callback='parse_item',
follow=True),
)
def get_cookies(self):
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "http://www.my_domain.com"
driver.get(base_url + "/")
driver.find_element_by_name("USR").clear()
driver.find_element_by_name("USR").send_keys("my_user")
driver.find_element_by_name("PASSWRD").clear()
driver.find_element_by_name("PASSWRD").send_keys("my_pass")
driver.find_element_by_name("submit").click()
cookies = driver.get_cookies()
driver.close()
cookie_dic = {}
for c in cookies:
cookie_dic[c['name']] = c['value']
return cookie_dic
def init_request(self):
print '=======================INIT======================='
"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return [FormRequest.from_response(response,formname='login_form',
formdata={'USR': 'my_user', 'PASSWRD': 'my_pass'},
callback=self.login_cookies)]
def login_cookies(self, response):
print '=======================COOKIES======================='
return Request(url='http://www.my_domain.com/home',
cookies=self.get_cookies(),
callback=self.check_login_response)
def check_login_response(self, response):
print '=======================CHECK LOGIN======================='
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logoff" in response.body:
print "=========Successfully logged in.========="
self.initialized()
# Now the crawling can begin..
else:
print "==============Bad times :(==============="
# Something went wrong, we couldn't log in, so nothing happens.
def parse_item(self, response):
print "==============PARSE ITEM=========================="
# Scrape data from page