我写一个Scrapy CrawlSpider读取第一页上的广告列表,需要像列表和广告网址的大拇指一些信息,然后产生一个请求,每个这种广告网址,采取他们的详细资料。
这是工作和分页显然好于测试环境,但今天试图让我意识到,在日志中完整的运行:
抓取3852页(以228页/分钟),刮256项(15个/分钟)
我不理解抓取网页和仿古物品之间的这种大的差异的原因。 任何人都可以帮我实现如该项目迷路?
我的蜘蛛代码:
class MySpider(CrawlSpider):
name = "myspider"
allowed_domains = ["myspider.com", "myspider.co"]
start_urls = [
"http://www.myspider.com/offers/myCity/typeOfAd/?search=fast",
]
#Pagination
rules = (
Rule (
SgmlLinkExtractor()
, callback='parse_start_url', follow= True),
)
#1st page
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//a[@class='pagNext']/@href").extract()
offers = hxs.select("//div[@class='hlist']")
for offer in offers:
myItem = myItem()
myItem['url'] = offer.select('.//span[@class="location"]/a/@href').extract()[0]
myItem['thumb'] = oferta.select('.//div[@class="itemFoto"]/div/a/img/@src').extract()[0]
request = Request(myItem['url'], callback = self.second_page)
request.meta['myItem'] = myItem
yield request
if next_page:
yield Request(next_page[0], callback=self.parse_start_url)
def second_page(self,response):
myItem = response.meta['myItem']
loader = myItemLoader(item=myItem, response=response)
loader.add_xpath('address', '//span[@itemprop="streetAddress"]/text()')
return loader.load_item()