Scrapy shell can't crawl information while xpa

2019-08-21 01:25发布

问题:

I'm working on a project to collect the university's professors contact information. (So it is not malicious.) The professor page is dynamic. I find out the request via Chrome network. However, scrapy xpath doesn't work in scrapy shell while it works on the browser. I even tried to add headers. scrapy shell result

Chrome console result

import scrapy
from universities.items import UniversitiesItem


class UniversityOfHouston(scrapy.Spider):
    name = 'University_of_Houston'
    allowed_domains = ['uh.edu']
    start_urls = ['http://www.uh.edu/directory/']

    def __init__(self):
        self.lastName = ''

    def parse(self, response):
        self.lastName = 'An'
        query = "http://www.uh.edu/directory/proxy.php?q=" + self.lastName + \
                "&submit=Search&limit=250&loc=HR730&pos=faculty%7Cstaff&faculty=faculty&staff=staff&student=student"

        yield scrapy.Request(query, callback=self.parse_staff)

    def parse_staff(self, response):
        results = response.xpath('//dt/a/@href').extract()
        for result in results:
            query = 'http://www.uh.edu/directory/' + result
            yield scrapy.Request(query, callback=self.parse_item)

    def parse_item(self, response):

        item = UniversitiesItem()

        item['full_name'] = response.xpath('//h2[@class="single_title"]/text()').extract_first()
        item['university'] = 'University of Houston'
        item['discipline'] = response.xpath('//td/a[@class="org"]/text()').extract_first()
        item['title'] = response.xpath('//tr/td[@class="title"]/text()')
        item['email'] = response.xpath('//td/a[@title="email address"]/text()').extract_first()[7:]
        item['phone'] = response.xpath('//td[@class="tel"]/a/text()').extract_first()

        yield item

Test version:

import scrapy
from universities.items import UniversitiesItem


class UniversityOfHouston(scrapy.Spider):
    #name = 'University_of_Houston'
    name = 'uh2'
    allowed_domains = ['uh.edu']
    start_urls = ['http://www.uh.edu/directory/']

    def __init__(self):
        self.last_name = ''

    def parse(self, response):
        with open('kw.txt') as file_object:
            last_names = file_object.readlines()

        for ln in ['Lee', 'Zhao']:
            self.last_name = ln.strip()
            print('-----------------------------------------------------')
            print("scraping last name: ", self.last_name)
            query = "http://www.uh.edu/directory/proxy.php?q=" + self.last_name + \
                    "&submit=Search&limit=250&loc=HR730&pos=faculty%7Cstaff&faculty=faculty&staff=staff&student=student"

            yield scrapy.Request(query, callback=self.parse_staff)

    def parse_staff(self, response):
        results = response.xpath('//dt/a/@href').extract()
        for result in results:
            query_proxy = 'http://www.uh.edu/directory/' + result.replace("index.php", "proxy.php")
            yield scrapy.Request(query_proxy, callback=self.parse_item)

    def parse_item(self, response):
        full_name = response.xpath('//h2[@class="single_title"]/text()').extract_first()
        if full_name:
            if self.last_name in full_name.split():
                item = UniversitiesItem()
                item['fullname'] = full_name
                # last_name = full_name.split()[-1]
                # item['lastname'] = last_name
                # item['firstname'] = full_name[:-len(last_name)].strip()
                item['university'] = 'University of Houston'
                try:
                    item['department'] = response.xpath('//td/a[@class="org"]/text()').extract_first()
                    item['title'] = response.xpath('//tr/td[@class="title"]/text()').extract_first()
                    item['email'] = response.xpath('//td/a[@title="email address"]/text()').extract_first()
                    item['phone'] = response.xpath('//td[@class="tel"]/a/text()').extract_first()
                except ValueError:
                    pass

                yield item

回答1:

The issue is because the data is fetched using AJAX call on the web page. And the data is not available when you fetch the main page

Change your parse_staff function to below and it should work

def parse_staff(self, response):
    results = response.xpath('//dt/a/@href').extract()
    for result in results:
        query = 'http://www.uh.edu/directory/' + result
        query_proxy = "https://ssl.uh.edu/directory/" + result.replace("index.php", "proxy.php")
        yield response.follow(query_proxy, callback=self.parse_item)