Scrapy shell can't crawl information while xpa

2019-08-21 00:53发布

I'm working on a project to collect the university's professors contact information. (So it is not malicious.) The professor page is dynamic. I find out the request via Chrome network. However, scrapy xpath doesn't work in scrapy shell while it works on the browser. I even tried to add headers. scrapy shell result

Chrome console result

import scrapy
from universities.items import UniversitiesItem


class UniversityOfHouston(scrapy.Spider):
    name = 'University_of_Houston'
    allowed_domains = ['uh.edu']
    start_urls = ['http://www.uh.edu/directory/']

    def __init__(self):
        self.lastName = ''

    def parse(self, response):
        self.lastName = 'An'
        query = "http://www.uh.edu/directory/proxy.php?q=" + self.lastName + \
                "&submit=Search&limit=250&loc=HR730&pos=faculty%7Cstaff&faculty=faculty&staff=staff&student=student"

        yield scrapy.Request(query, callback=self.parse_staff)

    def parse_staff(self, response):
        results = response.xpath('//dt/a/@href').extract()
        for result in results:
            query = 'http://www.uh.edu/directory/' + result
            yield scrapy.Request(query, callback=self.parse_item)

    def parse_item(self, response):

        item = UniversitiesItem()

        item['full_name'] = response.xpath('//h2[@class="single_title"]/text()').extract_first()
        item['university'] = 'University of Houston'
        item['discipline'] = response.xpath('//td/a[@class="org"]/text()').extract_first()
        item['title'] = response.xpath('//tr/td[@class="title"]/text()')
        item['email'] = response.xpath('//td/a[@title="email address"]/text()').extract_first()[7:]
        item['phone'] = response.xpath('//td[@class="tel"]/a/text()').extract_first()

        yield item

Test version:

import scrapy
from universities.items import UniversitiesItem


class UniversityOfHouston(scrapy.Spider):
    #name = 'University_of_Houston'
    name = 'uh2'
    allowed_domains = ['uh.edu']
    start_urls = ['http://www.uh.edu/directory/']

    def __init__(self):
        self.last_name = ''

    def parse(self, response):
        with open('kw.txt') as file_object:
            last_names = file_object.readlines()

        for ln in ['Lee', 'Zhao']:
            self.last_name = ln.strip()
            print('-----------------------------------------------------')
            print("scraping last name: ", self.last_name)
            query = "http://www.uh.edu/directory/proxy.php?q=" + self.last_name + \
                    "&submit=Search&limit=250&loc=HR730&pos=faculty%7Cstaff&faculty=faculty&staff=staff&student=student"

            yield scrapy.Request(query, callback=self.parse_staff)

    def parse_staff(self, response):
        results = response.xpath('//dt/a/@href').extract()
        for result in results:
            query_proxy = 'http://www.uh.edu/directory/' + result.replace("index.php", "proxy.php")
            yield scrapy.Request(query_proxy, callback=self.parse_item)

    def parse_item(self, response):
        full_name = response.xpath('//h2[@class="single_title"]/text()').extract_first()
        if full_name:
            if self.last_name in full_name.split():
                item = UniversitiesItem()
                item['fullname'] = full_name
                # last_name = full_name.split()[-1]
                # item['lastname'] = last_name
                # item['firstname'] = full_name[:-len(last_name)].strip()
                item['university'] = 'University of Houston'
                try:
                    item['department'] = response.xpath('//td/a[@class="org"]/text()').extract_first()
                    item['title'] = response.xpath('//tr/td[@class="title"]/text()').extract_first()
                    item['email'] = response.xpath('//td/a[@title="email address"]/text()').extract_first()
                    item['phone'] = response.xpath('//td[@class="tel"]/a/text()').extract_first()
                except ValueError:
                    pass

                yield item

1条回答
迷人小祖宗
2楼-- · 2019-08-21 01:33

The issue is because the data is fetched using AJAX call on the web page. And the data is not available when you fetch the main page

AJAX Call

Change your parse_staff function to below and it should work

def parse_staff(self, response):
    results = response.xpath('//dt/a/@href').extract()
    for result in results:
        query = 'http://www.uh.edu/directory/' + result
        query_proxy = "https://ssl.uh.edu/directory/" + result.replace("index.php", "proxy.php")
        yield response.follow(query_proxy, callback=self.parse_item)
查看更多
登录 后发表回答