I'm working on a project to collect the university's professors contact information. (So it is not malicious.) The professor page is dynamic. I find out the request via Chrome network. However, scrapy xpath doesn't work in scrapy shell while it works on the browser. I even tried to add headers. scrapy shell result
import scrapy
from universities.items import UniversitiesItem
class UniversityOfHouston(scrapy.Spider):
name = 'University_of_Houston'
allowed_domains = ['uh.edu']
start_urls = ['http://www.uh.edu/directory/']
def __init__(self):
self.lastName = ''
def parse(self, response):
self.lastName = 'An'
query = "http://www.uh.edu/directory/proxy.php?q=" + self.lastName + \
"&submit=Search&limit=250&loc=HR730&pos=faculty%7Cstaff&faculty=faculty&staff=staff&student=student"
yield scrapy.Request(query, callback=self.parse_staff)
def parse_staff(self, response):
results = response.xpath('//dt/a/@href').extract()
for result in results:
query = 'http://www.uh.edu/directory/' + result
yield scrapy.Request(query, callback=self.parse_item)
def parse_item(self, response):
item = UniversitiesItem()
item['full_name'] = response.xpath('//h2[@class="single_title"]/text()').extract_first()
item['university'] = 'University of Houston'
item['discipline'] = response.xpath('//td/a[@class="org"]/text()').extract_first()
item['title'] = response.xpath('//tr/td[@class="title"]/text()')
item['email'] = response.xpath('//td/a[@title="email address"]/text()').extract_first()[7:]
item['phone'] = response.xpath('//td[@class="tel"]/a/text()').extract_first()
yield item
Test version:
import scrapy
from universities.items import UniversitiesItem
class UniversityOfHouston(scrapy.Spider):
#name = 'University_of_Houston'
name = 'uh2'
allowed_domains = ['uh.edu']
start_urls = ['http://www.uh.edu/directory/']
def __init__(self):
self.last_name = ''
def parse(self, response):
with open('kw.txt') as file_object:
last_names = file_object.readlines()
for ln in ['Lee', 'Zhao']:
self.last_name = ln.strip()
print('-----------------------------------------------------')
print("scraping last name: ", self.last_name)
query = "http://www.uh.edu/directory/proxy.php?q=" + self.last_name + \
"&submit=Search&limit=250&loc=HR730&pos=faculty%7Cstaff&faculty=faculty&staff=staff&student=student"
yield scrapy.Request(query, callback=self.parse_staff)
def parse_staff(self, response):
results = response.xpath('//dt/a/@href').extract()
for result in results:
query_proxy = 'http://www.uh.edu/directory/' + result.replace("index.php", "proxy.php")
yield scrapy.Request(query_proxy, callback=self.parse_item)
def parse_item(self, response):
full_name = response.xpath('//h2[@class="single_title"]/text()').extract_first()
if full_name:
if self.last_name in full_name.split():
item = UniversitiesItem()
item['fullname'] = full_name
# last_name = full_name.split()[-1]
# item['lastname'] = last_name
# item['firstname'] = full_name[:-len(last_name)].strip()
item['university'] = 'University of Houston'
try:
item['department'] = response.xpath('//td/a[@class="org"]/text()').extract_first()
item['title'] = response.xpath('//tr/td[@class="title"]/text()').extract_first()
item['email'] = response.xpath('//td/a[@title="email address"]/text()').extract_first()
item['phone'] = response.xpath('//td[@class="tel"]/a/text()').extract_first()
except ValueError:
pass
yield item
The issue is because the data is fetched using AJAX call on the web page. And the data is not available when you fetch the main page
Change your
parse_staff
function to below and it should work