I am trying to learn Scrapy and i'm learning on yelp website this LINK But when scrapy runs, it scrapes the same phone, addresses, over and over again instead of scraping different parts. The selector i have used is all the "li" tags that belong to a specific class for each restaurant of the page each li tag contain each restaurant information i have used appropriate selectors but scrapy gives me results repeated form only 2 or 3 restaurants. For some reason Scrapy is using same parts over and over again when it should skip over them as soon as they are completed in the for loop. Here is the code
try:
import scrapy
from urlparse import urljoin
except ImportError:
print "\nERROR IMPORTING THE NESSASARY LIBRARIES\n"
#scrapy.optional_features.remove('boto')
url = raw_input('ENTER THE SITE URL : ')
class YelpSpider(scrapy.Spider):
name = 'yelp spider'
start_urls = [url]
def parse(self, response):
SET_SELECTOR = '.regular-search-result'
#Going over each li tags containg each resturant belonging to this class
for yelp in response.css(SET_SELECTOR):
#getting a slector to get a link to scrape website info from another page
selector = '.indexed-biz-name a ::attr(href)'
#getting the complete url joining the extracted part
momo = urljoin(response.url, yelp.css(selector).extract_first())
#All the selectors
name = '.indexed-biz-name a span ::text'
services = '.category-str-list a ::text'
address1 = '.neighborhood-str-list ::text'
address2 = 'address ::text'
phone = '.biz-phone ::text'
# extracting them and adding them in a dict
try:
add1 = response.css(address1).extract_first().replace('\n','').replace('\n','')
add2 = response.css(address2).extract_first().replace('\n','').replace('\n','')
ADDRESS = add1 + ' ' + add2
pookiebanana = {
"PHONE": response.css(phone).extract_first().replace('\n','').replace('\t',''),
"NAME": response.css(name).extract_first().replace('\n','').replace('\t',''),
"SERVICES": response.css(services).extract_first().replace('\n','').replace('\t',''),
"ADDRESS": ADDRESS,
}
except:
pass
#Opening another page passing the old dict
Post = scrapy.Request(momo, callback=self.parse_yelp, meta={'item': pookiebanana})
#yielding the dict with the website scraped
yield Post
#Clicking the next button and recursively calling the same function with the same link
NEXT_PAGE_SELECTOR = '.u-decoration-none.next.pagination-links_anchor ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
def parse_yelp(self, response):
#Website selector opening a new page from the link we extracted
WEBSITE_SELECTOR = '.biz-website.js-add-url-tagging a ::text'
item = response.meta['item']
#inside the try block extracting the website info and returning the modified dict
try:
item['WEBSITE'] = ' '.join(response.css(WEBSITE_SELECTOR).extract_first().split(' '))
except:
pass
return item
i have commented in the code extensively about where i did what. What am i doing wrong ?
here is the output csv screenshot this shows the repetitions
HERE is the scrapy scraping output as you can see it scrapes the same thing over and over What is happening and what am i doing wrong ?
I can't test it but inside
for yelp
loop you should useyelp.css()
but you useresponse.css()