I am using the following code to scrape this website (http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1) ; however, obtain the following TypeError:
"File "C:\Users\Anaconda2\lib\site-packages\scrapy\contrib\spiders\crawl.py", line 83, in _compile_rules self._rules = [copy.copy(r) for r in self.rules] TypeError: 'Rule' object is not iterable"
I don't have any code written on line 83, thus, wondering if anyone has ideas on how to resolve the issue? I'm using Python 2.7 in Windows.
Thanks!
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
class MdiMassSpider(CrawlSpider):
name = "MdiMass"
allowed_domains = ["http://profiles.ehs.state.ma.us/Profiles/Pages/FindAPhysician.aspx"]
start_urls = ["http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1"]
driver = webdriver.Chrome()
rules = (Rule(LinkExtractor(allow=(".*http://profiles.ehs.state.ma.us/Profiles/Pages/PhysicianProfile.aspx?PhysicianID=.*,"))),)
# all pages to scrape follow the same: http://profiles.ehs.state.ma.us/Profiles/Pages/PhysicianProfile.aspx?PhysicianID=1311
#PhysicianID=XXXX
def __init__(self):
CrawlSpider.__init__(self)
self.driver = webdriver.Chrome()
#def __del__(self):
# self.selenium.quit()
# print self.verificationErrors
def parse(self, response):
self.driver.get('http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1')
def parse(self):
select = Select(driver.find_element_by_xpath("//select[@id=\"ctl00_ContentPlaceHolder1_cmbDistance\"]"))
print select.options
print [o.text for o in select.options]
select.select_by_visible_text("15")
zip = driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_txtZip\"]")
zip.send_keys("02109")
prim_care_chekbox = driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_SpecialtyGroupsCheckbox_6\"]")
prim_care_chekbox.click()
find_phy_button = driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_btnSearch\"]")
find_phy_button.click()
for sel in response.xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a"):
item = MdiMassItem()
item["phy_name"] = sel.xpaths("//*[@id=\"content\"]/center/p[1]").extract()
item["lic_status"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/a[1]").extract()
item["lic_issue_date"] = driver.find.elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[3]/td[2]").extract()
item["prim_worksetting"] = driver.find.elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[5]/td[2]").extract()
item["npi"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[2]/table/tbody/tr[6]/td[2]").extract()
item["Med_sch_grad_date"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[3]/tbody/tr[3]/td/table/tbody/tr[2]/td[2]").extract()
item["Area_of_speciality"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[4]/tbody/tr[3]/td/table/tbody/tr/td[2]").extract()
item["link"] = driver.find_element_by_xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a").extract()
print item
Edited Code:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
class MdiMassSpider(CrawlSpider):
name = "MdiMass"
allowed_domains = ["http://profiles.ehs.state.ma.us/Profiles/Pages/FindAPhysician.aspx"]
start_urls = ["http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1"]
rules = (Rule(LinkExtractor(allow=(".*http://profiles.ehs.state.ma.us/Profiles/Pages/PhysicianProfile.aspx?PhysicianID=.*"))),)
# all pages to scrape follow the same: http://profiles.ehs.state.ma.us/Profiles/Pages/PhysicianProfile.aspx?PhysicianID=1311
#PhysicianID=XXXX
def __init__(self):
CrawlSpider.__init__(self)
self.driver = webdriver.Chrome()
self.driver.get('http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1')
def parse(self, response):
driver = self.driver
select = Select(self.driver.find_element_by_xpath("//select[@id=\"ctl00_ContentPlaceHolder1_cmbDistance\"]"))
print select.options
print [o.text for o in select.options]
select.select_by_visible_text("15")
zip = self.driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_txtZip\"]")
zip.send_keys("02109")
prim_care_chekbox = self.driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_SpecialtyGroupsCheckbox_6\"]")
prim_care_chekbox.click()
find_phy_button = self.driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_btnSearch\"]")
find_phy_button.click()
for sel in response.xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a"):
item = MdiMassItem()
item["phy_name"] = sel.xpaths("//*[@id=\"content\"]/center/p[1]").extract()
item["lic_status"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/a[1]").extract()
item["lic_issue_date"] = driver.find.elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[3]/td[2]").extract()
item["prim_worksetting"] = driver.find.elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[5]/td[2]").extract()
item["npi"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[2]/table/tbody/tr[6]/td[2]").extract()
item["Med_sch_grad_date"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[3]/tbody/tr[3]/td/table/tbody/tr[2]/td[2]").extract()
item["Area_of_speciality"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[4]/tbody/tr[3]/td/table/tbody/tr/td[2]").extract()
item["link"] = driver.find_element_by_xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a").extract()
print item
Python Log Message:
C:\Users\Anaconda2\MdiMass>scrapy crawl MdiMass -o items.csv
2015-02-26 01:11:47-0500 [scrapy] INFO: Scrapy 0.24.4 started (bot: MdiMass)
2015-02-26 01:11:47-0500 [scrapy] INFO: Optional features available: ssl, http11, boto
2015-02-26 01:11:47-0500 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'MdiMass.spiders', 'FEED_FORMAT': 'csv', 'SPIDER_M
ODULES': ['MdiMass.spiders'], 'FEED_URI': 'items.csv', 'BOT_NAME': 'MdiMass'}
2015-02-26 01:11:47-0500 [scrapy] INFO: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStat
s, SpiderState
2015-02-26 01:11:49-0500 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMidd
leware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMidd
leware, ChunkedTransferMiddleware, DownloaderStats
2015-02-26 01:11:49-0500 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLe
ngthMiddleware, DepthMiddleware
C:\Users\Anaconda2\MdiMass
2015-02-26 01:11:49-0500 [scrapy] INFO: Enabled item pipelines: CsvWriterPipeline
2015-02-26 01:11:49-0500 [MdiMass] INFO: Spider opened
2015-02-26 01:11:49-0500 [MdiMass] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-02-26 01:11:49-0500 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-02-26 01:11:49-0500 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080
2015-02-26 01:11:49-0500 [MdiMass] DEBUG: Redirecting (302) to <GET http://profiles.ehs.state.ma.us/Profiles/Pages/FindAPhysician.as
px> from <GET http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1>
2015-02-26 01:11:49-0500 [MdiMass] DEBUG: Crawled (200) <GET http://profiles.ehs.state.ma.us/Profiles/Pages/FindAPhysician.aspx> (re
ferer: None)
[<selenium.webdriver.remote.webelement.WebElement object at 0x0493B210>, <selenium.webdriver.remote.webelement.WebElement object at
0x0493B0B0>, <selenium.webdriver.remote.webelement.WebElement object at 0x0493B1F0>, <selenium.webdriver.remote.webelement.WebElemen
t object at 0x0493B110>, <selenium.webdriver.remote.webelement.WebElement object at 0x0493B150>, <selenium.webdriver.remote.webeleme
nt.WebElement object at 0x0493B170>]
[u'', u'5', u'10', u'15', u'30', u'50']
2015-02-26 01:11:50-0500 [MdiMass] INFO: Closing spider (finished)
2015-02-26 01:11:50-0500 [MdiMass] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 575,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 19312,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/302': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 2, 26, 6, 11, 50, 150000),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2015, 2, 26, 6, 11, 49, 134000)}
2015-02-26 01:11:50-0500 [MdiMass] INFO: Spider closed (finished)
The problem is in the following line.
You miss positioned a comma. The correct code is:
By this correction you make the rule iterable.
Good definition of iterators here: (Build a Basic Python Iterator)
Iterator objects in python conform to the iterator protocol, which basically means they provide two methods:
__iter__()
andnext()
.The
__iter__
returns the iterator object and is implicitly called at the start of loops.The
next()
method returns the next value and is implicitly called at each loop increment.next()
raises a StopIteration exception when there are no more value to return, which is implicitly captured by looping constructs to stop iterating.