He created a spider in Scrapy: items.py:
from scrapy.item import Item, Field
class dns_shopItem (Item):
# Define the fields for your item here like:
# Name = Field ()
id = Field ()
idd = Field ()
dns_shop_spider.py:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader.processor import TakeFirst
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from dns_shop.items import dns_shopItem
class dns_shopLoader (XPathItemLoader):
default_output_processor = TakeFirst ()
class dns_shopSpider (CrawlSpider):
name = "dns_shop_spider"
allowed_domains = ["www.playground.ru"]
start_urls = ["http://www.playground.ru/files/stalker_clear_sky/"]
rules = (
Rule (SgmlLinkExtractor (allow = ('/ files / s_t_a_l_k_e_r_chistoe_nebo')), follow = True),
Rule (SgmlLinkExtractor (allow = ('/ files / s_t_a_l_k_e_r_chistoe_nebo')), callback = 'parse_item'),
)
def parse_item (self, response):
hxs = HtmlXPathSelector (response)
l = dns_shopLoader (dns_shopItem (), hxs)
l.add_xpath ('id', "/ html / body / table [2] / tbody / tr [5] / td [2] / table / tbody / tr / td / div [6] / h1/text ()" )
l.add_xpath ('idd', "/ / html / body / table [2] / tbody / tr [5] / td [2] / table / tbody / tr / td / div [6] / h1/text () ")
return l.load_item ()
Run the following command:
scrapy crawl dns_shop_spider-o scarped_data_utf8.csv-t csv
This log shows that Scrapy through all the necessary url, but why not write to the specified file when you start the spider. In what could be the problem?
Assuming you want to follow all links on the page http://www.playground.ru/files/stalker_clear_sky/ and get titles, urls and links for downloading:
Save it to the
spider.py
and run via:Then check
output.json
.Hope that helps.