Scrapy Image Pipeline: How to rename images?

2019-03-06 14:16发布

I've a spider which fetches both the data and images. I want to rename the images with the respective 'title' which i'm fetching.

Following is my code:

spider1.py

from imageToFileSystemCheck.items import ImagetofilesystemcheckItem
import scrapy

class TestSpider(scrapy.Spider):
   name = 'imagecheck'

    def start_requests(self):

        searchterms=['keyword1','keyword2',]
        for item in searchterms:
                yield scrapy.Request('http://www.example.com/s?=%s' % item,callback=self.parse, meta={'item': item})

    def parse(self,response):
        start_urls=[]
        item = response.meta.get('item')
        for i in range(0,2):
            link=str(response.css("div.tt a.chek::attr(href)")[i].extract())
            start_urls.append(link)

        for url in start_urls:
            print(url)
            yield scrapy.Request(url=url, callback=self.parse_info ,meta={'item': item})

    def parse_info(self, response):
        url=response.url
        title=str(response.xpath('//*[@id="Title"]/text()').extract_first())
        img_url_1=response.xpath("//img[@id='images']/@src").extract_first()

        scraped_info = {
            'url' : url,
            'title' : title,
            'image_urls': [img_url_1]
        }

        yield scraped_info

items.py

import scrapy


class ImagetofilesystemcheckItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
    pass

pipelines.py

class ImagetofilesystemcheckPipeline(object):
    def process_item(self, item, spider):
        return item

settings.py

BOT_NAME = 'imageToFileSystemCheck'

SPIDER_MODULES = ['imageToFileSystemCheck.spiders']
NEWSPIDER_MODULE = 'imageToFileSystemCheck.spiders'

ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_STORE = '/home/imageToFileSystemCheck/images/'


ROBOTSTXT_OBEY = True

Can you please help me with the required changes so that scrapy could save the scraped images in the 'title'.jpg format where title is scraped by the spider?

1条回答
Lonely孤独者°
2楼-- · 2019-03-06 14:27

Create a Spider like this

class ShopeeSpider(scrapy.Spider):

    _TEMP_IMAGES_STORE = "/home/crawler/scrapers/images"


    custom_settings = {
        'ITEM_PIPELINES': {
            'coszi.pipelines.CustomImagePipeline': 400,
        }
         "IMAGES_STORE": _TEMP_IMAGES_STORE
    }


    def parse(self, response):

        data = {}

        data['images'] = {"image_link_here": "image_name_here"}

Then your pipelines.py should be like this

class CustomImagePipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        if 'images' in item:
            for image_url, img_name in item['images'].iteritems():

                if os.path.exists(os.path.join(item['images_path'], img_name)) == False:
                    request = scrapy.Request(url=image_url)
                    request.meta['img_name'] = img_name
                    request.meta['this_prod_img_folder'] = item['img_name_here']
                    request.dont_filter = True
                    yield request

    def file_path(self, request, response=None, info=None):
        return os.path.join(info.spider.CRAWLER_IMAGES_STORE, request.meta['this_prod_img_folder'], request.meta['img_name'])
查看更多
登录 后发表回答