Can't get desired results using try/except cla

2020-04-16 05:55发布

I've written a script in scrapy to make proxied requests using newly generated proxies by get_proxies() method. I used requests module to fetch the proxies in order to reuse them in the script. What I'm trying to do is parse all the movie links from it's landing page and then fetch the name of each movie from it's target page. My following script can use rotation of proxies.

I know there is an easier way to change proxies, like it is described here HttpProxyMiddleware but I would still like to stick to the way I'm trying here.

website link

This is my current attempt (It keeps using new proxies to fetch a valid response but every time it gets 503 Service Unavailable):

import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess

def get_proxies():   
    response = requests.get("https://www.us-proxy.org/")
    soup = BeautifulSoup(response.text,"lxml")
    proxy = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tbody tr") if "yes" in item.text]
    return proxy

class ProxySpider(scrapy.Spider):
    name = "proxiedscript"
    handle_httpstatus_list = [503]
    proxy_vault = get_proxies()
    check_url = "https://yts.am/browse-movies"

    def start_requests(self):
        random.shuffle(self.proxy_vault)
        proxy_url = next(cycle(self.proxy_vault))
        request = scrapy.Request(self.check_url,callback=self.parse,dont_filter=True)
        request.meta['https_proxy'] = f'http://{proxy_url}'
        yield request

    def parse(self,response):
        print(response.meta)
        if "DDoS protection by Cloudflare" in response.css(".attribution > a::text").get():
            random.shuffle(self.proxy_vault)
            proxy_url = next(cycle(self.proxy_vault))
            request = scrapy.Request(self.check_url,callback=self.parse,dont_filter=True)
            request.meta['https_proxy'] = f'http://{proxy_url}'
            yield request

        else:
            for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
                nlink = response.urljoin(item)
                yield scrapy.Request(nlink,callback=self.parse_details)

    def parse_details(self,response):
        name = response.css("#movie-info h1::text").get()
        yield {"Name":name}

if __name__ == "__main__":
    c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
    c.crawl(ProxySpider)
    c.start()

To make sure whether the request is being proxied, I printed response.meta and could get results like this {'https_proxy': 'http://142.93.127.126:3128', 'download_timeout': 180.0, 'download_slot': 'yts.am', 'download_latency': 0.237013578414917, 'retry_times': 2, 'depth': 0}.

As I've overused the link to check how the proxied request within scrapy works, I'm getting 503 Service Unavailable error at this moment and I can see this keyword within the response DDoS protection by Cloudflare. However, I get valid response when I try with requests module applying the same logic I implemented here.

My earlier question: why I can't get the valid response as (I suppose) I'm using proxies in the right way? [solved]

Bounty Question: how can I define try/except clause within my script so that it will try with different proxies once it throws connection error with a certain proxy?

2条回答
Bombasti
2楼-- · 2020-04-16 06:19

The start_requests() function is just the entry point. On subsequent requests, you would need to resupply this metadata to the Request object.

Also, errors can occur on two levels: proxy and target server

We need to handle bad response codes from both the proxy and the target server. Proxy errors are returned by the middelware to the errback function. The target server response can be handled during parsing from the response.status

import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess


def get_proxies():
    response = requests.get("https://www.us-proxy.org/")
    soup = BeautifulSoup(response.text, "lxml")
    proxy = [':'.join([item.select_one("td").text, item.select_one("td:nth-of-type(2)").text]) for item in
             soup.select("table.table tbody tr") if "yes" in item.text]
    # proxy = ['https://52.0.0.1:8090', 'https://52.0.0.2:8090']
    return proxy


def get_random_proxy(proxy_vault):
    random.shuffle(proxy_vault)
    proxy_url = next(cycle(proxy_vault))
    return proxy_url


class ProxySpider(scrapy.Spider):
    name = "proxiedscript"
    handle_httpstatus_list = [503, 502, 401, 403]
    check_url = "https://yts.am/browse-movies"
    proxy_vault = get_proxies()

    def handle_middleware_errors(self, *args, **kwargs):
        # implement middleware error handling here
        print('Middleware Error')
        # retry request with different proxy
        yield self.make_request(url=args[0].request._url, callback=args[0].request._meta['callback'])

    def start_requests(self):
        yield self.make_request(url=self.check_url, callback=self.parse)

    def make_request(self, url, callback, dont_filter=True):
        return scrapy.Request(url,
                              meta={'proxy': f'https://{get_random_proxy(self.proxy_vault)}', 'callback': callback},
                              callback=callback,
                              dont_filter=dont_filter,
                              errback=self.handle_middleware_errors)

    def parse(self, response):
        print(response.meta)
        try:
            if response.status != 200:
                # implement server status code handling here - this loops forever
                print(f'Status code: {response.status}')
                raise
            else:
                for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
                    nlink = response.urljoin(item)
                    yield self.make_request(url=nlink, callback=self.parse_details)
        except:
            # if anything goes wrong fetching the lister page, try again
            yield self.make_request(url=self.check_url, callback=self.parse)

    def parse_details(self, response):
        print(response.meta)
        try:
            if response.status != 200:
                # implement server status code handeling here - this loops forever
                print(f'Status code: {response.status}')
                raise
            name = response.css("#movie-info h1::text").get()
            yield {"Name": name}
        except:
            # if anything goes wrong fetching the detail page, try again
            yield self.make_request(url=response.request._url, callback=self.parse_details)


if __name__ == "__main__":
    c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
    c.crawl(ProxySpider)
    c.start()
查看更多
看我几分像从前
3楼-- · 2020-04-16 06:20

According to scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware docs (and source)
proxy meta key is expected to use (not https_proxy)

#request.meta['https_proxy'] = f'http://{proxy_url}'  
request.meta['proxy'] = f'http://{proxy_url}'

As scrapy didn't received valid meta key - your scrapy application didn't use proxies

查看更多
登录 后发表回答