PHP卷曲以下的重定向(PHP Curl following redirects)

2019-06-23 21:57发布

我想有点sneeky并作为一个学习的过程尝试和改进我的网页抓取技能的一部分。

有一件事我遇到,我还没有能够解决的是,某些网站会使用内部链接,然后重定向到一个外部链接。

我想要做的是修改一些代码卷曲跟随重定向,直到他们停止,然后获得最终的安息之地URL。

谁能推荐一些代码给我吗?

我有这样的时刻,但它不是在此刻正确遵循重定向。

        $opts = array(CURLOPT_URL => $url,
                      CURLOPT_RETURNTRANSFER => true,
                      CURLOPT_HEADER => true,
                      CURLOPT_FOLLOWLOCATION => true);      

        $curl = curl_init(); 
        curl_setopt_array($curl, $opts);  
        $str = curl_exec($curl);  
        curl_close($curl);  

Answer 1:

http.//php.net/manual/en/ref.curl.php

  function get_final_url( $url, $timeout = 5 ) { $url = str_replace( "&", "&", urldecode(trim($url)) ); $cookie = tempnam ("/tmp", "CURLCOOKIE"); $ch = curl_init(); curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" ); curl_setopt( $ch, CURLOPT_URL, $url ); curl_setopt( $ch, CURLOPT_COOKIEJAR, $cookie ); curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true ); curl_setopt( $ch, CURLOPT_ENCODING, "" ); curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); curl_setopt( $ch, CURLOPT_AUTOREFERER, true ); curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout ); curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout ); curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 ); $content = curl_exec( $ch ); $response = curl_getinfo( $ch ); curl_close ( $ch ); if ($response['http_code'] == 301 || $response['http_code'] == 302) { ini_set("user_agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1"); $headers = get_headers($response['url']); $location = ""; foreach( $headers as $value ) { if ( substr( strtolower($value), 0, 9 ) == "location:" ) return get_final_url( trim( substr( $value, 9, strlen($value) ) ) ); } } if ( preg_match("/window\.location\.replace\('(.*)'\)/i", $content, $value) || preg_match("/window\.location\=\"(.*)\"/i", $content, $value) ) { return get_final_url ( $value[1] ); } else { return $response['url']; } } 


Answer 2:

如果您无法使用CURLOPT_FOLLOWLOCATION ,我建议你使用像这样的递归方法:

function getUrl($url, $count) {

    // max number of redirects
    if ($count > 5) {
        return false;
    }

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_HEADER, true);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

    $data = curl_exec($ch);
    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);

    curl_close($ch);

    if (!$data) {
        return false;
    }

    $dataArray = explode("\r\n\r\n", $data, 2);

    if (count($dataArray) != 2) {
        return false;
    }

    list($header, $body) = $dataArray;
    if ($httpCode == 301 || $httpCode == 302) {
        $matches = array();
        preg_match('/Location:(.*?)\n/', $header, $matches);

        if (isset($matches[1])) {
            return getUrl(trim($matches[1]), $count + 1);
        }
    } else {
        return $body;
    }
}


文章来源: PHP Curl following redirects
标签: php curl scrape