How to download images from a site with phantomjs

2019-02-01 02:34发布

问题:

I wanna save some images from a site. At the moment I can get the paths to the images but I have no clue how to get and save the images with phantomJs.

findRotationTeaserImages = ->
  paths = page.evaluate ->
    jQuery('.rotate img').map(-> return this.src).get()

  for path, i in paths
    console.log(path);
    //save the image

回答1:

I know this is an old question, but you do this pretty simply by storing the dimensions and location of each image on the in an object, then altering the phantomjs page.clipRect so that the page.render() method renders only the area where the image is. Here is an example, scraping multiple images from http://dribbble.com/ :

var page = require('webpage').create();

page.open('http://dribbble.com/', function() {

    page.includeJs('//ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js',function() {

        var images = page.evaluate(function() {
            var images = [];
            function getImgDimensions($i) {
                return {
                    top : $i.offset().top,
                    left : $i.offset().left,
                    width : $i.width(),
                    height : $i.height()
                }
            }
            $('.dribbble-img img').each(function() {
                var img = getImgDimensions($(this));
                images.push(img);
            });

            return images;
        });

        images.forEach(function(imageObj, index, array){
            page.clipRect = imageObj;
            page.render('images/'+index+'.png')
        });

        phantom.exit();
    });
});


回答2:

There is now another way to do this.

var fs = require("fs");
var imageBase64 = page.evaluate(function(){
  var canvas = document.createElement("canvas");
  canvas.width =img.width;
  canvas.height =img.height;
  var ctx = canvas.getContext("2d");
  ctx.drawImage(img, 0, 0);      
  return canvas.toDataURL ("image/png").split(",")[1];
})
fs.write("file.png",atob(imageBase64),'wb');


回答3:

Solve this by starting a child process running a node script that download the images:

phantomJs script:

findRotationTeaserImages = ->
  paths = page.evaluate ->
    jQuery('.rotate img').map(-> return this.src).get()

  args = ('loadRotationTeaser.js ' + paths.join(' ')).split(' ')

  child_process.execFile("node", args, null, (err, stdout, stderr) ->
    phantom.exit()
  )

nodeJs script

http = require('http-get');

args = process.argv.splice(2)

for path, i in args
  http.get path, 'public/images/rotationTeaser/img' + i + '.jpeg', (error, result) ->


回答4:

In case image dimensions are known:



    var webPage = require('webpage');

    /**
     * Download image with known dimension.
     * @param src   Image source
     * @param dest  Destination full path
     * @param width Image width
     * @param height    Image height
     * @param timeout   Operation timeout
     * @param cbk   Callback (optional)
     * @param cbkParam  Parameter to pass back to the callback (optional)
     */
    function downloadImg(src, dest, width, height, timeout, cbk, cbkParam) {
        var page = webPage.create();

        page.settings.resourceTimeout = timeout; //resources loading timeout(ms)
        page.settings.webSecurityEnabled = false; //Disable web security
        page.settings.XSSAuditingEnabled = false; //Disable web security

        page.open(src, function(status) {

            // missing images sometime receive text from server
            var success = status == 'success' && !page.plainText;

            if (success) {
                page.clipRect = {
                    top: 0,
                    left: 0,
                    width: width,
                    height: height
                };
                page.render(dest);

            }

            cbk && cbk(success, cbkParam);
            page.close();
        });
    };




回答5:

I've experienced really a lot troubles when using the render method. Luckily I finally come up with two better solution. Here is the code I used in my project. First solution has some trouble to update the cookie, so it cannot work well when fetching captcha image. Both method will cause a new http request. But with a few modifications, the second one can ommit such kind of request.

The first one fetches the cookie from phantomJs and makes a new http request using request. The second one uses base64 to pass the image.

 async download(download_url, stream) {
    logger.profile(`download(download_url='${download_url}')`);
    let orig_url = await this.page.property('url');
    download_url = url.resolve(orig_url, download_url);
    let cookies = await this.page.property('cookies');
    let jar = request.jar();
    for (let cookie of cookies) {
        if (cookie.name !== undefined) {
            cookie.key = cookie.name;
            delete cookie.name;
        }
        if (cookie.httponly !== undefined) {
            cookie.httpOnly = cookie.httponly;
            delete cookie.httponly;
        }
        if (cookie.expires !== undefined)
            cookie.expires = new Date(cookie.expires);
        jar.setCookie(new Cookie(cookie), download_url, {ignoreError: true});
    }
    let req = request({
        url: download_url,
        jar: jar,
        headers: {
            'User-Agent': this.user_agent,
            'Referer': orig_url
        }
    });
    await new Promise((resolve, reject) => {
        req.pipe(stream)
            .on('close', resolve)
            .on('error', reject);
    });
    // Due to this issue https://github.com/ariya/phantomjs/issues/13409, we cannot set cookies back
    // to browser. It is said to be redesigned, but till now (Mar 31 2017), no change has been made.
    /*await Promise.all([
        new Promise((resolve, reject) => {
            req.on('response', () => {
                jar._jar.store.getAllCookies((err, cookies) => {
                    if (err) {
                        reject(err);
                        return;
                    }
                    cookies = cookies.map(x => x.toJSON());
                    for (let cookie of cookies) {
                        if (cookie.key !== undefined) {
                            cookie.name = cookie.key;
                            delete cookie.key;
                        }
                        if (cookie.httpOnly !== undefined) {
                            cookie.httponly = cookie.httpOnly;
                            delete cookie.httpOnly;
                        }
                        if (cookie.expires instanceof Date) {
                            cookie.expires = cookie.expires.toGMTString();
                            cookie.expiry = cookie.expires.toTime();
                        }
                        else if (cookie.expires == Infinity)
                            delete cookie.expires;
                        delete cookie.lastAccessed;
                        delete cookie.creation;
                        delete cookie.hostOnly;
                    }
                    this.page.property('cookies', cookies).then(resolve).catch(reject);
                });
            }).on('error', reject);
        }),
        new Promise((resolve, reject) => {
            req.pipe(fs.createWriteStream(save_path))
                .on('close', resolve)
                .on('error', reject);
        })
    ]);*/
    logger.profile(`download(download_url='${download_url}')`);
}
async download_image(download_url, stream) {
    logger.profile(`download_image(download_url='${download_url}')`);
    await Promise.all([
        new Promise((resolve, reject) => {
            this.client.once('donwload image', data => {
                if (data.err)
                    reject(err);
                else
                    stream.write(Buffer.from(data.data, 'base64'), resolve);

            });
        }),
        this.page.evaluate(function (url) {
            var img = new Image(), callback = function (err, data) {
                callPhantom({
                    event: 'donwload image',
                    data: {
                        err: err && err.message,
                        data: data
                    }
                });
            };
            img.onload = function () {
                var canvas = document.createElement("canvas");
                canvas.width = img.width;
                canvas.height = img.height;
                canvas.getContext("2d").drawImage(img, 0, 0);
                callback(null, canvas.toDataURL("image/png").replace(/^data:image\/(png|jpg);base64,/, ""));
            };
            img.onerror = function () {
                callback(new Error('Failed to fetch image.'));
            };
            img.src = url;
        }, download_url)
    ]);
    logger.profile(`download_image(download_url='${download_url}')`);
}