I wanna save some images from a site. At the moment I can get the paths to the images but I have no clue how to get and save the images with phantomJs.
findRotationTeaserImages = ->
paths = page.evaluate ->
jQuery('.rotate img').map(-> return this.src).get()
for path, i in paths
console.log(path);
//save the image
I know this is an old question, but you do this pretty simply by storing the dimensions and location of each image on the in an object, then altering the phantomjs page.clipRect so that the page.render() method renders only the area where the image is. Here is an example, scraping multiple images from http://dribbble.com/ :
var page = require('webpage').create();
page.open('http://dribbble.com/', function() {
page.includeJs('//ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js',function() {
var images = page.evaluate(function() {
var images = [];
function getImgDimensions($i) {
return {
top : $i.offset().top,
left : $i.offset().left,
width : $i.width(),
height : $i.height()
}
}
$('.dribbble-img img').each(function() {
var img = getImgDimensions($(this));
images.push(img);
});
return images;
});
images.forEach(function(imageObj, index, array){
page.clipRect = imageObj;
page.render('images/'+index+'.png')
});
phantom.exit();
});
});
There is now another way to do this.
var fs = require("fs");
var imageBase64 = page.evaluate(function(){
var canvas = document.createElement("canvas");
canvas.width =img.width;
canvas.height =img.height;
var ctx = canvas.getContext("2d");
ctx.drawImage(img, 0, 0);
return canvas.toDataURL ("image/png").split(",")[1];
})
fs.write("file.png",atob(imageBase64),'wb');
Solve this by starting a child process running a node script that download the images:
phantomJs script:
findRotationTeaserImages = ->
paths = page.evaluate ->
jQuery('.rotate img').map(-> return this.src).get()
args = ('loadRotationTeaser.js ' + paths.join(' ')).split(' ')
child_process.execFile("node", args, null, (err, stdout, stderr) ->
phantom.exit()
)
nodeJs script
http = require('http-get');
args = process.argv.splice(2)
for path, i in args
http.get path, 'public/images/rotationTeaser/img' + i + '.jpeg', (error, result) ->
In case image dimensions are known:
var webPage = require('webpage');
/**
* Download image with known dimension.
* @param src Image source
* @param dest Destination full path
* @param width Image width
* @param height Image height
* @param timeout Operation timeout
* @param cbk Callback (optional)
* @param cbkParam Parameter to pass back to the callback (optional)
*/
function downloadImg(src, dest, width, height, timeout, cbk, cbkParam) {
var page = webPage.create();
page.settings.resourceTimeout = timeout; //resources loading timeout(ms)
page.settings.webSecurityEnabled = false; //Disable web security
page.settings.XSSAuditingEnabled = false; //Disable web security
page.open(src, function(status) {
// missing images sometime receive text from server
var success = status == 'success' && !page.plainText;
if (success) {
page.clipRect = {
top: 0,
left: 0,
width: width,
height: height
};
page.render(dest);
}
cbk && cbk(success, cbkParam);
page.close();
});
};
I've experienced really a lot troubles when using the render
method. Luckily I finally come up with two better solution. Here is the code I used in my project. First solution has some trouble to update the cookie, so it cannot work well when fetching captcha image. Both method will cause a new http request. But with a few modifications, the second one can ommit such kind of request.
The first one fetches the cookie from phantomJs
and makes a new http request using request
. The second one uses base64
to pass the image.
async download(download_url, stream) {
logger.profile(`download(download_url='${download_url}')`);
let orig_url = await this.page.property('url');
download_url = url.resolve(orig_url, download_url);
let cookies = await this.page.property('cookies');
let jar = request.jar();
for (let cookie of cookies) {
if (cookie.name !== undefined) {
cookie.key = cookie.name;
delete cookie.name;
}
if (cookie.httponly !== undefined) {
cookie.httpOnly = cookie.httponly;
delete cookie.httponly;
}
if (cookie.expires !== undefined)
cookie.expires = new Date(cookie.expires);
jar.setCookie(new Cookie(cookie), download_url, {ignoreError: true});
}
let req = request({
url: download_url,
jar: jar,
headers: {
'User-Agent': this.user_agent,
'Referer': orig_url
}
});
await new Promise((resolve, reject) => {
req.pipe(stream)
.on('close', resolve)
.on('error', reject);
});
// Due to this issue https://github.com/ariya/phantomjs/issues/13409, we cannot set cookies back
// to browser. It is said to be redesigned, but till now (Mar 31 2017), no change has been made.
/*await Promise.all([
new Promise((resolve, reject) => {
req.on('response', () => {
jar._jar.store.getAllCookies((err, cookies) => {
if (err) {
reject(err);
return;
}
cookies = cookies.map(x => x.toJSON());
for (let cookie of cookies) {
if (cookie.key !== undefined) {
cookie.name = cookie.key;
delete cookie.key;
}
if (cookie.httpOnly !== undefined) {
cookie.httponly = cookie.httpOnly;
delete cookie.httpOnly;
}
if (cookie.expires instanceof Date) {
cookie.expires = cookie.expires.toGMTString();
cookie.expiry = cookie.expires.toTime();
}
else if (cookie.expires == Infinity)
delete cookie.expires;
delete cookie.lastAccessed;
delete cookie.creation;
delete cookie.hostOnly;
}
this.page.property('cookies', cookies).then(resolve).catch(reject);
});
}).on('error', reject);
}),
new Promise((resolve, reject) => {
req.pipe(fs.createWriteStream(save_path))
.on('close', resolve)
.on('error', reject);
})
]);*/
logger.profile(`download(download_url='${download_url}')`);
}
async download_image(download_url, stream) {
logger.profile(`download_image(download_url='${download_url}')`);
await Promise.all([
new Promise((resolve, reject) => {
this.client.once('donwload image', data => {
if (data.err)
reject(err);
else
stream.write(Buffer.from(data.data, 'base64'), resolve);
});
}),
this.page.evaluate(function (url) {
var img = new Image(), callback = function (err, data) {
callPhantom({
event: 'donwload image',
data: {
err: err && err.message,
data: data
}
});
};
img.onload = function () {
var canvas = document.createElement("canvas");
canvas.width = img.width;
canvas.height = img.height;
canvas.getContext("2d").drawImage(img, 0, 0);
callback(null, canvas.toDataURL("image/png").replace(/^data:image\/(png|jpg);base64,/, ""));
};
img.onerror = function () {
callback(new Error('Failed to fetch image.'));
};
img.src = url;
}, download_url)
]);
logger.profile(`download_image(download_url='${download_url}')`);
}