I am trying to get the original source for a particular web page.
The page executes some scripts that modify the DOM as soon as it loads. I would like to get the source before any script or user changes any object in the document.
With Chrome or Firefox (and probably most browsers) I can either look at the DOM (debug utility F12) or look at the original source (right-click, view source). The latter is what I want to accomplish.
Is it possible to do this with phantomjs/casperjs?
Before getting to the page I have to log in. This is working fine with casperjs. If I browse to the page and render the results I know I am on the right page.
casper.thenOpen('http://'+customUrl, function(response) {
this.page.render('example.png'); // *** Renders correct page (current DOM) ***
console.log(this.page.content); // *** Gets current DOM ***
casper.download('view-source:'+customUrl, 'b.html', 'GET'); // *** Blank page ***
console.log(this.getHTML()); // *** Gets current DOM ***
this.debugPage(); // *** Gets current DOM ***
utils.dump(response); // *** No BODY ***
casper.download('http://'+customUrl, 'a.html', 'GET'); // *** Not logged in ?! ***
});
I've tried this.download(url, 'a.html')
but it doesn't seem to share the same context since it returns HTML as if I was not logged in, even if I run with cookies casperjs test.casper.js --cookies-file=cookies.txt
.
I believe I should keep analyzing this option.
I have also tried casper.open('view-source:url')
instead of casper.open('http://url')
but it seems it doesn't recognize the url since I just get a blank page.
I have looked at the raw HTTP Response I get from the server with a utility I have and the body of this message (which is HTML) is what I need but when the page loads in the browser the DOM has already been modified.
I tried:
casper.thenOpen('http://'+url, function(response) {
...
}
But the response
object only contains the headers and some other information but not the body.
I also tried with the event onResourceRequested.
The idea is to abort the download of any resource needed by a specific web page (the referer).
onResourceRequested: function(casperObj, requestData, networkRequest) {
for (var i=0; i < requestData.headers.length; i++) {
var obj = requestData.headers[i];
if (obj.name === "Referer" && obj.value === 'http://'+customUrl) {
networkRequest.abort();
break;
}
}
Unfortunately the script that modifies the DOM initially seems to be inline the main HTML page (or this code is not doing what I would like it to do).
¿Any ideas?
Here is the full code:
phantom.casperTest = true;
phantom.cookiesEnabled = true;
var utils = require('utils');
var casper = require('casper').create({
clientScripts: [],
pageSettings: {
loadImages: false,
loadPlugins: false,
javascriptEnabled: true,
webSecurityEnabled: false
},
logLevel: "error",
verbose: true
});
casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X)');
casper.start('http://www.xxxxxxx.xxx/login');
casper.waitForSelector('input#login',
function() {
this.evaluate(function(customLogin, customPassword) {
document.getElementById("login").value = customLogin;
document.getElementById("password").value = customPassword;
document.getElementById("button").click();
}, {
"customLogin": customLogin,
"customPassword": customPassword
});
},
function() {
console.log('Can't login.');
},
15000
);
casper.waitForSelector('div#home',
function() {
console.log('Login successfull.');
},
function() {
console.log('Login failed.');
},
15000
);
casper.thenOpen('http://'+customUrl, function(response) {
this.page.render('example.png'); // *** Renders correct page (current DOM) ***
console.log(this.page.content); // *** Gets current DOM ***
casper.download('view-source:'+customUrl, 'b.html', 'GET'); // *** Blank page ***
console.log(this.getHTML()); // *** Gets current DOM ***
this.debugPage(); // *** Gets current DOM ***
utils.dump(response); // *** No BODY ***
casper.download('http://'+customUrl, 'a.html', 'GET'); // *** Not logged in ?! ***
});