First of all, I've installed successfully both PhantomJs and its npm interface phantom. I've set the code to load my page with the new syntax (All the other questions posted on here were based on the old code syntax or I'm missing something). this is the source I'm trying to scrape.
Now, the right sidebar, the one with the fake select near "Comune" and the other one are generated dynamically and I can't understand why phantomjs isn't picking them up. Following my code:
var sito = "http://bicincitta.tobike.it/";
var sitepage = null;
var phInstance = null;
var phantom = require('phantom')
phantom.create()
.then((instance) => {
phInstance = instance;
return instance.createPage();
})
.then((page) => {
sitepage = page;
return page.open(sito);
})
.then((status) => {
console.log(status);
return sitepage.property('content');
})
.then((content) => {
console.log(content);
sitepage.close();
phInstance.exit();
})
.catch((error) => {
console.log(error);
phInstance.exit();
})
I'm hitting my head hard on a wall right now. Am I supposed to get in some way the site's scripts and execute them? Am I missing an instruction?
Also, on a sidenote; it's not really clear how should I concatenate additional methods to page, if page is scoped inside the second ".then".
I've spent the past week workign with PhantomJS trying to get it to snapshot a page with data that is rendered with angular. The easiest thing I found to do was to use page.injectJs('../script.js')
for any local scripts, and page.includeJs('http://jquery.com...')
for any external scripts. Since Phantom is sandboxed, it won't execute the javascript on the page it's capturing unless you give it the JS to execute. This will allow you to screenshot a page that has data rendered with javascript.
There is CData script at the bottom of the html that can not be parsed by phantom. This is where the items are being propagated from.
<script type="text/javascript">
//<![CDATA[
Sys.Application.initialize();
Sys.Application.add_init(function() {
$create(Telerik.Web.UI.RadAjaxManager, {"_updatePanels":"","ajaxSettings":[],"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"defaultLoadingPanelID":"","enableAJAX":true,"enableHistory":false,"links":[],"styles":[],"uniqueID":"RadAjaxManager1","updatePanelsRenderMode":0}, null, null, $get("RadAjaxManager1"));
});
Sys.Application.add_init(function() {
$create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginUser"}, null, null, $get("ajCheckLoginUser"));
});
Sys.Application.add_init(function() {
$create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginAdmin"}, null, null, $get("ajCheckLoginAdmin"));
});
Sys.Application.add_init(function() {
$create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajLogoutUser"}, null, null, $get("ajLogoutUser"));
});
Sys.Application.add_init(function() {
$create(Telerik.Web.UI.RadWindow, {"_dockMode":false,"behaviors":0,"clientStateFieldID":"radPortal_ClientState","destroyOnClose":true,"formID":"form1","height":"180px","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"radPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"width":"450px"}, {"close":OnClientClosePortal}, null, $get("radPortal"));
});
Sys.Application.add_init(function() {
$create(Telerik.Web.UI.RadWindowManager, {"behaviors":4,"clientStateFieldID":"windowManagerPortal_ClientState","destroyOnClose":true,"formID":"form1","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"windowManagerPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"windowControls":"['radPortal']"}, null, {"child":"radPortal"}, $get("windowManagerPortal"));
});
//]]>
</script>
These items will also be destroyed as soon as you leave communication with this sites server. There are methods to get around this but I think you are better off trying something else. I used npm cheerio to load the CDATA html