There is a website that contains a page with a list of 25 entries, where each entry is a link to a page containing some information that I need. I want get to the listing page and then:
1) click on link to first entry
2) retrieve all the html
3) click back to the listing page (there is a button for this)
4) repeat for every other listing
I would also like to do this as efficiently as possible which I've been told means leveraging promises. Here's my code sketch, which doesn't work:
var Nightmare = require('nightmare');
var nightmare = Nightmare({ openDevTools: true, show: true })
var Xray = require('x-ray');
var x = Xray();
var resultArr = [];
nightmare
.goto(hidTestURL)
.wait(2500)
.click('input[name="propertySearchOptions:advanced"]') //start navigating to listing page
.wait(2500)
.type('input[name="propertySearchOptions:streetName"]', 'Main')
.wait(2500)
.select('select[name="propertySearchOptions:recordsPerPage"]', '25')
.wait(2500)
.click('input[name="propertySearchOptions:search"]') //at listing page
.wait(2500)
.then(function(){
nightmare
.click('a[href^="Property.aspx?prop_id=228645"]') //first entry
.evaluate(function(){ //retrieve info
var resultArr = [];
resultArr.push(document.querySelector('html').innerHTML);
})
})
nightmare
.click('a[id="propertyHeading_searchResults"]') //return to listing page
.evaluate(function(){
return resultArr.push(document.querySelector('html').innerHTML); retrieve listing page info to show that it returned.
})
.then(function (resultArr) {
console.log('resultArr', resultArr);
x(resultArr[1], 'body@html') //output listing page html
.write('results.json');
})
This gets as far as the listing page, and then does not proceed any further. I also tried the same code, but with return nightmare
for every use of nightmare
except the first one. I'd seen some examples that used return
, but when I did this, the code threw an error.
I also tried not including the third nightmare
(the one after the blank space), and instead trying to continue the old nightmare instance by going straight to the .click()
, but this also threw an error.
I clearly need some help with the syntax and semantics of nightmare, but there is not much documentation online besides an API listing. Does anyone know how I can make this work?
First, calling Nightmare like you have it - broken into two chains - is probably not going to do what you want. (This comment thread is a good - albeit long - primer.) Memory serving, actions from the second chain will be queued immediately after the first, resulting in (probably) undesirable behavior. You said you had it written slightly differently - I'd be curious to see it, it sounds like it may have been a little closer.
Second, you're trying to lift resultArr
in .evaluate()
, which isn't possible. The function passed to .evaluate()
is stringified and reconstituted inside of Electron - meaning that you'll lose the ambient context around the function. This example in nightmare-examples
goes into a little more depth, if you're curious.
Third, and maybe this is a typo or me misunderstanding intent: your href
selector uses the starts-with (^=
) operator, is that intentional? Should that be an ends-with ($=
)?
Fourth, looping over asynchronous operations is tricky. I get the impression that may also be a stumbling block?
With all of that in mind, let's take a look at modifying your original script. Admittedly untested, as I don't have access to your testing URL, so this is a bit from the hip:
var Nightmare = require('nightmare');
var nightmare = Nightmare({ openDevTools: true, show: true })
var Xray = require('x-ray');
var x = Xray();
nightmare
.goto(hidTestURL)
.wait(2500)
.click('input[name="propertySearchOptions:advanced"]') //start navigating to listing page
.wait(2500)
.type('input[name="propertySearchOptions:streetName"]', 'Main')
.wait(2500)
.select('select[name="propertySearchOptions:recordsPerPage"]', '25')
.wait(2500)
.click('input[name="propertySearchOptions:search"]') //at listing page
.wait(2500)
.evaluate(function(){
//using `Array.from` as the DOMList is not an array, but an array-like, sort of like `arguments`
//planning on using `Array.map()` in a moment
return Array.from(
//give me all of the elements where the href contains 'Property.aspx'
document.querySelectorAll('a[href*="Property.aspx"]'))
//pull the target hrefs for those anchors
.map(a => a.href);
})
.then(function(hrefs){
//here, there are two options:
// 1. you could navigate to each link, get the information you need, then navigate back, or
// 2. you could navigate straight to each link and get the information you need.
//I'm going to go with #1 as that's how it was in your original script.
//here, we're going to use the vanilla JS way of executing a series of promises in a sequence.
//for every href in hrefs,
return hrefs.reduce(function(accumulator, href){
//return the accumulated promise results, followed by...
return accumulator.then(function(results){
return nightmare
//click on the href
.click('a[href="'+href+'"]')
//get the html
.evaluate(function(){
return document.querySelector('html').innerHTML;
})
//add the result to the results
.then(function(html){
results.push(html);
return results;
})
.then(function(results){
//click on the search result link to go back to the search result page
return nightmare
.click('a[id="propertyHeading_searchResults"]')
.then(function() {
//make sure the results are returned
return results;
});
})
});
}, Promise.resolve([])) //kick off the reduce with a promise that resolves an empty array
})
.then(function (resultArr) {
//if I haven't made a mistake above with the `Array.reduce`, `resultArr` should now contain all of your links' results
console.log('resultArr', resultArr);
x(resultArr[1], 'body@html') //output listing page html
.write('results.json');
});
Hopefully that's enough to get you started.