How to follow all links in CasperJS?

2019-01-18 15:32发布

I'm having trouble clicking all JavaScript based links in a DOM and saving the output. The links have the form

<a id="html" href="javascript:void(0);" onclick="goToHtml();">HTML</a>

the following code works great:

var casper = require('casper').create();

var fs = require('fs');

var firstUrl = 'http://www.testurl.com/test.html';

var css_selector = '#jan_html';

casper.start(firstUrl);

casper.thenClick(css_selector, function(){
    console.log("whoop");
});

casper.waitFor(function check() {
    return this.getCurrentUrl() != firstUrl;
}, function then() {
    console.log(this.getCurrentUrl());
    var file_title = this.getTitle().split(' ').join('_') + '.html';
    fs.write(file_title, this.getPageContent());
});

casper.run();

However, how can I get this to work with a selector of "a", clicking all available links and saving content? I'm not sure how to get the clickWhileSelector to remove nodes from the selector as is done here: Click on all links matching a selector

2条回答
太酷不给撩
2楼-- · 2019-01-18 16:09

I have this script that first will get all links from a page then save 'href' attributes to an array, then will iterate over this array and then open each link one by one and echo the url :

var casper = require('casper').create({
    logLevel:"verbose",
    debug:true
});
var links;

casper.start('http://localhost:8000');

casper.then(function getLinks(){
     links = this.evaluate(function(){
        var links = document.getElementsByTagName('a');
        links = Array.prototype.map.call(links,function(link){
            return link.getAttribute('href');
        });
        return links;
    });
});
casper.then(function(){
    this.each(links,function(self,link){
        self.thenOpen(link,function(a){
            this.echo(this.getCurrentUrl());
        });
    });
});
casper.run(function(){
    this.exit();
});
查看更多
【Aperson】
3楼-- · 2019-01-18 16:09

rusln's answer works great if all the links have a meaningful href attribute (actual URL). If you want to click every a that also triggers a javascript function, you may need to iterate some other way over the elements.

I propose using the XPath generator from stijn de ryck for an element.

  1. You can then sample all XPaths that are on the page.
  2. Then you open the page for every a that you have the XPath for and click it by XPath.
  3. Wait a little if it is a single page application
  4. Do something
var startURL = 'http://localhost:8000',
    xPaths
    x = require('casper').selectXPath;

casper.start(startURL);

casper.then(function getLinks(){
    xPaths = this.evaluate(function(){
        // copied from https://stackoverflow.com/a/5178132/1816580
        function createXPathFromElement(elm) {
            var allNodes = document.getElementsByTagName('*'); 
            for (var segs = []; elm && elm.nodeType == 1; elm = elm.parentNode) { 
                if (elm.hasAttribute('id')) { 
                        var uniqueIdCount = 0; 
                        for (var n=0;n < allNodes.length;n++) { 
                            if (allNodes[n].hasAttribute('id') && allNodes[n].id == elm.id) uniqueIdCount++; 
                            if (uniqueIdCount > 1) break; 
                        }; 
                        if ( uniqueIdCount == 1) { 
                            segs.unshift('id("' + elm.getAttribute('id') + '")'); 
                            return segs.join('/'); 
                        } else { 
                            segs.unshift(elm.localName.toLowerCase() + '[@id="' + elm.getAttribute('id') + '"]'); 
                        } 
                } else if (elm.hasAttribute('class')) { 
                    segs.unshift(elm.localName.toLowerCase() + '[@class="' + elm.getAttribute('class') + '"]'); 
                } else { 
                    for (i = 1, sib = elm.previousSibling; sib; sib = sib.previousSibling) { 
                        if (sib.localName == elm.localName)  i++; }; 
                        segs.unshift(elm.localName.toLowerCase() + '[' + i + ']'); 
                }; 
            }; 
            return segs.length ? '/' + segs.join('/') : null; 
        };
        var links = document.getElementsByTagName('a');
        var xPaths = Array.prototype.map.call(links, createXPathFromElement);
        return xPaths;
    });
});
casper.then(function(){
    this.each(xPaths, function(self, xpath){
        self.thenOpen(startURL);
        self.thenClick(x(xpath));
        // waiting some time may be necessary for single page applications
        self.wait(1000);
        self.then(function(a){
            // do something meaningful here
            this.echo(this.getCurrentUrl());
        });

        // Uncomment the following line in case each click opens a new page instead of staying at the same page
        //self.back()
    });
});
casper.run(function(){
    this.exit();
});
查看更多
登录 后发表回答