Blacklist and whitelist URLs in HtmlUnitDriver

2019-09-02 08:53发布

Blacklisting URLs in PhantomJS and GhostDriver is pretty straightforward. First initialize the driver with a handler:

PhantomJSDriver driver = new PhantomJSDriver();
driver.executePhantomJS(loadFile("/phantomjs/handlers.js"))

And configure the handler:

this.onResourceRequested = function (requestData, networkRequest) {
    var allowedUrls = [
        /https?:\/\/localhost.*/,
        /https?:\/\/.*\.example.com\/?.*/
    ];
    var disallowedUrls = [
        /https?:\/\/nonono.com.*/
    ];

    function isUrlAllowed(url) {
        function matches(url) {
            return function(re) {
                return re.test(url);
            };
        }
        return allowedUrls.some(matches(url)) && !disallowedUrls.some(matches(url));
    }

    if (!isUrlAllowed(requestData.url)) {
        console.log("Aborting disallowed request (# " + requestData.id + ") to url: '" + requestData.url + "'");
        networkRequest.abort();
    }
};

I haven't found a good way to do this with HtmlUnitDriver. There's the ScriptPreProcessor mentioned in How to filter javascript from specific urls in HtmlUnit, but it uses WebClient, not HtmlUnitDriver. Any ideas?

1条回答
Bombasti
2楼-- · 2019-09-02 09:43

Extend HtmlUnitDriver and implement a ScriptPreProcessor (for editing content) and a HttpWebConnection (for allowing/blocking URLs):

public class FilteringHtmlUnitDriver extends HtmlUnitDriver {

    private static final String[] ALLOWED_URLS = {
            "https?://localhost.*",
            "https?://.*\\.yes.yes/?.*",
    };
    private static final String[] DISALLOWED_URLS = {
            "https?://spam.nono.*"
    };

    public FilteringHtmlUnitDriver(DesiredCapabilities capabilities) {
        super(capabilities);
    }

    @Override
    protected WebClient modifyWebClient(WebClient client) {
        WebConnection connection = filteringWebConnection(client);
        ScriptPreProcessor preProcessor = filteringPreProcessor();

        client.setWebConnection(connection);
        client.setScriptPreProcessor(preProcessor);

        return client;
    }

    private ScriptPreProcessor filteringPreProcessor() {
        return (htmlPage, sourceCode, sourceName, lineNumber, htmlElement) -> editContent(sourceCode);
    }

    private String editContent(String sourceCode) {
        return sourceCode.replaceAll("foo", "bar");        }

    private WebConnection filteringWebConnection(WebClient client) {
        return new HttpWebConnection(client) {
            @Override
            public WebResponse getResponse(WebRequest request) throws IOException {
                String url = request.getUrl().toString();
                WebResponse emptyResponse = new WebResponse(
                        new WebResponseData("".getBytes(), SC_OK, "", new ArrayList<>()), request, 0);

                for (String disallowed : DISALLOWED_URLS) {
                    if (url.matches(disallowed)) {
                        return emptyResponse;
                    }
                }
                for (String allowed : ALLOWED_URLS) {
                    if (url.matches(allowed)) {
                        return super.getResponse(request);
                    }
                }
                return emptyResponse;
            }
        };
    }
}

This enables both editing of content, and blocking of URLs.

查看更多
登录 后发表回答