Web客户端的HtmlUnit超时(HtmlUnit WebClient Timeout)

2019-08-08 13:17发布

在我以前的有关问题的HtmlUnit 跳到特定的JavaScript执行在HTML单元和使用的HtmlUnit抓取页面来源:URL卡住了

我曾提到URL被卡住。 我还发现,它卡住由于在库的HtmlUnit方法(解析)一个不出来执行。

我没有在这个进一步的工作。 我写代码来获得该方法,如果它花费的时间比规定的超时秒就完成了。

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HandleHtmlUnitTimeout {

public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException, TimeoutException 
    {   
        Date start = new Date();
        String url = "http://ericaweiner.com/collections/";
        doWorkWithTimeout(url, 60);
    }

public static void doWorkWithTimeout(final String url, long timeoutSecs) throws InterruptedException, TimeoutException {
    //maintains a thread for executing the doWork method
    ExecutorService executor = Executors.newFixedThreadPool(1);
    //logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
    //set the executor thread working

    final Future<?> future = executor.submit(new Runnable() {
        public void run() 
            {
            try 
                {
                getPageSource(url);
                }
            catch (Exception e) 
                {
                throw new RuntimeException(e);
                }
        }
    });

    //check the outcome of the executor thread and limit the time allowed for it to complete
    try {
        future.get(timeoutSecs, TimeUnit.SECONDS);
    } catch (Exception e) {
        //ExecutionException: deliverer threw exception
        //TimeoutException: didn't complete within downloadTimeoutSecs
        //InterruptedException: the executor thread was interrupted

        //interrupts the worker thread if necessary
        future.cancel(true);

        //logger.warn("encountered problem while doing some work", e);
        throw new TimeoutException();
    }finally{ 
    executor.shutdownNow();
    }
}

public static void getPageSource(String productPageUrl)
    {
    try {
    if(productPageUrl == null)
        {
        productPageUrl = "http://ericaweiner.com/collections/";
        }   

        WebClient wb = new WebClient(BrowserVersion.FIREFOX_3_6);
        wb.getOptions().setTimeout(120000);
        wb.getOptions().setJavaScriptEnabled(true);
        wb.getOptions().setThrowExceptionOnScriptError(true);
        wb.getOptions().setThrowExceptionOnFailingStatusCode(false);
        HtmlPage page = wb.getPage(productPageUrl);
        wb.waitForBackgroundJavaScript(4000);
        wb.closeAllWindows();
} 
catch (FailingHttpStatusCodeException e) 
    {
    e.printStackTrace();
    } 
catch (MalformedURLException e) 
    {
    e.printStackTrace();
    } 
catch (IOException e) 
    {
    e.printStackTrace();
    }
    }

}

此代码不来doWorkWithTimeout出来(URL,60); 方法。 但是,这并不终止。

当我尝试调用与下面的代码similiar实现:

import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.log4j.Logger;


public class HandleScraperTimeOut {

private static Logger logger = Logger.getLogger(HandleScraperTimeOut .class);


public void doWork() throws InterruptedException {
    logger.info(new Date()+ "Starting worker method ");
    Thread.sleep(20000);
    logger.info(new Date()+ "Ending worker method ");
    //perform some long running task here...
}

public void doWorkWithTimeout(int timeoutSecs) {
    //maintains a thread for executing the doWork method
    ExecutorService executor = Executors.newFixedThreadPool(1);
    logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
    //set the executor thread working

    final Future<?> future = executor.submit(new Runnable() {
        public void run() 
            {
            try 
                {
                doWork();
                }
            catch (Exception e) 
                {
                throw new RuntimeException(e);
                }
        }
    });

    //check the outcome of the executor thread and limit the time allowed for it to complete
    try {
        future.get(timeoutSecs, TimeUnit.SECONDS);
    } catch (Exception e) {
        //ExecutionException: deliverer threw exception
        //TimeoutException: didn't complete within downloadTimeoutSecs
        //InterruptedException: the executor thread was interrupted

        //interrupts the worker thread if necessary
        future.cancel(true);

        logger.warn("encountered problem while doing some work", e);
    }
    executor.shutdown();
}

public static void main(String a[])
    {
        HandleScraperTimeOut hcto = new HandleScraperTimeOut ();
        hcto.doWorkWithTimeout(30);

    }

}

如果任何人都可以去看看,并告诉我是什么问题,这将是很有益的。

有关问题的更多信息,你可以看看跳过特定的JavaScript执行在HTML单元和使用的HtmlUnit抓取页面来源:URL卡住了


更新1名奇怪的是:future.cancel(真); 将返回在这两种情况下TRUE。 我多么希望它是是:

  • 随着它的HtmlUnit应该返回FALSE,因为进程仍然挂。
  • 与正常的Thread.sleep(); 因为该进程得到了成功取消它应该返回TRUE。

更新2只挂带http://ericaweiner.com/collections/ URL。 如果我给任何其他URL即http://www.google.comhttp://www.yahoo.com ,它没有手。 在这些情况下,抛出IntruptedException和出来的过程。

看来, http://ericaweiner.com/collections/页面源都有导致问题的某些元素。

Answer 1:

Future.cancel(布尔)的回报:

  • 假如果任务不能被取消,通常是由于它已经正常完成
  • 否则真

取消方式表示线程没有完成之前取消,被取消的标志设置为true,如果请求的线程被中断。

中断线程menans叫了Thread.interrupt,仅此而已。 Future.cancel(布尔)不检查线程实际停止。

因此,它是正确的,取消在该情况下返回true。

中断线程意味着它应该尽快停止,但不会强制执行。 你可以尝试使它停止/失败关闭它需要的资源或东西。 我通常做的是用一个线程读取从插座(等待输入数据)。 我关闭套接字所以停止等待。



文章来源: HtmlUnit WebClient Timeout