I am using WebBrowser
to render javascript on webpages to scrape the rendered source code, but after several page loads, the CPU usage spikes to 100% as well as the number of threads.
I'm assuming that the threads are not closing properly once the webpage has been rendered. I am trying to open the browser, extract the source code, and then close the browser and move to the next page.
I am able to get the rendered page, but this program doesn't make it very far before getting bogged down. I tried adding wb.Stop()
but that didn't help. The memory doesn't seem to be the problem (stays at a constant 70% or so).
Here is my source code. using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; using System.Threading;
namespace Abot.Demo
{
// Threaded version
public class HeadlessBrowser
{
private static string GeneratedSource { get; set; }
private static string URL { get; set; }
public static string GetGeneratedHTML(string url)
{
URL = url;
Thread t = new Thread(new ThreadStart(WebBrowserThread));
t.SetApartmentState(ApartmentState.STA);
t.Start();
t.Join();
return GeneratedSource;
}
private static void WebBrowserThread()
{
WebBrowser wb = new WebBrowser();
wb.Navigate(URL);
wb.DocumentCompleted +=
new WebBrowserDocumentCompletedEventHandler(
wb_DocumentCompleted);
while (wb.ReadyState != WebBrowserReadyState.Complete);
//Application.DoEvents();
//Added this line, because the final HTML takes a while to show up
GeneratedSource = wb.Document.Body.InnerHtml;
wb.Dispose();
wb.Stop();
}
private static void wb_DocumentCompleted(object sender,
WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser wb = (WebBrowser)sender;
GeneratedSource = wb.Document.Body.InnerHtml;
}
}
}
Any suggestions would be appreciated.
Thanks.