I am create a C# 4.0 application to download the webpage content using Web client.
WebClient function
public static string GetDocText(string url)
{
string html = string.Empty;
try
{
using (ConfigurableWebClient client = new ConfigurableWebClient())
{
/* Set timeout for webclient */
client.Timeout = 600000;
/* Build url */
Uri innUri = null;
if (!url.StartsWith("http://"))
url = "http://" + url;
Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out innUri);
try
{
client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR " + "3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.2; AskTbFXTV5/5.15.4.23821; BRI/2)");
client.Headers.Add("Vary", "Accept-Encoding");
client.Encoding = Encoding.UTF8;
html = client.DownloadString(innUri);
if (html.Contains("Pagina non disponibile"))
{
string str = "site blocked";
str = "";
}
if (string.IsNullOrEmpty(html))
{
return string.Empty;
}
else
{
return html;
}
}
catch (Exception ex)
{
return "";
}
finally
{
client.Dispose();
}
}
}
catch (Exception ex)
{
return "";
}
}
public class ConfigurableWebClient : WebClient
{
public int? Timeout { get; set; }
public int? ConnectionLimit { get; set; }
protected override WebRequest GetWebRequest(Uri address)
{
var baseRequest = base.GetWebRequest(address);
var webRequest = baseRequest as HttpWebRequest;
if (webRequest == null)
return baseRequest;
if (Timeout.HasValue)
webRequest.Timeout = Timeout.Value;
if (ConnectionLimit.HasValue)
webRequest.ServicePoint.ConnectionLimit = ConnectionLimit.Value;
return webRequest;
}
}
I examine the download content in C# Web client it's slightly different than the browser
content. I give the same URL in browser ( Mozilla Firefox ) and my web client function.
the webpage shows the content correctly but my Web client DownloadString is returns another
HTML. Please see my the Web Client response below.
Webclient downloaded html
<!DOCTYPE html>
<head>
<META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW">
<meta http-equiv="cache-control" content="max-age=0" />
<meta http-equiv="cache-control" content="no-cache" />
<meta http-equiv="expires" content="0" />
<meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />
<meta http-equiv="pragma" content="no-cache" />
<meta http-equiv="refresh" content="10; url=/distil_r_captcha.html?Ref=/pgol/4-abbigliamento/3-Roma%20%28RM%29/p-7&distil_RID=A8D2F8B6-B314-11E3-A5E9-E04C5DBA1712" />
<script type="text/javascript" src="/ga.280243267228712.js?PID=6D4E4D1D-7094-375D-A439-0568A6A70836" defer></script><style type="text/css">#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#glance7ca96c1b,#hiredf795fe70,#target01a7c05a,#hiredf795fe70{display:none!important}</style></head>
<body>
<div id="distil_ident_block"> </div>
<div id="d__fFH"><OBJECT id="d_dlg" CLASSID="clsid:3050f819-98b5-11cf-bb82-00aa00bdce0b" width="0px" height="0px"></OBJECT><span id="d__fF"></span></div></body>
</html>
My problem is my Webclient function is not returned the actual webpage content.
Please help.