i have checked a couple of posts on stack overflow regarding getting all the words between all the html tags! All of them confused me up! some people recommend regular expression specifically for a single tag while some have mentioned parsing techniques! am basically trying to make a web crawler! for that i have got the html of the link i fetched to my program in a string! i have also extracted the links from the html that i stored in my data string! now i want to crawl through the depth and extract words on the page of all links i extracted from my string! i got two questions! how can i fetch the words on the each of the web pages ignoring tags and java script? secondly how would i recursively crawl through the links??
This is how am getting html in the string:
public void getting_html_code_of_link()
{
string urlAddress = "http://google.com";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
readStream = new StreamReader(receiveStream);
else
readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
data = readStream.ReadToEnd();
response.Close();
readStream.Close();
Console.WriteLine(data);
}
}
and this is how am extracting link refrences from the url i give:
public void regex_ka_kaam()
{
StringBuilder sb = new StringBuilder();
//Regex hrefs = new Regex("<a href.*?>");
Regex http = new Regex("http://.*?>");
foreach (Match m in http.Matches(data))
{
sb.Append(m.ToString());
if (http.IsMatch(m.ToString()))
{
sb.Append(http.Match(m.ToString()));
sb.Append(" ");
//sb.Append("<br>");
}
else
{
sb.Append(m.ToString().Substring(1, m.ToString().Length - 1)); //+ "<br>");
}
}
Console.WriteLine(sb);
}