I need to parse Html code. More specifically, parse each cell of every rows in all tables. Each row represent a single object and each cell represent different properties. I want to parse these to be able to write an XML file with every data inside (without the useless HTML code). I have successfully been able to parse each column from the HTML file but now I don't know what my options are for writing this to an XML file. I am baffled.
HTML:
<tr><tr>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF">
1
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="left">
<a href="/ice/player.htm?id=8471675">Sidney Crosby</a>
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center">
PIT
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="center">
C
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
39
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
32
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
33
</td>
<td class="statBox sorted" style="border-width:0px 1px 1px 0px; background-color: #E0E0E0" align="right">
<font color="#000000">
65
</font>
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
20
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
29
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
10
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
1
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
3
</td>
<td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right">
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
0
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
154
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
20.8
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
21:54
</td>
<td class="statBox" style="border-width:0px 1px 1px 0px; background-color: #FFFFFF" align="right">
22.6
</td>
<td class="statBox" style="border-width:0px 0px 1px 0px; background-color: #FFFFFF" align="right">
55.7
</td>
</tr></tr>
C#:
using HtmlAgilityPack;
namespace Stats
{
class StatsParser
{
private string htmlCode;
private static string fileName = "[" + DateTime.Now.ToShortDateString() + " NHL Stats].xml";
public StatsParser(string htmlCode)
{
this.htmlCode = htmlCode;
this.ParseHtml();
}
public void ParseHtml()
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlCode);
try
{
// Get all tables in the document
HtmlNodeCollection tables = doc.DocumentNode.SelectNodes("//table");
// Iterate all rows in the first table
HtmlNodeCollection rows = tables[0].SelectNodes(".//tr");
for (int i = 0; i < rows.Count; ++i)
{
// Iterate all columns in this row
HtmlNodeCollection cols = rows[i].SelectNodes(".//td[@class='statBox']");
for (int j = 0; j < cols.Count; ++j)
{
// Get the value of the column and print it
string value = cols[j].InnerText;
if (value!="")
System.Windows.MessageBox.Show(value);
}
}
}
catch (NullReferenceException)
{
System.Windows.Forms.MessageBox.Show("Exception!!");
}
}
XML:
<?xml version="1.0" encoding="utf-8" ?>
<Stats Date="2011-01-01">
<Player Rank="1">
<Name>Sidney Crosby</Name>
<Team>PIT</Team>
<Position>C</Position>
<GamesPlayed>39</GamesPlayed>
<Goals>32</Goals>
<Assists>33</Assists>
</Player>
</Stats>