HTML and XML are not same just given for illustration.
For input HTML file
<p class=MsoNormal style='tab-stops:.5in'><b><span style='mso-tab-count:3'> </span></b><b><span
lang=AR-SY dir=RTL style='mso-bidi-language:AR-SY'>عزت
ابراهيم
الدوري</span><o:p></o:p></b></p>
receive XML with UTF-8 encoding
<Name Type="Script"> صدام حسين التكريتي</Name>
Basically I need a sequence ASCII sequence &#xxxx;&#yyyy;&#zzzz; to be saved as utf-8.
I am not quite sure if you want to convert the HTML decimal encoding in place or do a transformation from HTML to an XML document. Converting the decimal (or hexadecimal) encoded characters to UTF-8/16 is not too difficult. However, properly parsing HTML in the wild can be a challenge (see this thread).
Here is a naive class for converting decimal and hexadecimal encoded characters in place and returning a .Net string (I make no guarantees about its correctness or robustness - particularly if you are trying to use it against malformed HTML or have surrogate pair characters):
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text.RegularExpressions;
namespace HtmlEncodingConverter
{
internal static class HtmlEncoding
{
private static readonly Regex EncodedCharRegex = new Regex(@"&#[X]?[0-9|A-F]{1,12};",
RegexOptions.Compiled |
RegexOptions.IgnoreCase |
RegexOptions.CultureInvariant);
public static string ToUtfCharacters(string input)
{
return ConvertInnerText(input, ReplaceWithCharacter);
}
private static string ReplaceWithCharacter(string original)
{
return EncodedCharRegex.Replace(original, new MatchEvaluator(DecodeCharacter));
}
private static string DecodeCharacter(Match match)
{
string digits = match.ToString().TrimStart(new[] {'#', '&'}).TrimEnd(';').ToUpperInvariant();
return digits.StartsWith("X") ? HexToString(digits) : DecToString(digits);
}
private static string DecToString(string digits)
{
return ((char) int.Parse(digits)).ToString();
}
private static string HexToString(string digits)
{
return ((char) int.Parse(
digits.Substring(1),
NumberStyles.HexNumber,
CultureInfo.InvariantCulture)).ToString();
}
private static string ConvertInnerText(string original, Func<string, string> converter)
{
var convertedQueue = new Queue<char>(original.Length);
var innerQueue = new Queue<char>();
int tagCount = 0;
bool hasFoundHtml = false;
foreach (char character in original)
{
if (character.Equals('<'))
{
hasFoundHtml = true;
if (tagCount == 0 && innerQueue.Count > 0)
{
var innerString = new string(innerQueue.ToArray());
string convertedString = converter.Invoke(innerString);
foreach (char convertedCharacter in convertedString)
{
convertedQueue.Enqueue(convertedCharacter);
}
innerQueue.Clear();
}
tagCount += 1;
convertedQueue.Enqueue(character);
continue;
}
if (character.Equals('>'))
{
tagCount -= 1;
convertedQueue.Enqueue(character);
continue;
}
if (tagCount == 0 && hasFoundHtml)
{
innerQueue.Enqueue(character);
}
else
{
convertedQueue.Enqueue(character);
}
}
return new string(convertedQueue.ToArray());
}
}
}