-->

XML - System.Xml.XmlException - hexadecimal value

2019-09-01 15:54发布

问题:

I get this error. Later I searched and found out the reason of illegal characters in my XML and its solution. But I don't have the access to edit any of these files. My job is to read and fetch the tag value, attribute value and similar stuff. SO I can't replace the binary characters with escapes like '\x01' with &#01. Also I tried to include CheckCharacters =false in XMLreader settings. It doesn't take this. Still it is throwing the same error.

Is it not possible to fix in XMLreader? I read about XMLtextReader. It can skip the exception. But already I have coded for all my features using XMLreader. It would be good if I can find a solution for this. Otherwise I would have to change all my code.

My code:

  private void button1_Click(object sender, EventArgs e)
        {
            int i = 0;
            var filenames = System.IO.Directory
                        .EnumerateFiles(textBox1.Text, "*.xml", System.IO.SearchOption.AllDirectories)
                        .Select(System.IO.Path.GetFullPath);



            foreach (var f in filenames)
            {
                var resolver = new XmlUrlOverrideResolver();
                resolver.DtdFileMap[@"X1.DTD"] = @"\\location\X1.DTD";
                resolver.DtdFileMap[@"R2.DTD"] = @"\\location\X2.DTD";
                resolver.DtdFileMap[@"R5.DTD"] = @"\\location\R5.DTD";
                XmlReaderSettings settings = new XmlReaderSettings();

                settings.DtdProcessing = DtdProcessing.Parse;
                settings.XmlResolver = resolver;
                XmlReader doc = XmlReader.Create(f, settings);
                while (doc.Read())
                {
                    if ((doc.NodeType == XmlNodeType.Element) && (doc.Name == "ap"))
                {
                    if (doc.HasAttributes)
                    {

                        String fin = doc.GetAttribute("ap");
                        if (fin == "no")
                        {


                            String[] array = new String[10000];
                            array[i] = (f);

                            File.AppendAllText(@"\\location\NAPP.txt", array[i] + Environment.NewLine);
                            i++;
                        }
                        else
                        {
                            String[] abs = new String[10000];
                            abs[i] = (f);
                            File.AppendAllText(@"\\location\APP.txt", abs[i] + Environment.NewLine);
                            i++;
                        }
                    }

                }
            }
        }

        MessageBox.Show("Done");
    }

回答1:

This is a very simple example of character "filter" that will replae the 0x06 character with a space:

public class MyStreamReader : StreamReader {
    public MyStreamReader(string path)
        : base(path) {
    }

    public override int Read(char[] buffer, int index, int count) {            
        int res = base.Read(buffer, index, count);

        for (int i = 0; i < res; i++) {
            if (buffer[i] == 0x06) {
                buffer[i] = ' ';
            }
        }

        return res;
    }
}

You use it this way:

using (var sr = new MyStreamReader(f)) {
    var doc = XmlReader.Create(sr, settings);

Note that it's very simple because it's replacing a character (the 0x06) with another character of the same "length" (the space). If you wanted to replace a character with a "sequence" of characters (to escape it), it would get more complex (not impossible, 30 minutes of work difficult)

(I have checked and it seems the XmlTextReader only uses that method and not the Read() method)

As always, when a programmer tells you 30 minutes, it means 0 minutes or 2 hours :-)

This is the "more complex" ReplacingStreamReader:

/// <summary>
/// Only the Read methods are supported!
/// </summary>
public class ReplacingStreamReader : StreamReader
{
    public ReplacingStreamReader(string path)
        : base(path)
    {
    }

    public Func<char, string> ReplaceWith { get; set; }

    protected char[] RemainingChars { get; set; }
    protected int RemainingCharsIndex { get; set; }


    public override int Read()
    {
        int ch;

        if (RemainingChars != null)
        {
            ch = RemainingChars[RemainingCharsIndex];
            RemainingCharsIndex++;

            if (RemainingCharsIndex == RemainingChars.Length)
            {
                RemainingCharsIndex = 0;
                RemainingChars = null;
            }
        }
        else
        {
            ch = base.Read();

            if (ch != -1)
            {
                string replace = ReplaceWith((char)ch);

                if (replace == null)
                {
                    // Do nothing
                }
                else if (replace.Length == 1)
                {
                    ch = replace[0];
                }
                else
                {
                    ch = replace[0];

                    RemainingChars = replace.ToCharArray(1, replace.Length - 1);
                    RemainingCharsIndex = 0;
                }
            }
        }

        return ch;
    }

    public override int Read(char[] buffer, int index, int count)
    {
        int res = 0;

        // We leave error handling to the StreamReader :-)
        // We handle only "working" parameters
        if (RemainingChars != null && buffer != null && index >= 0 && count > 0 && index + count <= buffer.Length)
        {
            int remainingCharsCount = RemainingChars.Length - RemainingCharsIndex;
            res = Math.Min(remainingCharsCount, count);

            Array.Copy(RemainingChars, RemainingCharsIndex, buffer, index, res);

            RemainingCharsIndex += res;

            if (RemainingCharsIndex == RemainingChars.Length)
            {
                RemainingCharsIndex = 0;
                RemainingChars = null;
            }

            if (res == count)
            {
                return res;
            }

            index += res;
            count -= res;
        }

        while (true)
        {
            List<char> sb = null;

            int res2 = base.Read(buffer, index, count);

            if (res2 == 0 || ReplaceWith == null)
            {
                return res;
            }

            int j = 0;

            for (int i = 0; i < res2; i++)
            {
                char ch = buffer[index + i];
                string replace = ReplaceWith(ch);

                if (sb != null)
                {
                    if (replace == null)
                    {
                        sb.Add(ch);
                    }
                    else
                    {
                        sb.AddRange(replace);
                    }
                }
                else if (replace == null)
                {
                    buffer[j] = ch;
                    j++;
                }
                else if (replace.Length == 1)
                {
                    buffer[j] = replace[0];
                    j++;
                }
                else if (replace.Length == 0)
                {
                    // We do not advance
                }
                else
                {
                    sb = new List<char>();
                    sb.AddRange(replace);
                }
            }

            res2 = j;

            if (sb != null)
            {
                int res3 = Math.Min(sb.Count, count - res2);
                sb.CopyTo(0, buffer, index + res2, res3);

                if (res3 < sb.Count)
                {
                    RemainingChars = new char[sb.Count - res3];
                    RemainingCharsIndex = 0;
                    sb.CopyTo(res3, RemainingChars, 0, RemainingChars.Length);
                }

                res += res3;
            }
            else
            {
                res2 = j;

                // Can't happen if sb != null (at least a character must
                // have been added)
                if (res2 == 0)
                {
                    continue;
                }
            }

            res += res2;
            return res;
        }
    }
}

Use it like:

using (var sr = new ReplacingStreamReader(f))
{
    sr.ReplaceWith = x =>
    {
        return x == 0x6 ? " " : null;
        // return x == '.' ? "&#160;" : null; // Replace all . with &nbsp;
    };

    var doc = XmlReader.Create(sr, settings);

Be aware that the ReplacingStreamReader doesn't "know" which part of the xml it is modifying, so rarely a "blind" replace is ok :-) Other than this limitation, you can replace any character with any string (null in the ReplaceWith means "keep the current character", equivalent to x.ToString() in the example given. Returning string.Empty is valid, means remove the current character).

The class is quite interesting: it keeps a char[] RemainingChars with the chars that have been read (and filtered by ReplaceWith) but that haven't been returned by a Read() method because the passed buffer was too much small (the ReplaceWith method could "enlarge" the read string, making it too much big for the buffer!). Note that sb is a List<char> instead of a StringBuilder. Probably using one or the other would be nearly equivalent, code-wise.



回答2:

You could first read the content into a string replace (escape) the content, and then load it into a XmlReader:

foreach (var f in filenames) {
    string text;
    using (StreamReader s = new StreamReader(f,Encoding.UTF8)) {
        text = s.ReadToEnd();
    }
    text = text.Replace("\x01",@"&#01"); //replace the content

    //load some settings
    var resolver = new XmlUrlOverrideResolver();
    resolver.DtdFileMap[@"X1.DTD"] = @"\\location\X1.DTD";
    resolver.DtdFileMap[@"R2.DTD"] = @"\\location\X2.DTD";
    resolver.DtdFileMap[@"R5.DTD"] = @"\\location\R5.DTD";
    XmlReaderSettings settings = new XmlReaderSettings();

    settings.DtdProcessing = DtdProcessing.Parse;
    settings.XmlResolver = resolver;
    XmlReader doc = XmlReader.Create(text, settings);

    //perform processing task
    //...
}