Java - convert named html entities to numbered xml

2020-03-02 03:27发布

问题:

I'm looking to convert an html block that contains html named entities to an xml compliant block that uses numbered xml entities while leaving all html tag elements in place.

This is the basic idea illustrated via test:

@Test
public void testEvalHtmlEntitiesToXmlEntities() {
    String input = "<a href=\"test.html\">link&nbsp;</a>";
    String expected = "<a href=\"test.html\">link&#160;</a>";
    String actual = SomeUtil.eval(input);
    Assert.assertEquals(expected, actual);
}

Is anyone aware of a Class that provides this functionality? I can write a regex to iterate through non element matches and do:

xlmString += StringEscapeUtils.escapeXml(StringEscapeUtils.unescapeHtml(htmlString));

but hoped there is an easier way or a Class that already provides this.

回答1:

Have you tried with JTidy?

private String cleanData(String data) throws UnsupportedEncodingException {
    Tidy tidy = new Tidy();
    tidy.setInputEncoding("UTF-8");
    tidy.setOutputEncoding("UTF-8");
    tidy.setPrintBodyOnly(true); // only print the content
    tidy.setXmlOut(true); // to XML
    tidy.setSmartIndent(true); 
    ByteArrayInputStream inputStream = new ByteArrayInputStream(data.getBytes("UTF-8"));
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    tidy.parseDOM(inputStream, outputStream);
    return outputStream.toString("UTF-8");
}

Although I think it will repair some of your HTML code in case has something.



回答2:

This might be useful to you.

private static Map<String, String> entityMap = new HashMap<String, String>();
static{
    entityMap.put("nbsp", "&#160;");
    entityMap.put("iexcl", "&#161;");
    entityMap.put("cent", "&#162;");
    entityMap.put("pound", "&#163;");
    entityMap.put("curren", "&#164;");
    entityMap.put("yen", "&#165;");
    entityMap.put("brvbar", "&#166;");
    entityMap.put("sect", "&#167;");
    entityMap.put("uml", "&#168;");
    entityMap.put("copy", "&#169;");
    entityMap.put("ordf", "&#170;");
    entityMap.put("laquo", "&#171;");
    entityMap.put("not", "&#172;");
    entityMap.put("shy", "&#173;");
    entityMap.put("reg", "&#174;");
    entityMap.put("macr", "&#175;");
    entityMap.put("deg", "&#176;");
    entityMap.put("plusmn", "&#177;");
    entityMap.put("sup2", "&#178;");
    entityMap.put("sup3", "&#179;");
    entityMap.put("acute", "&#180;");
    entityMap.put("micro", "&#181;");
    entityMap.put("para", "&#182;");
    entityMap.put("middot", "&#183;");
    entityMap.put("cedil", "&#184;");
    entityMap.put("sup1", "&#185;");
    entityMap.put("ordm", "&#186;");
    entityMap.put("raquo", "&#187;");
    entityMap.put("frac14", "&#188;");
    entityMap.put("frac12", "&#189;");
    entityMap.put("frac34", "&#190;");
    entityMap.put("iquest", "&#191;");
    entityMap.put("Agrave", "&#192;");
    entityMap.put("Aacute", "&#193;");
    entityMap.put("Acirc", "&#194;");
    entityMap.put("Atilde", "&#195;");
    entityMap.put("Auml", "&#196;");
    entityMap.put("Aring", "&#197;");
    entityMap.put("AElig", "&#198;");
    entityMap.put("Ccedil", "&#199;");
    entityMap.put("Egrave", "&#200;");
    entityMap.put("Eacute", "&#201;");
    entityMap.put("Ecirc", "&#202;");
    entityMap.put("Euml", "&#203;");
    entityMap.put("Igrave", "&#204;");
    entityMap.put("Iacute", "&#205;");
    entityMap.put("Icirc", "&#206;");
    entityMap.put("Iuml", "&#207;");
    entityMap.put("ETH", "&#208;");
    entityMap.put("Ntilde", "&#209;");
    entityMap.put("Ograve", "&#210;");
    entityMap.put("Oacute", "&#211;");
    entityMap.put("Ocirc", "&#212;");
    entityMap.put("Otilde", "&#213;");
    entityMap.put("Ouml", "&#214;");
    entityMap.put("times", "&#215;");
    entityMap.put("Oslash", "&#216;");
    entityMap.put("Ugrave", "&#217;");
    entityMap.put("Uacute", "&#218;");
    entityMap.put("Ucirc", "&#219;");
    entityMap.put("Uuml", "&#220;");
    entityMap.put("Yacute", "&#221;");
    entityMap.put("THORN", "&#222;");
    entityMap.put("szlig", "&#223;");
    entityMap.put("agrave", "&#224;");
    entityMap.put("aacute", "&#225;");
    entityMap.put("acirc", "&#226;");
    entityMap.put("atilde", "&#227;");
    entityMap.put("auml", "&#228;");
    entityMap.put("aring", "&#229;");
    entityMap.put("aelig", "&#230;");
    entityMap.put("ccedil", "&#231;");
    entityMap.put("egrave", "&#232;");
    entityMap.put("eacute", "&#233;");
    entityMap.put("ecirc", "&#234;");
    entityMap.put("euml", "&#235;");
    entityMap.put("igrave", "&#236;");
    entityMap.put("iacute", "&#237;");
    entityMap.put("icirc", "&#238;");
    entityMap.put("iuml", "&#239;");
    entityMap.put("eth", "&#240;");
    entityMap.put("ntilde", "&#241;");
    entityMap.put("ograve", "&#242;");
    entityMap.put("oacute", "&#243;");
    entityMap.put("ocirc", "&#244;");
    entityMap.put("otilde", "&#245;");
    entityMap.put("ouml", "&#246;");
    entityMap.put("divide", "&#247;");
    entityMap.put("oslash", "&#248;");
    entityMap.put("ugrave", "&#249;");
    entityMap.put("uacute", "&#250;");
    entityMap.put("ucirc", "&#251;");
    entityMap.put("uuml", "&#252;");
    entityMap.put("yacute", "&#253;");
    entityMap.put("thorn", "&#254;");
    entityMap.put("yuml", "&#255;");
    entityMap.put("fnof", "&#192;");
    entityMap.put("Alpha", "&#913;");
    entityMap.put("Beta", "&#914;");
    entityMap.put("Gamma", "&#915;");
    entityMap.put("Delta", "&#916;");
    entityMap.put("Epsilon", "&#917;");
    entityMap.put("Zeta", "&#918;");
    entityMap.put("Eta", "&#919;");
    entityMap.put("Theta", "&#920;");
    entityMap.put("Iota", "&#921;");
    entityMap.put("Kappa", "&#922;");
    entityMap.put("Lambda", "&#923;");
    entityMap.put("Mu", "&#924;");
    entityMap.put("Nu", "&#925;");
    entityMap.put("Xi", "&#926;");
    entityMap.put("Omicron", "&#927;");
    entityMap.put("Pi", "&#928;");
    entityMap.put("Rho", "&#929;");
    entityMap.put("Sigma", "&#931;");
    entityMap.put("Tau", "&#932;");
    entityMap.put("Upsi", "&#933;");
    entityMap.put("Phi", "&#934;");
    entityMap.put("Chi", "&#935;");
    entityMap.put("Psi", "&#936;");
    entityMap.put("Omega", "&#937;");
    entityMap.put("alpha", "&#945;");
    entityMap.put("beta", "&#946;");
    entityMap.put("gamma", "&#947;");
    entityMap.put("delta", "&#948;");
    entityMap.put("epsi", "&#949;");
    entityMap.put("zeta", "&#950;");
    entityMap.put("eta", "&#951;");
    entityMap.put("theta", "&#952;");
    entityMap.put("iota", "&#953;");
    entityMap.put("kappa", "&#954;");
    entityMap.put("lambda", "&#955;");
    entityMap.put("mu", "&#956;");
    entityMap.put("nu", "&#957;");
    entityMap.put("xi", "&#958;");
    entityMap.put("omicron", "&#959;");
    entityMap.put("pi", "&#960;");
    entityMap.put("rho", "&#961;");
    entityMap.put("sigmaf", "&#962;");
    entityMap.put("sigma", "&#963;");
    entityMap.put("tau", "&#964;");
    entityMap.put("upsi", "&#965;");
    entityMap.put("phi", "&#966;");
    entityMap.put("chi", "&#967;");
    entityMap.put("psi", "&#968;");
    entityMap.put("omega", "&#969;");
    entityMap.put("theta", "&#977;");
    entityMap.put("upsih", "&#978;");
    entityMap.put("piv", "&#982;");
    entityMap.put("bull", "&#8226;");
    entityMap.put("hellip", "&#8230;");
    entityMap.put("prime", "&#8242;");
    entityMap.put("Prime", "&#8243;");
    entityMap.put("oline", "&#8254;");
    entityMap.put("frasl", "&#8260;");
    entityMap.put("weierp", "&#8472;");
    entityMap.put("image", "&#8465;");
    entityMap.put("real", "&#8476;");
    entityMap.put("trade", "&#8482;");
    entityMap.put("alefsym", "&#8501;");
    entityMap.put("larr", "&#8592;");
    entityMap.put("uarr", "&#8593;");
    entityMap.put("rarr", "&#8594;");
    entityMap.put("darr", "&#8595;");
    entityMap.put("harr", "&#8596;");
    entityMap.put("crarr", "&#8629;");
    entityMap.put("lArr", "&#8656;");
    entityMap.put("uArr", "&#8657;");
    entityMap.put("rArr", "&#8658;");
    entityMap.put("dArr", "&#8659;");
    entityMap.put("hArr", "&#8660;");
    entityMap.put("forall", "&#8704;");
    entityMap.put("part", "&#8706;");
    entityMap.put("exist", "&#8707;");
    entityMap.put("empty", "&#8709;");
    entityMap.put("nabla", "&#8711;");
    entityMap.put("isin", "&#8712;");
    entityMap.put("notin", "&#8713;");
    entityMap.put("ni", "&#8715;");
    entityMap.put("prod", "&#8719;");
    entityMap.put("sum", "&#8722;");
    entityMap.put("minus", "&#8722;");
    entityMap.put("lowast", "&#8727;");
    entityMap.put("radic", "&#8730;");
    entityMap.put("prop", "&#8733;");
    entityMap.put("infin", "&#8734;");
    entityMap.put("ang", "&#8736;");
    entityMap.put("and", "&#8869;");
    entityMap.put("or", "&#8870;");
    entityMap.put("cap", "&#8745;");
    entityMap.put("cup", "&#8746;");
    entityMap.put("int", "&#8747;");
    entityMap.put("there4", "&#8756;");
    entityMap.put("sim", "&#8764;");
    entityMap.put("cong", "&#8773;");
    entityMap.put("asymp", "&#8773;");
    entityMap.put("ne", "&#8800;");
    entityMap.put("equiv", "&#8801;");
    entityMap.put("le", "&#8804;");
    entityMap.put("ge", "&#8805;");
    entityMap.put("sub", "&#8834;");
    entityMap.put("sup", "&#8835;");

    entityMap.put("nsub", "&#8836;");
    entityMap.put("sube", "&#8838;");
    entityMap.put("supe", "&#8839;");
    entityMap.put("oplus", "&#8853;");
    entityMap.put("otimes", "&#8855;");
    entityMap.put("perp", "&#8869;");
    entityMap.put("sdot", "&#8901;");

    entityMap.put("lceil", "&#8968;");
    entityMap.put("rceil", "&#8969;");
    entityMap.put("lfloor", "&#8970;");
    entityMap.put("rfloor", "&#8971;");
    entityMap.put("lang", "&#9001;");


    entityMap.put("loz", "&#9674;");

    entityMap.put("spades", "&#9824;");
    entityMap.put("clubs", "&#9827;");
    entityMap.put("hearts", "&#9829;");
    entityMap.put("diams", "&#9830;");


    entityMap.put("quot", "&#34;");
    entityMap.put("amp", "&#38;");
    entityMap.put("lt", "&#60;");
    entityMap.put("gt", "&#62;");

    entityMap.put("OElig", "&#338;");
    entityMap.put("oelig", "&#339;");
    entityMap.put("Scaron", "&#352;");
    entityMap.put("scaron", "&#353;");
    entityMap.put("Yuml", "&#376;");

    entityMap.put("circ", "&#710;");
    entityMap.put("tilde", "&#732;");

    entityMap.put("ensp", "&#8194;");
    entityMap.put("emsp", "&#8195;");
    entityMap.put("thinsp", "&#8201;");
    entityMap.put("zwnj", "&#8204;");
    entityMap.put("zwj", "&#8205;");
    entityMap.put("lrm", "&#8206;");
    entityMap.put("rlm", "&#8207;");
    entityMap.put("ndash", "&#8211;");
    entityMap.put("mdash", "&#8212;");
    entityMap.put("lsquo", "&#8216;");
    entityMap.put("rsquo", "&#8217;");
    entityMap.put("sbquo", "&#8218;");
    entityMap.put("ldquo", "&#8220;");
    entityMap.put("rdquo", "&#8221;");
    entityMap.put("bdquo", "&#8222;");
    entityMap.put("dagger", "&#8224;");
    entityMap.put("Dagger", "&#8225;");
    entityMap.put("permil", "&#8240;");
    entityMap.put("lsaquo", "&#8249;");
    entityMap.put("rsaquo", "&#8250;");
}

Then I simply append the data to the document as the DOCTYPE

  StringBuffer buffer = new StringBuffer();
        buffer.append("<?xml version=\"1.0\"?> " + " <!DOCTYPE some_name [ ");
        Iterator<Entry<String, String>> iterator = entityMap.entrySet().iterator();
        while (iterator.hasNext()) {
            Entry<String, String> entry = iterator.next();
            buffer.append("<!ENTITY " + entry.getKey() + " \"" + entry.getValue() + "\">");
        }
        buffer.append(" ]>");

        convertedData = buffer.toString() + convertedData;


回答3:

If you already have commons-lang on your classpath, look into the arrays in EntityArrays; they contain the mapping for all the entities.

To get the numeric value, just use codePointAt(0) on the first element (the Unicode character).

Now you need a regex-based loop to search for &[^;]+;. This is pretty safe since & is a special character which needs to be escaped. If you need to be 100% sure, look for CDATA elements and ignore them.



回答4:

This is what I wound up using. Seems to work fine:

/**
 * Some helper methods for XHTML => HTML manipulation
 * 
 * @author David Maple<d@davemaple.com>
 *
 */
public class XhtmlUtil {

    private static final Pattern ENTITY_PATTERN = Pattern.compile("(&[^\\s]+?;)");

    /**
     * Don't instantiate me
     */
    private XhtmlUtil() { } 

    /**
     * Convert a String of HTML with named HTML entities to the 
     * same String with entities converted to numbered XML entities 
     * 
     * @param html
     * @return xhtml
     */
    public static String htmlToXmlEntities(String html) {
        StringBuffer stringBuffer = new StringBuffer();
        Matcher matcher = ENTITY_PATTERN.matcher(html);

        while (matcher.find()) {
            String replacement = htmlEntityToXmlEntity(matcher.group(1));
            matcher.appendReplacement(stringBuffer, "");
            stringBuffer.append(replacement);
        }

        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    /**
     * Replace an HTML entity with an XML entity
     * 
     * @param htmlEntity
     * @return xmlEntity
     */
    private static String htmlEntityToXmlEntity(String html) {
        return StringEscapeUtils.escapeXml(StringEscapeUtils.unescapeHtml(html));
    }

}

and the corresponding tests:

public class XhtmlUtilTest {

    @Test
    public void testEvalXmlEscape() {
        String input = "link 1 &nbsp;|&nbsp; link2 &amp; & dkdk;";
        String expected = "link 1 &#160;|&#160; link2 &amp; & dkdk;";
        String actual = XhtmlUtil.htmlToXmlEntities(input);
        System.out.println(actual);
        Assert.assertEquals(expected, actual);
    }

    @Test
    public void testEvalXmlEscape2() {
        String input = "<a href=\"test.html\">link&nbsp;</a>";
        String expected = "<a href=\"test.html\">link&#160;</a>";
        String actual = XhtmlUtil.htmlToXmlEntities(input);
        System.out.println(actual);
        Assert.assertEquals(expected, actual);
    }

    @Test
    public void testEvalXmlEscapeMultiLine() {
        String input = "<a href=\"test.html\">link&nbsp;</a>\n<a href=\"test.html\">link&nbsp;</a>";
        String expected = "<a href=\"test.html\">link&#160;</a>\n<a href=\"test.html\">link&#160;</a>";
        String actual = XhtmlUtil.htmlToXmlEntities(input);
        System.out.println(actual);
        Assert.assertEquals(expected, actual);
    }

}


回答5:

Here is another solution that I use

 /**
     * Converts the specified string which is in ASCII format to legal XML
     * format. Inspired by XMLWriter by http://www.megginson.com/Software/
     */
    public static String convertAsciiToXml(String string) {
        if (string == null || string.equals(""))
            return "";

        StringBuffer sbuf = new StringBuffer();
        char ch[] = string.toCharArray();
        for (int i = 0; i < ch.length; i++) {
            switch (ch[i]) {
                case '&':
                    sbuf.append("&amp;");
                    break;
                case '<':
                    sbuf.append("&lt;");
                    break;
                case '>':
                    sbuf.append("&gt;");
                    break;
                case '\"':
                    sbuf.append("&quot;");
                    break;
                default:
                    if (ch[i] > '\u007f') {
                        sbuf.append("&#");
                        sbuf.append(Integer.toString(ch[i]));
                        sbuf.append(';');
                    }
                    else if (ch[i] == '\t') {
                        sbuf.append(' ');
                        sbuf.append(' ');
                        sbuf.append(' ');
                        sbuf.append(' ');
                    }
                    else if ((int) ch[i] >= 32 || (ch[i] == '\n' || ch[i] == '\r')) {
                        sbuf.append(ch[i]);
                    }
            }
        }
        return sbuf.toString();
    }