I have a string from an RSS file after parsing.
String htmlString=
<p><img border="1" align="left" width="200" vspace="2" hspace="2" height="133" alt="Prime Minister Manmohan Singh will leave for Iran on August 28, 2012 to attend the Non-Aligned Movement summit, which will be preceded by crucial bilateral talks with Iran’s supreme leader Ayotollah Ali Khamenei and Iranian President Mahmoud Ahmadinejad." src="/tmdbuserfiles/manmohan ahmadi(3).jpg" />Prime Minister summit, which will be preceded by crucial bilateral talks with Iran’s supreme leader place at a time when the U.S. is pushing India to reduce engagement with Iran and implement sanctions imposed by some countries over its controversial nuclear programme.<br />
<br />
</p>
I have a requirement to display the text without any HTML tags and without HTML special characters from above htmlString on my LWUIT Form like:
Prime Minister ManmohanSingh will leave for Iran on August 28, 2012 to attend the Non-Aligned Movement summit, which will
be preceded by crucial bilateral talks with Iran supreme leader Ayotollah Ali Khamenei and Iranian etc...........?
It also helps to open the HttpConnection input stream with UTF-8 encoding like this :
String encoding = "UTF-8";
Reader reader = new InputStreamReader(in, encoding);
Use this suite of String Utils to get clean and well formatted text. :
/**
* Method removes HTML tags from given string.
*
* @param text Input parameter containing HTML tags (eg. <b>cat</b>)
* @return String without HTML tags (eg. cat)
*/
public static String removeHtml(String text) {
try {
int idx = text.indexOf("<");
if (idx == -1) {
text = decodeEntities(text);
return text;
}
String plainText = "";
String htmlText = text;
int htmlStartIndex = htmlText.indexOf("<", 0);
if (htmlStartIndex == -1) {
return text;
}
htmlText = StringUtils.replace(htmlText, "</p>", "\r\n");
htmlText = StringUtils.replace(htmlText, "<br/>", "\r\n");
htmlText = StringUtils.replace(htmlText, "<br>", "\r\n");
while (htmlStartIndex >= 0) {
plainText += htmlText.substring(0, htmlStartIndex);
int htmlEndIndex = htmlText.indexOf(">", htmlStartIndex);
htmlText = htmlText.substring(htmlEndIndex + 1);
htmlStartIndex = htmlText.indexOf("<", 0);
}
plainText = plainText.trim();
plainText = decodeEntities(plainText);
return plainText;
} catch (Exception e) {
System.err.println("Error while removing HTML: " + e.toString());
return text;
}
}
public static String decodeEntities(String html) {
String result = StringUtils.replace(html, "<", "<");
result = StringUtils.replace(result, ">", ">");
result = StringUtils.replace(result, " ", " ");
result = StringUtils.replace(result, "&", "&");
result = StringUtils.replace(result, "ä", "ä");
result = StringUtils.replace(result, "ö", "ö");
result = StringUtils.replace(result, """, "'");
result = StringUtils.replace(result, "&lquot;", "'");
result = StringUtils.replace(result, "&rquot;", "'");
result = StringUtils.replace(result, "
", "\r");
return result;
}
/* Replace all instances of a String in a String.
* @param s String to alter.
* @param f String to look for.
* @param r String to replace it with, or null to just remove it.
*/
public static String replace(String s, String f, String r) {
if (s == null) {
return s;
}
if (f == null) {
return s;
}
if (r == null) {
r = "";
}
int index01 = s.indexOf(f);
while (index01 != -1) {
s = s.substring(0, index01) + r + s.substring(index01 + f.length());
index01 += r.length();
index01 = s.indexOf(f, index01);
}
return s;
}
public static String cleanEncodedString(String str) {
String resultStr = str;
String encoding = "UTF-8";
InputStream in = new ByteArrayInputStream(str.getBytes());
InputStreamReader isr;
try {
isr = new InputStreamReader(in, encoding);
ByteArrayOutputStream buf = new ByteArrayOutputStream();
int result = isr.read();
while (result != -1) {
byte b = (byte) result;
buf.write(b);
result = isr.read();
}
resultStr = buf.toString();
return resultStr;
} catch (Exception uee) {
uee.printStackTrace();
}
return resultStr;
}
int startIndex = htmlString.indexOf("alt=\"");
int endIndex = htmlString.indexOf("\"", startIndex+5);
String resultantString = htmlString.substring(startIndex+5, endIndex);
resultantString = replaceAll(resultantString ,""","\"");
resultantString = replaceAll(resultantString ,"&","&");
resultantString = replaceAll(resultantString ,"’","’");
private String replaceAll(String source, String pattern, String replacement) {
if (source == null) {
return "";
}
StringBuffer sb = new StringBuffer();
int index;
int patIndex = 0;
while ((index = source.indexOf(pattern, patIndex)) != -1) {
sb.append(source.substring(patIndex, index));
sb.append(replacement);
patIndex = index + pattern.length();
}
sb.append(source.substring(patIndex));
return sb.toString();
}