Strip HTML from Text JavaScript

2018-12-31 00:39发布

Is there an easy way to take a string of html in JavaScript and strip out the html?

30条回答
栀子花@的思念
2楼-- · 2018-12-31 00:59

A lot of people have answered this already, but I thought it might be useful to share the function I wrote that strips HTML tags from a string but allows you to include an array of tags that you do not want stripped. It's pretty short and has been working nicely for me.

function removeTags(string, array){
  return array ? string.split("<").filter(function(val){ return f(array, val); }).map(function(val){ return f(array, val); }).join("") : string.split("<").map(function(d){ return d.split(">").pop(); }).join("");
  function f(array, value){
    return array.map(function(d){ return value.includes(d + ">"); }).indexOf(true) != -1 ? "<" + value : value.split(">")[1];
  }
}

var x = "<span><i>Hello</i> <b>world</b>!</span>";
console.log(removeTags(x)); // Hello world!
console.log(removeTags(x, ["span", "i"])); // <span><i>Hello</i> world!</span>
查看更多
时光乱了年华
3楼-- · 2018-12-31 00:59

Here's a version which sorta addresses @MikeSamuel's security concern:

function strip(html)
{
   try {
       var doc = document.implementation.createDocument('http://www.w3.org/1999/xhtml', 'html', null);
       doc.documentElement.innerHTML = html;
       return doc.documentElement.textContent||doc.documentElement.innerText;
   } catch(e) {
       return "";
   }
}

Note, it will return an empty string if the HTML markup isn't valid XML (aka, tags must be closed and attributes must be quoted). This isn't ideal, but does avoid the issue of having the security exploit potential.

If not having valid XML markup is a requirement for you, you could try using:

var doc = document.implementation.createHTMLDocument("");

but that isn't a perfect solution either for other reasons.

查看更多
初与友歌
4楼-- · 2018-12-31 01:00

input element support only one line text:

The text state represents a one line plain text edit control for the element's value.

function stripHtml(str) {
  var tmp = document.createElement('input');
  tmp.value = str;
  return tmp.value;
}

Update: this works as expected

function stripHtml(str) {
  // Remove some tags
  str = str.replace(/<[^>]+>/gim, '');

  // Remove BB code
  str = str.replace(/\[(\w+)[^\]]*](.*?)\[\/\1]/g, '$2 ');

  // Remove html and line breaks
  const div = document.createElement('div');
  div.innerHTML = str;

  const input = document.createElement('input');
  input.value = div.textContent || div.innerText || '';

  return input.value;
}
查看更多
听够珍惜
5楼-- · 2018-12-31 01:02
var text = html.replace(/<\/?("[^"]*"|'[^']*'|[^>])*(>|$)/g, "");

This is a regex version, which is more resilient to malformed HTML, like:

Unclosed tags

Some text <img

"<", ">" inside tag attributes

Some text <img alt="x > y">

Newlines

Some <a href="http://google.com">

The code

var html = '<br>This <img alt="a>b" \r\n src="a_b.gif" />is > \nmy<>< > <a>"text"</a'
var text = html.replace(/<\/?("[^"]*"|'[^']*'|[^>])*(>|$)/g, "");
查看更多
无与为乐者.
6楼-- · 2018-12-31 01:04
function stripHTML(my_string){
    var charArr   = my_string.split(''),
        resultArr = [],
        htmlZone  = 0,
        quoteZone = 0;
    for( x=0; x < charArr.length; x++ ){
     switch( charArr[x] + htmlZone + quoteZone ){
       case "<00" : htmlZone  = 1;break;
       case ">10" : htmlZone  = 0;resultArr.push(' ');break;
       case '"10' : quoteZone = 1;break;
       case "'10" : quoteZone = 2;break;
       case '"11' : 
       case "'12" : quoteZone = 0;break;
       default    : if(!htmlZone){ resultArr.push(charArr[x]); }
     }
    }
    return resultArr.join('');
}

Accounts for > inside attributes and <img onerror="javascript"> in newly created dom elements.

usage:

clean_string = stripHTML("string with <html> in it")

demo:

https://jsfiddle.net/gaby_de_wilde/pqayphzd/

demo of top answer doing the terrible things:

https://jsfiddle.net/gaby_de_wilde/6f0jymL6/1/

查看更多
何处买醉
7楼-- · 2018-12-31 01:04

It is also possible to use the fantastic htmlparser2 pure JS HTML parser. Here is a working demo:

var htmlparser = require('htmlparser2');

var body = '<p><div>This is </div>a <span>simple </span> <img src="test"></img>example.</p>';

var result = [];

var parser = new htmlparser.Parser({
    ontext: function(text){
        result.push(text);
    }
}, {decodeEntities: true});

parser.write(body);
parser.end();

result.join('');

The output will be This is a simple example.

See it in action here: https://tonicdev.com/jfahrenkrug/extract-text-from-html

This works in both node and the browser if you pack you web application using a tool like webpack.

查看更多
登录 后发表回答