Strip HTML from strings in Python

2018-12-31 05:22发布

from mechanize import Browser
br = Browser()
br.open('http://somewebpage')
html = br.response().readlines()
for line in html:
  print line

When printing a line in an HTML file, I'm trying to find a way to only show the contents of each HTML element and not the formatting itself. If it finds '<a href="whatever.com">some text</a>', it will only print 'some text', '<b>hello</b>' prints 'hello', etc. How would one go about doing this?

标签: python html
21条回答
何处买醉
2楼-- · 2018-12-31 05:35

A python 3 adaption of søren-løvborg's answer

from html.parser import HTMLParser
from html.entities import html5

class HTMLTextExtractor(HTMLParser):
    """ Adaption of http://stackoverflow.com/a/7778368/196732 """
    def __init__(self):
        super().__init__()
        self.result = []

    def handle_data(self, d):
        self.result.append(d)

    def handle_charref(self, number):
        codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
        self.result.append(unichr(codepoint))

    def handle_entityref(self, name):
        if name in html5:
            self.result.append(unichr(html5[name]))

    def get_text(self):
        return u''.join(self.result)

def html_to_text(html):
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()
查看更多
怪性笑人.
3楼-- · 2018-12-31 05:36

The Beautiful Soup package does this immediately for you.

from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
text = soup.get_text()
print(text)
查看更多
柔情千种
4楼-- · 2018-12-31 05:38

I have used Eloff's answer successfully for Python 3.1 [many thanks!].

I upgraded to Python 3.2.3, and ran into errors.

The solution, provided here thanks to the responder Thomas K, is to insert super().__init__() into the following code:

def __init__(self):
    self.reset()
    self.fed = []

... in order to make it look like this:

def __init__(self):
    super().__init__()
    self.reset()
    self.fed = []

... and it will work for Python 3.2.3.

Again, thanks to Thomas K for the fix and for Eloff's original code provided above!

查看更多
还给你的自由
5楼-- · 2018-12-31 05:38

This method works flawlessly for me and requires no additional installations:

import re
import htmlentitydefs

def convertentity(m):
    if m.group(1)=='#':
        try:
            return unichr(int(m.group(2)))
        except ValueError:
            return '&#%s;' % m.group(2)
        try:
            return htmlentitydefs.entitydefs[m.group(2)]
        except KeyError:
            return '&%s;' % m.group(2)

def converthtml(s):
    return re.sub(r'&(#?)(.+?);',convertentity,s)

html =  converthtml(html)
html.replace("&nbsp;", " ") ## Get rid of the remnants of certain formatting(subscript,superscript,etc).
查看更多
听够珍惜
6楼-- · 2018-12-31 05:42

I'm parsing Github readmes and I find that the following really works well:

import re
import lxml.html

def strip_markdown(x):
    links_sub = re.sub(r'\[(.+)\]\([^\)]+\)', r'\1', x)
    bold_sub = re.sub(r'\*\*([^*]+)\*\*', r'\1', links_sub)
    emph_sub = re.sub(r'\*([^*]+)\*', r'\1', bold_sub)
    return emph_sub

def strip_html(x):
    return lxml.html.fromstring(x).text_content() if x else ''

And then

readme = """<img src="https://raw.githubusercontent.com/kootenpv/sky/master/resources/skylogo.png" />

            sky is a web scraping framework, implemented with the latest python versions in mind (3.4+). 
            It uses the asynchronous `asyncio` framework, as well as many popular modules 
            and extensions.

            Most importantly, it aims for **next generation** web crawling where machine intelligence 
            is used to speed up the development/maintainance/reliability of crawling.

            It mainly does this by considering the user to be interested in content 
            from *domains*, not just a collection of *single pages*
            ([templating approach](#templating-approach))."""

strip_markdown(strip_html(readme))

Removes all markdown and html correctly.

查看更多
姐姐魅力值爆表
7楼-- · 2018-12-31 05:43

I always used this function to strip HTML tags, as it requires only the Python stdlib:

On Python 2

from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

For Python 3

from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

Note: this works only for 3.1. For 3.2 or above, you need to call the parent class's init function. See Using HTMLParser in Python 3.2

查看更多
登录 后发表回答