I have this HTML:
<td class="0">
<b>Bold Text</b>
<a href=""></a>
</td>
<td class="0">
Regular Text
<a href=""></a>
</td>
Which, when formatted with xpath...
new_html = tree.xpath('//td[@class="0"]/text() | //td[@class="0"]/b/text()')
Prints:
['Bold Text', '', 'Regular Text']
As you can see, the
character hasn't been ignored and is actually read as an extra entry in td. How can I get a better output?
Note: I'm posting this not so much as an answer, but as an interesting thing (I did not know) about XPath's normalize-space()
. This might help other users.
It looks like normalize-space()
which I would have suggested here, does not remove 'NO-BREAK SPACE' (U+00A0)
>>> text = '''<html>
... <table>
... <tr>
... <td class="0">
... <b>Bold Text</b>
... <a href=""></a>
... </td>
...
... <td class="0">
... Regular Text
... <a href=""></a>
... </td>
... </tr>
... </table>
... </html>'''
>>> doc = lxml.html.fromstring(text)
>>>
>>> # ouch, is not stripped...
>>> [td.xpath('normalize-space(.)') for td in doc.xpath('.//td[@class="0"]')]
[u'Bold Text\xa0', u'Regular Text\xa0']
>>>
>>> # one needs to strip() like in @alecxe's answer
>>> [td.xpath('normalize-space(.)').strip() for td in doc.xpath('.//td[@class="0"]')]
[u'Bold Text', u'Regular Text']
>>>
Edit:
So I continued looking into whitespace characters and how they are stripped or not using Python's strip()
or XPath's normalize-space()
.
The following is a bit longer than I first wanted, but he's the whole script to test Unicode whitespace codepoints:
>>> import lxml.html
>>> import requests
>>>
>>> whitespace_chars_wikipedia = 'https://en.wikipedia.org/wiki/Whitespace_character#Unicode'
>>> r = requests.get(whitespace_chars_wikipedia)
>>>
>>> doc = lxml.html.fromstring(r.text)
>>>
>>>
>>> import collections
>>> import re
>>>
>>> WhitespaceChar = collections.namedtuple('WhitespaceChar', ['codepoint', 'name', 'decimal', 'named_entity'])
>>> r = re.compile('')
>>> wchars = {}
>>> for table in doc.xpath('''
... .//div[@class="NavHead"][.//strong="Whitespace"]
... /following-sibling::div[@class="NavContent"]
... //table[1]
... |
... .//table[caption="Related characters"]
... '''):
... for row in table.xpath('.//tr[position()>1]'):
... codepoint = row.xpath('string(./td[1]/text()[last()])')
... name = row.xpath('normalize-space(./td[2])').upper()
... decimal = int(row.xpath('string(./td[3])'))
... named_entity = row.xpath('''string(
... ./td[last()]/text()[contains(., "HTML/XML named entity: ")]
... /following-sibling::code
... )''')
... wchars[decimal] = WhitespaceChar(codepoint, name, decimal, named_entity or None)
...
>>>
>>> listitems = "\n".join(
... '<li><i>&#x{wchar.decimal:04X};</i> <b data-decimal="{wchar.decimal}">{wchar.codepoint}</b> <i>&#x{wchar.decimal:04X};</i></li>'.format(wchar=c)
... for c in sorted(wchars.values(), key=lambda c: c.decimal)
... )
>>> text = '''
... <html>
... <body>
... <ul>
... {}
... </ul>
... </body>
... </html>
... '''.format(listitems)
>>> print text
<html>
<body>
<ul>
<li><i>	</i> <b data-decimal="9">U+0009</b> <i>	</i></li>
<li><i>
</i> <b data-decimal="10">U+000A</b> <i>
</i></li>
<li><i></i> <b data-decimal="11">U+000B</b> <i></i></li>
<li><i></i> <b data-decimal="12">U+000C</b> <i></i></li>
<li><i>
</i> <b data-decimal="13">U+000D</b> <i>
</i></li>
<li><i> </i> <b data-decimal="32">U+0020</b> <i> </i></li>
<li><i>…</i> <b data-decimal="133">U+0085</b> <i>…</i></li>
<li><i> </i> <b data-decimal="160">U+00A0</b> <i> </i></li>
<li><i> </i> <b data-decimal="5760">U+1680</b> <i> </i></li>
<li><i>᠎</i> <b data-decimal="6158">U+180E</b> <i>᠎</i></li>
<li><i> </i> <b data-decimal="8192">U+2000</b> <i> </i></li>
<li><i> </i> <b data-decimal="8193">U+2001</b> <i> </i></li>
<li><i> </i> <b data-decimal="8194">U+2002</b> <i> </i></li>
<li><i> </i> <b data-decimal="8195">U+2003</b> <i> </i></li>
<li><i> </i> <b data-decimal="8196">U+2004</b> <i> </i></li>
<li><i> </i> <b data-decimal="8197">U+2005</b> <i> </i></li>
<li><i> </i> <b data-decimal="8198">U+2006</b> <i> </i></li>
<li><i> </i> <b data-decimal="8199">U+2007</b> <i> </i></li>
<li><i> </i> <b data-decimal="8200">U+2008</b> <i> </i></li>
<li><i> </i> <b data-decimal="8201">U+2009</b> <i> </i></li>
<li><i> </i> <b data-decimal="8202">U+200A</b> <i> </i></li>
<li><i>​</i> <b data-decimal="8203">U+200B</b> <i>​</i></li>
<li><i>‌</i> <b data-decimal="8204">U+200C</b> <i>‌</i></li>
<li><i>‍</i> <b data-decimal="8205">U+200D</b> <i>‍</i></li>
<li><i>
</i> <b data-decimal="8232">U+2028</b> <i>
</i></li>
<li><i>
</i> <b data-decimal="8233">U+2029</b> <i>
</i></li>
<li><i> </i> <b data-decimal="8239">U+202F</b> <i> </i></li>
<li><i> </i> <b data-decimal="8287">U+205F</b> <i> </i></li>
<li><i>⁠</i> <b data-decimal="8288">U+2060</b> <i>⁠</i></li>
<li><i> </i> <b data-decimal="12288">U+3000</b> <i> </i></li>
<li><i></i> <b data-decimal="65279">U+FEFF</b> <i></i></li>
</ul>
</body>
</html>
>>>
>>>
>>> doc2 = lxml.html.fromstring(text)
>>>
>>> from prettytable import PrettyTable
>>>
>>> x = PrettyTable([
... #"#",
... #"Code point",
... "Name",
... #"Char Python repr",
... "Test string",
... "strip()",
... "normalize-space()"
... ])
>>>
>>> for cnt, li in enumerate(doc2.xpath('.//ul/li'), start=1):
... codepoint = li.xpath('string(b)')
... wc = wchars[li.xpath('number(b/@data-decimal)')]
... tstring = li.xpath('string(.)')
... x.add_row([
... #cnt,
... #wc.codepoint,
... wc.name,
... #repr([unichr(wc.decimal)]).strip('[]'),
... repr([tstring]).strip('[]'),
... tstring.strip() == codepoint,
... li.xpath('normalize-space(.)') == codepoint
... ])
...
Do strip()
and normalize-space()
strip these whitespace characters?
>>> print x
+-------------------------------+-------------------------+---------+-------------------+
| Name | Test string | strip() | normalize-space() |
+-------------------------------+-------------------------+---------+-------------------+
| CHARACTER TABULATION | '\t U+0009 \t' | True | True |
| LINE FEED | '\n U+000A \n' | True | True |
| LINE TABULATION | ' U+000B ' | True | True |
| FORM FEED | ' U+000C ' | True | True |
| CARRIAGE RETURN | '\r U+000D \r' | True | True |
| SPACE | ' U+0020 ' | True | True |
| NEXT LINE | u'\x85 U+0085 \x85' | True | False |
| NO-BREAK SPACE | u'\xa0 U+00A0 \xa0' | True | False |
| OGHAM SPACE MARK | u'\u1680 U+1680 \u1680' | True | False |
| MONGOLIAN VOWEL SEPARATOR | u'\u180e U+180E \u180e' | True | False |
| EN QUAD | u'\u2000 U+2000 \u2000' | True | False |
| EM QUAD | u'\u2001 U+2001 \u2001' | True | False |
| EN SPACE | u'\u2002 U+2002 \u2002' | True | False |
| EM SPACE | u'\u2003 U+2003 \u2003' | True | False |
| THREE-PER-EM SPACE | u'\u2004 U+2004 \u2004' | True | False |
| FOUR-PER-EM SPACE | u'\u2005 U+2005 \u2005' | True | False |
| SIX-PER-EM SPACE | u'\u2006 U+2006 \u2006' | True | False |
| FIGURE SPACE | u'\u2007 U+2007 \u2007' | True | False |
| PUNCTUATION SPACE | u'\u2008 U+2008 \u2008' | True | False |
| THIN SPACE | u'\u2009 U+2009 \u2009' | True | False |
| HAIR SPACE | u'\u200a U+200A \u200a' | True | False |
| ZERO WIDTH SPACE | u'\u200b U+200B \u200b' | False | False |
| ZERO WIDTH NON-JOINER | u'\u200c U+200C \u200c' | False | False |
| ZERO WIDTH JOINER | u'\u200d U+200D \u200d' | False | False |
| LINE SEPARATOR | u'\u2028 U+2028 \u2028' | True | False |
| PARAGRAPH SEPARATOR | u'\u2029 U+2029 \u2029' | True | False |
| NARROW NO-BREAK SPACE | u'\u202f U+202F \u202f' | True | False |
| MEDIUM MATHEMATICAL SPACE | u'\u205f U+205F \u205f' | True | False |
| WORD JOINER | u'\u2060 U+2060 \u2060' | False | False |
| IDEOGRAPHIC SPACE | u'\u3000 U+3000 \u3000' | True | False |
| ZERO WIDTH NON-BREAKING SPACE | u'\ufeff U+FEFF \ufeff' | False | False |
+-------------------------------+-------------------------+---------+-------------------+
>>>
Whitespace chars:
>>> pprint.pprint(wchars)
{9: WhitespaceChar(codepoint='U+0009', name='CHARACTER TABULATION', decimal=9, named_entity=None),
10: WhitespaceChar(codepoint='U+000A', name='LINE FEED', decimal=10, named_entity='
'),
11: WhitespaceChar(codepoint='U+000B', name='LINE TABULATION', decimal=11, named_entity=None),
12: WhitespaceChar(codepoint='U+000C', name='FORM FEED', decimal=12, named_entity=None),
13: WhitespaceChar(codepoint='U+000D', name='CARRIAGE RETURN', decimal=13, named_entity=None),
32: WhitespaceChar(codepoint='U+0020', name='SPACE', decimal=32, named_entity=None),
133: WhitespaceChar(codepoint='U+0085', name='NEXT LINE', decimal=133, named_entity=None),
160: WhitespaceChar(codepoint='U+00A0', name='NO-BREAK SPACE', decimal=160, named_entity=' '),
5760: WhitespaceChar(codepoint='U+1680', name='OGHAM SPACE MARK', decimal=5760, named_entity=None),
6158: WhitespaceChar(codepoint='U+180E', name='MONGOLIAN VOWEL SEPARATOR', decimal=6158, named_entity=None),
8192: WhitespaceChar(codepoint='U+2000', name='EN QUAD', decimal=8192, named_entity=None),
8193: WhitespaceChar(codepoint='U+2001', name='EM QUAD', decimal=8193, named_entity=None),
8194: WhitespaceChar(codepoint='U+2002', name='EN SPACE', decimal=8194, named_entity=' '),
8195: WhitespaceChar(codepoint='U+2003', name='EM SPACE', decimal=8195, named_entity=' '),
8196: WhitespaceChar(codepoint='U+2004', name='THREE-PER-EM SPACE', decimal=8196, named_entity=' '),
8197: WhitespaceChar(codepoint='U+2005', name='FOUR-PER-EM SPACE', decimal=8197, named_entity=' '),
8198: WhitespaceChar(codepoint='U+2006', name='SIX-PER-EM SPACE', decimal=8198, named_entity=None),
8199: WhitespaceChar(codepoint='U+2007', name='FIGURE SPACE', decimal=8199, named_entity=' '),
8200: WhitespaceChar(codepoint='U+2008', name='PUNCTUATION SPACE', decimal=8200, named_entity=' '),
8201: WhitespaceChar(codepoint='U+2009', name='THIN SPACE', decimal=8201, named_entity=' '),
8202: WhitespaceChar(codepoint='U+200A', name='HAIR SPACE', decimal=8202, named_entity=' '),
8203: WhitespaceChar(codepoint='U+200B', name='ZERO WIDTH SPACE', decimal=8203, named_entity=None),
8204: WhitespaceChar(codepoint='U+200C', name='ZERO WIDTH NON-JOINER', decimal=8204, named_entity='‌'),
8205: WhitespaceChar(codepoint='U+200D', name='ZERO WIDTH JOINER', decimal=8205, named_entity='‍'),
8232: WhitespaceChar(codepoint='U+2028', name='LINE SEPARATOR', decimal=8232, named_entity=None),
8233: WhitespaceChar(codepoint='U+2029', name='PARAGRAPH SEPARATOR', decimal=8233, named_entity=None),
8239: WhitespaceChar(codepoint='U+202F', name='NARROW NO-BREAK SPACE', decimal=8239, named_entity=None),
8287: WhitespaceChar(codepoint='U+205F', name='MEDIUM MATHEMATICAL SPACE', decimal=8287, named_entity=' '),
8288: WhitespaceChar(codepoint='U+2060', name='WORD JOINER', decimal=8288, named_entity='⁠'),
12288: WhitespaceChar(codepoint='U+3000', name='IDEOGRAPHIC SPACE', decimal=12288, named_entity=None),
65279: WhitespaceChar(codepoint='U+FEFF', name='ZERO WIDTH NON-BREAKING SPACE', decimal=65279, named_entity=None)}
>>>