htmlToText: Don't replace inline element tags with spaces

They are not meant to be displayed like this, so they look weird sometimes.
For example, Mastodon splits long links between spans, so the Fediverse plugin
always displayed them broken.
This commit is contained in:
Valentin Lorentz 2022-02-19 21:00:15 +01:00
parent bc76f7ead4
commit 78bdc469e4
2 changed files with 22 additions and 4 deletions

View File

@ -222,12 +222,16 @@ def getEncoding(s):
except:
return None
# From beautifulsoup (version 4.10.0, bs4/builder/__init__.py, line 391)
_block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
class HtmlToText(HTMLParser, object):
"""Taken from some eff-bot code on c.l.p."""
entitydefs = entitydefs.copy()
entitydefs['nbsp'] = ' '
entitydefs['apos'] = '\''
def __init__(self, tagReplace=' '):
def __init__(self, tagReplace=None):
self.data = []
self.tagReplace = tagReplace
super(HtmlToText, self).__init__()
@ -235,11 +239,20 @@ class HtmlToText(HTMLParser, object):
def append(self, data):
self.data.append(data)
def getTagReplace(self, tag):
if self.tagReplace is None:
if tag in _block_elements:
return ' '
else:
return ''
else:
return self.tagReplace
def handle_starttag(self, tag, attr):
self.append(self.tagReplace)
self.append(self.getTagReplace(tag))
def handle_endtag(self, tag):
self.append(self.tagReplace)
self.append(self.getTagReplace(tag))
def handle_data(self, data):
self.append(data)
@ -267,7 +280,7 @@ class HtmlToText(HTMLParser, object):
def handle_charref(self, name):
self.append(self.unescape('&#%s;' % name))
def htmlToText(s, tagReplace=' '):
def htmlToText(s, tagReplace=None):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""
encoding = getEncoding(s)

View File

@ -541,6 +541,11 @@ class NetTest(SupyTestCase):
self.assertTrue(f('2001:888:0:1::666'))
class WebTest(SupyTestCase):
def testHtmlToText(self):
self.assertEqual(
utils.web.htmlToText('foo<p>bar<span>baz</span>qux</p>quux'),
'foo barbazqux quux')
def testGetDomain(self):
url = 'http://slashdot.org/foo/bar.exe'
self.assertEqual(utils.web.getDomain(url), 'slashdot.org')