From 78bdc469e4a42f0a91206a23f3dced9f66ce8c9c Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Sat, 19 Feb 2022 21:00:15 +0100 Subject: [PATCH] htmlToText: Don't replace inline element tags with spaces They are not meant to be displayed like this, so they look weird sometimes. For example, Mastodon splits long links between spans, so the Fediverse plugin always displayed them broken. --- src/utils/web.py | 21 +++++++++++++++++---- test/test_utils.py | 5 +++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/utils/web.py b/src/utils/web.py index 39c33a0a8..901ac4dc7 100644 --- a/src/utils/web.py +++ b/src/utils/web.py @@ -222,12 +222,16 @@ def getEncoding(s): except: return None + +# From beautifulsoup (version 4.10.0, bs4/builder/__init__.py, line 391) +_block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + class HtmlToText(HTMLParser, object): """Taken from some eff-bot code on c.l.p.""" entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' entitydefs['apos'] = '\'' - def __init__(self, tagReplace=' '): + def __init__(self, tagReplace=None): self.data = [] self.tagReplace = tagReplace super(HtmlToText, self).__init__() @@ -235,11 +239,20 @@ class HtmlToText(HTMLParser, object): def append(self, data): self.data.append(data) + def getTagReplace(self, tag): + if self.tagReplace is None: + if tag in _block_elements: + return ' ' + else: + return '' + else: + return self.tagReplace + def handle_starttag(self, tag, attr): - self.append(self.tagReplace) + self.append(self.getTagReplace(tag)) def handle_endtag(self, tag): - self.append(self.tagReplace) + self.append(self.getTagReplace(tag)) def handle_data(self, data): self.append(data) @@ -267,7 +280,7 @@ class HtmlToText(HTMLParser, object): def handle_charref(self, name): self.append(self.unescape('&#%s;' % name)) -def htmlToText(s, tagReplace=' '): +def htmlToText(s, tagReplace=None): """Turns HTML into text. tagReplace is a string to replace HTML tags with. """ encoding = getEncoding(s) diff --git a/test/test_utils.py b/test/test_utils.py index b0a5dc2fb..e8ca9581d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -541,6 +541,11 @@ class NetTest(SupyTestCase): self.assertTrue(f('2001:888:0:1::666')) class WebTest(SupyTestCase): + def testHtmlToText(self): + self.assertEqual( + utils.web.htmlToText('foo

barbazqux

quux'), + 'foo barbazqux quux') + def testGetDomain(self): url = 'http://slashdot.org/foo/bar.exe' self.assertEqual(utils.web.getDomain(url), 'slashdot.org')