mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-01-26 12:04:07 +01:00
htmlToText: Don't replace inline element tags with spaces
They are not meant to be displayed like this, so they look weird sometimes. For example, Mastodon splits long links between spans, so the Fediverse plugin always displayed them broken.
This commit is contained in:
parent
bc76f7ead4
commit
78bdc469e4
@ -222,12 +222,16 @@ def getEncoding(s):
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
# From beautifulsoup (version 4.10.0, bs4/builder/__init__.py, line 391)
|
||||
_block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||
|
||||
class HtmlToText(HTMLParser, object):
|
||||
"""Taken from some eff-bot code on c.l.p."""
|
||||
entitydefs = entitydefs.copy()
|
||||
entitydefs['nbsp'] = ' '
|
||||
entitydefs['apos'] = '\''
|
||||
def __init__(self, tagReplace=' '):
|
||||
def __init__(self, tagReplace=None):
|
||||
self.data = []
|
||||
self.tagReplace = tagReplace
|
||||
super(HtmlToText, self).__init__()
|
||||
@ -235,11 +239,20 @@ class HtmlToText(HTMLParser, object):
|
||||
def append(self, data):
|
||||
self.data.append(data)
|
||||
|
||||
def getTagReplace(self, tag):
|
||||
if self.tagReplace is None:
|
||||
if tag in _block_elements:
|
||||
return ' '
|
||||
else:
|
||||
return ''
|
||||
else:
|
||||
return self.tagReplace
|
||||
|
||||
def handle_starttag(self, tag, attr):
|
||||
self.append(self.tagReplace)
|
||||
self.append(self.getTagReplace(tag))
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.append(self.tagReplace)
|
||||
self.append(self.getTagReplace(tag))
|
||||
|
||||
def handle_data(self, data):
|
||||
self.append(data)
|
||||
@ -267,7 +280,7 @@ class HtmlToText(HTMLParser, object):
|
||||
def handle_charref(self, name):
|
||||
self.append(self.unescape('&#%s;' % name))
|
||||
|
||||
def htmlToText(s, tagReplace=' '):
|
||||
def htmlToText(s, tagReplace=None):
|
||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||
"""
|
||||
encoding = getEncoding(s)
|
||||
|
@ -541,6 +541,11 @@ class NetTest(SupyTestCase):
|
||||
self.assertTrue(f('2001:888:0:1::666'))
|
||||
|
||||
class WebTest(SupyTestCase):
|
||||
def testHtmlToText(self):
|
||||
self.assertEqual(
|
||||
utils.web.htmlToText('foo<p>bar<span>baz</span>qux</p>quux'),
|
||||
'foo barbazqux quux')
|
||||
|
||||
def testGetDomain(self):
|
||||
url = 'http://slashdot.org/foo/bar.exe'
|
||||
self.assertEqual(utils.web.getDomain(url), 'slashdot.org')
|
||||
|
Loading…
Reference in New Issue
Block a user