From e3ff413734f71c3179e2c20682abac89f26e85ae Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Fri, 23 Oct 2015 07:41:36 +0200 Subject: [PATCH] Web & core: Merge features of Web's title parser and utils.web.HtmlToText + don't unescape HTML twice. Closes GH-1176. --- plugins/Web/plugin.py | 24 ++++++------------------ plugins/Web/test.py | 6 ++++++ src/utils/web.py | 25 ++++++++++++++++--------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/plugins/Web/plugin.py b/plugins/Web/plugin.py index 1117d0208..f3199fefc 100644 --- a/plugins/Web/plugin.py +++ b/plugins/Web/plugin.py @@ -50,15 +50,13 @@ else: from HTMLParser import HTMLParser from htmlentitydefs import entitydefs -class Title(HTMLParser): +class Title(utils.web.HtmlToText): entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' - entitydefs['apos'] = '\'' def __init__(self): self.inTitle = False self.inSvg = False - self.title = '' - HTMLParser.__init__(self) + utils.web.HtmlToText.__init__(self) @property def inHtmlTitle(self): @@ -76,18 +74,9 @@ class Title(HTMLParser): elif tag == 'svg': self.inSvg = False - def handle_data(self, data): + def append(self, data): if self.inHtmlTitle: - self.title += data - - def handle_entityref(self, name): - if self.inHtmlTitle: - if name in self.entitydefs: - self.title += self.entitydefs[name] - - def handle_charref(self, name): - if self.inHtmlTitle: - self.title += (unichr if minisix.PY2 else chr)(int(name)) + super(Title, self).append(data) class DelayedIrc: def __init__(self, irc): @@ -156,16 +145,15 @@ class Web(callbacks.PluginRegexp): return None parser.feed(text) parser.close() - title = parser.title + title = ''.join(parser.data).strip() if title: - title = utils.web.htmlToText(title.strip()) + return title elif raiseErrors: if len(text) < size: irc.reply(_('That URL appears to have no HTML title.')) else: irc.reply(format(_('That URL appears to have no HTML title ' 'within the first %S.'), size)) - return title @fetch_sandbox def titleSnarfer(self, irc, msg, match): diff --git a/plugins/Web/test.py b/plugins/Web/test.py index 1de573410..bba44eec4 100644 --- a/plugins/Web/test.py +++ b/plugins/Web/test.py @@ -68,6 +68,12 @@ class WebTestCase(ChannelPluginTestCase): self.assertResponse( 'title http://www.thefreedictionary.com/don%27t', "Don't - definition of don't by The Free Dictionary") + self.assertRegexp( + 'title ' + 'https://twitter.com/rlbarnes/status/656554266744586240', + '"PSA: In Firefox 44 Nightly, "http:" pages with ' + ' are now marked insecure. ' + 'https://t.co/qS9LxuRPdm"$') def testTitleSnarfer(self): try: diff --git a/src/utils/web.py b/src/utils/web.py index 31adac7e1..2ef565b38 100644 --- a/src/utils/web.py +++ b/src/utils/web.py @@ -206,40 +206,47 @@ class HtmlToText(HTMLParser, object): """Taken from some eff-bot code on c.l.p.""" entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' + entitydefs['apos'] = '\'' def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace super(HtmlToText, self).__init__() + def append(self, data): + self.data.append(data) + def handle_starttag(self, tag, attr): - self.data.append(self.tagReplace) + self.append(self.tagReplace) def handle_endtag(self, tag): - self.data.append(self.tagReplace) + self.append(self.tagReplace) def handle_data(self, data): - self.data.append(data) + self.append(data) def handle_entityref(self, data): if minisix.PY3: if data in name2codepoint: - self.data.append(chr(name2codepoint[data])) + self.append(chr(name2codepoint[data])) elif isinstance(data, bytes): - self.data.append(data.decode()) + self.append(data.decode()) else: - self.data.append(data) + self.append(data) else: if data in name2codepoint: - self.data.append(unichr(name2codepoint[data])) + self.append(unichr(name2codepoint[data])) elif isinstance(data, str): - self.data.append(data.decode('utf8', errors='replace')) + self.append(data.decode('utf8', errors='replace')) else: - self.data.append(data) + self.append(data) def getText(self): text = ''.join(self.data).strip() return normalizeWhitespace(text) + def handle_charref(self, name): + self.append((unichr if minisix.PY2 else chr)(int(name))) + def htmlToText(s, tagReplace=' '): """Turns HTML into text. tagReplace is a string to replace HTML tags with. """