Web & core: Merge features of Web's title parser and utils.web.HtmlToText + don't unescape HTML twice. Closes GH-1176.

2025-12-28 05:57:56 +01:00 · 2015-10-23 07:41:36 +02:00 · 2015-10-23 07:41:36 +02:00 · e3ff413734
commit e3ff413734
parent 9f10f08b2e
3 changed files with 28 additions and 27 deletions
--- a/plugins/Web/plugin.py
+++ b/plugins/Web/plugin.py
@ -50,15 +50,13 @@ else:
    from HTMLParser import HTMLParser
    from htmlentitydefs import entitydefs

-class Title(HTMLParser):
+class Title(utils.web.HtmlToText):
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '
-    entitydefs['apos'] = '\''
    def __init__(self):
        self.inTitle = False
        self.inSvg = False
-        self.title = ''
-        HTMLParser.__init__(self)
+        utils.web.HtmlToText.__init__(self)

    @property
    def inHtmlTitle(self):
@ -76,18 +74,9 @@ class Title(HTMLParser):
        elif tag == 'svg':
            self.inSvg = False

-    def handle_data(self, data):
+    def append(self, data):
        if self.inHtmlTitle:
-            self.title += data
-
-    def handle_entityref(self, name):
-        if self.inHtmlTitle:
-            if name in self.entitydefs:
-                self.title += self.entitydefs[name]
-
-    def handle_charref(self, name):
-        if self.inHtmlTitle:
-            self.title += (unichr if minisix.PY2 else chr)(int(name))
+            super(Title, self).append(data)

 class DelayedIrc:
    def __init__(self, irc):
@ -156,16 +145,15 @@ class Web(callbacks.PluginRegexp):
                return None
        parser.feed(text)
        parser.close()
-        title = parser.title
+        title = ''.join(parser.data).strip()
        if title:
-            title = utils.web.htmlToText(title.strip())
+            return title
        elif raiseErrors:
            if len(text) < size:
                irc.reply(_('That URL appears to have no HTML title.'))
            else:
                irc.reply(format(_('That URL appears to have no HTML title '
                                 'within the first %S.'), size))
-        return title

    @fetch_sandbox
    def titleSnarfer(self, irc, msg, match):
--- a/plugins/Web/test.py
+++ b/plugins/Web/test.py
@ -68,6 +68,12 @@ class WebTestCase(ChannelPluginTestCase):
            self.assertResponse(
                    'title http://www.thefreedictionary.com/don%27t',
                    "Don't - definition of don't by The Free Dictionary")
+            self.assertRegexp(
+                    'title '
+                    'https://twitter.com/rlbarnes/status/656554266744586240',
+                    '"PSA: In Firefox 44 Nightly, "http:" pages with '
+                    '<input type="password"> are now marked insecure. '
+                    'https://t.co/qS9LxuRPdm"$')

        def testTitleSnarfer(self):
            try:
--- a/src/utils/web.py
+++ b/src/utils/web.py
@ -206,40 +206,47 @@ class HtmlToText(HTMLParser, object):
    """Taken from some eff-bot code on c.l.p."""
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '
+    entitydefs['apos'] = '\''
    def __init__(self, tagReplace=' '):
        self.data = []
        self.tagReplace = tagReplace
        super(HtmlToText, self).__init__()

+    def append(self, data):
+        self.data.append(data)
+
    def handle_starttag(self, tag, attr):
-        self.data.append(self.tagReplace)
+        self.append(self.tagReplace)

    def handle_endtag(self, tag):
-        self.data.append(self.tagReplace)
+        self.append(self.tagReplace)

    def handle_data(self, data):
-        self.data.append(data)
+        self.append(data)

    def handle_entityref(self, data):
        if minisix.PY3:
            if data in name2codepoint:
-                self.data.append(chr(name2codepoint[data]))
+                self.append(chr(name2codepoint[data]))
            elif isinstance(data, bytes):
-                self.data.append(data.decode())
+                self.append(data.decode())
            else:
-                self.data.append(data)
+                self.append(data)
        else:
            if data in name2codepoint:
-                self.data.append(unichr(name2codepoint[data]))
+                self.append(unichr(name2codepoint[data]))
            elif isinstance(data, str):
-                self.data.append(data.decode('utf8', errors='replace'))
+                self.append(data.decode('utf8', errors='replace'))
            else:
-                self.data.append(data)
+                self.append(data)

    def getText(self):
        text = ''.join(self.data).strip()
        return normalizeWhitespace(text)

+    def handle_charref(self, name):
+        self.append((unichr if minisix.PY2 else chr)(int(name)))
+
 def htmlToText(s, tagReplace=' '):
    """Turns HTML into text.  tagReplace is a string to replace HTML tags with.
    """