Bug #1190350, Don't grab fake title.

2026-02-01 23:47:57 +01:00 · 2005-04-30 12:53:42 +00:00 · 2005-04-30 12:53:42 +00:00 · fcfda73f64
commit fcfda73f64
parent d9ce747fef
2 changed files with 37 additions and 10 deletions
--- a/plugins/Web/plugin.py
+++ b/plugins/Web/plugin.py
@ -29,6 +29,8 @@

 import re

+from HTMLParser import HTMLParser
+
 import supybot.conf as conf
 import supybot.utils as utils
 from supybot.commands import *
@ -36,6 +38,24 @@ import supybot.plugins as plugins
 import supybot.ircutils as ircutils
 import supybot.callbacks as callbacks

+class Title(HTMLParser):
+    def __init__(self, *args, **kwargs):
+        self.inTitle = False
+        self.title = None
+        HTMLParser.__init__(self, *args, **kwargs)
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'title':
+            self.inTitle = True
+
+    def handle_data(self, data):
+        if self.inTitle:
+            self.title = data
+
+    def handle_endtag(self, tag):
+        if tag == 'title':
+            self.inTitle = False
+
 class Web(callbacks.PluginRegexp):
    """Add the help for "@help Web" here."""
    threaded = True
@ -46,7 +66,6 @@ class Web(callbacks.PluginRegexp):
        except utils.web.Error, e:
            irc.reply(str(e))

-    _titleRe = re.compile(r'<title>(.*?)</title>', re.I | re.S)
    def titleSnarfer(self, irc, msg, match):
        r"https?://[^\])>\s]+"
        channel = msg.args[0]
@ -66,10 +85,11 @@ class Web(callbacks.PluginRegexp):
            except utils.web.Error, e:
                self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
                return
-            m = self._titleRe.search(text)
-            if m is not None:
+            parser = Title()
+            parser.feed(text)
+            if parser.title is not None:
                domain = utils.web.getDomain(url)
-                title = utils.web.htmlToText(m.group(1).strip())
+                title = utils.web.htmlToText(parser.title.strip())
                s = format('Title: %s (at %s)', title, domain)
                irc.reply(s, prefixName=False)
    titleSnarfer = urlSnarfer(titleSnarfer)
@ -137,9 +157,10 @@ class Web(callbacks.PluginRegexp):
        """
        size = conf.supybot.protocols.http.peekSize()
        text = utils.web.getUrl(url, size=size)
-        m = self._titleRe.search(text)
-        if m is not None:
-            irc.reply(utils.web.htmlToText(m.group(1).strip()))
+        parser = Title()
+        parser.feed(text)
+        if parser.title is not None:
+            irc.reply(utils.web.htmlToText(parser.title.strip()))
        else:
            irc.reply(format('That URL appears to have no HTML title '
                             'within the first %i bytes.', size))
--- a/plugins/Web/test.py
+++ b/plugins/Web/test.py
@ -29,7 +29,7 @@

 from supybot.test import *

-class WebTestCase(PluginTestCase):
+class WebTestCase(ChannelPluginTestCase):
    plugins = ('Web',)
    if network:
        def testHeaders(self):
@ -68,9 +68,15 @@ class WebTestCase(PluginTestCase):
                                'jupiter_dark_spot_031023.html',
                                'Mystery Spot on Jupiter Baffles Astronomers')
            # Checks for @title not-working correctly
-            self.assertResponse('title '\
+            self.assertResponse('title '
                'http://www.catb.org/~esr/jargon/html/F/foo.html',
                'foo')
+            # Checks for only grabbing the real title tags instead of title
+            # tags inside, for example, script tags. Bug #1190350
+            self.assertNotRegexp('title '
+                'http://www.irinnews.org/report.asp?ReportID=45910&'
+                'SelectRegion=West_Africa&SelectCountry=CHAD',
+                r'document\.write\(')

        def testNetcraft(self):
            self.assertNotError('netcraft slashdot.org')