plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title.

2025-08-22 12:57:22 +02:00 · 2005-07-19 13:55:37 +00:00 · 2005-07-19 13:55:37 +00:00 · b375ea9792
commit b375ea9792
parent 77330d496a
2 changed files with 29 additions and 16 deletions
--- a/plugins/Web/plugin.py
+++ b/plugins/Web/plugin.py
@ -28,7 +28,8 @@
 ###

 import re
-import HTMLParser
+import sgmllib
+import htmlentitydefs

 import supybot.conf as conf
 import supybot.utils as utils
@ -37,23 +38,31 @@ import supybot.plugins as plugins
 import supybot.ircutils as ircutils
 import supybot.callbacks as callbacks

-class Title(HTMLParser.HTMLParser):
-    def __init__(self, *args, **kwargs):
+class Title(sgmllib.SGMLParser):
+    entitydefs = htmlentitydefs.entitydefs.copy()
+    entitydefs['nbsp'] = ' '
+    def __init__(self):
        self.inTitle = False
-        self.title = None
-        HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
+        self.title = ''
+        sgmllib.SGMLParser.__init__(self)

-    def handle_starttag(self, tag, attrs):
-        if tag == 'title':
-            self.inTitle = True
+    def start_title(self, attrs):
+        self.inTitle = True
+
+    def end_title(self):
+        self.inTitle = False
+
+    def unknown_entityref(self, name):
+        if self.inTitle:
+            self.title += ' '
+
+    def unknown_charref(self, name):
+        if self.inTitle:
+            self.title += ' '

    def handle_data(self, data):
        if self.inTitle:
-            self.title = data
-
-    def handle_endtag(self, tag):
-        if tag == 'title':
-            self.inTitle = False
+            self.title += data

 class Web(callbacks.PluginRegexp):
    """Add the help for "@help Web" here."""
@ -90,7 +99,7 @@ class Web(callbacks.PluginRegexp):
            except HTMLParser.HTMLParseError:
                self.log.debug('Encountered a problem parsing %u.  Title may '
                               'already be set, though', url)
-            if parser.title is not None:
+            if parser.title:
                domain = utils.web.getDomain(url)
                title = utils.web.htmlToText(parser.title.strip())
                s = format('Title: %s (at %s)', title, domain)
@ -166,7 +175,7 @@ class Web(callbacks.PluginRegexp):
        except HTMLParser.HTMLParseError:
            self.log.debug('Encountered a problem parsing %u.  Title may '
                           'already be set, though', url)
-        if parser.title is not None:
+        if parser.title:
            irc.reply(utils.web.htmlToText(parser.title.strip()))
        else:
            irc.reply(format('That URL appears to have no HTML title '
--- a/plugins/Web/test.py
+++ b/plugins/Web/test.py
@ -66,7 +66,8 @@ class WebTestCase(ChannelPluginTestCase):
            self.assertResponse('title '
                                'http://www.space.com/scienceastronomy/'
                                'jupiter_dark_spot_031023.html',
-                                'Mystery Spot on Jupiter Baffles Astronomers')
+                                'SPACE.com -- Mystery Spot on Jupiter Baffles '
+                                'Astronomers')
            # Checks for @title not-working correctly
            self.assertResponse('title '
                'http://www.catb.org/~esr/jargon/html/F/foo.html',
@ -77,6 +78,9 @@ class WebTestCase(ChannelPluginTestCase):
                'http://www.irinnews.org/report.asp?ReportID=45910&'
                'SelectRegion=West_Africa&SelectCountry=CHAD',
                r'document\.write\(')
+            # Checks that title parser grabs the full title instead of just
+            # part of it.
+            self.assertRegexp('title http://www.n-e-r-d.com/', 'N.*E.*R.*D')

        def testNetcraft(self):
            self.assertNotError('netcraft slashdot.org')