diff --git a/plugins/Web/plugin.py b/plugins/Web/plugin.py index 2bc8355ab..c5257e50e 100644 --- a/plugins/Web/plugin.py +++ b/plugins/Web/plugin.py @@ -28,7 +28,8 @@ ### import re -import HTMLParser +import sgmllib +import htmlentitydefs import supybot.conf as conf import supybot.utils as utils @@ -37,23 +38,31 @@ import supybot.plugins as plugins import supybot.ircutils as ircutils import supybot.callbacks as callbacks -class Title(HTMLParser.HTMLParser): - def __init__(self, *args, **kwargs): +class Title(sgmllib.SGMLParser): + entitydefs = htmlentitydefs.entitydefs.copy() + entitydefs['nbsp'] = ' ' + def __init__(self): self.inTitle = False - self.title = None - HTMLParser.HTMLParser.__init__(self, *args, **kwargs) + self.title = '' + sgmllib.SGMLParser.__init__(self) - def handle_starttag(self, tag, attrs): - if tag == 'title': - self.inTitle = True + def start_title(self, attrs): + self.inTitle = True + + def end_title(self): + self.inTitle = False + + def unknown_entityref(self, name): + if self.inTitle: + self.title += ' ' + + def unknown_charref(self, name): + if self.inTitle: + self.title += ' ' def handle_data(self, data): if self.inTitle: - self.title = data - - def handle_endtag(self, tag): - if tag == 'title': - self.inTitle = False + self.title += data class Web(callbacks.PluginRegexp): """Add the help for "@help Web" here.""" @@ -90,7 +99,7 @@ class Web(callbacks.PluginRegexp): except HTMLParser.HTMLParseError: self.log.debug('Encountered a problem parsing %u. Title may ' 'already be set, though', url) - if parser.title is not None: + if parser.title: domain = utils.web.getDomain(url) title = utils.web.htmlToText(parser.title.strip()) s = format('Title: %s (at %s)', title, domain) @@ -166,7 +175,7 @@ class Web(callbacks.PluginRegexp): except HTMLParser.HTMLParseError: self.log.debug('Encountered a problem parsing %u. Title may ' 'already be set, though', url) - if parser.title is not None: + if parser.title: irc.reply(utils.web.htmlToText(parser.title.strip())) else: irc.reply(format('That URL appears to have no HTML title ' diff --git a/plugins/Web/test.py b/plugins/Web/test.py index 006b02051..2982b34f1 100644 --- a/plugins/Web/test.py +++ b/plugins/Web/test.py @@ -66,7 +66,8 @@ class WebTestCase(ChannelPluginTestCase): self.assertResponse('title ' 'http://www.space.com/scienceastronomy/' 'jupiter_dark_spot_031023.html', - 'Mystery Spot on Jupiter Baffles Astronomers') + 'SPACE.com -- Mystery Spot on Jupiter Baffles ' + 'Astronomers') # Checks for @title not-working correctly self.assertResponse('title ' 'http://www.catb.org/~esr/jargon/html/F/foo.html', @@ -77,6 +78,9 @@ class WebTestCase(ChannelPluginTestCase): 'http://www.irinnews.org/report.asp?ReportID=45910&' 'SelectRegion=West_Africa&SelectCountry=CHAD', r'document\.write\(') + # Checks that title parser grabs the full title instead of just + # part of it. + self.assertRegexp('title http://www.n-e-r-d.com/', 'N.*E.*R.*D') def testNetcraft(self): self.assertNotError('netcraft slashdot.org')