From fcfda73f64bab7de0ab0a597beac9bb55d94f4b2 Mon Sep 17 00:00:00 2001 From: James Vega Date: Sat, 30 Apr 2005 12:53:42 +0000 Subject: [PATCH] Bug #1190350, Don't grab fake title. --- plugins/Web/plugin.py | 37 +++++++++++++++++++++++++++++-------- plugins/Web/test.py | 10 ++++++++-- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/plugins/Web/plugin.py b/plugins/Web/plugin.py index 6a9a6798c..cb5c55280 100644 --- a/plugins/Web/plugin.py +++ b/plugins/Web/plugin.py @@ -29,6 +29,8 @@ import re +from HTMLParser import HTMLParser + import supybot.conf as conf import supybot.utils as utils from supybot.commands import * @@ -36,6 +38,24 @@ import supybot.plugins as plugins import supybot.ircutils as ircutils import supybot.callbacks as callbacks +class Title(HTMLParser): + def __init__(self, *args, **kwargs): + self.inTitle = False + self.title = None + HTMLParser.__init__(self, *args, **kwargs) + + def handle_starttag(self, tag, attrs): + if tag == 'title': + self.inTitle = True + + def handle_data(self, data): + if self.inTitle: + self.title = data + + def handle_endtag(self, tag): + if tag == 'title': + self.inTitle = False + class Web(callbacks.PluginRegexp): """Add the help for "@help Web" here.""" threaded = True @@ -45,8 +65,7 @@ class Web(callbacks.PluginRegexp): super(Web, self).callCommand(command, irc, msg, *args, **kwargs) except utils.web.Error, e: irc.reply(str(e)) - - _titleRe = re.compile(r'(.*?)', re.I | re.S) + def titleSnarfer(self, irc, msg, match): r"https?://[^\])>\s]+" channel = msg.args[0] @@ -66,10 +85,11 @@ class Web(callbacks.PluginRegexp): except utils.web.Error, e: self.log.info('Couldn\'t snarf title of %u: %s.', url, e) return - m = self._titleRe.search(text) - if m is not None: + parser = Title() + parser.feed(text) + if parser.title is not None: domain = utils.web.getDomain(url) - title = utils.web.htmlToText(m.group(1).strip()) + title = utils.web.htmlToText(parser.title.strip()) s = format('Title: %s (at %s)', title, domain) irc.reply(s, prefixName=False) titleSnarfer = urlSnarfer(titleSnarfer) @@ -137,9 +157,10 @@ class Web(callbacks.PluginRegexp): """ size = conf.supybot.protocols.http.peekSize() text = utils.web.getUrl(url, size=size) - m = self._titleRe.search(text) - if m is not None: - irc.reply(utils.web.htmlToText(m.group(1).strip())) + parser = Title() + parser.feed(text) + if parser.title is not None: + irc.reply(utils.web.htmlToText(parser.title.strip())) else: irc.reply(format('That URL appears to have no HTML title ' 'within the first %i bytes.', size)) diff --git a/plugins/Web/test.py b/plugins/Web/test.py index be198e844..006b02051 100644 --- a/plugins/Web/test.py +++ b/plugins/Web/test.py @@ -29,7 +29,7 @@ from supybot.test import * -class WebTestCase(PluginTestCase): +class WebTestCase(ChannelPluginTestCase): plugins = ('Web',) if network: def testHeaders(self): @@ -68,9 +68,15 @@ class WebTestCase(PluginTestCase): 'jupiter_dark_spot_031023.html', 'Mystery Spot on Jupiter Baffles Astronomers') # Checks for @title not-working correctly - self.assertResponse('title '\ + self.assertResponse('title ' 'http://www.catb.org/~esr/jargon/html/F/foo.html', 'foo') + # Checks for only grabbing the real title tags instead of title + # tags inside, for example, script tags. Bug #1190350 + self.assertNotRegexp('title ' + 'http://www.irinnews.org/report.asp?ReportID=45910&' + 'SelectRegion=West_Africa&SelectCountry=CHAD', + r'document\.write\(') def testNetcraft(self): self.assertNotError('netcraft slashdot.org')