Bug #1190350, Don't grab fake title.

This commit is contained in:
James Vega 2005-04-30 12:53:42 +00:00
parent d9ce747fef
commit fcfda73f64
2 changed files with 37 additions and 10 deletions

View File

@ -29,6 +29,8 @@
import re import re
from HTMLParser import HTMLParser
import supybot.conf as conf import supybot.conf as conf
import supybot.utils as utils import supybot.utils as utils
from supybot.commands import * from supybot.commands import *
@ -36,6 +38,24 @@ import supybot.plugins as plugins
import supybot.ircutils as ircutils import supybot.ircutils as ircutils
import supybot.callbacks as callbacks import supybot.callbacks as callbacks
class Title(HTMLParser):
def __init__(self, *args, **kwargs):
self.inTitle = False
self.title = None
HTMLParser.__init__(self, *args, **kwargs)
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.inTitle = True
def handle_data(self, data):
if self.inTitle:
self.title = data
def handle_endtag(self, tag):
if tag == 'title':
self.inTitle = False
class Web(callbacks.PluginRegexp): class Web(callbacks.PluginRegexp):
"""Add the help for "@help Web" here.""" """Add the help for "@help Web" here."""
threaded = True threaded = True
@ -46,7 +66,6 @@ class Web(callbacks.PluginRegexp):
except utils.web.Error, e: except utils.web.Error, e:
irc.reply(str(e)) irc.reply(str(e))
_titleRe = re.compile(r'<title>(.*?)</title>', re.I | re.S)
def titleSnarfer(self, irc, msg, match): def titleSnarfer(self, irc, msg, match):
r"https?://[^\])>\s]+" r"https?://[^\])>\s]+"
channel = msg.args[0] channel = msg.args[0]
@ -66,10 +85,11 @@ class Web(callbacks.PluginRegexp):
except utils.web.Error, e: except utils.web.Error, e:
self.log.info('Couldn\'t snarf title of %u: %s.', url, e) self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
return return
m = self._titleRe.search(text) parser = Title()
if m is not None: parser.feed(text)
if parser.title is not None:
domain = utils.web.getDomain(url) domain = utils.web.getDomain(url)
title = utils.web.htmlToText(m.group(1).strip()) title = utils.web.htmlToText(parser.title.strip())
s = format('Title: %s (at %s)', title, domain) s = format('Title: %s (at %s)', title, domain)
irc.reply(s, prefixName=False) irc.reply(s, prefixName=False)
titleSnarfer = urlSnarfer(titleSnarfer) titleSnarfer = urlSnarfer(titleSnarfer)
@ -137,9 +157,10 @@ class Web(callbacks.PluginRegexp):
""" """
size = conf.supybot.protocols.http.peekSize() size = conf.supybot.protocols.http.peekSize()
text = utils.web.getUrl(url, size=size) text = utils.web.getUrl(url, size=size)
m = self._titleRe.search(text) parser = Title()
if m is not None: parser.feed(text)
irc.reply(utils.web.htmlToText(m.group(1).strip())) if parser.title is not None:
irc.reply(utils.web.htmlToText(parser.title.strip()))
else: else:
irc.reply(format('That URL appears to have no HTML title ' irc.reply(format('That URL appears to have no HTML title '
'within the first %i bytes.', size)) 'within the first %i bytes.', size))

View File

@ -29,7 +29,7 @@
from supybot.test import * from supybot.test import *
class WebTestCase(PluginTestCase): class WebTestCase(ChannelPluginTestCase):
plugins = ('Web',) plugins = ('Web',)
if network: if network:
def testHeaders(self): def testHeaders(self):
@ -68,9 +68,15 @@ class WebTestCase(PluginTestCase):
'jupiter_dark_spot_031023.html', 'jupiter_dark_spot_031023.html',
'Mystery Spot on Jupiter Baffles Astronomers') 'Mystery Spot on Jupiter Baffles Astronomers')
# Checks for @title not-working correctly # Checks for @title not-working correctly
self.assertResponse('title '\ self.assertResponse('title '
'http://www.catb.org/~esr/jargon/html/F/foo.html', 'http://www.catb.org/~esr/jargon/html/F/foo.html',
'foo') 'foo')
# Checks for only grabbing the real title tags instead of title
# tags inside, for example, script tags. Bug #1190350
self.assertNotRegexp('title '
'http://www.irinnews.org/report.asp?ReportID=45910&'
'SelectRegion=West_Africa&SelectCountry=CHAD',
r'document\.write\(')
def testNetcraft(self): def testNetcraft(self):
self.assertNotError('netcraft slashdot.org') self.assertNotError('netcraft slashdot.org')