Bug #1190350, Don't grab fake title.

This commit is contained in:
James Vega 2005-04-30 12:53:42 +00:00
parent d9ce747fef
commit fcfda73f64
2 changed files with 37 additions and 10 deletions

View File

@ -29,6 +29,8 @@
import re
from HTMLParser import HTMLParser
import supybot.conf as conf
import supybot.utils as utils
from supybot.commands import *
@ -36,6 +38,24 @@ import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
class Title(HTMLParser):
def __init__(self, *args, **kwargs):
self.inTitle = False
self.title = None
HTMLParser.__init__(self, *args, **kwargs)
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.inTitle = True
def handle_data(self, data):
if self.inTitle:
self.title = data
def handle_endtag(self, tag):
if tag == 'title':
self.inTitle = False
class Web(callbacks.PluginRegexp):
"""Add the help for "@help Web" here."""
threaded = True
@ -45,8 +65,7 @@ class Web(callbacks.PluginRegexp):
super(Web, self).callCommand(command, irc, msg, *args, **kwargs)
except utils.web.Error, e:
irc.reply(str(e))
_titleRe = re.compile(r'<title>(.*?)</title>', re.I | re.S)
def titleSnarfer(self, irc, msg, match):
r"https?://[^\])>\s]+"
channel = msg.args[0]
@ -66,10 +85,11 @@ class Web(callbacks.PluginRegexp):
except utils.web.Error, e:
self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
return
m = self._titleRe.search(text)
if m is not None:
parser = Title()
parser.feed(text)
if parser.title is not None:
domain = utils.web.getDomain(url)
title = utils.web.htmlToText(m.group(1).strip())
title = utils.web.htmlToText(parser.title.strip())
s = format('Title: %s (at %s)', title, domain)
irc.reply(s, prefixName=False)
titleSnarfer = urlSnarfer(titleSnarfer)
@ -137,9 +157,10 @@ class Web(callbacks.PluginRegexp):
"""
size = conf.supybot.protocols.http.peekSize()
text = utils.web.getUrl(url, size=size)
m = self._titleRe.search(text)
if m is not None:
irc.reply(utils.web.htmlToText(m.group(1).strip()))
parser = Title()
parser.feed(text)
if parser.title is not None:
irc.reply(utils.web.htmlToText(parser.title.strip()))
else:
irc.reply(format('That URL appears to have no HTML title '
'within the first %i bytes.', size))

View File

@ -29,7 +29,7 @@
from supybot.test import *
class WebTestCase(PluginTestCase):
class WebTestCase(ChannelPluginTestCase):
plugins = ('Web',)
if network:
def testHeaders(self):
@ -68,9 +68,15 @@ class WebTestCase(PluginTestCase):
'jupiter_dark_spot_031023.html',
'Mystery Spot on Jupiter Baffles Astronomers')
# Checks for @title not-working correctly
self.assertResponse('title '\
self.assertResponse('title '
'http://www.catb.org/~esr/jargon/html/F/foo.html',
'foo')
# Checks for only grabbing the real title tags instead of title
# tags inside, for example, script tags. Bug #1190350
self.assertNotRegexp('title '
'http://www.irinnews.org/report.asp?ReportID=45910&'
'SelectRegion=West_Africa&SelectCountry=CHAD',
r'document\.write\(')
def testNetcraft(self):
self.assertNotError('netcraft slashdot.org')