plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title.

This commit is contained in:
James Vega 2005-07-19 13:55:37 +00:00
parent 77330d496a
commit b375ea9792
2 changed files with 29 additions and 16 deletions

View File

@ -28,7 +28,8 @@
###
import re
import HTMLParser
import sgmllib
import htmlentitydefs
import supybot.conf as conf
import supybot.utils as utils
@ -37,23 +38,31 @@ import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
class Title(HTMLParser.HTMLParser):
def __init__(self, *args, **kwargs):
class Title(sgmllib.SGMLParser):
entitydefs = htmlentitydefs.entitydefs.copy()
entitydefs['nbsp'] = ' '
def __init__(self):
self.inTitle = False
self.title = None
HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
self.title = ''
sgmllib.SGMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'title':
def start_title(self, attrs):
self.inTitle = True
def end_title(self):
self.inTitle = False
def unknown_entityref(self, name):
if self.inTitle:
self.title += ' '
def unknown_charref(self, name):
if self.inTitle:
self.title += ' '
def handle_data(self, data):
if self.inTitle:
self.title = data
def handle_endtag(self, tag):
if tag == 'title':
self.inTitle = False
self.title += data
class Web(callbacks.PluginRegexp):
"""Add the help for "@help Web" here."""
@ -90,7 +99,7 @@ class Web(callbacks.PluginRegexp):
except HTMLParser.HTMLParseError:
self.log.debug('Encountered a problem parsing %u. Title may '
'already be set, though', url)
if parser.title is not None:
if parser.title:
domain = utils.web.getDomain(url)
title = utils.web.htmlToText(parser.title.strip())
s = format('Title: %s (at %s)', title, domain)
@ -166,7 +175,7 @@ class Web(callbacks.PluginRegexp):
except HTMLParser.HTMLParseError:
self.log.debug('Encountered a problem parsing %u. Title may '
'already be set, though', url)
if parser.title is not None:
if parser.title:
irc.reply(utils.web.htmlToText(parser.title.strip()))
else:
irc.reply(format('That URL appears to have no HTML title '

View File

@ -66,7 +66,8 @@ class WebTestCase(ChannelPluginTestCase):
self.assertResponse('title '
'http://www.space.com/scienceastronomy/'
'jupiter_dark_spot_031023.html',
'Mystery Spot on Jupiter Baffles Astronomers')
'SPACE.com -- Mystery Spot on Jupiter Baffles '
'Astronomers')
# Checks for @title not-working correctly
self.assertResponse('title '
'http://www.catb.org/~esr/jargon/html/F/foo.html',
@ -77,6 +78,9 @@ class WebTestCase(ChannelPluginTestCase):
'http://www.irinnews.org/report.asp?ReportID=45910&'
'SelectRegion=West_Africa&SelectCountry=CHAD',
r'document\.write\(')
# Checks that title parser grabs the full title instead of just
# part of it.
self.assertRegexp('title http://www.n-e-r-d.com/', 'N.*E.*R.*D')
def testNetcraft(self):
self.assertNotError('netcraft slashdot.org')