plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title.

This commit is contained in:
James Vega 2005-07-19 13:55:37 +00:00
parent 77330d496a
commit b375ea9792
2 changed files with 29 additions and 16 deletions

View File

@ -28,7 +28,8 @@
### ###
import re import re
import HTMLParser import sgmllib
import htmlentitydefs
import supybot.conf as conf import supybot.conf as conf
import supybot.utils as utils import supybot.utils as utils
@ -37,23 +38,31 @@ import supybot.plugins as plugins
import supybot.ircutils as ircutils import supybot.ircutils as ircutils
import supybot.callbacks as callbacks import supybot.callbacks as callbacks
class Title(HTMLParser.HTMLParser): class Title(sgmllib.SGMLParser):
def __init__(self, *args, **kwargs): entitydefs = htmlentitydefs.entitydefs.copy()
entitydefs['nbsp'] = ' '
def __init__(self):
self.inTitle = False self.inTitle = False
self.title = None self.title = ''
HTMLParser.HTMLParser.__init__(self, *args, **kwargs) sgmllib.SGMLParser.__init__(self)
def handle_starttag(self, tag, attrs): def start_title(self, attrs):
if tag == 'title':
self.inTitle = True self.inTitle = True
def end_title(self):
self.inTitle = False
def unknown_entityref(self, name):
if self.inTitle:
self.title += ' '
def unknown_charref(self, name):
if self.inTitle:
self.title += ' '
def handle_data(self, data): def handle_data(self, data):
if self.inTitle: if self.inTitle:
self.title = data self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.inTitle = False
class Web(callbacks.PluginRegexp): class Web(callbacks.PluginRegexp):
"""Add the help for "@help Web" here.""" """Add the help for "@help Web" here."""
@ -90,7 +99,7 @@ class Web(callbacks.PluginRegexp):
except HTMLParser.HTMLParseError: except HTMLParser.HTMLParseError:
self.log.debug('Encountered a problem parsing %u. Title may ' self.log.debug('Encountered a problem parsing %u. Title may '
'already be set, though', url) 'already be set, though', url)
if parser.title is not None: if parser.title:
domain = utils.web.getDomain(url) domain = utils.web.getDomain(url)
title = utils.web.htmlToText(parser.title.strip()) title = utils.web.htmlToText(parser.title.strip())
s = format('Title: %s (at %s)', title, domain) s = format('Title: %s (at %s)', title, domain)
@ -166,7 +175,7 @@ class Web(callbacks.PluginRegexp):
except HTMLParser.HTMLParseError: except HTMLParser.HTMLParseError:
self.log.debug('Encountered a problem parsing %u. Title may ' self.log.debug('Encountered a problem parsing %u. Title may '
'already be set, though', url) 'already be set, though', url)
if parser.title is not None: if parser.title:
irc.reply(utils.web.htmlToText(parser.title.strip())) irc.reply(utils.web.htmlToText(parser.title.strip()))
else: else:
irc.reply(format('That URL appears to have no HTML title ' irc.reply(format('That URL appears to have no HTML title '

View File

@ -66,7 +66,8 @@ class WebTestCase(ChannelPluginTestCase):
self.assertResponse('title ' self.assertResponse('title '
'http://www.space.com/scienceastronomy/' 'http://www.space.com/scienceastronomy/'
'jupiter_dark_spot_031023.html', 'jupiter_dark_spot_031023.html',
'Mystery Spot on Jupiter Baffles Astronomers') 'SPACE.com -- Mystery Spot on Jupiter Baffles '
'Astronomers')
# Checks for @title not-working correctly # Checks for @title not-working correctly
self.assertResponse('title ' self.assertResponse('title '
'http://www.catb.org/~esr/jargon/html/F/foo.html', 'http://www.catb.org/~esr/jargon/html/F/foo.html',
@ -77,6 +78,9 @@ class WebTestCase(ChannelPluginTestCase):
'http://www.irinnews.org/report.asp?ReportID=45910&' 'http://www.irinnews.org/report.asp?ReportID=45910&'
'SelectRegion=West_Africa&SelectCountry=CHAD', 'SelectRegion=West_Africa&SelectCountry=CHAD',
r'document\.write\(') r'document\.write\(')
# Checks that title parser grabs the full title instead of just
# part of it.
self.assertRegexp('title http://www.n-e-r-d.com/', 'N.*E.*R.*D')
def testNetcraft(self): def testNetcraft(self):
self.assertNotError('netcraft slashdot.org') self.assertNotError('netcraft slashdot.org')