plugins/Web: Swtich the title parser back to HTMLParser sing sgmllib's parser spins on invalid input.

This commit is contained in:
James Vega 2006-09-13 19:40:51 +00:00
parent deb6dbc1e1
commit ee9aaa89d6
2 changed files with 34 additions and 26 deletions

View File

@ -28,7 +28,7 @@
###
import re
import sgmllib
import HTMLParser
import htmlentitydefs
import supybot.conf as conf
@ -38,32 +38,32 @@ import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
class Title(sgmllib.SGMLParser):
class Title(HTMLParser.HTMLParser):
entitydefs = htmlentitydefs.entitydefs.copy()
entitydefs['nbsp'] = ' '
entitydefs['apos'] = '\''
def __init__(self):
self.inTitle = False
self.title = ''
sgmllib.SGMLParser.__init__(self)
HTMLParser.HTMLParser.__init__(self)
def start_title(self, attrs):
self.inTitle = True
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.inTitle = True
def end_title(self):
self.inTitle = False
def unknown_entityref(self, name):
if self.inTitle:
self.title += ' '
def unknown_charref(self, name):
if self.inTitle:
self.title += ' '
def handle_endtag(self, tag):
if tag == 'title':
self.inTitle = False
def handle_data(self, data):
if self.inTitle:
self.title += data
def handle_entityref(self, name):
if self.inTitle:
if name in self.entitydefs:
self.title += self.entitydefs[name]
class Web(callbacks.PluginRegexp):
"""Add the help for "@help Web" here."""
threaded = True
@ -172,7 +172,7 @@ class Web(callbacks.PluginRegexp):
parser = Title()
try:
parser.feed(text)
except sgmllib.SGMLParseError:
except HTMLParser.HTMLParseError:
self.log.debug('Encountered a problem parsing %u. Title may '
'already be set, though', url)
if parser.title:

View File

@ -53,16 +53,18 @@ class WebTestCase(ChannelPluginTestCase):
'Slashdot: News for nerds, stuff that matters')
# Amazon add a bunch of scripting stuff to the top of their page,
# so we need to allow for a larger peekSize
try:
orig = conf.supybot.protocols.http.peekSize()
conf.supybot.protocols.http.peekSize.setValue(8192)
self.assertNotRegexp('title '
'http://www.amazon.com/exec/obidos/tg/detail/-/'
'1884822312/qid=1063140754/sr=8-1/ref=sr_8_1/'
'002-9802970-2308826?v=glance&s=books&n=507846',
'no HTML title')
finally:
conf.supybot.protocols.http.peekSize.setValue(orig)
# Actually, screw Amazon. Even bumping this up to 10k doesn't give us enough
# info.
# try:
# orig = conf.supybot.protocols.http.peekSize()
# conf.supybot.protocols.http.peekSize.setValue(8192)
# self.assertNotRegexp('title '
# 'http://www.amazon.com/exec/obidos/tg/detail/-/'
# '1884822312/qid=1063140754/sr=8-1/ref=sr_8_1/'
# '002-9802970-2308826?v=glance&s=books&n=507846',
# 'no HTML title')
# finally:
# conf.supybot.protocols.http.peekSize.setValue(orig)
# Checks the non-greediness of the regexp
self.assertResponse('title '
'http://www.space.com/scienceastronomy/'
@ -82,6 +84,12 @@ class WebTestCase(ChannelPluginTestCase):
# Checks that title parser grabs the full title instead of just
# part of it.
self.assertRegexp('title http://www.n-e-r-d.com/', 'N.*E.*R.*D')
# Checks that the parser doesn't hang on invalid tags
print
print "If we have not fixed a bug with the parser, the following",
print "test will hang the test-suite."
self.assertNotError(
'title http://www.youtube.com/watch?v=x4BtiqPN4u8')
def testNetcraft(self):
self.assertNotError('netcraft slashdot.org')