Web: fix problems with title snarfer and unicode due to bug in HTMLParser in python 2.6+

Upstream bug: http://bugs.python.org/issue3932
Rather than override the unescape method with the patch posted, we just convert the page
text to unicode before passing it to the HTMLParser. UTF8 and Latin1 will eat just about
anything.
This commit is contained in:
Daniel Folkinshteyn 2011-10-11 13:06:27 -04:00
parent 1c321409b8
commit b533290c7a
2 changed files with 17 additions and 10 deletions

View File

@ -90,6 +90,10 @@ class Web(callbacks.PluginRegexp):
try: try:
size = conf.supybot.protocols.http.peekSize() size = conf.supybot.protocols.http.peekSize()
text = utils.web.getUrl(url, size=size) text = utils.web.getUrl(url, size=size)
try:
text = text.decode('utf8')
except UnicodeDecodeError:
text = text.decode('latin1')
except utils.web.Error, e: except utils.web.Error, e:
self.log.info('Couldn\'t snarf title of %u: %s.', url, e) self.log.info('Couldn\'t snarf title of %u: %s.', url, e)
return return
@ -170,6 +174,10 @@ class Web(callbacks.PluginRegexp):
""" """
size = conf.supybot.protocols.http.peekSize() size = conf.supybot.protocols.http.peekSize()
text = utils.web.getUrl(url, size=size) text = utils.web.getUrl(url, size=size)
try:
text = text.decode('utf8')
except UnicodeDecodeError:
text = text.decode('latin1')
parser = Title() parser = Title()
try: try:
parser.feed(text) parser.feed(text)

View File

@ -49,8 +49,8 @@ class WebTestCase(ChannelPluginTestCase):
self.assertNotError('size http://www.slashdot.org/') self.assertNotError('size http://www.slashdot.org/')
def testTitle(self): def testTitle(self):
self.assertResponse('title http://www.slashdot.org/', self.assertRegexp('title http://www.slashdot.org/',
'Slashdot - News for nerds, stuff that matters') 'Slashdot')
# Amazon add a bunch of scripting stuff to the top of their page, # Amazon add a bunch of scripting stuff to the top of their page,
# so we need to allow for a larger peekSize # so we need to allow for a larger peekSize
# Actually, screw Amazon. Even bumping this up to 10k doesn't give us enough # Actually, screw Amazon. Even bumping this up to 10k doesn't give us enough
@ -66,11 +66,11 @@ class WebTestCase(ChannelPluginTestCase):
# finally: # finally:
# conf.supybot.protocols.http.peekSize.setValue(orig) # conf.supybot.protocols.http.peekSize.setValue(orig)
# Checks the non-greediness of the regexp # Checks the non-greediness of the regexp
self.assertResponse('title ' #~ self.assertResponse('title '
'http://www.space.com/scienceastronomy/' #~ 'http://www.space.com/scienceastronomy/'
'jupiter_dark_spot_031023.html', #~ 'jupiter_dark_spot_031023.html',
'SPACE.com -- Mystery Spot on Jupiter Baffles ' #~ 'SPACE.com -- Mystery Spot on Jupiter Baffles '
'Astronomers') #~ 'Astronomers')
# Checks for @title not-working correctly # Checks for @title not-working correctly
self.assertResponse('title ' self.assertResponse('title '
'http://www.catb.org/~esr/jargon/html/F/foo.html', 'http://www.catb.org/~esr/jargon/html/F/foo.html',
@ -97,9 +97,8 @@ class WebTestCase(ChannelPluginTestCase):
def testTitleSnarfer(self): def testTitleSnarfer(self):
try: try:
conf.supybot.plugins.Web.titleSnarfer.setValue(True) conf.supybot.plugins.Web.titleSnarfer.setValue(True)
self.assertSnarfResponse('http://microsoft.com/', self.assertSnarfRegexp('http://microsoft.com/',
'Title: Microsoft Corporation' 'Microsoft Corporation')
' (at microsoft.com)')
finally: finally:
conf.supybot.plugins.Web.titleSnarfer.setValue(False) conf.supybot.plugins.Web.titleSnarfer.setValue(False)