mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-20 01:19:26 +01:00
plugins/Web: Fixed the title-retrieval parser to actually retrieve the entire title.
This commit is contained in:
parent
77330d496a
commit
b375ea9792
@ -28,7 +28,8 @@
|
|||||||
###
|
###
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import HTMLParser
|
import sgmllib
|
||||||
|
import htmlentitydefs
|
||||||
|
|
||||||
import supybot.conf as conf
|
import supybot.conf as conf
|
||||||
import supybot.utils as utils
|
import supybot.utils as utils
|
||||||
@ -37,23 +38,31 @@ import supybot.plugins as plugins
|
|||||||
import supybot.ircutils as ircutils
|
import supybot.ircutils as ircutils
|
||||||
import supybot.callbacks as callbacks
|
import supybot.callbacks as callbacks
|
||||||
|
|
||||||
class Title(HTMLParser.HTMLParser):
|
class Title(sgmllib.SGMLParser):
|
||||||
def __init__(self, *args, **kwargs):
|
entitydefs = htmlentitydefs.entitydefs.copy()
|
||||||
|
entitydefs['nbsp'] = ' '
|
||||||
|
def __init__(self):
|
||||||
self.inTitle = False
|
self.inTitle = False
|
||||||
self.title = None
|
self.title = ''
|
||||||
HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
|
sgmllib.SGMLParser.__init__(self)
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def start_title(self, attrs):
|
||||||
if tag == 'title':
|
|
||||||
self.inTitle = True
|
self.inTitle = True
|
||||||
|
|
||||||
|
def end_title(self):
|
||||||
|
self.inTitle = False
|
||||||
|
|
||||||
|
def unknown_entityref(self, name):
|
||||||
|
if self.inTitle:
|
||||||
|
self.title += ' '
|
||||||
|
|
||||||
|
def unknown_charref(self, name):
|
||||||
|
if self.inTitle:
|
||||||
|
self.title += ' '
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
if self.inTitle:
|
if self.inTitle:
|
||||||
self.title = data
|
self.title += data
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
if tag == 'title':
|
|
||||||
self.inTitle = False
|
|
||||||
|
|
||||||
class Web(callbacks.PluginRegexp):
|
class Web(callbacks.PluginRegexp):
|
||||||
"""Add the help for "@help Web" here."""
|
"""Add the help for "@help Web" here."""
|
||||||
@ -90,7 +99,7 @@ class Web(callbacks.PluginRegexp):
|
|||||||
except HTMLParser.HTMLParseError:
|
except HTMLParser.HTMLParseError:
|
||||||
self.log.debug('Encountered a problem parsing %u. Title may '
|
self.log.debug('Encountered a problem parsing %u. Title may '
|
||||||
'already be set, though', url)
|
'already be set, though', url)
|
||||||
if parser.title is not None:
|
if parser.title:
|
||||||
domain = utils.web.getDomain(url)
|
domain = utils.web.getDomain(url)
|
||||||
title = utils.web.htmlToText(parser.title.strip())
|
title = utils.web.htmlToText(parser.title.strip())
|
||||||
s = format('Title: %s (at %s)', title, domain)
|
s = format('Title: %s (at %s)', title, domain)
|
||||||
@ -166,7 +175,7 @@ class Web(callbacks.PluginRegexp):
|
|||||||
except HTMLParser.HTMLParseError:
|
except HTMLParser.HTMLParseError:
|
||||||
self.log.debug('Encountered a problem parsing %u. Title may '
|
self.log.debug('Encountered a problem parsing %u. Title may '
|
||||||
'already be set, though', url)
|
'already be set, though', url)
|
||||||
if parser.title is not None:
|
if parser.title:
|
||||||
irc.reply(utils.web.htmlToText(parser.title.strip()))
|
irc.reply(utils.web.htmlToText(parser.title.strip()))
|
||||||
else:
|
else:
|
||||||
irc.reply(format('That URL appears to have no HTML title '
|
irc.reply(format('That URL appears to have no HTML title '
|
||||||
|
@ -66,7 +66,8 @@ class WebTestCase(ChannelPluginTestCase):
|
|||||||
self.assertResponse('title '
|
self.assertResponse('title '
|
||||||
'http://www.space.com/scienceastronomy/'
|
'http://www.space.com/scienceastronomy/'
|
||||||
'jupiter_dark_spot_031023.html',
|
'jupiter_dark_spot_031023.html',
|
||||||
'Mystery Spot on Jupiter Baffles Astronomers')
|
'SPACE.com -- Mystery Spot on Jupiter Baffles '
|
||||||
|
'Astronomers')
|
||||||
# Checks for @title not-working correctly
|
# Checks for @title not-working correctly
|
||||||
self.assertResponse('title '
|
self.assertResponse('title '
|
||||||
'http://www.catb.org/~esr/jargon/html/F/foo.html',
|
'http://www.catb.org/~esr/jargon/html/F/foo.html',
|
||||||
@ -77,6 +78,9 @@ class WebTestCase(ChannelPluginTestCase):
|
|||||||
'http://www.irinnews.org/report.asp?ReportID=45910&'
|
'http://www.irinnews.org/report.asp?ReportID=45910&'
|
||||||
'SelectRegion=West_Africa&SelectCountry=CHAD',
|
'SelectRegion=West_Africa&SelectCountry=CHAD',
|
||||||
r'document\.write\(')
|
r'document\.write\(')
|
||||||
|
# Checks that title parser grabs the full title instead of just
|
||||||
|
# part of it.
|
||||||
|
self.assertRegexp('title http://www.n-e-r-d.com/', 'N.*E.*R.*D')
|
||||||
|
|
||||||
def testNetcraft(self):
|
def testNetcraft(self):
|
||||||
self.assertNotError('netcraft slashdot.org')
|
self.assertNotError('netcraft slashdot.org')
|
||||||
|
Loading…
Reference in New Issue
Block a user