Web & core: Merge features of Web's title parser and utils.web.HtmlToText + don't unescape HTML twice. Closes GH-1176.

This commit is contained in:
Valentin Lorentz 2015-10-23 07:41:36 +02:00
parent 9f10f08b2e
commit e3ff413734
3 changed files with 28 additions and 27 deletions

View File

@ -50,15 +50,13 @@ else:
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
class Title(HTMLParser):
class Title(utils.web.HtmlToText):
entitydefs = entitydefs.copy()
entitydefs['nbsp'] = ' '
entitydefs['apos'] = '\''
def __init__(self):
self.inTitle = False
self.inSvg = False
self.title = ''
HTMLParser.__init__(self)
utils.web.HtmlToText.__init__(self)
@property
def inHtmlTitle(self):
@ -76,18 +74,9 @@ class Title(HTMLParser):
elif tag == 'svg':
self.inSvg = False
def handle_data(self, data):
def append(self, data):
if self.inHtmlTitle:
self.title += data
def handle_entityref(self, name):
if self.inHtmlTitle:
if name in self.entitydefs:
self.title += self.entitydefs[name]
def handle_charref(self, name):
if self.inHtmlTitle:
self.title += (unichr if minisix.PY2 else chr)(int(name))
super(Title, self).append(data)
class DelayedIrc:
def __init__(self, irc):
@ -156,16 +145,15 @@ class Web(callbacks.PluginRegexp):
return None
parser.feed(text)
parser.close()
title = parser.title
title = ''.join(parser.data).strip()
if title:
title = utils.web.htmlToText(title.strip())
return title
elif raiseErrors:
if len(text) < size:
irc.reply(_('That URL appears to have no HTML title.'))
else:
irc.reply(format(_('That URL appears to have no HTML title '
'within the first %S.'), size))
return title
@fetch_sandbox
def titleSnarfer(self, irc, msg, match):

View File

@ -68,6 +68,12 @@ class WebTestCase(ChannelPluginTestCase):
self.assertResponse(
'title http://www.thefreedictionary.com/don%27t',
"Don't - definition of don't by The Free Dictionary")
self.assertRegexp(
'title '
'https://twitter.com/rlbarnes/status/656554266744586240',
'"PSA: In Firefox 44 Nightly, "http:" pages with '
'<input type="password"> are now marked insecure. '
'https://t.co/qS9LxuRPdm"$')
def testTitleSnarfer(self):
try:

View File

@ -206,40 +206,47 @@ class HtmlToText(HTMLParser, object):
"""Taken from some eff-bot code on c.l.p."""
entitydefs = entitydefs.copy()
entitydefs['nbsp'] = ' '
entitydefs['apos'] = '\''
def __init__(self, tagReplace=' '):
self.data = []
self.tagReplace = tagReplace
super(HtmlToText, self).__init__()
def append(self, data):
self.data.append(data)
def handle_starttag(self, tag, attr):
self.data.append(self.tagReplace)
self.append(self.tagReplace)
def handle_endtag(self, tag):
self.data.append(self.tagReplace)
self.append(self.tagReplace)
def handle_data(self, data):
self.data.append(data)
self.append(data)
def handle_entityref(self, data):
if minisix.PY3:
if data in name2codepoint:
self.data.append(chr(name2codepoint[data]))
self.append(chr(name2codepoint[data]))
elif isinstance(data, bytes):
self.data.append(data.decode())
self.append(data.decode())
else:
self.data.append(data)
self.append(data)
else:
if data in name2codepoint:
self.data.append(unichr(name2codepoint[data]))
self.append(unichr(name2codepoint[data]))
elif isinstance(data, str):
self.data.append(data.decode('utf8', errors='replace'))
self.append(data.decode('utf8', errors='replace'))
else:
self.data.append(data)
self.append(data)
def getText(self):
text = ''.join(self.data).strip()
return normalizeWhitespace(text)
def handle_charref(self, name):
self.append((unichr if minisix.PY2 else chr)(int(name)))
def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""