mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-01-11 12:42:34 +01:00
Web & core: Merge features of Web's title parser and utils.web.HtmlToText + don't unescape HTML twice. Closes GH-1176.
This commit is contained in:
parent
9f10f08b2e
commit
e3ff413734
@ -50,15 +50,13 @@ else:
|
||||
from HTMLParser import HTMLParser
|
||||
from htmlentitydefs import entitydefs
|
||||
|
||||
class Title(HTMLParser):
|
||||
class Title(utils.web.HtmlToText):
|
||||
entitydefs = entitydefs.copy()
|
||||
entitydefs['nbsp'] = ' '
|
||||
entitydefs['apos'] = '\''
|
||||
def __init__(self):
|
||||
self.inTitle = False
|
||||
self.inSvg = False
|
||||
self.title = ''
|
||||
HTMLParser.__init__(self)
|
||||
utils.web.HtmlToText.__init__(self)
|
||||
|
||||
@property
|
||||
def inHtmlTitle(self):
|
||||
@ -76,18 +74,9 @@ class Title(HTMLParser):
|
||||
elif tag == 'svg':
|
||||
self.inSvg = False
|
||||
|
||||
def handle_data(self, data):
|
||||
def append(self, data):
|
||||
if self.inHtmlTitle:
|
||||
self.title += data
|
||||
|
||||
def handle_entityref(self, name):
|
||||
if self.inHtmlTitle:
|
||||
if name in self.entitydefs:
|
||||
self.title += self.entitydefs[name]
|
||||
|
||||
def handle_charref(self, name):
|
||||
if self.inHtmlTitle:
|
||||
self.title += (unichr if minisix.PY2 else chr)(int(name))
|
||||
super(Title, self).append(data)
|
||||
|
||||
class DelayedIrc:
|
||||
def __init__(self, irc):
|
||||
@ -156,16 +145,15 @@ class Web(callbacks.PluginRegexp):
|
||||
return None
|
||||
parser.feed(text)
|
||||
parser.close()
|
||||
title = parser.title
|
||||
title = ''.join(parser.data).strip()
|
||||
if title:
|
||||
title = utils.web.htmlToText(title.strip())
|
||||
return title
|
||||
elif raiseErrors:
|
||||
if len(text) < size:
|
||||
irc.reply(_('That URL appears to have no HTML title.'))
|
||||
else:
|
||||
irc.reply(format(_('That URL appears to have no HTML title '
|
||||
'within the first %S.'), size))
|
||||
return title
|
||||
|
||||
@fetch_sandbox
|
||||
def titleSnarfer(self, irc, msg, match):
|
||||
|
@ -68,6 +68,12 @@ class WebTestCase(ChannelPluginTestCase):
|
||||
self.assertResponse(
|
||||
'title http://www.thefreedictionary.com/don%27t',
|
||||
"Don't - definition of don't by The Free Dictionary")
|
||||
self.assertRegexp(
|
||||
'title '
|
||||
'https://twitter.com/rlbarnes/status/656554266744586240',
|
||||
'"PSA: In Firefox 44 Nightly, "http:" pages with '
|
||||
'<input type="password"> are now marked insecure. '
|
||||
'https://t.co/qS9LxuRPdm"$')
|
||||
|
||||
def testTitleSnarfer(self):
|
||||
try:
|
||||
|
@ -206,40 +206,47 @@ class HtmlToText(HTMLParser, object):
|
||||
"""Taken from some eff-bot code on c.l.p."""
|
||||
entitydefs = entitydefs.copy()
|
||||
entitydefs['nbsp'] = ' '
|
||||
entitydefs['apos'] = '\''
|
||||
def __init__(self, tagReplace=' '):
|
||||
self.data = []
|
||||
self.tagReplace = tagReplace
|
||||
super(HtmlToText, self).__init__()
|
||||
|
||||
def append(self, data):
|
||||
self.data.append(data)
|
||||
|
||||
def handle_starttag(self, tag, attr):
|
||||
self.data.append(self.tagReplace)
|
||||
self.append(self.tagReplace)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.data.append(self.tagReplace)
|
||||
self.append(self.tagReplace)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.data.append(data)
|
||||
self.append(data)
|
||||
|
||||
def handle_entityref(self, data):
|
||||
if minisix.PY3:
|
||||
if data in name2codepoint:
|
||||
self.data.append(chr(name2codepoint[data]))
|
||||
self.append(chr(name2codepoint[data]))
|
||||
elif isinstance(data, bytes):
|
||||
self.data.append(data.decode())
|
||||
self.append(data.decode())
|
||||
else:
|
||||
self.data.append(data)
|
||||
self.append(data)
|
||||
else:
|
||||
if data in name2codepoint:
|
||||
self.data.append(unichr(name2codepoint[data]))
|
||||
self.append(unichr(name2codepoint[data]))
|
||||
elif isinstance(data, str):
|
||||
self.data.append(data.decode('utf8', errors='replace'))
|
||||
self.append(data.decode('utf8', errors='replace'))
|
||||
else:
|
||||
self.data.append(data)
|
||||
self.append(data)
|
||||
|
||||
def getText(self):
|
||||
text = ''.join(self.data).strip()
|
||||
return normalizeWhitespace(text)
|
||||
|
||||
def handle_charref(self, name):
|
||||
self.append((unichr if minisix.PY2 else chr)(int(name)))
|
||||
|
||||
def htmlToText(s, tagReplace=' '):
|
||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user