mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-01-11 20:52:42 +01:00
Web & core: Merge features of Web's title parser and utils.web.HtmlToText + don't unescape HTML twice. Closes GH-1176.
This commit is contained in:
parent
9f10f08b2e
commit
e3ff413734
@ -50,15 +50,13 @@ else:
|
|||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
from htmlentitydefs import entitydefs
|
from htmlentitydefs import entitydefs
|
||||||
|
|
||||||
class Title(HTMLParser):
|
class Title(utils.web.HtmlToText):
|
||||||
entitydefs = entitydefs.copy()
|
entitydefs = entitydefs.copy()
|
||||||
entitydefs['nbsp'] = ' '
|
entitydefs['nbsp'] = ' '
|
||||||
entitydefs['apos'] = '\''
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.inTitle = False
|
self.inTitle = False
|
||||||
self.inSvg = False
|
self.inSvg = False
|
||||||
self.title = ''
|
utils.web.HtmlToText.__init__(self)
|
||||||
HTMLParser.__init__(self)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def inHtmlTitle(self):
|
def inHtmlTitle(self):
|
||||||
@ -76,18 +74,9 @@ class Title(HTMLParser):
|
|||||||
elif tag == 'svg':
|
elif tag == 'svg':
|
||||||
self.inSvg = False
|
self.inSvg = False
|
||||||
|
|
||||||
def handle_data(self, data):
|
def append(self, data):
|
||||||
if self.inHtmlTitle:
|
if self.inHtmlTitle:
|
||||||
self.title += data
|
super(Title, self).append(data)
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
|
||||||
if self.inHtmlTitle:
|
|
||||||
if name in self.entitydefs:
|
|
||||||
self.title += self.entitydefs[name]
|
|
||||||
|
|
||||||
def handle_charref(self, name):
|
|
||||||
if self.inHtmlTitle:
|
|
||||||
self.title += (unichr if minisix.PY2 else chr)(int(name))
|
|
||||||
|
|
||||||
class DelayedIrc:
|
class DelayedIrc:
|
||||||
def __init__(self, irc):
|
def __init__(self, irc):
|
||||||
@ -156,16 +145,15 @@ class Web(callbacks.PluginRegexp):
|
|||||||
return None
|
return None
|
||||||
parser.feed(text)
|
parser.feed(text)
|
||||||
parser.close()
|
parser.close()
|
||||||
title = parser.title
|
title = ''.join(parser.data).strip()
|
||||||
if title:
|
if title:
|
||||||
title = utils.web.htmlToText(title.strip())
|
return title
|
||||||
elif raiseErrors:
|
elif raiseErrors:
|
||||||
if len(text) < size:
|
if len(text) < size:
|
||||||
irc.reply(_('That URL appears to have no HTML title.'))
|
irc.reply(_('That URL appears to have no HTML title.'))
|
||||||
else:
|
else:
|
||||||
irc.reply(format(_('That URL appears to have no HTML title '
|
irc.reply(format(_('That URL appears to have no HTML title '
|
||||||
'within the first %S.'), size))
|
'within the first %S.'), size))
|
||||||
return title
|
|
||||||
|
|
||||||
@fetch_sandbox
|
@fetch_sandbox
|
||||||
def titleSnarfer(self, irc, msg, match):
|
def titleSnarfer(self, irc, msg, match):
|
||||||
|
@ -68,6 +68,12 @@ class WebTestCase(ChannelPluginTestCase):
|
|||||||
self.assertResponse(
|
self.assertResponse(
|
||||||
'title http://www.thefreedictionary.com/don%27t',
|
'title http://www.thefreedictionary.com/don%27t',
|
||||||
"Don't - definition of don't by The Free Dictionary")
|
"Don't - definition of don't by The Free Dictionary")
|
||||||
|
self.assertRegexp(
|
||||||
|
'title '
|
||||||
|
'https://twitter.com/rlbarnes/status/656554266744586240',
|
||||||
|
'"PSA: In Firefox 44 Nightly, "http:" pages with '
|
||||||
|
'<input type="password"> are now marked insecure. '
|
||||||
|
'https://t.co/qS9LxuRPdm"$')
|
||||||
|
|
||||||
def testTitleSnarfer(self):
|
def testTitleSnarfer(self):
|
||||||
try:
|
try:
|
||||||
|
@ -206,40 +206,47 @@ class HtmlToText(HTMLParser, object):
|
|||||||
"""Taken from some eff-bot code on c.l.p."""
|
"""Taken from some eff-bot code on c.l.p."""
|
||||||
entitydefs = entitydefs.copy()
|
entitydefs = entitydefs.copy()
|
||||||
entitydefs['nbsp'] = ' '
|
entitydefs['nbsp'] = ' '
|
||||||
|
entitydefs['apos'] = '\''
|
||||||
def __init__(self, tagReplace=' '):
|
def __init__(self, tagReplace=' '):
|
||||||
self.data = []
|
self.data = []
|
||||||
self.tagReplace = tagReplace
|
self.tagReplace = tagReplace
|
||||||
super(HtmlToText, self).__init__()
|
super(HtmlToText, self).__init__()
|
||||||
|
|
||||||
|
def append(self, data):
|
||||||
|
self.data.append(data)
|
||||||
|
|
||||||
def handle_starttag(self, tag, attr):
|
def handle_starttag(self, tag, attr):
|
||||||
self.data.append(self.tagReplace)
|
self.append(self.tagReplace)
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
self.data.append(self.tagReplace)
|
self.append(self.tagReplace)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
self.data.append(data)
|
self.append(data)
|
||||||
|
|
||||||
def handle_entityref(self, data):
|
def handle_entityref(self, data):
|
||||||
if minisix.PY3:
|
if minisix.PY3:
|
||||||
if data in name2codepoint:
|
if data in name2codepoint:
|
||||||
self.data.append(chr(name2codepoint[data]))
|
self.append(chr(name2codepoint[data]))
|
||||||
elif isinstance(data, bytes):
|
elif isinstance(data, bytes):
|
||||||
self.data.append(data.decode())
|
self.append(data.decode())
|
||||||
else:
|
else:
|
||||||
self.data.append(data)
|
self.append(data)
|
||||||
else:
|
else:
|
||||||
if data in name2codepoint:
|
if data in name2codepoint:
|
||||||
self.data.append(unichr(name2codepoint[data]))
|
self.append(unichr(name2codepoint[data]))
|
||||||
elif isinstance(data, str):
|
elif isinstance(data, str):
|
||||||
self.data.append(data.decode('utf8', errors='replace'))
|
self.append(data.decode('utf8', errors='replace'))
|
||||||
else:
|
else:
|
||||||
self.data.append(data)
|
self.append(data)
|
||||||
|
|
||||||
def getText(self):
|
def getText(self):
|
||||||
text = ''.join(self.data).strip()
|
text = ''.join(self.data).strip()
|
||||||
return normalizeWhitespace(text)
|
return normalizeWhitespace(text)
|
||||||
|
|
||||||
|
def handle_charref(self, name):
|
||||||
|
self.append((unichr if minisix.PY2 else chr)(int(name)))
|
||||||
|
|
||||||
def htmlToText(s, tagReplace=' '):
|
def htmlToText(s, tagReplace=' '):
|
||||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user