mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-30 14:59:34 +01:00
Use HTMLParser instead of deprecated sgmllib in utils.web.
This commit is contained in:
parent
9ae90c3869
commit
c4dfa55d65
@ -29,13 +29,14 @@
|
|||||||
###
|
###
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import socket
|
import socket
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import httplib
|
import httplib
|
||||||
import sgmllib
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import htmlentitydefs
|
import htmlentitydefs
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
sockerrors = (socket.error,)
|
sockerrors = (socket.error,)
|
||||||
try:
|
try:
|
||||||
@ -150,19 +151,19 @@ def getUrl(url, size=None, headers=None, data=None):
|
|||||||
def getDomain(url):
|
def getDomain(url):
|
||||||
return urlparse.urlparse(url)[1]
|
return urlparse.urlparse(url)[1]
|
||||||
|
|
||||||
class HtmlToText(sgmllib.SGMLParser):
|
class HtmlToText(HTMLParser, object):
|
||||||
"""Taken from some eff-bot code on c.l.p."""
|
"""Taken from some eff-bot code on c.l.p."""
|
||||||
entitydefs = htmlentitydefs.entitydefs.copy()
|
entitydefs = htmlentitydefs.entitydefs.copy()
|
||||||
entitydefs['nbsp'] = ' '
|
entitydefs['nbsp'] = ' '
|
||||||
def __init__(self, tagReplace=' '):
|
def __init__(self, tagReplace=' '):
|
||||||
self.data = []
|
self.data = []
|
||||||
self.tagReplace = tagReplace
|
self.tagReplace = tagReplace
|
||||||
sgmllib.SGMLParser.__init__(self)
|
super(HtmlToText, self).__init__()
|
||||||
|
|
||||||
def unknown_starttag(self, tag, attr):
|
def handle_starttag(self, tag, attr):
|
||||||
self.data.append(self.tagReplace)
|
self.data.append(self.tagReplace)
|
||||||
|
|
||||||
def unknown_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
self.data.append(self.tagReplace)
|
self.data.append(self.tagReplace)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
@ -175,6 +176,8 @@ class HtmlToText(sgmllib.SGMLParser):
|
|||||||
def htmlToText(s, tagReplace=' '):
|
def htmlToText(s, tagReplace=' '):
|
||||||
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
||||||
"""
|
"""
|
||||||
|
if sys.version_info[0] >= 3 and isinstance(s, bytes):
|
||||||
|
s = s.decode()
|
||||||
x = HtmlToText(tagReplace)
|
x = HtmlToText(tagReplace)
|
||||||
x.feed(s)
|
x.feed(s)
|
||||||
return x.getText()
|
return x.getText()
|
||||||
|
Loading…
Reference in New Issue
Block a user