Use HTMLParser instead of deprecated sgmllib in utils.web.

This commit is contained in:
Valentin Lorentz 2012-08-04 18:02:45 +02:00
parent 9ae90c3869
commit c4dfa55d65

View File

@ -29,13 +29,14 @@
### ###
import re import re
import sys
import socket import socket
import urllib import urllib
import urllib2 import urllib2
import httplib import httplib
import sgmllib
import urlparse import urlparse
import htmlentitydefs import htmlentitydefs
from HTMLParser import HTMLParser
sockerrors = (socket.error,) sockerrors = (socket.error,)
try: try:
@ -150,19 +151,19 @@ def getUrl(url, size=None, headers=None, data=None):
def getDomain(url): def getDomain(url):
return urlparse.urlparse(url)[1] return urlparse.urlparse(url)[1]
class HtmlToText(sgmllib.SGMLParser): class HtmlToText(HTMLParser, object):
"""Taken from some eff-bot code on c.l.p.""" """Taken from some eff-bot code on c.l.p."""
entitydefs = htmlentitydefs.entitydefs.copy() entitydefs = htmlentitydefs.entitydefs.copy()
entitydefs['nbsp'] = ' ' entitydefs['nbsp'] = ' '
def __init__(self, tagReplace=' '): def __init__(self, tagReplace=' '):
self.data = [] self.data = []
self.tagReplace = tagReplace self.tagReplace = tagReplace
sgmllib.SGMLParser.__init__(self) super(HtmlToText, self).__init__()
def unknown_starttag(self, tag, attr): def handle_starttag(self, tag, attr):
self.data.append(self.tagReplace) self.data.append(self.tagReplace)
def unknown_endtag(self, tag): def handle_endtag(self, tag):
self.data.append(self.tagReplace) self.data.append(self.tagReplace)
def handle_data(self, data): def handle_data(self, data):
@ -175,6 +176,8 @@ class HtmlToText(sgmllib.SGMLParser):
def htmlToText(s, tagReplace=' '): def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with. """Turns HTML into text. tagReplace is a string to replace HTML tags with.
""" """
if sys.version_info[0] >= 3 and isinstance(s, bytes):
s = s.decode()
x = HtmlToText(tagReplace) x = HtmlToText(tagReplace)
x.feed(s) x.feed(s)
return x.getText() return x.getText()