Added keyword arg to htmlToText to determine how to replace HTML tags.

This commit is contained in:
Jeremy Fincher 2003-04-02 09:20:49 +00:00
parent dcf7fa5924
commit 5ef1d3eaaa

View File

@ -43,15 +43,16 @@ import htmlentitydefs
class HtmlToText(sgmllib.SGMLParser): class HtmlToText(sgmllib.SGMLParser):
"""Taken from some eff-bot code on c.l.p.""" """Taken from some eff-bot code on c.l.p."""
entitydefs = htmlentitydefs.entitydefs entitydefs = htmlentitydefs.entitydefs
def __init__(self): def __init__(self, tagReplace=' '):
self.data = [] self.data = []
self.tagReplace = tagReplace
sgmllib.SGMLParser.__init__(self) sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attrib): def unknown_starttag(self, tag, attrib):
self.data.append(" ") self.data.append(self.tagReplace)
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
self.data.append(" ") self.data.append(self.tagReplace)
def handle_data(self, data): def handle_data(self, data):
self.data.append(data) self.data.append(data)
@ -60,8 +61,8 @@ class HtmlToText(sgmllib.SGMLParser):
text = ''.join(self.data).strip() text = ''.join(self.data).strip()
return ' '.join(text.split()) # normalize whitespace return ' '.join(text.split()) # normalize whitespace
def htmlToText(s): def htmlToText(s, tagReplace=' '):
x = HtmlToText() x = HtmlToText(tagReplace)
x.feed(s) x.feed(s)
return x.getText() return x.getText()