From 5ef1d3eaaaa7e9870d73bc26950b25a3b40bfd6c Mon Sep 17 00:00:00 2001 From: Jeremy Fincher Date: Wed, 2 Apr 2003 09:20:49 +0000 Subject: [PATCH] Added keyword arg to htmlToText to determine how to replace HTML tags. --- src/utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/utils.py b/src/utils.py index 71fd68e2c..70ef5a61e 100755 --- a/src/utils.py +++ b/src/utils.py @@ -43,15 +43,16 @@ import htmlentitydefs class HtmlToText(sgmllib.SGMLParser): """Taken from some eff-bot code on c.l.p.""" entitydefs = htmlentitydefs.entitydefs - def __init__(self): + def __init__(self, tagReplace=' '): self.data = [] + self.tagReplace = tagReplace sgmllib.SGMLParser.__init__(self) def unknown_starttag(self, tag, attrib): - self.data.append(" ") + self.data.append(self.tagReplace) def unknown_endtag(self, tag): - self.data.append(" ") + self.data.append(self.tagReplace) def handle_data(self, data): self.data.append(data) @@ -60,8 +61,8 @@ class HtmlToText(sgmllib.SGMLParser): text = ''.join(self.data).strip() return ' '.join(text.split()) # normalize whitespace -def htmlToText(s): - x = HtmlToText() +def htmlToText(s, tagReplace=' '): + x = HtmlToText(tagReplace) x.feed(s) return x.getText()