Added normalizeWhitespace.

This commit is contained in:
Jeremy Fincher 2003-09-01 18:39:27 +00:00
parent ebc00fe4d7
commit 83707f81c1
2 changed files with 12 additions and 1 deletions

View File

@ -44,6 +44,10 @@ import sgmllib
import textwrap import textwrap
import htmlentitydefs import htmlentitydefs
def normalizeWhitespace(s):
"""Normalizes the whitespace in a string; \s+ becomes one space."""
return ' '.join(s.split())
class HtmlToText(sgmllib.SGMLParser): class HtmlToText(sgmllib.SGMLParser):
"""Taken from some eff-bot code on c.l.p.""" """Taken from some eff-bot code on c.l.p."""
entitydefs = htmlentitydefs.entitydefs entitydefs = htmlentitydefs.entitydefs
@ -63,7 +67,7 @@ class HtmlToText(sgmllib.SGMLParser):
def getText(self): def getText(self):
text = ''.join(self.data).strip() text = ''.join(self.data).strip()
return ' '.join(text.split()) # normalize whitespace return normalizeWhitespace(text)
def htmlToText(s, tagReplace=' '): def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with. """Turns HTML into text. tagReplace is a string to replace HTML tags with.
@ -302,4 +306,5 @@ def be(i):
else: else:
return 'are' return 'are'
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: # vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:

View File

@ -157,5 +157,11 @@ class UtilsTest(unittest.TestCase):
self.assertEqual(utils.unCommaThe('foo bar, the'), 'the foo bar') self.assertEqual(utils.unCommaThe('foo bar, the'), 'the foo bar')
self.assertEqual(utils.unCommaThe('foo bar, The'), 'The foo bar') self.assertEqual(utils.unCommaThe('foo bar, The'), 'The foo bar')
self.assertEqual(utils.unCommaThe('foo bar,the'), 'the foo bar') self.assertEqual(utils.unCommaThe('foo bar,the'), 'the foo bar')
def testNormalizeWhitespace(self):
self.assertEqual(utils.normalizeWhitespace('foo bar'), 'foo bar')
self.assertEqual(utils.normalizeWhitespace('foo\nbar'), 'foo bar')
self.assertEqual(utils.normalizeWhitespace('foo\tbar'), 'foo bar')
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: # vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: