2003-03-27 21:10:10 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
###
|
|
|
|
# Copyright (c) 2002, Jeremiah Fincher
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
###
|
|
|
|
|
|
|
|
"""
|
|
|
|
Simple utility functions.
|
|
|
|
"""
|
|
|
|
|
|
|
|
from fix import *
|
|
|
|
|
|
|
|
import sgmllib
|
|
|
|
import htmlentitydefs
|
|
|
|
|
|
|
|
class HtmlToText(sgmllib.SGMLParser):
|
|
|
|
"""Taken from some eff-bot code on c.l.p."""
|
|
|
|
entitydefs = htmlentitydefs.entitydefs
|
|
|
|
def __init__(self):
|
|
|
|
self.data = []
|
|
|
|
sgmllib.SGMLParser.__init__(self)
|
|
|
|
|
|
|
|
def unknown_starttag(self, tag, attrib):
|
|
|
|
self.data.append(" ")
|
|
|
|
|
|
|
|
def unknown_endtag(self, tag):
|
|
|
|
self.data.append(" ")
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
self.data.append(data)
|
|
|
|
|
|
|
|
def getText(self):
|
|
|
|
text = ''.join(self.data).strip()
|
2003-03-27 21:14:17 +01:00
|
|
|
return ' '.join(text.split()) # normalize whitespace
|
2003-03-27 21:10:10 +01:00
|
|
|
|
|
|
|
def htmlToText(s):
|
|
|
|
x = HtmlToText()
|
|
|
|
x.feed(s)
|
|
|
|
return x.getText()
|
|
|
|
|
2003-03-31 07:14:21 +02:00
|
|
|
def eachSubstring(s):
|
|
|
|
for i in range(1, len(s)+1):
|
|
|
|
yield s[:i]
|
|
|
|
|
|
|
|
def abbrev(strings):
|
|
|
|
d = {}
|
|
|
|
for s in strings:
|
|
|
|
for abbreviation in eachSubstring(s):
|
|
|
|
if abbreviation not in d:
|
|
|
|
d[abbreviation] = s
|
|
|
|
else:
|
|
|
|
d[abbreviation] = None
|
|
|
|
removals = []
|
|
|
|
for key in d:
|
|
|
|
if d[key] is None:
|
|
|
|
removals.append(key)
|
|
|
|
for key in removals:
|
|
|
|
del d[key]
|
|
|
|
return d
|
|
|
|
|
2003-03-27 21:10:10 +01:00
|
|
|
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:
|