2003-03-27 21:10:10 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
###
|
|
|
|
# Copyright (c) 2002, Jeremiah Fincher
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
###
|
|
|
|
|
|
|
|
"""
|
|
|
|
Simple utility functions.
|
|
|
|
"""
|
|
|
|
|
2003-08-07 04:48:44 +02:00
|
|
|
## from __future__ import generators
|
2003-03-31 08:55:34 +02:00
|
|
|
|
2003-03-27 21:10:10 +01:00
|
|
|
from fix import *
|
|
|
|
|
2003-08-17 22:09:09 +02:00
|
|
|
import os
|
2003-04-16 07:26:24 +02:00
|
|
|
import re
|
2003-04-05 13:23:28 +02:00
|
|
|
import string
|
2003-03-27 21:10:10 +01:00
|
|
|
import sgmllib
|
2003-08-27 18:25:43 +02:00
|
|
|
import textwrap
|
2003-03-27 21:10:10 +01:00
|
|
|
import htmlentitydefs
|
|
|
|
|
2003-09-01 20:39:27 +02:00
|
|
|
def normalizeWhitespace(s):
|
|
|
|
"""Normalizes the whitespace in a string; \s+ becomes one space."""
|
|
|
|
return ' '.join(s.split())
|
|
|
|
|
2003-03-27 21:10:10 +01:00
|
|
|
class HtmlToText(sgmllib.SGMLParser):
|
|
|
|
"""Taken from some eff-bot code on c.l.p."""
|
|
|
|
entitydefs = htmlentitydefs.entitydefs
|
2003-04-02 11:20:49 +02:00
|
|
|
def __init__(self, tagReplace=' '):
|
2003-03-27 21:10:10 +01:00
|
|
|
self.data = []
|
2003-04-02 11:20:49 +02:00
|
|
|
self.tagReplace = tagReplace
|
2003-03-27 21:10:10 +01:00
|
|
|
sgmllib.SGMLParser.__init__(self)
|
|
|
|
|
2003-08-19 12:38:45 +02:00
|
|
|
def unknown_starttag(self, tag, attr):
|
2003-04-02 11:20:49 +02:00
|
|
|
self.data.append(self.tagReplace)
|
2003-03-27 21:10:10 +01:00
|
|
|
|
|
|
|
def unknown_endtag(self, tag):
|
2003-04-02 11:20:49 +02:00
|
|
|
self.data.append(self.tagReplace)
|
2003-03-27 21:10:10 +01:00
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
self.data.append(data)
|
|
|
|
|
|
|
|
def getText(self):
|
|
|
|
text = ''.join(self.data).strip()
|
2003-09-01 20:39:27 +02:00
|
|
|
return normalizeWhitespace(text)
|
2003-03-27 21:10:10 +01:00
|
|
|
|
2003-04-02 11:20:49 +02:00
|
|
|
def htmlToText(s, tagReplace=' '):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
|
|
|
|
"""
|
2003-04-02 11:20:49 +02:00
|
|
|
x = HtmlToText(tagReplace)
|
2003-03-27 21:10:10 +01:00
|
|
|
x.feed(s)
|
|
|
|
return x.getText()
|
|
|
|
|
2003-03-31 07:14:21 +02:00
|
|
|
def eachSubstring(s):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Returns every substring starting at the first index until the last."""
|
|
|
|
for i in xrange(1, len(s)+1):
|
2003-03-31 07:14:21 +02:00
|
|
|
yield s[:i]
|
|
|
|
|
|
|
|
def abbrev(strings):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Returns a dictionary mapping unambiguous abbreviations to full forms."""
|
2003-03-31 07:14:21 +02:00
|
|
|
d = {}
|
|
|
|
for s in strings:
|
|
|
|
for abbreviation in eachSubstring(s):
|
|
|
|
if abbreviation not in d:
|
|
|
|
d[abbreviation] = s
|
|
|
|
else:
|
2003-04-04 18:14:58 +02:00
|
|
|
if abbreviation not in strings:
|
|
|
|
d[abbreviation] = None
|
2003-03-31 07:14:21 +02:00
|
|
|
removals = []
|
|
|
|
for key in d:
|
|
|
|
if d[key] is None:
|
|
|
|
removals.append(key)
|
|
|
|
for key in removals:
|
|
|
|
del d[key]
|
|
|
|
return d
|
|
|
|
|
2003-05-20 17:37:25 +02:00
|
|
|
def timeElapsed(elapsed, leadingZeroes=False, years=True, weeks=True,
|
2003-04-03 11:11:57 +02:00
|
|
|
days=True, hours=True, minutes=True, seconds=True):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Given <elapsed> seconds, returns a string with an English description of
|
|
|
|
how much time as passed. leadingZeroes determines whether 0 days, 0 hours,
|
|
|
|
etc. will be printed; the others determine what larger time periods should
|
|
|
|
be used.
|
|
|
|
"""
|
2003-06-03 05:18:14 +02:00
|
|
|
elapsed = int(elapsed)
|
2003-05-20 17:37:25 +02:00
|
|
|
assert years or weeks or days or \
|
|
|
|
hours or minutes or seconds, 'One flag must be True'
|
2003-04-03 11:11:57 +02:00
|
|
|
ret = []
|
|
|
|
if years:
|
|
|
|
yrs, elapsed = elapsed // 31536000, elapsed % 31536000
|
|
|
|
if leadingZeroes or yrs:
|
|
|
|
if yrs:
|
|
|
|
leadingZeroes = True
|
|
|
|
if yrs != 1:
|
|
|
|
yrs = '%s years' % yrs
|
|
|
|
else:
|
|
|
|
yrs = '1 year'
|
|
|
|
ret.append(yrs)
|
|
|
|
if weeks:
|
|
|
|
wks, elapsed = elapsed // 604800, elapsed % 604800
|
|
|
|
if leadingZeroes or wks:
|
|
|
|
if wks:
|
|
|
|
leadingZeroes = True
|
|
|
|
if wks != 1:
|
|
|
|
wks = '%s weeks' % wks
|
|
|
|
else:
|
|
|
|
wks = '1 week'
|
|
|
|
ret.append(wks)
|
|
|
|
if days:
|
|
|
|
ds, elapsed = elapsed // 86400, elapsed % 86400
|
|
|
|
if leadingZeroes or ds:
|
|
|
|
if ds:
|
|
|
|
leadingZeroes = True
|
|
|
|
if ds != 1:
|
|
|
|
ds = '%s days' % ds
|
|
|
|
else:
|
|
|
|
ds = '1 day'
|
|
|
|
ret.append(ds)
|
|
|
|
if hours:
|
|
|
|
hrs, elapsed = elapsed // 3600, elapsed % 3600
|
|
|
|
if leadingZeroes or hrs:
|
|
|
|
if hrs:
|
|
|
|
leadingZeroes = True
|
|
|
|
if hrs != 1:
|
|
|
|
hrs = '%s hours' % hrs
|
|
|
|
else:
|
|
|
|
hrs = '1 hour'
|
|
|
|
ret.append(hrs)
|
|
|
|
if minutes or seconds:
|
|
|
|
mins, secs = elapsed // 60, elapsed % 60
|
|
|
|
if leadingZeroes or mins:
|
|
|
|
if mins != 1:
|
|
|
|
mins = '%s minutes' % mins
|
|
|
|
else:
|
|
|
|
mins = '1 minute'
|
|
|
|
ret.append(mins)
|
|
|
|
if seconds:
|
|
|
|
if secs != 1:
|
|
|
|
secs = '%s seconds' % secs
|
|
|
|
else:
|
|
|
|
secs = '1 second'
|
|
|
|
ret.append(secs)
|
|
|
|
if len(ret) == 0:
|
|
|
|
raise ValueError, 'Time difference not great enough to be noted.'
|
|
|
|
if len(ret) == 1:
|
|
|
|
return ret[0]
|
|
|
|
else:
|
2003-08-21 18:31:11 +02:00
|
|
|
return commaAndify(ret)
|
2003-08-20 11:24:57 +02:00
|
|
|
|
2003-04-04 17:49:24 +02:00
|
|
|
def distance(s, t):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Returns the levenshtein edit distance between two strings."""
|
2003-04-04 17:49:24 +02:00
|
|
|
n = len(s)
|
|
|
|
m = len(t)
|
|
|
|
if n == 0:
|
|
|
|
return m
|
|
|
|
elif m == 0:
|
|
|
|
return n
|
2003-04-06 17:10:14 +02:00
|
|
|
d = []
|
|
|
|
for i in range(n+1):
|
|
|
|
d.append([])
|
|
|
|
for j in range(m+1):
|
|
|
|
d[i].append(0)
|
|
|
|
d[0][j] = j
|
|
|
|
d[i][0] = i
|
2003-04-04 17:49:24 +02:00
|
|
|
for i in range(1, n+1):
|
|
|
|
cs = s[i-1]
|
|
|
|
for j in range(1, m+1):
|
|
|
|
ct = t[j-1]
|
2003-04-06 17:10:14 +02:00
|
|
|
cost = int(cs != ct)
|
2003-04-04 17:49:24 +02:00
|
|
|
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
|
|
|
|
return d[n][m]
|
2003-04-05 13:23:28 +02:00
|
|
|
|
|
|
|
_soundextrans = string.maketrans(string.ascii_uppercase,
|
|
|
|
'01230120022455012623010202')
|
|
|
|
_notUpper = string.ascii.translate(string.ascii, string.ascii_uppercase)
|
|
|
|
def soundex(s, length=4):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Returns the soundex hash of a given string."""
|
2003-04-05 13:23:28 +02:00
|
|
|
assert s
|
|
|
|
s = s.upper() # Make everything uppercase.
|
|
|
|
firstChar = s[0] # Save the first character.
|
|
|
|
s = s.translate(string.ascii, _notUpper) # Delete non-letters.
|
|
|
|
s = s.translate(_soundextrans) # Convert to soundex numbers.
|
|
|
|
s = s.lstrip(s[0]) # Remove all repeated first characters.
|
|
|
|
L = [firstChar]
|
|
|
|
for c in s:
|
|
|
|
if c != L[-1]:
|
|
|
|
L.append(c)
|
2003-08-10 12:45:44 +02:00
|
|
|
L = [c for c in L if c != '0'] + (['0']*(length-1))
|
2003-04-05 13:23:28 +02:00
|
|
|
s = ''.join(L)
|
2003-04-05 13:29:29 +02:00
|
|
|
return length and s[:length] or s.rstrip('0')
|
2003-04-03 11:11:57 +02:00
|
|
|
|
2003-04-12 14:50:20 +02:00
|
|
|
def dqrepr(s):
|
|
|
|
"""Returns a repr() of s guaranteed to be in double quotes."""
|
2003-07-31 08:20:58 +02:00
|
|
|
# The wankers-that-be decided not to use double-quotes anymore in 2.3.
|
|
|
|
# return '"' + repr("'\x00" + s)[6:]
|
2003-08-17 08:28:05 +02:00
|
|
|
return '"%s"' % s.encode('string_escape').replace('"', '\\"')
|
2003-04-12 14:50:20 +02:00
|
|
|
|
2003-04-16 07:26:24 +02:00
|
|
|
nonEscapedSlashes = re.compile(r'(?<!\\)/')
|
|
|
|
def perlReToPythonRe(s):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Converts a string representation of a Perl regular expression (i.e.,
|
|
|
|
m/^foo$/i or /foo|bar/) to a Python regular expression.
|
|
|
|
"""
|
2003-04-16 07:26:24 +02:00
|
|
|
(kind, regexp, flags) = nonEscapedSlashes.split(s)
|
|
|
|
regexp = regexp.replace('\\/', '/')
|
|
|
|
if kind not in ('', 'm'):
|
|
|
|
raise ValueError, 'Invalid kind: must be in ("", "m")'
|
|
|
|
flag = 0
|
|
|
|
try:
|
|
|
|
for c in flags.upper():
|
2003-08-30 20:39:19 +02:00
|
|
|
flag |= getattr(re, c)
|
2003-04-16 07:26:24 +02:00
|
|
|
except AttributeError:
|
|
|
|
raise ValueError, 'Invalid flag: %s' % c
|
|
|
|
return re.compile(regexp, flag)
|
2003-08-20 11:24:57 +02:00
|
|
|
|
2003-04-16 07:26:24 +02:00
|
|
|
def perlReToReplacer(s):
|
2003-08-10 12:45:44 +02:00
|
|
|
"""Converts a string representation of a Perl regular expression (i.e.,
|
|
|
|
s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent
|
|
|
|
replacement.
|
|
|
|
"""
|
2003-04-16 07:26:24 +02:00
|
|
|
(kind, regexp, replace, flags) = nonEscapedSlashes.split(s)
|
|
|
|
if kind != 's':
|
|
|
|
raise ValueError, 'Invalid kind: must be "s"'
|
|
|
|
g = False
|
|
|
|
if 'g' in flags:
|
|
|
|
g = True
|
|
|
|
flags = filter('g'.__ne__, flags)
|
|
|
|
r = perlReToPythonRe('/'.join(('', regexp, flags)))
|
|
|
|
if g:
|
|
|
|
return lambda s: r.sub(replace, s)
|
|
|
|
else:
|
|
|
|
return lambda s: r.sub(replace, s, 1)
|
|
|
|
|
2003-08-17 22:09:09 +02:00
|
|
|
def findBinaryInPath(s):
|
2003-08-19 21:02:59 +02:00
|
|
|
"""Return full path of a binary if it's in PATH, otherwise return None."""
|
2003-08-17 22:09:09 +02:00
|
|
|
cmdLine = None
|
|
|
|
for dir in os.getenv('PATH').split(':'):
|
|
|
|
filename = os.path.join(dir, s)
|
|
|
|
if os.path.exists(filename):
|
|
|
|
cmdLine = filename
|
|
|
|
break
|
|
|
|
return cmdLine
|
|
|
|
|
2003-08-22 23:31:17 +02:00
|
|
|
def commaAndify(seq):
|
2003-09-03 10:51:45 +02:00
|
|
|
"""Given a a sequence, returns an english clause for that sequence.
|
|
|
|
|
|
|
|
I.e., given [1, 2, 3], returns '1, 2, and 3'
|
|
|
|
"""
|
2003-08-22 23:31:17 +02:00
|
|
|
L = list(seq)
|
2003-08-20 11:24:57 +02:00
|
|
|
if len(L) == 0:
|
|
|
|
return ''
|
|
|
|
elif len(L) == 1:
|
|
|
|
return L[0]
|
|
|
|
elif len(L) == 2:
|
|
|
|
return '%s and %s' % (L[0], L[1])
|
|
|
|
else:
|
|
|
|
L[-1] = 'and %s' % L[-1]
|
2003-08-22 23:31:17 +02:00
|
|
|
return ', '.join(L)
|
2003-08-20 11:24:57 +02:00
|
|
|
|
2003-08-23 09:57:04 +02:00
|
|
|
_unCommaTheRe = re.compile(r'(.*),\s*(the)$', re.I)
|
|
|
|
def unCommaThe(s):
|
2003-09-03 10:51:45 +02:00
|
|
|
"""Takes a string of the form 'foo, the' and turns it into 'the foo'."""
|
2003-08-23 09:57:04 +02:00
|
|
|
m = _unCommaTheRe.match(s)
|
|
|
|
if m is not None:
|
|
|
|
return '%s %s' % (m.group(2), m.group(1))
|
|
|
|
else:
|
|
|
|
return s
|
|
|
|
|
2003-08-27 18:25:43 +02:00
|
|
|
def wrapLines(s):
|
2003-09-03 10:51:45 +02:00
|
|
|
"""Word wraps several paragraphs in a string s."""
|
2003-08-27 18:25:43 +02:00
|
|
|
L = []
|
|
|
|
for line in s.splitlines():
|
|
|
|
L.append(textwrap.fill(line))
|
|
|
|
return '\n'.join(L)
|
|
|
|
|
2003-09-01 07:42:35 +02:00
|
|
|
plurals = {}
|
|
|
|
def pluralize(i, s):
|
2003-09-03 10:51:45 +02:00
|
|
|
"""Returns the plural of s based on its number i. Put any exceptions to
|
|
|
|
the general English rule of appending 's' in the plurals dictionary.
|
|
|
|
"""
|
2003-09-01 07:42:35 +02:00
|
|
|
if i == 1:
|
|
|
|
return s
|
|
|
|
else:
|
|
|
|
if s in plurals:
|
|
|
|
return plurals[s]
|
|
|
|
else:
|
|
|
|
return s + 's'
|
|
|
|
|
2003-09-03 11:40:26 +02:00
|
|
|
def nItems(n, item, between=None):
|
|
|
|
if between is None:
|
|
|
|
return '%s %s' % (n, pluralize(n, item))
|
|
|
|
else:
|
|
|
|
return '%s %s %s' % (n, between, pluralize(n, item))
|
|
|
|
|
2003-09-01 07:42:35 +02:00
|
|
|
def be(i):
|
2003-09-03 10:51:45 +02:00
|
|
|
"""Returns the form of the verb 'to be' based on the number i."""
|
2003-09-01 07:42:35 +02:00
|
|
|
if i == 1:
|
|
|
|
return 'is'
|
|
|
|
else:
|
|
|
|
return 'are'
|
|
|
|
|
2003-09-03 10:51:45 +02:00
|
|
|
def sortBy(f, L, cmp=cmp):
|
|
|
|
"""Uses the decorate-sort-undecorate pattern to sort L by function f."""
|
|
|
|
for (i, elt) in enumerate(L):
|
|
|
|
L[i] = (f(elt), elt)
|
|
|
|
L.sort(cmp)
|
|
|
|
for (i, elt) in enumerate(L):
|
|
|
|
L[i] = L[i][1]
|
2003-09-01 20:39:27 +02:00
|
|
|
|
2003-09-04 22:42:37 +02:00
|
|
|
def mktemp(suffix=''):
|
|
|
|
"""Gives a decent random string, suitable for a filename."""
|
|
|
|
import sha
|
|
|
|
import md5
|
|
|
|
import time
|
|
|
|
import random
|
|
|
|
r = random.Random()
|
|
|
|
m = md5.md5(suffix)
|
|
|
|
r.seed(time.time())
|
|
|
|
s = str(r.getstate())
|
|
|
|
for x in xrange(0, random.randrange(400), random.randrange(1, 5)):
|
|
|
|
m.update(str(x))
|
|
|
|
m.update(s)
|
|
|
|
m.update(str(time.time()))
|
|
|
|
s = m.hexdigest()
|
|
|
|
return sha.sha(s + str(time.time())).hexdigest() + suffix
|
|
|
|
|
|
|
|
def itersplit(isSeparator, iterable, maxsplit=-1, yieldEmpty=False):
|
|
|
|
"""Splits an iterator based on a predicate isSeparator."""
|
|
|
|
acc = []
|
|
|
|
for element in iterable:
|
|
|
|
if maxsplit == 0 or not isSeparator(element):
|
|
|
|
acc.append(element)
|
|
|
|
else:
|
|
|
|
maxsplit -= 1
|
|
|
|
if acc or yieldEmpty:
|
|
|
|
yield acc
|
|
|
|
acc = []
|
|
|
|
if acc or yieldEmpty:
|
|
|
|
yield acc
|
|
|
|
|
|
|
|
def flatten(seq, strings=False):
|
|
|
|
"""Flattens a list of lists into a single list. See the test for examples.
|
|
|
|
"""
|
|
|
|
for elt in seq:
|
|
|
|
if not strings and type(elt) == str or type(elt) == unicode:
|
|
|
|
yield elt
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
for x in flatten(elt):
|
|
|
|
yield x
|
|
|
|
except TypeError:
|
|
|
|
yield elt
|
|
|
|
|
|
|
|
class IterableMap(object):
|
|
|
|
"""Define .iteritems() in a class and subclass this to get the other iters.
|
|
|
|
"""
|
|
|
|
def iteritems(self):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def iterkeys(self):
|
|
|
|
for (key, _) in self.iteritems():
|
|
|
|
yield key
|
|
|
|
|
|
|
|
def itervalues(self):
|
|
|
|
for (_, value) in self.iteritems():
|
|
|
|
yield value
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
return list(self.iteritems())
|
|
|
|
|
|
|
|
def keys(self):
|
|
|
|
return list(self.iterkeys())
|
|
|
|
|
|
|
|
def values(self):
|
|
|
|
return list(self.itervalues())
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
ret = 0
|
|
|
|
for _ in self.iteritems():
|
|
|
|
ret += 1
|
|
|
|
return ret
|
|
|
|
|
|
|
|
def __nonzero__(self):
|
|
|
|
for _ in self.iteritems():
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2003-03-27 21:10:10 +01:00
|
|
|
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:
|