Limnoria/src/utils/str.py
2012-08-04 17:36:15 +02:00

519 lines
17 KiB
Python

###
# Copyright (c) 2002-2005, Jeremiah Fincher
# Copyright (c) 2008-2009, James Vega
# Copyright (c) 2010, Valentin Lorentz
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Simple utility functions related to strings.
"""
import re
import new
import sys
import string
import textwrap
from iter import all, any
from structures import TwoWayDictionary
from supybot.i18n import PluginInternationalization
internationalizeFunction=PluginInternationalization().internationalizeFunction
chars = string.maketrans('', '')
def rsplit(s, sep=None, maxsplit=-1):
"""Equivalent to str.split, except splitting from the right."""
if sys.version_info < (2, 4, 0):
if sep is not None:
sep = sep[::-1]
L = s[::-1].split(sep, maxsplit)
L.reverse()
return [s[::-1] for s in L]
else:
return s.rsplit(sep, maxsplit)
def normalizeWhitespace(s, removeNewline=True):
"""Normalizes the whitespace in a string; \s+ becomes one space."""
replace_fn = lambda x, y, z: str.replace(x, y, z)
if isinstance(s, unicode):
replace_fn = lambda x, y, z: unicode.replace(x, y, z)
else:
s = str(s)
if removeNewline:
s = replace_fn(s, '\n', '')
s = replace_fn(s, '\t', ' ')
while ' ' in s:
s = replace_fn(s, ' ', ' ')
return s
def distance(s, t):
"""Returns the levenshtein edit distance between two strings."""
n = len(s)
m = len(t)
if n == 0:
return m
elif m == 0:
return n
d = []
for i in xrange(n+1):
d.append([])
for j in xrange(m+1):
d[i].append(0)
d[0][j] = j
d[i][0] = i
for i in xrange(1, n+1):
cs = s[i-1]
for j in xrange(1, m+1):
ct = t[j-1]
cost = int(cs != ct)
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
return d[n][m]
_soundextrans = string.maketrans(string.ascii_uppercase,
'01230120022455012623010202')
_notUpper = chars.translate(chars, string.ascii_uppercase)
def soundex(s, length=4):
"""Returns the soundex hash of a given string."""
s = s.upper() # Make everything uppercase.
s = s.translate(chars, _notUpper) # Delete non-letters.
if not s:
raise ValueError, 'Invalid string for soundex: %s'
firstChar = s[0] # Save the first character.
s = s.translate(_soundextrans) # Convert to soundex numbers.
s = s.lstrip(s[0]) # Remove all repeated first characters.
L = [firstChar]
for c in s:
if c != L[-1]:
L.append(c)
L = [c for c in L if c != '0'] + (['0']*(length-1))
s = ''.join(L)
return length and s[:length] or s.rstrip('0')
def dqrepr(s):
"""Returns a repr() of s guaranteed to be in double quotes."""
# The wankers-that-be decided not to use double-quotes anymore in 2.3.
# return '"' + repr("'\x00" + s)[6:]
encoding = 'string_escape' if sys.version_info[0] < 3 else 'unicode_escape'
return '"%s"' % s.encode(encoding).decode().replace('"', '\\"')
def quoted(s):
"""Returns a quoted s."""
return '"%s"' % s
_openers = '{[(<'
_closers = '}])>'
def _getSep(s, allowBraces=False):
if len(s) < 2:
raise ValueError, 'string given to _getSep is too short: %r' % s
if allowBraces:
braces = _closers
else:
braces = _openers + _closers
if s.startswith('m') or s.startswith('s'):
separator = s[1]
else:
separator = s[0]
if separator.isalnum() or separator in braces:
raise ValueError, \
'Invalid separator: separator must not be alphanumeric or in ' \
'"%s"' % braces
return separator
def perlReToPythonRe(s):
"""Converts a string representation of a Perl regular expression (i.e.,
m/^foo$/i or /foo|bar/) to a Python regular expression.
"""
opener = closer = _getSep(s, True)
if opener in '{[(<':
closer = _closers[_openers.index(opener)]
opener = re.escape(opener)
closer = re.escape(closer)
matcher = re.compile(r'm?%s((?:\\.|[^\\])*)%s(.*)' % (opener, closer))
try:
(regexp, flags) = matcher.match(s).groups()
except AttributeError: # Unpack list of wrong size.
raise ValueError, 'Must be of the form m/.../ or /.../'
regexp = regexp.replace('\\'+opener, opener)
if opener != closer:
regexp = regexp.replace('\\'+closer, closer)
flag = 0
try:
for c in flags.upper():
flag |= getattr(re, c)
except AttributeError:
raise ValueError, 'Invalid flag: %s' % c
try:
return re.compile(regexp, flag)
except re.error, e:
raise ValueError, str(e)
def perlReToReplacer(s):
"""Converts a string representation of a Perl regular expression (i.e.,
s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent
replacement.
"""
sep = _getSep(s)
escaped = re.escape(sep)
matcher = re.compile(r's%s((?:\\.|[^\\])*)%s((?:\\.|[^\\])*)%s(.*)'
% (escaped, escaped, escaped))
try:
(regexp, replace, flags) = matcher.match(s).groups()
except AttributeError: # Unpack list of wrong size.
raise ValueError, 'Must be of the form s/.../.../'
regexp = regexp.replace('\x08', r'\b')
replace = replace.replace('\\'+sep, sep)
for i in xrange(10):
replace = replace.replace(chr(i), r'\%s' % i)
g = False
if 'g' in flags:
g = True
flags = filter('g'.__ne__, flags)
if isinstance(flags, list):
flags = ''.join(flags)
r = perlReToPythonRe(sep.join(('', regexp, flags)))
if g:
return lambda s: r.sub(replace, s)
else:
return lambda s: r.sub(replace, s, 1)
_perlVarSubstituteRe = re.compile(r'\$\{([^}]+)\}|\$([a-zA-Z][a-zA-Z0-9]*)')
def perlVariableSubstitute(vars, text):
def replacer(m):
(braced, unbraced) = m.groups()
var = braced or unbraced
try:
x = vars[var]
if callable(x):
return x()
else:
return str(x)
except KeyError:
if braced:
return '${%s}' % braced
else:
return '$' + unbraced
return _perlVarSubstituteRe.sub(replacer, text)
class MultipleReplacer:
"""Return a callable that replaces all dict keys by the associated
value. More efficient than multiple .replace()."""
# We use an object instead of a lambda function because it avoids the
# need for using the staticmethod() on the lambda function if assigning
# it to a class in Python 3.
def __init__(self, dict_):
self._dict = dict_
dict_ = {re.escape(key): val for key,val in dict_.items()}
self._matcher = re.compile('|'.join(dict_.keys()))
def __call__(self, s):
return self._matcher.sub(lambda m: self._dict[m.group(0)], s)
def multipleReplacer(dict_):
return MultipleReplacer(dict_)
class MultipleRemover:
"""Return a callable that removes all words in the list. A bit more
efficient than multipleReplacer"""
# See comment of MultipleReplacer
def __init__(self, list_):
list_ = [re.escape(x) for x in list_]
self._matcher = re.compile('|'.join(list_))
def __call__(self, s):
return self._matcher.sub(lambda m: '', s)
def commaAndify(seq, comma=',', And='and'):
"""Given a a sequence, returns an English clause for that sequence.
I.e., given [1, 2, 3], returns '1, 2, and 3'
"""
L = list(seq)
if len(L) == 0:
return ''
elif len(L) == 1:
return ''.join(L) # We need this because it raises TypeError.
elif len(L) == 2:
L.insert(1, And)
return ' '.join(L)
else:
L[-1] = '%s %s' % (And, L[-1])
sep = '%s ' % comma
return sep.join(L)
_unCommaTheRe = re.compile(r'(.*),\s*(the)$', re.I)
def unCommaThe(s):
"""Takes a string of the form 'foo, the' and turns it into 'the foo'."""
m = _unCommaTheRe.match(s)
if m is not None:
return '%s %s' % (m.group(2), m.group(1))
else:
return s
def ellipsisify(s, n):
"""Returns a shortened version of s. Produces up to the first n chars at
the nearest word boundary.
"""
if len(s) <= n:
return s
else:
return (textwrap.wrap(s, n-3)[0] + '...')
plurals = TwoWayDictionary({})
def matchCase(s1, s2):
"""Matches the case of s1 in s2"""
if s1.isupper():
return s2.upper()
else:
L = list(s2)
for (i, char) in enumerate(s1[:len(s2)]):
if char.isupper():
L[i] = L[i].upper()
return ''.join(L)
@internationalizeFunction('pluralize')
def pluralize(s):
"""Returns the plural of s. Put any exceptions to the general English
rule of appending 's' in the plurals dictionary.
"""
consonants = 'bcdfghjklmnpqrstvwxz'
_pluralizeRegex = re.compile('[%s]y$' % consonants)
lowered = s.lower()
# Exception dictionary
if lowered in plurals:
return matchCase(s, plurals[lowered])
# Words ending with 'ch', 'sh' or 'ss' such as 'punch(es)', 'fish(es)
# and miss(es)
elif any(lowered.endswith, ['x', 'ch', 'sh', 'ss']):
return matchCase(s, s+'es')
# Words ending with a consonant followed by a 'y' such as
# 'try (tries)' or 'spy (spies)'
elif _pluralizeRegex.search(lowered):
return matchCase(s, s[:-1] + 'ies')
# In all other cases, we simply add an 's' to the base word
else:
return matchCase(s, s+'s')
@internationalizeFunction('depluralize')
def depluralize(s):
"""Returns the singular of s."""
consonants = 'bcdfghjklmnpqrstvwxz'
_depluralizeRegex = re.compile('[%s]ies' % consonants)
lowered = s.lower()
if lowered in plurals:
return matchCase(s, plurals[lowered])
elif any(lowered.endswith, ['ches', 'shes', 'sses']):
return s[:-2]
elif re.search(_depluralizeRegex, lowered):
return s[:-3] + 'y'
else:
if lowered.endswith('s'):
return s[:-1] # Chop off 's'.
else:
return s # Don't know what to do.
def nItems(n, item, between=None):
"""Works like this:
>>> nItems(4, '<empty>')
'4'
>>> nItems(1, 'clock')
'1 clock'
>>> nItems(10, 'clock')
'10 clocks'
>>> nItems(4, '<empty>', between='grandfather')
'4 grandfather'
>>> nItems(10, 'clock', between='grandfather')
'10 grandfather clocks'
"""
assert isinstance(n, int) or isinstance(n, long), \
'The order of the arguments to nItems changed again, sorry.'
if item == '<empty>':
if between is None:
return format('%s', n)
else:
return format('%s %s', n, item)
if between is None:
if n != 1:
return format('%s %p', n, item)
else:
return format('%s %s', n, item)
else:
if n != 1:
return format('%s %s %p', n, between, item)
else:
return format('%s %s %s', n, between, item)
@internationalizeFunction('ordinal')
def ordinal(i):
"""Returns i + the ordinal indicator for the number.
Example: ordinal(3) => '3rd'
"""
i = int(i)
if i % 100 in (11,12,13):
return '%sth' % i
ord = 'th'
test = i % 10
if test == 1:
ord = 'st'
elif test == 2:
ord = 'nd'
elif test == 3:
ord = 'rd'
return '%s%s' % (i, ord)
@internationalizeFunction('be')
def be(i):
"""Returns the form of the verb 'to be' based on the number i."""
if i == 1:
return 'is'
else:
return 'are'
@internationalizeFunction('has')
def has(i):
"""Returns the form of the verb 'to have' based on the number i."""
if i == 1:
return 'has'
else:
return 'have'
def toBool(s):
s = s.strip().lower()
if s in ('true', 'on', 'enable', 'enabled', '1'):
return True
elif s in ('false', 'off', 'disable', 'disabled', '0'):
return False
else:
raise ValueError, 'Invalid string for toBool: %s' % quoted(s)
# When used with Supybot, this is overriden when supybot.conf is loaded
def timestamp(t):
if t is None:
t = time.time()
return time.ctime(t)
_formatRe = re.compile('%((?:\d+)?\.\d+f|[bfhiLnpqrsStuv%])')
def format(s, *args, **kwargs):
"""w00t.
%: literal %.
i: integer
s: string
f: float
r: repr
b: form of the verb 'to be' (takes an int)
h: form of the verb 'to have' (takes an int)
L: commaAndify (takes a list of strings or a tuple of ([strings], and))
p: pluralize (takes a string)
q: quoted (takes a string)
n: nItems (takes a 2-tuple of (n, item) or a 3-tuple of (n, between, item))
S: returns a human-readable size (takes an int)
t: time, formatted (takes an int)
u: url, wrapped in braces (this should be configurable at some point)
v: void : takes one or many arguments, but doesn't display it
(useful for translation)
"""
args = list(args)
args.reverse() # For more efficient popping.
def sub(match):
char = match.group(1)
if char == 's':
return str(args.pop())
elif char == 'i':
# XXX Improve me!
return str(args.pop())
elif char.endswith('f'):
return ('%'+char) % args.pop()
elif char == 'b':
return be(args.pop())
elif char == 'h':
return has(args.pop())
elif char == 'L':
t = args.pop()
if isinstance(t, list):
return commaAndify(t)
elif isinstance(t, tuple) and len(t) == 2:
if not isinstance(t[0], list):
raise ValueError, \
'Invalid list for %%L in format: %s' % t
if not isinstance(t[1], basestring):
raise ValueError, \
'Invalid string for %%L in format: %s' % t
return commaAndify(t[0], And=t[1])
else:
raise ValueError, 'Invalid value for %%L in format: %s' % t
elif char == 'p':
return pluralize(args.pop())
elif char == 'q':
return quoted(args.pop())
elif char == 'r':
return repr(args.pop())
elif char == 'n':
t = args.pop()
if not isinstance(t, (tuple, list)):
raise ValueError, 'Invalid value for %%n in format: %s' % t
if len(t) == 2:
return nItems(*t)
elif len(t) == 3:
return nItems(t[0], t[2], between=t[1])
else:
raise ValueError, 'Invalid value for %%n in format: %s' % t
elif char == 'S':
t = args.pop()
if not isinstance(t, (int, long)):
raise ValueError, 'Invalid value for %%S in format: %s' % t
for suffix in ['B','KB','MB','GB','TB']:
if t < 1024:
return "%i%s" % (t, suffix)
t /= 1024
elif char == 't':
return timestamp(args.pop())
elif char == 'u':
import supybot.conf as conf
return conf.supybot.reply.format.url() % args.pop()
elif char == 'v':
args.pop()
return ''
elif char == '%':
return '%'
else:
raise ValueError, 'Invalid char in sub (in format).'
try:
return _formatRe.sub(sub, s)
except IndexError:
raise ValueError, 'Extra format chars in format spec: %r' % s
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: