Limnoria/src/utils/str.py

518 lines
17 KiB
Python
Raw Normal View History

###
# Copyright (c) 2002-2005, Jeremiah Fincher
# Copyright (c) 2008-2009, James Vega
2011-01-22 10:28:27 +01:00
# Copyright (c) 2010, Valentin Lorentz
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Simple utility functions related to strings.
"""
import re
import new
2005-01-31 15:52:27 +01:00
import sys
import string
import textwrap
from iter import all, any
2005-03-12 19:01:47 +01:00
from structures import TwoWayDictionary
2011-01-22 10:28:27 +01:00
from supybot.i18n import PluginInternationalization
internationalizeFunction=PluginInternationalization().internationalizeFunction
curry = new.instancemethod
chars = string.maketrans('', '')
def rsplit(s, sep=None, maxsplit=-1):
"""Equivalent to str.split, except splitting from the right."""
if sys.version_info < (2, 4, 0):
if sep is not None:
sep = sep[::-1]
L = s[::-1].split(sep, maxsplit)
L.reverse()
return [s[::-1] for s in L]
else:
return s.rsplit(sep, maxsplit)
2011-01-22 10:28:27 +01:00
def normalizeWhitespace(s, removeNewline=True):
"""Normalizes the whitespace in a string; \s+ becomes one space."""
replace_fn = lambda x, y, z: str.replace(x, y, z)
if isinstance(s, unicode):
replace_fn = lambda x, y, z: unicode.replace(x, y, z)
else:
s = str(s)
2011-01-22 10:28:27 +01:00
if removeNewline:
s = replace_fn(s, '\n', '')
s = replace_fn(s, '\t', ' ')
2011-01-22 10:28:27 +01:00
while ' ' in s:
s = replace_fn(s, ' ', ' ')
2011-01-22 10:28:27 +01:00
return s
def distance(s, t):
"""Returns the levenshtein edit distance between two strings."""
n = len(s)
m = len(t)
if n == 0:
return m
elif m == 0:
return n
d = []
for i in xrange(n+1):
d.append([])
for j in xrange(m+1):
d[i].append(0)
d[0][j] = j
d[i][0] = i
for i in xrange(1, n+1):
cs = s[i-1]
for j in xrange(1, m+1):
ct = t[j-1]
cost = int(cs != ct)
d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
return d[n][m]
_soundextrans = string.maketrans(string.ascii_uppercase,
'01230120022455012623010202')
_notUpper = chars.translate(chars, string.ascii_uppercase)
def soundex(s, length=4):
"""Returns the soundex hash of a given string."""
s = s.upper() # Make everything uppercase.
s = s.translate(chars, _notUpper) # Delete non-letters.
if not s:
raise ValueError, 'Invalid string for soundex: %s'
firstChar = s[0] # Save the first character.
s = s.translate(_soundextrans) # Convert to soundex numbers.
s = s.lstrip(s[0]) # Remove all repeated first characters.
L = [firstChar]
for c in s:
if c != L[-1]:
L.append(c)
L = [c for c in L if c != '0'] + (['0']*(length-1))
s = ''.join(L)
return length and s[:length] or s.rstrip('0')
def dqrepr(s):
"""Returns a repr() of s guaranteed to be in double quotes."""
# The wankers-that-be decided not to use double-quotes anymore in 2.3.
# return '"' + repr("'\x00" + s)[6:]
encoding = 'string_escape' if sys.version_info[0] < 3 else 'unicode_escape'
return '"%s"' % s.encode(encoding).decode().replace('"', '\\"')
def quoted(s):
"""Returns a quoted s."""
return '"%s"' % s
_openers = '{[(<'
_closers = '}])>'
def _getSep(s, allowBraces=False):
if len(s) < 2:
raise ValueError, 'string given to _getSep is too short: %r' % s
if allowBraces:
braces = _closers
else:
braces = _openers + _closers
if s.startswith('m') or s.startswith('s'):
separator = s[1]
else:
separator = s[0]
if separator.isalnum() or separator in braces:
raise ValueError, \
'Invalid separator: separator must not be alphanumeric or in ' \
'"%s"' % braces
return separator
def perlReToPythonRe(s):
"""Converts a string representation of a Perl regular expression (i.e.,
m/^foo$/i or /foo|bar/) to a Python regular expression.
"""
opener = closer = _getSep(s, True)
if opener in '{[(<':
closer = _closers[_openers.index(opener)]
opener = re.escape(opener)
closer = re.escape(closer)
matcher = re.compile(r'm?%s((?:\\.|[^\\])*)%s(.*)' % (opener, closer))
try:
(regexp, flags) = matcher.match(s).groups()
except AttributeError: # Unpack list of wrong size.
raise ValueError, 'Must be of the form m/.../ or /.../'
regexp = regexp.replace('\\'+opener, opener)
if opener != closer:
regexp = regexp.replace('\\'+closer, closer)
flag = 0
try:
for c in flags.upper():
flag |= getattr(re, c)
except AttributeError:
raise ValueError, 'Invalid flag: %s' % c
try:
return re.compile(regexp, flag)
except re.error, e:
raise ValueError, str(e)
def perlReToReplacer(s):
"""Converts a string representation of a Perl regular expression (i.e.,
s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent
replacement.
"""
sep = _getSep(s)
escaped = re.escape(sep)
matcher = re.compile(r's%s((?:\\.|[^\\])*)%s((?:\\.|[^\\])*)%s(.*)'
% (escaped, escaped, escaped))
try:
(regexp, replace, flags) = matcher.match(s).groups()
except AttributeError: # Unpack list of wrong size.
raise ValueError, 'Must be of the form s/.../.../'
regexp = regexp.replace('\x08', r'\b')
replace = replace.replace('\\'+sep, sep)
for i in xrange(10):
replace = replace.replace(chr(i), r'\%s' % i)
g = False
if 'g' in flags:
g = True
flags = filter('g'.__ne__, flags)
r = perlReToPythonRe(sep.join(('', regexp, flags)))
if g:
return curry(r.sub, replace)
else:
return lambda s: r.sub(replace, s, 1)
_perlVarSubstituteRe = re.compile(r'\$\{([^}]+)\}|\$([a-zA-Z][a-zA-Z0-9]*)')
def perlVariableSubstitute(vars, text):
def replacer(m):
(braced, unbraced) = m.groups()
var = braced or unbraced
try:
x = vars[var]
if callable(x):
return x()
else:
return str(x)
except KeyError:
if braced:
return '${%s}' % braced
else:
return '$' + unbraced
return _perlVarSubstituteRe.sub(replacer, text)
class MultipleReplacer:
"""Return a callable that replaces all dict keys by the associated
2012-08-03 09:46:00 +02:00
value. More efficient than multiple .replace()."""
2012-08-03 09:39:31 +02:00
# We use an object instead of a lambda function because it avoids the
# need for using the staticmethod() on the lambda function if assigning
# it to a class in Python 3.
def __init__(self, dict_):
self._dict = dict_
dict_ = {re.escape(key): val for key,val in dict_.items()}
self._matcher = re.compile('|'.join(dict_.keys()))
def __call__(self, s):
return self._matcher.sub(lambda m: self._dict[m.group(0)], s)
def multipleReplacer(dict_):
return MultipleReplacer(dict_)
class MultipleRemover:
"""Return a callable that removes all words in the list. A bit more
2012-08-03 09:46:00 +02:00
efficient than multipleReplacer"""
# See comment of MultipleReplacer
def __init__(self, list_):
list_ = [re.escape(x) for x in list_]
self._matcher = re.compile('|'.join(list_))
def __call__(self, s):
return self._matcher.sub(lambda m: '', s)
2012-08-03 09:46:00 +02:00
def commaAndify(seq, comma=',', And='and'):
"""Given a a sequence, returns an English clause for that sequence.
I.e., given [1, 2, 3], returns '1, 2, and 3'
"""
L = list(seq)
if len(L) == 0:
return ''
elif len(L) == 1:
return ''.join(L) # We need this because it raises TypeError.
elif len(L) == 2:
L.insert(1, And)
return ' '.join(L)
else:
L[-1] = '%s %s' % (And, L[-1])
sep = '%s ' % comma
return sep.join(L)
_unCommaTheRe = re.compile(r'(.*),\s*(the)$', re.I)
def unCommaThe(s):
"""Takes a string of the form 'foo, the' and turns it into 'the foo'."""
m = _unCommaTheRe.match(s)
if m is not None:
return '%s %s' % (m.group(2), m.group(1))
else:
return s
def ellipsisify(s, n):
"""Returns a shortened version of s. Produces up to the first n chars at
the nearest word boundary.
"""
if len(s) <= n:
return s
else:
return (textwrap.wrap(s, n-3)[0] + '...')
2005-03-12 19:01:47 +01:00
plurals = TwoWayDictionary({})
def matchCase(s1, s2):
"""Matches the case of s1 in s2"""
if s1.isupper():
return s2.upper()
else:
L = list(s2)
for (i, char) in enumerate(s1[:len(s2)]):
if char.isupper():
L[i] = L[i].upper()
return ''.join(L)
2011-01-22 10:28:27 +01:00
@internationalizeFunction('pluralize')
def pluralize(s):
"""Returns the plural of s. Put any exceptions to the general English
rule of appending 's' in the plurals dictionary.
"""
2011-01-22 10:28:27 +01:00
consonants = 'bcdfghjklmnpqrstvwxz'
_pluralizeRegex = re.compile('[%s]y$' % consonants)
lowered = s.lower()
# Exception dictionary
if lowered in plurals:
return matchCase(s, plurals[lowered])
# Words ending with 'ch', 'sh' or 'ss' such as 'punch(es)', 'fish(es)
# and miss(es)
elif any(lowered.endswith, ['x', 'ch', 'sh', 'ss']):
return matchCase(s, s+'es')
# Words ending with a consonant followed by a 'y' such as
# 'try (tries)' or 'spy (spies)'
elif _pluralizeRegex.search(lowered):
return matchCase(s, s[:-1] + 'ies')
# In all other cases, we simply add an 's' to the base word
else:
return matchCase(s, s+'s')
2011-01-22 10:28:27 +01:00
@internationalizeFunction('depluralize')
def depluralize(s):
"""Returns the singular of s."""
2011-01-22 10:28:27 +01:00
consonants = 'bcdfghjklmnpqrstvwxz'
_depluralizeRegex = re.compile('[%s]ies' % consonants)
lowered = s.lower()
if lowered in plurals:
return matchCase(s, plurals[lowered])
elif any(lowered.endswith, ['ches', 'shes', 'sses']):
return s[:-2]
elif re.search(_depluralizeRegex, lowered):
return s[:-3] + 'y'
else:
if lowered.endswith('s'):
return s[:-1] # Chop off 's'.
else:
return s # Don't know what to do.
def nItems(n, item, between=None):
"""Works like this:
2011-01-22 10:28:27 +01:00
>>> nItems(4, '<empty>')
'4'
>>> nItems(1, 'clock')
'1 clock'
>>> nItems(10, 'clock')
'10 clocks'
2011-01-22 10:28:27 +01:00
>>> nItems(4, '<empty>', between='grandfather')
'4 grandfather'
>>> nItems(10, 'clock', between='grandfather')
'10 grandfather clocks'
"""
assert isinstance(n, int) or isinstance(n, long), \
'The order of the arguments to nItems changed again, sorry.'
2011-01-22 10:28:27 +01:00
if item == '<empty>':
if between is None:
return format('%s', n)
else:
return format('%s %s', n, item)
if between is None:
if n != 1:
return format('%s %p', n, item)
else:
return format('%s %s', n, item)
else:
if n != 1:
return format('%s %s %p', n, between, item)
else:
return format('%s %s %s', n, between, item)
2011-01-22 10:28:27 +01:00
@internationalizeFunction('ordinal')
def ordinal(i):
"""Returns i + the ordinal indicator for the number.
Example: ordinal(3) => '3rd'
"""
2005-05-08 03:46:35 +02:00
i = int(i)
if i % 100 in (11,12,13):
return '%sth' % i
ord = 'th'
2005-05-08 03:46:35 +02:00
test = i % 10
if test == 1:
ord = 'st'
elif test == 2:
ord = 'nd'
elif test == 3:
ord = 'rd'
return '%s%s' % (i, ord)
2005-05-08 03:46:35 +02:00
2011-01-22 10:28:27 +01:00
@internationalizeFunction('be')
def be(i):
"""Returns the form of the verb 'to be' based on the number i."""
if i == 1:
return 'is'
else:
return 'are'
2011-01-22 10:28:27 +01:00
@internationalizeFunction('has')
def has(i):
"""Returns the form of the verb 'to have' based on the number i."""
if i == 1:
return 'has'
else:
return 'have'
def toBool(s):
s = s.strip().lower()
if s in ('true', 'on', 'enable', 'enabled', '1'):
return True
elif s in ('false', 'off', 'disable', 'disabled', '0'):
return False
else:
raise ValueError, 'Invalid string for toBool: %s' % quoted(s)
# When used with Supybot, this is overriden when supybot.conf is loaded
def timestamp(t):
if t is None:
t = time.time()
return time.ctime(t)
2011-01-22 10:28:27 +01:00
_formatRe = re.compile('%((?:\d+)?\.\d+f|[bfhiLnpqrsStuv%])')
def format(s, *args, **kwargs):
"""w00t.
%: literal %.
i: integer
s: string
f: float
r: repr
b: form of the verb 'to be' (takes an int)
h: form of the verb 'to have' (takes an int)
L: commaAndify (takes a list of strings or a tuple of ([strings], and))
p: pluralize (takes a string)
q: quoted (takes a string)
n: nItems (takes a 2-tuple of (n, item) or a 3-tuple of (n, between, item))
2011-01-22 10:28:27 +01:00
S: returns a human-readable size (takes an int)
2005-01-28 16:30:15 +01:00
t: time, formatted (takes an int)
u: url, wrapped in braces (this should be configurable at some point)
2011-01-22 10:28:27 +01:00
v: void : takes one or many arguments, but doesn't display it
(useful for translation)
"""
args = list(args)
args.reverse() # For more efficient popping.
def sub(match):
char = match.group(1)
if char == 's':
return str(args.pop())
elif char == 'i':
# XXX Improve me!
return str(args.pop())
elif char.endswith('f'):
return ('%'+char) % args.pop()
elif char == 'b':
return be(args.pop())
elif char == 'h':
return has(args.pop())
elif char == 'L':
t = args.pop()
if isinstance(t, list):
return commaAndify(t)
elif isinstance(t, tuple) and len(t) == 2:
if not isinstance(t[0], list):
raise ValueError, \
'Invalid list for %%L in format: %s' % t
if not isinstance(t[1], basestring):
raise ValueError, \
'Invalid string for %%L in format: %s' % t
return commaAndify(t[0], And=t[1])
else:
raise ValueError, 'Invalid value for %%L in format: %s' % t
elif char == 'p':
return pluralize(args.pop())
elif char == 'q':
return quoted(args.pop())
elif char == 'r':
return repr(args.pop())
elif char == 'n':
t = args.pop()
if not isinstance(t, (tuple, list)):
raise ValueError, 'Invalid value for %%n in format: %s' % t
if len(t) == 2:
return nItems(*t)
elif len(t) == 3:
return nItems(t[0], t[2], between=t[1])
else:
raise ValueError, 'Invalid value for %%n in format: %s' % t
2011-01-22 10:28:27 +01:00
elif char == 'S':
t = args.pop()
if not isinstance(t, (int, long)):
raise ValueError, 'Invalid value for %%S in format: %s' % t
for suffix in ['B','KB','MB','GB','TB']:
if t < 1024:
return "%i%s" % (t, suffix)
t /= 1024
elif char == 't':
return timestamp(args.pop())
elif char == 'u':
import supybot.conf as conf
return conf.supybot.reply.format.url() % args.pop()
2011-01-22 10:28:27 +01:00
elif char == 'v':
args.pop()
return ''
elif char == '%':
return '%'
else:
raise ValueError, 'Invalid char in sub (in format).'
try:
return _formatRe.sub(sub, s)
except IndexError:
raise ValueError, 'Extra format chars in format spec: %r' % s
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: