### # Copyright (c) 2002-2005, Jeremiah Fincher # Copyright (c) 2008-2009, James McCoy # Copyright (c) 2010, Valentin Lorentz # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions, and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions, and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the author of this software nor the name of # contributors to this software may be used to endorse or promote products # derived from this software without specific prior written consent. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. ### """ Simple utility functions related to strings. """ import re import sys import string import textwrap from iter import all, any from structures import TwoWayDictionary from supybot.i18n import PluginInternationalization _ = PluginInternationalization() internationalizeFunction = _.internationalizeFunction def rsplit(s, sep=None, maxsplit=-1): """Equivalent to str.split, except splitting from the right.""" return s.rsplit(sep, maxsplit) def normalizeWhitespace(s, removeNewline=True): """Normalizes the whitespace in a string; \s+ becomes one space.""" if not s: return str(s) # not the same reference starts_with_space = (s[0] in ' \n\t') ends_with_space = (s[-1] in ' \n\t') if removeNewline: s = ' '.join(filter(bool, s.split('\n'))) s = ' '.join(filter(bool, s.split('\t'))) s = ' '.join(filter(bool, s.split(' '))) if starts_with_space: s = ' ' + s if ends_with_space: s += ' ' return s def distance(s, t): """Returns the levenshtein edit distance between two strings.""" n = len(s) m = len(t) if n == 0: return m elif m == 0: return n d = [] for i in xrange(n+1): d.append([]) for j in xrange(m+1): d[i].append(0) d[0][j] = j d[i][0] = i for i in xrange(1, n+1): cs = s[i-1] for j in xrange(1, m+1): ct = t[j-1] cost = int(cs != ct) d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost) return d[n][m] class MultipleReplacer: """Return a callable that replaces all dict keys by the associated value. More efficient than multiple .replace().""" # We use an object instead of a lambda function because it avoids the # need for using the staticmethod() on the lambda function if assigning # it to a class in Python 3. def __init__(self, dict_): self._dict = dict_ dict_ = dict([(re.escape(key), val) for key,val in dict_.items()]) self._matcher = re.compile('|'.join(dict_.keys())) def __call__(self, s): return self._matcher.sub(lambda m: self._dict[m.group(0)], s) def multipleReplacer(dict_): return MultipleReplacer(dict_) class MultipleRemover: """Return a callable that removes all words in the list. A bit more efficient than multipleReplacer""" # See comment of MultipleReplacer def __init__(self, list_): list_ = [re.escape(x) for x in list_] self._matcher = re.compile('|'.join(list_)) def __call__(self, s): return self._matcher.sub(lambda m: '', s) _soundextrans = MultipleReplacer(dict(zip(string.ascii_uppercase, '01230120022455012623010202'))) def soundex(s, length=4): """Returns the soundex hash of a given string.""" s = s.upper() # Make everything uppercase. s = ''.join([x for x in s if x in string.ascii_uppercase]) if not s: raise ValueError, 'Invalid string for soundex: %s' firstChar = s[0] # Save the first character. s = _soundextrans(s) # Convert to soundex numbers. s = s.lstrip(s[0]) # Remove all repeated first characters. L = [firstChar] for c in s: if c != L[-1]: L.append(c) L = [c for c in L if c != '0'] + (['0']*(length-1)) s = ''.join(L) return length and s[:length] or s.rstrip('0') def dqrepr(s): """Returns a repr() of s guaranteed to be in double quotes.""" # The wankers-that-be decided not to use double-quotes anymore in 2.3. # return '"' + repr("'\x00" + s)[6:] encoding = 'string_escape' if sys.version_info[0] < 3 else 'unicode_escape' if sys.version_info[0] < 3 and isinstance(s, unicode): s = s.encode('utf8', 'replace') return '"%s"' % s.encode(encoding).decode().replace('"', '\\"') def quoted(s): """Returns a quoted s.""" return '"%s"' % s _openers = '{[(<' _closers = '}])>' def _getSep(s, allowBraces=False): if len(s) < 2: raise ValueError, 'string given to _getSep is too short: %r' % s if allowBraces: braces = _closers else: braces = _openers + _closers if s.startswith('m') or s.startswith('s'): separator = s[1] else: separator = s[0] if separator.isalnum() or separator in braces: raise ValueError, \ 'Invalid separator: separator must not be alphanumeric or in ' \ '"%s"' % braces return separator def perlReToPythonRe(s): """Converts a string representation of a Perl regular expression (i.e., m/^foo$/i or /foo|bar/) to a Python regular expression. """ opener = closer = _getSep(s, True) if opener in '{[(<': closer = _closers[_openers.index(opener)] opener = re.escape(opener) closer = re.escape(closer) matcher = re.compile(r'm?%s((?:\\.|[^\\])*)%s(.*)' % (opener, closer)) try: (regexp, flags) = matcher.match(s).groups() except AttributeError: # Unpack list of wrong size. raise ValueError, 'Must be of the form m/.../ or /.../' regexp = regexp.replace('\\'+opener, opener) if opener != closer: regexp = regexp.replace('\\'+closer, closer) flag = 0 try: for c in flags.upper(): flag |= getattr(re, c) except AttributeError: raise ValueError, 'Invalid flag: %s' % c try: return re.compile(regexp, flag) except re.error, e: raise ValueError, str(e) def perlReToReplacer(s): """Converts a string representation of a Perl regular expression (i.e., s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent replacement. """ sep = _getSep(s) escaped = re.escape(sep) matcher = re.compile(r's%s((?:\\.|[^\\])*)%s((?:\\.|[^\\])*)%s(.*)' % (escaped, escaped, escaped)) try: (regexp, replace, flags) = matcher.match(s).groups() except AttributeError: # Unpack list of wrong size. raise ValueError, 'Must be of the form s/.../.../' regexp = regexp.replace('\x08', r'\b') replace = replace.replace('\\'+sep, sep) for i in xrange(10): replace = replace.replace(chr(i), r'\%s' % i) g = False if 'g' in flags: g = True flags = filter('g'.__ne__, flags) if isinstance(flags, list): flags = ''.join(flags) r = perlReToPythonRe(sep.join(('', regexp, flags))) if g: return lambda s: r.sub(replace, s) else: return lambda s: r.sub(replace, s, 1) _perlVarSubstituteRe = re.compile(r'\$\{([^}]+)\}|\$([a-zA-Z][a-zA-Z0-9]*)') def perlVariableSubstitute(vars, text): def replacer(m): (braced, unbraced) = m.groups() var = braced or unbraced try: x = vars[var] if callable(x): return x() else: return str(x) except KeyError: if braced: return '${%s}' % braced else: return '$' + unbraced return _perlVarSubstituteRe.sub(replacer, text) def commaAndify(seq, comma=',', And=None): """Given a a sequence, returns an English clause for that sequence. I.e., given [1, 2, 3], returns '1, 2, and 3' """ if And is None: And = _('and') L = list(seq) if len(L) == 0: return '' elif len(L) == 1: return ''.join(L) # We need this because it raises TypeError. elif len(L) == 2: L.insert(1, And) return ' '.join(L) else: L[-1] = '%s %s' % (And, L[-1]) sep = '%s ' % comma return sep.join(L) _unCommaTheRe = re.compile(r'(.*),\s*(the)$', re.I) def unCommaThe(s): """Takes a string of the form 'foo, the' and turns it into 'the foo'.""" m = _unCommaTheRe.match(s) if m is not None: return '%s %s' % (m.group(2), m.group(1)) else: return s def ellipsisify(s, n): """Returns a shortened version of s. Produces up to the first n chars at the nearest word boundary. """ if len(s) <= n: return s else: return (textwrap.wrap(s, n-3)[0] + '...') plurals = TwoWayDictionary({}) def matchCase(s1, s2): """Matches the case of s1 in s2""" if s1.isupper(): return s2.upper() else: L = list(s2) for (i, char) in enumerate(s1[:len(s2)]): if char.isupper(): L[i] = L[i].upper() return ''.join(L) @internationalizeFunction('pluralize') def pluralize(s): """Returns the plural of s. Put any exceptions to the general English rule of appending 's' in the plurals dictionary. """ consonants = 'bcdfghjklmnpqrstvwxz' _pluralizeRegex = re.compile('[%s]y$' % consonants) lowered = s.lower() # Exception dictionary if lowered in plurals: return matchCase(s, plurals[lowered]) # Words ending with 'ch', 'sh' or 'ss' such as 'punch(es)', 'fish(es) # and miss(es) elif any(lowered.endswith, ['x', 'ch', 'sh', 'ss']): return matchCase(s, s+'es') # Words ending with a consonant followed by a 'y' such as # 'try (tries)' or 'spy (spies)' elif _pluralizeRegex.search(lowered): return matchCase(s, s[:-1] + 'ies') # In all other cases, we simply add an 's' to the base word else: return matchCase(s, s+'s') @internationalizeFunction('depluralize') def depluralize(s): """Returns the singular of s.""" consonants = 'bcdfghjklmnpqrstvwxz' _depluralizeRegex = re.compile('[%s]ies' % consonants) lowered = s.lower() if lowered in plurals: return matchCase(s, plurals[lowered]) elif any(lowered.endswith, ['ches', 'shes', 'sses']): return s[:-2] elif re.search(_depluralizeRegex, lowered): return s[:-3] + 'y' else: if lowered.endswith('s'): return s[:-1] # Chop off 's'. else: return s # Don't know what to do. def nItems(n, item, between=None): """Works like this: >>> nItems(4, '') '4' >>> nItems(1, 'clock') '1 clock' >>> nItems(10, 'clock') '10 clocks' >>> nItems(4, '', between='grandfather') '4 grandfather' >>> nItems(10, 'clock', between='grandfather') '10 grandfather clocks' """ assert isinstance(n, int) or isinstance(n, long), \ 'The order of the arguments to nItems changed again, sorry.' if item == '': if between is None: return format('%s', n) else: return format('%s %s', n, item) if between is None: if n != 1: return format('%s %p', n, item) else: return format('%s %s', n, item) else: if n != 1: return format('%s %s %p', n, between, item) else: return format('%s %s %s', n, between, item) @internationalizeFunction('ordinal') def ordinal(i): """Returns i + the ordinal indicator for the number. Example: ordinal(3) => '3rd' """ i = int(i) if i % 100 in (11,12,13): return '%sth' % i ord = 'th' test = i % 10 if test == 1: ord = 'st' elif test == 2: ord = 'nd' elif test == 3: ord = 'rd' return '%s%s' % (i, ord) @internationalizeFunction('be') def be(i): """Returns the form of the verb 'to be' based on the number i.""" if i == 1: return 'is' else: return 'are' @internationalizeFunction('has') def has(i): """Returns the form of the verb 'to have' based on the number i.""" if i == 1: return 'has' else: return 'have' def toBool(s): s = s.strip().lower() if s in ('true', 'on', 'enable', 'enabled', '1'): return True elif s in ('false', 'off', 'disable', 'disabled', '0'): return False else: raise ValueError, 'Invalid string for toBool: %s' % quoted(s) # When used with Supybot, this is overriden when supybot.conf is loaded def timestamp(t): if t is None: t = time.time() return time.ctime(t) _formatRe = re.compile('%((?:\d+)?\.\d+f|[bfhiLnpqrsStTuv%])') def format(s, *args, **kwargs): """w00t. %: literal %. i: integer s: string f: float r: repr b: form of the verb 'to be' (takes an int) h: form of the verb 'to have' (takes an int) L: commaAndify (takes a list of strings or a tuple of ([strings], and)) p: pluralize (takes a string) q: quoted (takes a string) n: nItems (takes a 2-tuple of (n, item) or a 3-tuple of (n, between, item)) S: returns a human-readable size (takes an int) t: time, formatted (takes an int) T: time delta, formatted (takes an int) u: url, wrapped in braces (this should be configurable at some point) v: void : takes one or many arguments, but doesn't display it (useful for translation) """ # Note to developers: If you want to add an argument type, do not forget # to add the character to the _formatRe regexp or it will be ignored # (and hard to debug if you don't know the trick). # Of course, you should also document it in the docstring above. args = list(args) args.reverse() # For more efficient popping. def sub(match): char = match.group(1) if char == 's': token = args.pop() if isinstance(token, str): return token elif sys.version_info[0] < 3 and isinstance(token, unicode): return token.encode('utf8') else: return str(token) elif char == 'i': # XXX Improve me! return str(args.pop()) elif char.endswith('f'): return ('%'+char) % args.pop() elif char == 'b': return be(args.pop()) elif char == 'h': return has(args.pop()) elif char == 'L': t = args.pop() if isinstance(t, list) or (sys.version_info[0] >= 3 and (isinstance(t, map) or isinstance(t, filter))): return commaAndify(t) elif isinstance(t, tuple) and len(t) == 2: if not isinstance(t[0], list): raise ValueError, \ 'Invalid list for %%L in format: %s' % t if not isinstance(t[1], basestring): raise ValueError, \ 'Invalid string for %%L in format: %s' % t return commaAndify(t[0], And=t[1]) else: raise ValueError, 'Invalid value for %%L in format: %s' % t elif char == 'p': return pluralize(args.pop()) elif char == 'q': return quoted(args.pop()) elif char == 'r': return repr(args.pop()) elif char == 'n': t = args.pop() if not isinstance(t, (tuple, list)): raise ValueError, 'Invalid value for %%n in format: %s' % t if len(t) == 2: return nItems(*t) elif len(t) == 3: return nItems(t[0], t[2], between=t[1]) else: raise ValueError, 'Invalid value for %%n in format: %s' % t elif char == 'S': t = args.pop() if not isinstance(t, (int, long)): raise ValueError, 'Invalid value for %%S in format: %s' % t for suffix in ['B','KB','MB','GB','TB']: if t < 1024: return "%i%s" % (t, suffix) t /= 1024 elif char == 't': return timestamp(args.pop()) elif char == 'T': from gen import timeElapsed return timeElapsed(args.pop()) elif char == 'u': import supybot.conf as conf url = args.pop() if url: return conf.supybot.reply.format.url() % url else: return '' elif char == 'v': args.pop() return '' elif char == '%': return '%' else: raise ValueError, 'Invalid char in sub (in format).' try: return _formatRe.sub(sub, s) except IndexError: raise ValueError, 'Extra format chars in format spec: %r' % s # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: