Limnoria/src/utils.py

#!/usr/bin/env python

###
# Copyright (c) 2002, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions, and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions, and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#   * Neither the name of the author of this software nor the name of
#     contributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###

"""
Simple utility functions.
"""

## from __future__ import generators

from fix import *

import re
import string
import sgmllib
import htmlentitydefs

class HtmlToText(sgmllib.SGMLParser):
    """Taken from some eff-bot code on c.l.p."""
    entitydefs = htmlentitydefs.entitydefs
    def __init__(self, tagReplace=' '):
        self.data = []
        self.tagReplace = tagReplace
        sgmllib.SGMLParser.__init__(self)

    def unknown_starttag(self, tag, attrib):
        self.data.append(self.tagReplace)

    def unknown_endtag(self, tag):
        self.data.append(self.tagReplace)

    def handle_data(self, data):
        self.data.append(data)

    def getText(self):
        text = ''.join(self.data).strip()
        return ' '.join(text.split()) # normalize whitespace

def htmlToText(s, tagReplace=' '):
    """Turns HTML into text.  tagReplace is a string to replace HTML tags with.
    """
    x = HtmlToText(tagReplace)
    x.feed(s)
    return x.getText()

def eachSubstring(s):
    """Returns every substring starting at the first index until the last."""
    for i in xrange(1, len(s)+1):
        yield s[:i]

def abbrev(strings):
    """Returns a dictionary mapping unambiguous abbreviations to full forms."""
    d = {}
    for s in strings:
        for abbreviation in eachSubstring(s):
            if abbreviation not in d:
                d[abbreviation] = s
            else:
                if abbreviation not in strings:
                    d[abbreviation] = None
    removals = []
    for key in d:
        if d[key] is None:
            removals.append(key)
    for key in removals:
        del d[key]
    return d

def timeElapsed(elapsed, leadingZeroes=False, years=True, weeks=True,
                days=True, hours=True, minutes=True, seconds=True):
    """Given <elapsed> seconds, returns a string with an English description of
    how much time as passed.  leadingZeroes determines whether 0 days, 0 hours,
    etc. will be printed; the others determine what larger time periods should
    be used.
    """
    elapsed = int(elapsed)
    assert years or weeks or days or \
           hours or minutes or seconds, 'One flag must be True'
    ret = []
    if years:
        yrs, elapsed = elapsed // 31536000, elapsed % 31536000
        if leadingZeroes or yrs:
            if yrs:
                leadingZeroes = True
            if yrs != 1:
                yrs = '%s years' % yrs
            else:
                yrs = '1 year'
            ret.append(yrs)
    if weeks:
        wks, elapsed = elapsed // 604800, elapsed % 604800
        if leadingZeroes or wks:
            if wks:
                leadingZeroes = True
            if wks != 1:
                wks = '%s weeks' % wks
            else:
                wks = '1 week'
            ret.append(wks)
    if days:
        ds, elapsed = elapsed // 86400, elapsed % 86400
        if leadingZeroes or ds:
            if ds:
                leadingZeroes = True
            if ds != 1:
                ds = '%s days' % ds
            else:
                ds = '1 day'
            ret.append(ds)
    if hours:
        hrs, elapsed = elapsed // 3600, elapsed % 3600
        if leadingZeroes or hrs:
            if hrs:
                leadingZeroes = True
            if hrs != 1:
                hrs = '%s hours' % hrs
            else:
                hrs = '1 hour'
            ret.append(hrs)
    if minutes or seconds:
        mins, secs = elapsed // 60, elapsed % 60
        if leadingZeroes or mins:
            if mins != 1:
                mins = '%s minutes' % mins
            else:
                mins = '1 minute'
            ret.append(mins)
        if seconds:
            if secs != 1:
                secs = '%s seconds' % secs
            else:
                secs = '1 second'
            ret.append(secs)
    if len(ret) == 0:
        raise ValueError, 'Time difference not great enough to be noted.'
    if len(ret) == 1:
        return ret[0]
    else:
        return ' and '.join([', '.join(ret[:-1]), ret[-1]])
        
def distance(s, t):
    """Returns the levenshtein edit distance between two strings."""
    n = len(s)
    m = len(t)
    if n == 0:
        return m
    elif m == 0:
        return n
    d = []
    for i in range(n+1):
        d.append([])
        for j in range(m+1):
            d[i].append(0)
            d[0][j] = j
        d[i][0] = i
    for i in range(1, n+1):
        cs = s[i-1]
        for j in range(1, m+1):
            ct = t[j-1]
            cost = int(cs != ct)
            d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
    return d[n][m]

_soundextrans = string.maketrans(string.ascii_uppercase,
                                 '01230120022455012623010202')
_notUpper = string.ascii.translate(string.ascii, string.ascii_uppercase)
def soundex(s, length=4):
    """Returns the soundex hash of a given string."""
    assert s
    s = s.upper() # Make everything uppercase.
    firstChar = s[0] # Save the first character.
    s = s.translate(string.ascii, _notUpper) # Delete non-letters.
    s = s.translate(_soundextrans) # Convert to soundex numbers.
    s = s.lstrip(s[0]) # Remove all repeated first characters.
    L = [firstChar]
    for c in s:
        if c != L[-1]:
            L.append(c)
    L = [c for c in L if c != '0'] + (['0']*(length-1))
    s = ''.join(L)
    return length and s[:length] or s.rstrip('0')

def dqrepr(s):
    """Returns a repr() of s guaranteed to be in double quotes."""
    # The wankers-that-be decided not to use double-quotes anymore in 2.3.
    # return '"' + repr("'\x00" + s)[6:]
    return '"%s"' % s.encode('string_escape').replace('"', '\\"')

nonEscapedSlashes = re.compile(r'(?<!\\)/')
def perlReToPythonRe(s):
    """Converts a string representation of a Perl regular expression (i.e.,
    m/^foo$/i or /foo|bar/) to a Python regular expression.
    """
    (kind, regexp, flags) = nonEscapedSlashes.split(s)
    regexp = regexp.replace('\\/', '/')
    if kind not in ('', 'm'):
        raise ValueError, 'Invalid kind: must be in ("", "m")'
    flag = 0
    try:
        for c in flags.upper():
            flag &= getattr(re, c)
    except AttributeError:
        raise ValueError, 'Invalid flag: %s' % c
    return re.compile(regexp, flag)
    
def perlReToReplacer(s):
    """Converts a string representation of a Perl regular expression (i.e.,
    s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent
    replacement.
    """
    (kind, regexp, replace, flags) = nonEscapedSlashes.split(s)
    if kind != 's':
        raise ValueError, 'Invalid kind: must be "s"'
    g = False
    if 'g' in flags:
        g = True
        flags = filter('g'.__ne__, flags)
    r = perlReToPythonRe('/'.join(('', regexp, flags)))
    if g:
        return lambda s: r.sub(replace, s)
    else:
        return lambda s: r.sub(replace, s, 1)

# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`#!/usr/bin/env python`

			`###`
			`# Copyright (c) 2002, Jeremiah Fincher`
			`# All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions, and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions, and the following disclaimer in the`
			`# documentation and/or other materials provided with the distribution.`
			`# * Neither the name of the author of this software nor the name of`
			`# contributors to this software may be used to endorse or promote products`
			`# derived from this software without specific prior written consent.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`
			`###`

			`"""`
			`Simple utility functions.`
			`"""`

Updated stuff for 2.3 to break 2.2 compatibility :) 2003-08-07 04:48:44 +02:00			`## from __future__ import generators`
Added __future__ import 2003-03-31 08:55:34 +02:00
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`from fix import *`

Added perlReToPythonRe and perlReToReplacer and associated tests. 2003-04-16 07:26:24 +02:00			`import re`
Added soundex to utils (along with a test) and to FunCommands. 2003-04-05 13:23:28 +02:00			`import string`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`import sgmllib`
			`import htmlentitydefs`

			`class HtmlToText(sgmllib.SGMLParser):`
			`"""Taken from some eff-bot code on c.l.p."""`
			`entitydefs = htmlentitydefs.entitydefs`
Added keyword arg to htmlToText to determine how to replace HTML tags. 2003-04-02 11:20:49 +02:00			`def __init__(self, tagReplace=' '):`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`self.data = []`
Added keyword arg to htmlToText to determine how to replace HTML tags. 2003-04-02 11:20:49 +02:00			`self.tagReplace = tagReplace`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`sgmllib.SGMLParser.__init__(self)`

			`def unknown_starttag(self, tag, attrib):`
Added keyword arg to htmlToText to determine how to replace HTML tags. 2003-04-02 11:20:49 +02:00			`self.data.append(self.tagReplace)`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00
			`def unknown_endtag(self, tag):`
Added keyword arg to htmlToText to determine how to replace HTML tags. 2003-04-02 11:20:49 +02:00			`self.data.append(self.tagReplace)`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00
			`def handle_data(self, data):`
			`self.data.append(data)`

			`def getText(self):`
			`text = ''.join(self.data).strip()`
Fixed whitespace bug in HTML stripper. 2003-03-27 21:14:17 +01:00			`return ' '.join(text.split()) # normalize whitespace`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00
Added keyword arg to htmlToText to determine how to replace HTML tags. 2003-04-02 11:20:49 +02:00			`def htmlToText(s, tagReplace=' '):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Turns HTML into text. tagReplace is a string to replace HTML tags with.`
			`"""`
Added keyword arg to htmlToText to determine how to replace HTML tags. 2003-04-02 11:20:49 +02:00			`x = HtmlToText(tagReplace)`
Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`x.feed(s)`
			`return x.getText()`

Added abbrev 2003-03-31 07:14:21 +02:00			`def eachSubstring(s):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Returns every substring starting at the first index until the last."""`
			`for i in xrange(1, len(s)+1):`
Added abbrev 2003-03-31 07:14:21 +02:00			`yield s[:i]`

			`def abbrev(strings):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Returns a dictionary mapping unambiguous abbreviations to full forms."""`
Added abbrev 2003-03-31 07:14:21 +02:00			`d = {}`
			`for s in strings:`
			`for abbreviation in eachSubstring(s):`
			`if abbreviation not in d:`
			`d[abbreviation] = s`
			`else:`
Added test for abbrev and fixed a bug it found. 2003-04-04 18:14:58 +02:00			`if abbreviation not in strings:`
			`d[abbreviation] = None`
Added abbrev 2003-03-31 07:14:21 +02:00			`removals = []`
			`for key in d:`
			`if d[key] is None:`
			`removals.append(key)`
			`for key in removals:`
			`del d[key]`
			`return d`

Changed utils.timeElapsed to use just a seconds instead of a now/then argument. 2003-05-20 17:37:25 +02:00			`def timeElapsed(elapsed, leadingZeroes=False, years=True, weeks=True,`
Added timeElapsed and converted FunCommands to use it. 2003-04-03 11:11:57 +02:00			`days=True, hours=True, minutes=True, seconds=True):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Given <elapsed> seconds, returns a string with an English description of`
			`how much time as passed. leadingZeroes determines whether 0 days, 0 hours,`
			`etc. will be printed; the others determine what larger time periods should`
			`be used.`
			`"""`
Made sure utils.timeElapsed doesn't go all decimal on me. 2003-06-03 05:18:14 +02:00			`elapsed = int(elapsed)`
Changed utils.timeElapsed to use just a seconds instead of a now/then argument. 2003-05-20 17:37:25 +02:00			`assert years or weeks or days or \`
			`hours or minutes or seconds, 'One flag must be True'`
Added timeElapsed and converted FunCommands to use it. 2003-04-03 11:11:57 +02:00			`ret = []`
			`if years:`
			`yrs, elapsed = elapsed // 31536000, elapsed % 31536000`
			`if leadingZeroes or yrs:`
			`if yrs:`
			`leadingZeroes = True`
			`if yrs != 1:`
			`yrs = '%s years' % yrs`
			`else:`
			`yrs = '1 year'`
			`ret.append(yrs)`
			`if weeks:`
			`wks, elapsed = elapsed // 604800, elapsed % 604800`
			`if leadingZeroes or wks:`
			`if wks:`
			`leadingZeroes = True`
			`if wks != 1:`
			`wks = '%s weeks' % wks`
			`else:`
			`wks = '1 week'`
			`ret.append(wks)`
			`if days:`
			`ds, elapsed = elapsed // 86400, elapsed % 86400`
			`if leadingZeroes or ds:`
			`if ds:`
			`leadingZeroes = True`
			`if ds != 1:`
			`ds = '%s days' % ds`
			`else:`
			`ds = '1 day'`
			`ret.append(ds)`
			`if hours:`
			`hrs, elapsed = elapsed // 3600, elapsed % 3600`
			`if leadingZeroes or hrs:`
			`if hrs:`
			`leadingZeroes = True`
			`if hrs != 1:`
			`hrs = '%s hours' % hrs`
			`else:`
			`hrs = '1 hour'`
			`ret.append(hrs)`
			`if minutes or seconds:`
			`mins, secs = elapsed // 60, elapsed % 60`
			`if leadingZeroes or mins:`
			`if mins != 1:`
			`mins = '%s minutes' % mins`
			`else:`
			`mins = '1 minute'`
			`ret.append(mins)`
			`if seconds:`
			`if secs != 1:`
			`secs = '%s seconds' % secs`
			`else:`
			`secs = '1 second'`
			`ret.append(secs)`
			`if len(ret) == 0:`
			`raise ValueError, 'Time difference not great enough to be noted.'`
			`if len(ret) == 1:`
			`return ret[0]`
			`else:`
			`return ' and '.join([', '.join(ret[:-1]), ret[-1]])`

Added levenshtein distance 2003-04-04 17:49:24 +02:00			`def distance(s, t):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Returns the levenshtein edit distance between two strings."""`
Added levenshtein distance 2003-04-04 17:49:24 +02:00			`n = len(s)`
			`m = len(t)`
			`if n == 0:`
			`return m`
			`elif m == 0:`
			`return n`
Fixed bug (and added test) in distance. 2003-04-06 17:10:14 +02:00			`d = []`
			`for i in range(n+1):`
			`d.append([])`
			`for j in range(m+1):`
			`d[i].append(0)`
			`d[0][j] = j`
			`d[i][0] = i`
Added levenshtein distance 2003-04-04 17:49:24 +02:00			`for i in range(1, n+1):`
			`cs = s[i-1]`
			`for j in range(1, m+1):`
			`ct = t[j-1]`
Fixed bug (and added test) in distance. 2003-04-06 17:10:14 +02:00			`cost = int(cs != ct)`
Added levenshtein distance 2003-04-04 17:49:24 +02:00			`d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)`
			`return d[n][m]`
Added soundex to utils (along with a test) and to FunCommands. 2003-04-05 13:23:28 +02:00
			`_soundextrans = string.maketrans(string.ascii_uppercase,`
			`'01230120022455012623010202')`
			`_notUpper = string.ascii.translate(string.ascii, string.ascii_uppercase)`
			`def soundex(s, length=4):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Returns the soundex hash of a given string."""`
Added soundex to utils (along with a test) and to FunCommands. 2003-04-05 13:23:28 +02:00			`assert s`
			`s = s.upper() # Make everything uppercase.`
			`firstChar = s[0] # Save the first character.`
			`s = s.translate(string.ascii, _notUpper) # Delete non-letters.`
			`s = s.translate(_soundextrans) # Convert to soundex numbers.`
			`s = s.lstrip(s[0]) # Remove all repeated first characters.`
			`L = [firstChar]`
			`for c in s:`
			`if c != L[-1]:`
			`L.append(c)`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`L = [c for c in L if c != '0'] + (['0']*(length-1))`
Added soundex to utils (along with a test) and to FunCommands. 2003-04-05 13:23:28 +02:00			`s = ''.join(L)`
Added length=0 handling to soundex to return the everything. 2003-04-05 13:29:29 +02:00			`return length and s[:length] or s.rstrip('0')`
Added timeElapsed and converted FunCommands to use it. 2003-04-03 11:11:57 +02:00
Added dqrepr. 2003-04-12 14:50:20 +02:00			`def dqrepr(s):`
			`"""Returns a repr() of s guaranteed to be in double quotes."""`
Upgraded to 2.3. 2003-07-31 08:20:58 +02:00			`# The wankers-that-be decided not to use double-quotes anymore in 2.3.`
			`# return '"' + repr("'\x00" + s)[6:]`
string_escape is better than unicode_escape. 2003-08-17 08:28:05 +02:00			`return '"%s"' % s.encode('string_escape').replace('"', '\\"')`
Added dqrepr. 2003-04-12 14:50:20 +02:00
Added perlReToPythonRe and perlReToReplacer and associated tests. 2003-04-16 07:26:24 +02:00			`nonEscapedSlashes = re.compile(r'(?<!\\)/')`
			`def perlReToPythonRe(s):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Converts a string representation of a Perl regular expression (i.e.,`
			`m/^foo$/i or /foo\|bar/) to a Python regular expression.`
			`"""`
Added perlReToPythonRe and perlReToReplacer and associated tests. 2003-04-16 07:26:24 +02:00			`(kind, regexp, flags) = nonEscapedSlashes.split(s)`
			`regexp = regexp.replace('\\/', '/')`
			`if kind not in ('', 'm'):`
			`raise ValueError, 'Invalid kind: must be in ("", "m")'`
			`flag = 0`
			`try:`
			`for c in flags.upper():`
			`flag &= getattr(re, c)`
			`except AttributeError:`
			`raise ValueError, 'Invalid flag: %s' % c`
			`return re.compile(regexp, flag)`

			`def perlReToReplacer(s):`
Added more docstrings. 2003-08-10 12:45:44 +02:00			`"""Converts a string representation of a Perl regular expression (i.e.,`
			`s/foo/bar/g or s/foo/bar/i) to a Python function doing the equivalent`
			`replacement.`
			`"""`
Added perlReToPythonRe and perlReToReplacer and associated tests. 2003-04-16 07:26:24 +02:00			`(kind, regexp, replace, flags) = nonEscapedSlashes.split(s)`
			`if kind != 's':`
			`raise ValueError, 'Invalid kind: must be "s"'`
			`g = False`
			`if 'g' in flags:`
			`g = True`
			`flags = filter('g'.__ne__, flags)`
			`r = perlReToPythonRe('/'.join(('', regexp, flags)))`
			`if g:`
			`return lambda s: r.sub(replace, s)`
			`else:`
			`return lambda s: r.sub(replace, s, 1)`

Added utils.py, with a quality HTML stripper; removed stripHtml from other source files 2003-03-27 21:10:10 +01:00			`# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:`