Limnoria/others/reverend/splitter.py

# This module is part of the Pyndex project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.

import string

class Splitter(object):
    """Split plain text into words" utility class
    Adapted from David Mertz's article in IBM developerWorks
    Needs work to handle international characters, etc"""

##    __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
##                 'nondigits', 'alpha', 'ident', 'tokens', 'position']


    stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
                 'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
                 'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
                 'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
                 'no': 1, 'these': 1, 'of': 1, 'there': 1,
                 'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
                 'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}

    yes = string.lowercase + string.digits + '' # throw in any extras
    nonword = ''
    for i in range(0,255):
        if chr(i) not in yes:
            nonword += chr(i)

    word_only = string.maketrans(nonword, " " * len(nonword))

    nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
    alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
    ident = string.join(map(chr, range(256)), '')

    def close(self):
        # Lupy support
        pass

    def tokenStream(self, fieldName, file, casesensitive=False):
        """Split text/plain string into a list of words
        """
        self.tokens = self.split(file.read())
        self.position = 0
        return self

    def next(self):
        if self.position >= len(self.tokens):
            return None
        res = Token(self.tokens[self.position])
        self.position += 1
        return res

    def split(self, text, casesensitive=0):
        # Speedup trick: attributes into local scope
        word_only = self.word_only
        ident = self.ident
        alpha = self.alpha
        nondigits = self.nondigits

        # Let's adjust case if not case-sensitive
        if not casesensitive: text = string.lower(text)

        # Split the raw text
        allwords = text.translate(word_only).split()  # Let's strip funny byte values

        # Finally, let's skip some words not worth indexing
        words = []
        for word in allwords:
            if len(word) > 32: continue         # too long (probably gibberish)

            # Identify common patterns in non-word data (binary, UU/MIME, etc)
            num_nonalpha = len(word.translate(ident, alpha))
            numdigits    = len(word.translate(ident, nondigits))
            if numdigits > len(word)-2:         # almost all digits
                if numdigits > 5:               # too many digits is gibberish
                    continue                    # a moderate number is year/zipcode/etc
            elif num_nonalpha*2 > len(word):    # too much scattered nonalpha = gibberish
                continue

            word = word.translate(word_only)    # Let's strip funny byte values
            subwords = word.split()             # maybe embedded non-alphanumeric
            for subword in subwords:            # ...so we might have subwords
                if len(subword) <= 1: continue  # too short a subword
                words.append(subword)

        return words

class Token:
    def __init__(self, trmText):
        self.trmText = trmText

    def termText(self):
        return self.trmText