Limnoria/others/reverend/splitter.py

# This module is part of the Pyndex project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.

import string

class Splitter(object):
    """Split plain text into words" utility class
    Adapted from David Mertz's article in IBM developerWorks
    Needs work to handle international characters, etc"""

##    __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
##                 'nondigits', 'alpha', 'ident', 'tokens', 'position']
    

    stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
                 'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
                 'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
                 'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
                 'no': 1, 'these': 1, 'of': 1, 'there': 1,
                 'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
                 'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}

    yes = string.lowercase + string.digits + '' # throw in any extras
    nonword = ''
    for i in range(0,255):
        if chr(i) not in yes:
            nonword += chr(i)

    word_only = string.maketrans(nonword, " " * len(nonword))
    
    nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
    alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
    ident = string.join(map(chr, range(256)), '')

    def close(self):
        # Lupy support
        pass

    def tokenStream(self, fieldName, file, casesensitive=False):
        """Split text/plain string into a list of words
        """
        self.tokens = self.split(file.read())
        self.position = 0
        return self

    def next(self):
        if self.position >= len(self.tokens):
            return None
        res = Token(self.tokens[self.position])
        self.position += 1
        return res
     
    def split(self, text, casesensitive=0):
        # Speedup trick: attributes into local scope
        word_only = self.word_only
        ident = self.ident
        alpha = self.alpha
        nondigits = self.nondigits

        # Let's adjust case if not case-sensitive
        if not casesensitive: text = string.lower(text)

        # Split the raw text
        allwords = text.translate(word_only).split()  # Let's strip funny byte values

        # Finally, let's skip some words not worth indexing
        words = []
        for word in allwords:
            if len(word) > 32: continue         # too long (probably gibberish)

            # Identify common patterns in non-word data (binary, UU/MIME, etc)
            num_nonalpha = len(word.translate(ident, alpha))
            numdigits    = len(word.translate(ident, nondigits))
            if numdigits > len(word)-2:         # almost all digits
                if numdigits > 5:               # too many digits is gibberish
                    continue                    # a moderate number is year/zipcode/etc
            elif num_nonalpha*2 > len(word):    # too much scattered nonalpha = gibberish
                continue

            word = word.translate(word_only)    # Let's strip funny byte values
            subwords = word.split()             # maybe embedded non-alphanumeric
            for subword in subwords:            # ...so we might have subwords
                if len(subword) <= 1: continue  # too short a subword
                words.append(subword)
            
        return words                

class Token:
    def __init__(self, trmText):
        self.trmText = trmText

    def termText(self):
        return self.trmText
Initial checkin. 2004-10-03 11:08:36 +02:00			`# This module is part of the Pyndex project and is Copyright 2003 Amir`
			`# Bakhtiar (amir@divmod.org). This is free software; you can redistribute`
			`# it and/or modify it under the terms of version 2.1 of the GNU Lesser`
			`# General Public License as published by the Free Software Foundation.`

			`import string`

			`class Splitter(object):`
			`"""Split plain text into words" utility class`
			`Adapted from David Mertz's article in IBM developerWorks`
			`Needs work to handle international characters, etc"""`

			`## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',`
			`## 'nondigits', 'alpha', 'ident', 'tokens', 'position']`


			`stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,`
			`'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,`
			`'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,`
			`'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,`
			`'no': 1, 'these': 1, 'of': 1, 'there': 1,`
			`'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,`
			`'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}`

			`yes = string.lowercase + string.digits + '' # throw in any extras`
			`nonword = ''`
			`for i in range(0,255):`
			`if chr(i) not in yes:`
			`nonword += chr(i)`

			`word_only = string.maketrans(nonword, " " * len(nonword))`

			`nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')`
			`alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')`
			`ident = string.join(map(chr, range(256)), '')`

			`def close(self):`
			`# Lupy support`
			`pass`

			`def tokenStream(self, fieldName, file, casesensitive=False):`
			`"""Split text/plain string into a list of words`
			`"""`
			`self.tokens = self.split(file.read())`
			`self.position = 0`
			`return self`

			`def next(self):`
			`if self.position >= len(self.tokens):`
			`return None`
			`res = Token(self.tokens[self.position])`
			`self.position += 1`
			`return res`

			`def split(self, text, casesensitive=0):`
			`# Speedup trick: attributes into local scope`
			`word_only = self.word_only`
			`ident = self.ident`
			`alpha = self.alpha`
			`nondigits = self.nondigits`

			`# Let's adjust case if not case-sensitive`
			`if not casesensitive: text = string.lower(text)`

			`# Split the raw text`
			`allwords = text.translate(word_only).split() # Let's strip funny byte values`

			`# Finally, let's skip some words not worth indexing`
			`words = []`
			`for word in allwords:`
			`if len(word) > 32: continue # too long (probably gibberish)`

			`# Identify common patterns in non-word data (binary, UU/MIME, etc)`
			`num_nonalpha = len(word.translate(ident, alpha))`
			`numdigits = len(word.translate(ident, nondigits))`
			`if numdigits > len(word)-2: # almost all digits`
			`if numdigits > 5: # too many digits is gibberish`
			`continue # a moderate number is year/zipcode/etc`
			`elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish`
			`continue`

			`word = word.translate(word_only) # Let's strip funny byte values`
			`subwords = word.split() # maybe embedded non-alphanumeric`
			`for subword in subwords: # ...so we might have subwords`
			`if len(subword) <= 1: continue # too short a subword`
			`words.append(subword)`

			`return words`

			`class Token:`
			`def __init__(self, trmText):`
			`self.trmText = trmText`

			`def termText(self):`
			`return self.trmText`