Limnoria/others/reverend/splitter.py

97 lines
3.8 KiB
Python
Executable File

# This module is part of the Pyndex project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.
import string
class Splitter(object):
"""Split plain text into words" utility class
Adapted from David Mertz's article in IBM developerWorks
Needs work to handle international characters, etc"""
## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
## 'nondigits', 'alpha', 'ident', 'tokens', 'position']
stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
'no': 1, 'these': 1, 'of': 1, 'there': 1,
'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
yes = string.lowercase + string.digits + '' # throw in any extras
nonword = ''
for i in range(0,255):
if chr(i) not in yes:
nonword += chr(i)
word_only = string.maketrans(nonword, " " * len(nonword))
nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
ident = string.join(map(chr, range(256)), '')
def close(self):
# Lupy support
pass
def tokenStream(self, fieldName, file, casesensitive=False):
"""Split text/plain string into a list of words
"""
self.tokens = self.split(file.read())
self.position = 0
return self
def next(self):
if self.position >= len(self.tokens):
return None
res = Token(self.tokens[self.position])
self.position += 1
return res
def split(self, text, casesensitive=0):
# Speedup trick: attributes into local scope
word_only = self.word_only
ident = self.ident
alpha = self.alpha
nondigits = self.nondigits
# Let's adjust case if not case-sensitive
if not casesensitive: text = string.lower(text)
# Split the raw text
allwords = text.translate(word_only).split() # Let's strip funny byte values
# Finally, let's skip some words not worth indexing
words = []
for word in allwords:
if len(word) > 32: continue # too long (probably gibberish)
# Identify common patterns in non-word data (binary, UU/MIME, etc)
num_nonalpha = len(word.translate(ident, alpha))
numdigits = len(word.translate(ident, nondigits))
if numdigits > len(word)-2: # almost all digits
if numdigits > 5: # too many digits is gibberish
continue # a moderate number is year/zipcode/etc
elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish
continue
word = word.translate(word_only) # Let's strip funny byte values
subwords = word.split() # maybe embedded non-alphanumeric
for subword in subwords: # ...so we might have subwords
if len(subword) <= 1: continue # too short a subword
words.append(subword)
return words
class Token:
def __init__(self, trmText):
self.trmText = trmText
def termText(self):
return self.trmText