mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-01-26 03:54:08 +01:00
97 lines
3.8 KiB
Python
97 lines
3.8 KiB
Python
|
# This module is part of the Pyndex project and is Copyright 2003 Amir
|
||
|
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
|
||
|
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
|
||
|
# General Public License as published by the Free Software Foundation.
|
||
|
|
||
|
import string
|
||
|
|
||
|
class Splitter(object):
|
||
|
"""Split plain text into words" utility class
|
||
|
Adapted from David Mertz's article in IBM developerWorks
|
||
|
Needs work to handle international characters, etc"""
|
||
|
|
||
|
## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
|
||
|
## 'nondigits', 'alpha', 'ident', 'tokens', 'position']
|
||
|
|
||
|
|
||
|
stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
|
||
|
'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
|
||
|
'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
|
||
|
'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
|
||
|
'no': 1, 'these': 1, 'of': 1, 'there': 1,
|
||
|
'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
|
||
|
'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
|
||
|
|
||
|
yes = string.lowercase + string.digits + '' # throw in any extras
|
||
|
nonword = ''
|
||
|
for i in range(0,255):
|
||
|
if chr(i) not in yes:
|
||
|
nonword += chr(i)
|
||
|
|
||
|
word_only = string.maketrans(nonword, " " * len(nonword))
|
||
|
|
||
|
nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
|
||
|
alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
|
||
|
ident = string.join(map(chr, range(256)), '')
|
||
|
|
||
|
def close(self):
|
||
|
# Lupy support
|
||
|
pass
|
||
|
|
||
|
def tokenStream(self, fieldName, file, casesensitive=False):
|
||
|
"""Split text/plain string into a list of words
|
||
|
"""
|
||
|
self.tokens = self.split(file.read())
|
||
|
self.position = 0
|
||
|
return self
|
||
|
|
||
|
def next(self):
|
||
|
if self.position >= len(self.tokens):
|
||
|
return None
|
||
|
res = Token(self.tokens[self.position])
|
||
|
self.position += 1
|
||
|
return res
|
||
|
|
||
|
def split(self, text, casesensitive=0):
|
||
|
# Speedup trick: attributes into local scope
|
||
|
word_only = self.word_only
|
||
|
ident = self.ident
|
||
|
alpha = self.alpha
|
||
|
nondigits = self.nondigits
|
||
|
|
||
|
# Let's adjust case if not case-sensitive
|
||
|
if not casesensitive: text = string.lower(text)
|
||
|
|
||
|
# Split the raw text
|
||
|
allwords = text.translate(word_only).split() # Let's strip funny byte values
|
||
|
|
||
|
# Finally, let's skip some words not worth indexing
|
||
|
words = []
|
||
|
for word in allwords:
|
||
|
if len(word) > 32: continue # too long (probably gibberish)
|
||
|
|
||
|
# Identify common patterns in non-word data (binary, UU/MIME, etc)
|
||
|
num_nonalpha = len(word.translate(ident, alpha))
|
||
|
numdigits = len(word.translate(ident, nondigits))
|
||
|
if numdigits > len(word)-2: # almost all digits
|
||
|
if numdigits > 5: # too many digits is gibberish
|
||
|
continue # a moderate number is year/zipcode/etc
|
||
|
elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish
|
||
|
continue
|
||
|
|
||
|
word = word.translate(word_only) # Let's strip funny byte values
|
||
|
subwords = word.split() # maybe embedded non-alphanumeric
|
||
|
for subword in subwords: # ...so we might have subwords
|
||
|
if len(subword) <= 1: continue # too short a subword
|
||
|
words.append(subword)
|
||
|
|
||
|
return words
|
||
|
|
||
|
class Token:
|
||
|
def __init__(self, trmText):
|
||
|
self.trmText = trmText
|
||
|
|
||
|
def termText(self):
|
||
|
return self.trmText
|
||
|
|