mirror of
				https://github.com/Mikaela/Limnoria.git
				synced 2025-11-04 09:37:25 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			97 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			97 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
# This module is part of the Pyndex project and is Copyright 2003 Amir
 | 
						|
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
 | 
						|
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
 | 
						|
# General Public License as published by the Free Software Foundation.
 | 
						|
 | 
						|
import string
 | 
						|
 | 
						|
class Splitter(object):
 | 
						|
    """Split plain text into words" utility class
 | 
						|
    Adapted from David Mertz's article in IBM developerWorks
 | 
						|
    Needs work to handle international characters, etc"""
 | 
						|
 | 
						|
##    __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
 | 
						|
##                 'nondigits', 'alpha', 'ident', 'tokens', 'position']
 | 
						|
    
 | 
						|
 | 
						|
    stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
 | 
						|
                 'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
 | 
						|
                 'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
 | 
						|
                 'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
 | 
						|
                 'no': 1, 'these': 1, 'of': 1, 'there': 1,
 | 
						|
                 'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
 | 
						|
                 'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
 | 
						|
 | 
						|
    yes = string.lowercase + string.digits + '' # throw in any extras
 | 
						|
    nonword = ''
 | 
						|
    for i in range(0,255):
 | 
						|
        if chr(i) not in yes:
 | 
						|
            nonword += chr(i)
 | 
						|
 | 
						|
    word_only = string.maketrans(nonword, " " * len(nonword))
 | 
						|
    
 | 
						|
    nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
 | 
						|
    alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
 | 
						|
    ident = string.join(map(chr, range(256)), '')
 | 
						|
 | 
						|
    def close(self):
 | 
						|
        # Lupy support
 | 
						|
        pass
 | 
						|
 | 
						|
    def tokenStream(self, fieldName, file, casesensitive=False):
 | 
						|
        """Split text/plain string into a list of words
 | 
						|
        """
 | 
						|
        self.tokens = self.split(file.read())
 | 
						|
        self.position = 0
 | 
						|
        return self
 | 
						|
 | 
						|
    def next(self):
 | 
						|
        if self.position >= len(self.tokens):
 | 
						|
            return None
 | 
						|
        res = Token(self.tokens[self.position])
 | 
						|
        self.position += 1
 | 
						|
        return res
 | 
						|
     
 | 
						|
    def split(self, text, casesensitive=0):
 | 
						|
        # Speedup trick: attributes into local scope
 | 
						|
        word_only = self.word_only
 | 
						|
        ident = self.ident
 | 
						|
        alpha = self.alpha
 | 
						|
        nondigits = self.nondigits
 | 
						|
 | 
						|
        # Let's adjust case if not case-sensitive
 | 
						|
        if not casesensitive: text = string.lower(text)
 | 
						|
 | 
						|
        # Split the raw text
 | 
						|
        allwords = text.translate(word_only).split()  # Let's strip funny byte values
 | 
						|
 | 
						|
        # Finally, let's skip some words not worth indexing
 | 
						|
        words = []
 | 
						|
        for word in allwords:
 | 
						|
            if len(word) > 32: continue         # too long (probably gibberish)
 | 
						|
 | 
						|
            # Identify common patterns in non-word data (binary, UU/MIME, etc)
 | 
						|
            num_nonalpha = len(word.translate(ident, alpha))
 | 
						|
            numdigits    = len(word.translate(ident, nondigits))
 | 
						|
            if numdigits > len(word)-2:         # almost all digits
 | 
						|
                if numdigits > 5:               # too many digits is gibberish
 | 
						|
                    continue                    # a moderate number is year/zipcode/etc
 | 
						|
            elif num_nonalpha*2 > len(word):    # too much scattered nonalpha = gibberish
 | 
						|
                continue
 | 
						|
 | 
						|
            word = word.translate(word_only)    # Let's strip funny byte values
 | 
						|
            subwords = word.split()             # maybe embedded non-alphanumeric
 | 
						|
            for subword in subwords:            # ...so we might have subwords
 | 
						|
                if len(subword) <= 1: continue  # too short a subword
 | 
						|
                words.append(subword)
 | 
						|
            
 | 
						|
        return words                
 | 
						|
 | 
						|
class Token:
 | 
						|
    def __init__(self, trmText):
 | 
						|
        self.trmText = trmText
 | 
						|
 | 
						|
    def termText(self):
 | 
						|
        return self.trmText
 | 
						|
 |