Commit 41eb6fe8 authored by Christopher Petrilli's avatar Christopher Petrilli

Merge of Leixcon cleaup and text index merging.

parent 78f3476f
...@@ -81,82 +81,108 @@ ...@@ -81,82 +81,108 @@
# many individuals on behalf of Digital Creations. Specific # many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file. # attributions are listed in the accompanying credits file.
# #
############################################################################## #############################################################################
__doc__=""" Lexicon object that supports
"""
from Lexicon import Lexicon from Lexicon import Lexicon
from Splitter import Splitter from Splitter import Splitter
from intSet import intSet from intSet import intSet
from UnTextIndex import Or from UnTextIndex import Or
import re, time import re, string
import OIBTree, BTree, IOBTree, IIBTree import OIBTree, BTree, IOBTree, IIBTree
# Short cuts for common data containers
OIBTree = OIBTree.BTree # Object -> Integer OIBTree = OIBTree.BTree # Object -> Integer
OOBTree = BTree.BTree # Object -> Object OOBTree = BTree.BTree # Object -> Object
IOBTree = IOBTree.BTree # Integer -> Object IOBTree = IOBTree.BTree # Integer -> Object
IIBucket = IIBTree.Bucket # Integer -> Integer IIBucket = IIBTree.Bucket # Integer -> Integer
import pdb
class GlobbingLexicon(Lexicon): class GlobbingLexicon(Lexicon):
""" """Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Base class to support globbing lexicon object. Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
""" """
multi_wc = '*' multi_wc = '*'
single_wc = '?' single_wc = '?'
eow = '$' eow = '$'
def __init__(self):
self.counter = 0 def __init__(self):
self.counter = 0 # word id counter XXX
self._lexicon = OIBTree() self._lexicon = OIBTree()
self._inverseLex = IOBTree() self._inverseLex = IOBTree()
self._digrams = OOBTree() self._digrams = OOBTree()
def set(self, word):
""" """ def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
digrams = []
digrams.append(self.eow + word[0]) # Mark the beginning
for i in range(len(word)):
digrams.append(word[i:i+2])
digrams[-1] = digrams[-1] + self.eow # Mark the end
return digrams
def getWordId(self, word):
"""Provided 'word', return the matching integer word id."""
if self._lexicon.has_key(word): if self._lexicon.has_key(word):
return self._lexicon[word] return self._lexicon[word]
else: else:
word = intern(word) return self.assignWordId(word)
self._lexicon[word] = self.counter
self._inverseLex[self.counter] = word
## now, split the word into digrams and insert references set = getWordId # Kludge for old code
## to 'word' into the digram object. The first and last
## digrams in the list are specially marked with $ to
## indicate the beginning and end of the word
digrams = []
digrams.append(self.eow + word[0]) # mark the beginning
for i in range(len(word)): def assignWordId(self, word):
digrams.append(word[i:i+2]) """Assigns a new word id to the provided word, and return it."""
digrams[-1] = digrams[-1] + self.eow # mark the end # Double check it's not in the lexicon already, and if it is, just
# return it.
_digrams = self._digrams if self._lexicon.has_key(word):
return self._lexicon[word]
for digram in digrams:
set = _digrams.get(digram) # First we go ahead and put the forward and reverse maps in.
if set is None: self._lexicon[word] = self.counter
_digrams[digram] = set = intSet() self._inverseLex[self.counter] = word
set.insert(self.counter)
counter = self.counter # Now take all the digrams and insert them into the digram map.
self.counter = self.counter + 1 for digram in self.createDigrams(word):
return counter set = self._digrams.get(digram)
if set is None:
self._digrams[digram] = set = intSet()
set.insert(self.counter)
self.counter = self.counter + 1
return self.counter - 1 # Adjust for the previous increment
def get(self, pattern): def get(self, pattern):
""" Query the lexicon for words matching a pattern. """ Query the lexicon for words matching a pattern."""
"""
wc_set = [self.multi_wc, self.single_wc] wc_set = [self.multi_wc, self.single_wc]
digrams = [] digrams = []
...@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon): ...@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
## may contain all matching digrams, but in the wrong ## may contain all matching digrams, but in the wrong
## order. ## order.
expr = re.compile(self.translate(pattern)) expr = re.compile(self.createRegex(pattern))
words = [] words = []
hits = [] hits = []
for x in result.keys(): for x in result.keys():
if expr.match(self._inverseLex[x]): if expr.match(self._inverseLex[x]):
hits.append(x) hits.append(x)
return hits return hits
def __getitem__(self, word): def __getitem__(self, word):
""" """ """ """
return self.get(word) return self.get(word)
def query_hook(self, q):
"""expand wildcards
""" def query_hook(self, q):
"""expand wildcards"""
words = [] words = []
wids = [] wids = []
for w in q: for w in q:
...@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon): ...@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
return words return words
def Splitter(self, astring, words=None): def Splitter(self, astring, words=None):
""" wrap the splitter """ """ wrap the splitter """
...@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon): ...@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
return Splitter(astring) return Splitter(astring)
def translate(self, pat): def createRegex(self, pat):
"""Translate a PATTERN to a regular expression. """Translate a PATTERN to a regular expression.
There is no way to quote meta-characters. There is no way to quote meta-characters.
""" """
i, n = 0, len(pat) transTable = string.maketrans("", "")
res = ''
while i < n: # First, deal with mutli-character globbing
c = pat[i] result = string.replace(pat, '*', '.*')
i = i+1
if c == self.multi_wc: # Next, we need to deal with single-character globbing
res = res + '.*' result = string.replace(result, '?', '.?')
elif c == self.single_wc:
res = res + '.?' # Now, we need to remove all of the characters that
else: # are forbidden.
res = res + re.escape(c) result = string.translate(result, transTable,
return res + '$' r'()&|!@#$%^{}\<>')
return "%s$" % result
...@@ -83,11 +83,6 @@ ...@@ -83,11 +83,6 @@
# #
############################################################################## ##############################################################################
import string, regex, ts_regex
import regsub
__doc__=""" Module breaks out Zope specific methods and behavior. In __doc__=""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer addition, provides the Lexicon class which defines a word to integer
mapping. mapping.
...@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit): ...@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
self.stop_syn = stop_syn self.stop_syn = stop_syn
def set(self, word): def getWordId(self, word):
""" return the word id of 'word' """ """ return the word id of 'word' """
if self._lexicon.has_key(word): if self._lexicon.has_key(word):
return self._lexicon[word] return self._lexicon[word]
else: else:
if not hasattr(self, 'counter'): return self.assignWordId(word)
self.counter = 0
self._lexicon[intern(word)] = self.counter set = getWordId
self.counter = self.counter + 1
return self.counter - 1
def assignWordId(self, word):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if self._lexicon.has_key(word):
return self._lexicon[word]
if not hasattr(self, 'counter'):
self.counter = 0
self._lexicon[intern(word)] = self.counter
self.counter = self.counter + 1
return self.counter - 1
def get(self, key, default=None): def get(self, key, default=None):
""" """ """Return the matched word against the key."""
return [self._lexicon.get(key, default)] return [self._lexicon.getWordId(key, default)]
def __getitem__(self, key): def __getitem__(self, key):
......
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
"""Simple column indices""" """Simple column indices"""
__version__='$Revision: 1.23 $'[11:-2] __version__='$Revision: 1.24 $'[11:-2]
...@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit): ...@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
('unindex_object could not remove ' ('unindex_object could not remove '
'integer id %s from index %s. This ' 'integer id %s from index %s. This '
'should not happen.' 'should not happen.'
% (str(i), str(k)))) % (str(documentId), str(self.id))))
else: else:
LOG(self.__class__.__name__, ERROR, LOG(self.__class__.__name__, ERROR,
('unindex_object tried to retrieve set %s ' ('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This ' 'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k)))) 'should not happen.' % (repr(entry), str(self.id))))
def insertForwardIndexEntry(self, entry, documentId): def insertForwardIndexEntry(self, entry, documentId):
...@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit): ...@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
This will also deal with creating the entire row if necessary.""" This will also deal with creating the entire row if necessary."""
indexRow = self._index.get(entry, MV) indexRow = self._index.get(entry, MV)
# Make sure there's actually a row there already. If not, create # Make sure there's actually a row there already. If not, create
# an IntSet and stuff it in first. # an IntSet and stuff it in first.
if indexRow is MV: if indexRow is MV:
...@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit): ...@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
datum = getattr(obj, self.id) datum = getattr(obj, self.id)
if callable(datum): if callable(datum):
datum = datum() datum = datum()
except: except AttributeError:
datum = MV datum = MV
# We don't want to do anything that we don't have to here, so we'll # We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same. # check to see if the new and existing information is the same.
if not (datum == self._unindex.get(documentId, MV)): oldDatum = self._unindex.get(documentId, MV)
if not datum == oldDatum:
if oldDatum is not MV:
self.removeForwardIndexEntry(oldDatum, documentId)
self.insertForwardIndexEntry(datum, documentId) self.insertForwardIndexEntry(datum, documentId)
self._unindex[documentId] = datum self._unindex[documentId] = datum
returnStatus = 1 returnStatus = 1
self._p_changed = 1 # Tickle the transaction
return returnStatus return returnStatus
......
...@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex): ...@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
newKeywords = getattr(obj, self.id) newKeywords = getattr(obj, self.id)
if callable(newKeywords): if callable(newKeywords):
newKeywords = newKeywords() newKeywords = newKeywords()
except: except Except:
newKeywords = MV newKeywords = MV
if type(newKeywords) is StringType: if type(newKeywords) is StringType:
...@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex): ...@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
except TypeError: except TypeError:
return 0 return 0
self._unindex[documentId] = newKeywords self._unindex[documentId] = newKeywords[:] # Make a copy
return 1 return 1
......
...@@ -89,43 +89,58 @@ The UnTextIndex falls under the 'I didnt have a better name for it' ...@@ -89,43 +89,58 @@ The UnTextIndex falls under the 'I didnt have a better name for it'
excuse. It is an 'Un' Text index because it stores a little bit of excuse. It is an 'Un' Text index because it stores a little bit of
undo information so that objects can be unindexed when the old value undo information so that objects can be unindexed when the old value
is no longer known. is no longer known.
"""
__version__ = '$Revision: 1.35 $'[11:-2]
"""
__version__='$Revision: 1.34 $'[11:-2]
import BTree, IIBTree, IOBTree, OIBTree
import string, regex, regsub, ts_regex
import operator
from intSet import intSet
from Globals import Persistent from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree
from Acquisition import Implicit from Acquisition import Implicit
BTree=BTree.BTree
IOBTree=IOBTree.BTree
IIBucket=IIBTree.Bucket
OIBTree=OIBTree.BTree
from intSet import intSet
import operator
from Splitter import Splitter from Splitter import Splitter
from string import strip
import string, regex, regsub, ts_regex
from zLOG import LOG, ERROR from zLOG import LOG, ERROR
from types import * from Lexicon import Lexicon
from Lexicon import Lexicon, stop_word_dict
from ResultList import ResultList from ResultList import ResultList
from types import *
BTree = BTree.BTree # Regular generic BTree
IOBTree = IOBTree.BTree # Integer -> Object
IIBucket = IIBTree.Bucket # Integer -> Integer
OIBTree = OIBTree.BTree # Object -> Integer
AndNot = 'andnot' AndNot = 'andnot'
And = 'and' And = 'and'
Or = 'or' Or = 'or'
Near = '...' Near = '...'
QueryError='TextIndex.QueryError' QueryError = 'TextIndex.QueryError'
class UnTextIndex(Persistent, Implicit): class UnTextIndex(Persistent, Implicit):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way."""
meta_type = 'Text Index' meta_type = 'Text Index'
def __init__(self, id=None, ignore_ex=None, def __init__(self, id=None, ignore_ex=None,
call_methods=None, lexicon=None): call_methods=None, lexicon=None):
"""Create an index """Create an index
...@@ -142,49 +157,33 @@ class UnTextIndex(Persistent, Implicit): ...@@ -142,49 +157,33 @@ class UnTextIndex(Persistent, Implicit):
of getattr or getitem to get an attribute. of getattr or getitem to get an attribute.
'lexicon' is the lexicon object to specify, if None, the 'lexicon' is the lexicon object to specify, if None, the
index will use a private lexicon. index will use a private lexicon."""
There is a ZCatalog UML model that sheds some light on what is if not id == ignore_ex == call_methods == None:
going on here. '_index' is a BTree which maps word ids to self.id = id
mapping from document id to score. Something like: self.ignore_ex = ignore_ex
self.call_methods = call_methods
{'bob' : {1 : 5, 2 : 3, 42 : 9}} self._index = IOBTree()
{'uncle' : {1 : 1}} self._unindex = IOBTree()
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way.
"""
if not id==ignore_ex==call_methods==None:
self.id=id
self.ignore_ex=ignore_ex
self.call_methods=call_methods
self._index=IOBTree()
self._unindex=IOBTree()
else: else:
pass pass
if lexicon is None: if lexicon is None:
## if no lexicon is provided, create a default one ## if no lexicon is provided, create a default one
self._lexicon=Lexicon() self._lexicon = Lexicon()
else: else:
self._lexicon = lexicon self._lexicon = lexicon
def getLexicon(self, vocab_id): def getLexicon(self, vocab_id):
"""Return the Lexicon in use.
""" bit of a hack, indexes have been made acquirers so that Bit of a hack, indexes have been made acquirers so that they
they can acquire a vocabulary object from the object system in can acquire a vocabulary object from the object system in
Zope. I don't think indexes were ever intended to participate Zope. I don't think indexes were ever intended to participate
in this way, but I don't see too much of a problem with it. in this way, but I don't see too much of a problem with it."""
"""
if type(vocab_id) is not StringType: if type(vocab_id) is not StringType:
vocab = vocab_id vocab = vocab_id
else: else:
...@@ -193,10 +192,14 @@ class UnTextIndex(Persistent, Implicit): ...@@ -193,10 +192,14 @@ class UnTextIndex(Persistent, Implicit):
def __len__(self): def __len__(self):
"""Return the number of objects indexed."""
return len(self._unindex) return len(self._unindex)
def clear(self): def clear(self):
"""Reinitialize the text index."""
self._index = IOBTree() self._index = IOBTree()
self._unindex = IOBTree() self._unindex = IOBTree()
...@@ -214,6 +217,10 @@ class UnTextIndex(Persistent, Implicit): ...@@ -214,6 +217,10 @@ class UnTextIndex(Persistent, Implicit):
def getEntryForObject(self, rid, default=None): def getEntryForObject(self, rid, default=None):
"""Get all information contained for a specific object.
This takes the objects record ID as it's main argument."""
wordMap = self.getLexicon(self._lexicon)._lexicon.items() wordMap = self.getLexicon(self._lexicon)._lexicon.items()
results = self._unindex.get(rid, None) results = self._unindex.get(rid, None)
...@@ -239,7 +246,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -239,7 +246,7 @@ class UnTextIndex(Persistent, Implicit):
2-4 dictionary 2-4 dictionary
5+ bucket. 5+ bucket.
""" """
indexRow = self._index.get(entry, None) indexRow = self._index.get(entry, None)
if indexRow is not None: if indexRow is not None:
...@@ -247,12 +254,21 @@ class UnTextIndex(Persistent, Implicit): ...@@ -247,12 +254,21 @@ class UnTextIndex(Persistent, Implicit):
# Tuples are only used for rows which have only # Tuples are only used for rows which have only
# a single entry. Since we now need more, we'll # a single entry. Since we now need more, we'll
# promote it to a mapping object (dictionary). # promote it to a mapping object (dictionary).
indexRow = { indexRow[0]: indexRow[1] }
indexRow[documentId] = score
self._index[entry] = indexRow # First, make sure we're not already in it, if so
# update the score if necessary.
if indexRow[0] == documentId:
if indexRow[1] != score:
indexRow = (documentId, score)
else:
indexRow = { indexRow[0]: indexRow[1] }
indexRow[documentId] = score
self._index[entry] = indexRow
elif type(indexRow) is DictType: elif type(indexRow) is DictType:
if len(indexRow) > 4: if indexRow.has_key(documentId):
if indexRow[documentId] == score:
return 1 # No need to update
elif len(indexRow) > 4:
# We have a mapping (dictionary), but it has # We have a mapping (dictionary), but it has
# grown too large, so we'll convert it to a # grown too large, so we'll convert it to a
# bucket. # bucket.
...@@ -266,6 +282,9 @@ class UnTextIndex(Persistent, Implicit): ...@@ -266,6 +282,9 @@ class UnTextIndex(Persistent, Implicit):
indexRow[documentId] = score indexRow[documentId] = score
else: else:
# We've got a IIBucket already. # We've got a IIBucket already.
if indexRow.has_key(documentId):
if indexRow[documentId] == score:
return 1
indexRow[documentId] = score indexRow[documentId] = score
else: else:
# We don't have any information at this point, so we'll # We don't have any information at this point, so we'll
...@@ -277,13 +296,43 @@ class UnTextIndex(Persistent, Implicit): ...@@ -277,13 +296,43 @@ class UnTextIndex(Persistent, Implicit):
def insertReverseIndexEntry(self, entry, documentId): def insertReverseIndexEntry(self, entry, documentId):
"""Insert the correct entry into the reverse indexes for future """Insert the correct entry into the reverse indexes for future
unindexing.""" unindexing."""
newEntry = self._unindex.get(documentId, [])
newEntry.append(entry)
self._unindex[documentId] = newEntry
newRow = self._unindex.get(documentId, [])
if newRow:
# Catch cases where we don't need to modify anything
if entry in newRow:
return 1
newRow.append(entry)
self._unindex[documentId] = newRow
def removeReverseEntry(self, entry, documentId):
"""Removes a single entry from the reverse index."""
newRow = self._unindex.get(documentId, [])
if newRow:
try:
newRow.remove(entry)
except ValueError:
pass # We don't have it, this is bad
self._unindex[documentId] = newRow
def removeForwardEntry(self, entry, documentId):
"""Remove a single entry from the forward index."""
currentRow = self._index.get(entry, None)
if type(currentRow) is TupleType:
del self._index[entry]
elif currentRow is not None:
try:
del self._index[entry][documentId]
except (KeyError, IndexError, TypeError):
LOG('UnTextIndex', ERROR,
'unindex_object tried to unindex nonexistent'
' document %s' % str(i))
def index_object(self, documentId, obj, threshold=None): def index_object(self, documentId, obj, threshold=None):
""" Index an object: """ Index an object:
'documentId' is the integer id of the document 'documentId' is the integer id of the document
...@@ -301,7 +350,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -301,7 +350,7 @@ class UnTextIndex(Persistent, Implicit):
source = str(source()) source = str(source())
else: else:
source = str(source) source = str(source)
except: except AttributeError:
return 0 return 0
...@@ -322,32 +371,36 @@ class UnTextIndex(Persistent, Implicit): ...@@ -322,32 +371,36 @@ class UnTextIndex(Persistent, Implicit):
else: else:
wordList[word] = 1 wordList[word] = 1
index = self._index
unindex = self._unindex
lexicon = self.getLexicon(self._lexicon) lexicon = self.getLexicon(self._lexicon)
unindex[documentId] = [] # XXX currentWordIds = self._unindex.get(documentId, [])
wordCount = 0 wordCount = 0
# First deal with deleted words
# To do this, the first thing we have to do is convert the
# existing words to words, from wordIDS
wordListAsIds = OIBTree()
for word, score in wordList.items(): for word, score in wordList.items():
if threshold is not None: wordListAsIds[lexicon.getWordId(word)] = score
if ((wordCount % threshold) == 0) and not (wordCount == 0):
# commit a subtransaction hack for word in currentWordIds:
get_transaction().commit(1) if not wordListAsIds.has_key(word):
# kick the cache self.removeForwardEntry(word, documentId)
self._p_jar.cacheFullSweep(1)
#import pdb; pdb.set_trace()
wordId = lexicon.set(word) # Now we can deal with new/updated entries
for wordId, score in wordListAsIds.items():
self.insertForwardIndexEntry(wordId, documentId, score) self.insertForwardIndexEntry(wordId, documentId, score)
self.insertReverseIndexEntry(wordId, documentId) self.insertReverseIndexEntry(wordId, documentId)
wordCount = wordCount + 1 wordCount = wordCount + 1
## return the number of words you indexed # Return the number of words you indexed
return wordCount return wordCount
def unindex_object(self, i): def unindex_object(self, i):
""" carefully unindex document with integer id 'i' from the text """ carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """ index and do not fail if it does not exist """
index = self._index index = self._index
unindex = self._unindex unindex = self._unindex
val = unindex.get(i, None) val = unindex.get(i, None)
...@@ -385,7 +438,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -385,7 +438,7 @@ class UnTextIndex(Persistent, Implicit):
if len(splitSource) == 1: if len(splitSource) == 1:
splitSource = splitSource[0] splitSource = splitSource[0]
if splitSource[:1]=='"' and splitSource[-1:]=='"': if splitSource[:1] == '"' and splitSource[-1:] == '"':
return self[splitSource] return self[splitSource]
r = self._index.get( r = self._index.get(
...@@ -429,13 +482,13 @@ class UnTextIndex(Persistent, Implicit): ...@@ -429,13 +482,13 @@ class UnTextIndex(Persistent, Implicit):
return None return None
if type(keys) is StringType: if type(keys) is StringType:
if not keys or not strip(keys): if not keys or not string.strip(keys):
return None return None
keys = [keys] keys = [keys]
r = None r = None
for key in keys: for key in keys:
key = strip(key) key = string.strip(key)
if not key: if not key:
continue continue
...@@ -480,11 +533,11 @@ class UnTextIndex(Persistent, Implicit): ...@@ -480,11 +533,11 @@ class UnTextIndex(Persistent, Implicit):
def _subindex(self, isrc, d, old, last): def _subindex(self, isrc, d, old, last):
src = self.getLexicon(self._lexicon).Splitter(isrc) src = self.getLexicon(self._lexicon).Splitter(isrc)
for s in src: for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"':
last = self.subindex(s[1:-1],d,old,last)
else: else:
if old(s): if old(s):
if s != last: d[s] = d[s]+1 if s != last: d[s] = d[s]+1
...@@ -493,15 +546,12 @@ class UnTextIndex(Persistent, Implicit): ...@@ -493,15 +546,12 @@ class UnTextIndex(Persistent, Implicit):
return last return last
def query(self, s, default_operator = Or, ws = (string.whitespace,)): def query(self, s, default_operator=Or, ws=(string.whitespace,)):
""" """ This is called by TextIndexes. A 'query term' which is a
string 's' is passed in, along with an index object. s is
parsed, then the wildcards are parsed, then something is
parsed again, then the whole thing is 'evaluated'. """
This is called by TextIndexes. A 'query term' which is a string
's' is passed in, along with an index object. s is parsed, then
the wildcards are parsed, then something is parsed again, then the
whole thing is 'evaluated'
"""
# First replace any occurences of " and not " with " andnot " # First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub( s = ts_regex.gsub(
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3), '[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
...@@ -523,7 +573,8 @@ class UnTextIndex(Persistent, Implicit): ...@@ -523,7 +573,8 @@ class UnTextIndex(Persistent, Implicit):
def get_operands(self, q, i): def get_operands(self, q, i):
'''Evaluate and return the left and right operands for an operator''' """Evaluate and return the left and right operands for an operator"""
try: try:
left = q[i - 1] left = q[i - 1]
right = q[i + 1] right = q[i + 1]
...@@ -550,7 +601,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -550,7 +601,7 @@ class UnTextIndex(Persistent, Implicit):
def evaluate(self, query): def evaluate(self, query):
'''Evaluate a parsed query''' """Evaluate a parsed query"""
# There are two options if the query passed in is only one # There are two options if the query passed in is only one
# item. It means either it's an embedded query, in which case # item. It means either it's an embedded query, in which case
# we'll recursively evaluate, other wise it's nothing for us # we'll recursively evaluate, other wise it's nothing for us
...@@ -602,7 +653,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -602,7 +653,7 @@ class UnTextIndex(Persistent, Implicit):
def parse(s): def parse(s):
'''Parse parentheses and quotes''' """Parse parentheses and quotes"""
l = [] l = []
tmp = string.lower(s) tmp = string.lower(s)
...@@ -625,10 +676,10 @@ def parse(s): ...@@ -625,10 +676,10 @@ def parse(s):
return l return l
def parse2(q, default_operator, def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near}): operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
'''Find operators and operands''' """Find operators and operands"""
i = 0 i = 0
isop=operator_dict.has_key isop = operator_dict.has_key
while (i < len(q)): while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator) if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
...@@ -646,9 +697,9 @@ def parse2(q, default_operator, ...@@ -646,9 +697,9 @@ def parse2(q, default_operator,
return q return q
def parens(s, parens_re = regex.compile('(\|)').search): def parens(s, parens_re=regex.compile('(\|)').search):
index=open_index=paren_count = 0 index = open_index = paren_count = 0
while 1: while 1:
index = parens_re(s, index) index = parens_re(s, index)
...@@ -672,7 +723,7 @@ def parens(s, parens_re = regex.compile('(\|)').search): ...@@ -672,7 +723,7 @@ def parens(s, parens_re = regex.compile('(\|)').search):
def quotes(s, ws = (string.whitespace,)): def quotes(s, ws=(string.whitespace,)):
# split up quoted regions # split up quoted regions
splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2)) splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
split=string.split split=string.split
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment