Commit 41eb6fe8 authored by Christopher Petrilli's avatar Christopher Petrilli

Merge of Leixcon cleaup and text index merging.

parent 78f3476f
......@@ -81,82 +81,108 @@
# many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file.
#
##############################################################################
__doc__=""" Lexicon object that supports
#############################################################################
"""
from Lexicon import Lexicon
from Splitter import Splitter
from intSet import intSet
from UnTextIndex import Or
import re, time
import re, string
import OIBTree, BTree, IOBTree, IIBTree
# Short cuts for common data containers
OIBTree = OIBTree.BTree # Object -> Integer
OOBTree = BTree.BTree # Object -> Object
IOBTree = IOBTree.BTree # Integer -> Object
IIBucket = IIBTree.Bucket # Integer -> Integer
import pdb
class GlobbingLexicon(Lexicon):
"""
"""Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
Base class to support globbing lexicon object.
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
"""
multi_wc = '*'
single_wc = '?'
eow = '$'
def __init__(self):
self.counter = 0
def __init__(self):
self.counter = 0 # word id counter XXX
self._lexicon = OIBTree()
self._inverseLex = IOBTree()
self._digrams = OOBTree()
def set(self, word):
""" """
def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
digrams = []
digrams.append(self.eow + word[0]) # Mark the beginning
for i in range(len(word)):
digrams.append(word[i:i+2])
digrams[-1] = digrams[-1] + self.eow # Mark the end
return digrams
def getWordId(self, word):
"""Provided 'word', return the matching integer word id."""
if self._lexicon.has_key(word):
return self._lexicon[word]
else:
word = intern(word)
self._lexicon[word] = self.counter
self._inverseLex[self.counter] = word
return self.assignWordId(word)
## now, split the word into digrams and insert references
## to 'word' into the digram object. The first and last
## digrams in the list are specially marked with $ to
## indicate the beginning and end of the word
set = getWordId # Kludge for old code
digrams = []
digrams.append(self.eow + word[0]) # mark the beginning
for i in range(len(word)):
digrams.append(word[i:i+2])
def assignWordId(self, word):
"""Assigns a new word id to the provided word, and return it."""
digrams[-1] = digrams[-1] + self.eow # mark the end
# Double check it's not in the lexicon already, and if it is, just
# return it.
if self._lexicon.has_key(word):
return self._lexicon[word]
_digrams = self._digrams
# First we go ahead and put the forward and reverse maps in.
self._lexicon[word] = self.counter
self._inverseLex[self.counter] = word
for digram in digrams:
set = _digrams.get(digram)
# Now take all the digrams and insert them into the digram map.
for digram in self.createDigrams(word):
set = self._digrams.get(digram)
if set is None:
_digrams[digram] = set = intSet()
self._digrams[digram] = set = intSet()
set.insert(self.counter)
counter = self.counter
self.counter = self.counter + 1
return counter
return self.counter - 1 # Adjust for the previous increment
def get(self, pattern):
""" Query the lexicon for words matching a pattern.
"""
""" Query the lexicon for words matching a pattern."""
wc_set = [self.multi_wc, self.single_wc]
digrams = []
......@@ -199,7 +225,7 @@ class GlobbingLexicon(Lexicon):
## may contain all matching digrams, but in the wrong
## order.
expr = re.compile(self.translate(pattern))
expr = re.compile(self.createRegex(pattern))
words = []
hits = []
for x in result.keys():
......@@ -207,14 +233,14 @@ class GlobbingLexicon(Lexicon):
hits.append(x)
return hits
def __getitem__(self, word):
""" """
return self.get(word)
def query_hook(self, q):
"""expand wildcards
"""
def query_hook(self, q):
"""expand wildcards"""
words = []
wids = []
for w in q:
......@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
return words
def Splitter(self, astring, words=None):
""" wrap the splitter """
......@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
return Splitter(astring)
def translate(self, pat):
def createRegex(self, pat):
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
i, n = 0, len(pat)
res = ''
while i < n:
c = pat[i]
i = i+1
if c == self.multi_wc:
res = res + '.*'
elif c == self.single_wc:
res = res + '.?'
else:
res = res + re.escape(c)
return res + '$'
transTable = string.maketrans("", "")
# First, deal with mutli-character globbing
result = string.replace(pat, '*', '.*')
# Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
return "%s$" % result
......@@ -83,11 +83,6 @@
#
##############################################################################
import string, regex, ts_regex
import regsub
__doc__=""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer
mapping.
......@@ -137,13 +132,23 @@ class Lexicon(Persistent, Implicit):
self.stop_syn = stop_syn
def set(self, word):
def getWordId(self, word):
""" return the word id of 'word' """
if self._lexicon.has_key(word):
return self._lexicon[word]
else:
return self.assignWordId(word)
set = getWordId
def assignWordId(self, word):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if self._lexicon.has_key(word):
return self._lexicon[word]
if not hasattr(self, 'counter'):
self.counter = 0
self._lexicon[intern(word)] = self.counter
......@@ -152,8 +157,8 @@ class Lexicon(Persistent, Implicit):
def get(self, key, default=None):
""" """
return [self._lexicon.get(key, default)]
"""Return the matched word against the key."""
return [self._lexicon.getWordId(key, default)]
def __getitem__(self, key):
......
......@@ -85,7 +85,7 @@
"""Simple column indices"""
__version__='$Revision: 1.23 $'[11:-2]
__version__='$Revision: 1.24 $'[11:-2]
......@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
('unindex_object could not remove '
'integer id %s from index %s. This '
'should not happen.'
% (str(i), str(k))))
% (str(documentId), str(self.id))))
else:
LOG(self.__class__.__name__, ERROR,
('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k))))
'should not happen.' % (repr(entry), str(self.id))))
def insertForwardIndexEntry(self, entry, documentId):
......@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
datum = getattr(obj, self.id)
if callable(datum):
datum = datum()
except:
except AttributeError:
datum = MV
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
if not (datum == self._unindex.get(documentId, MV)):
oldDatum = self._unindex.get(documentId, MV)
if not datum == oldDatum:
if oldDatum is not MV:
self.removeForwardIndexEntry(oldDatum, documentId)
self.insertForwardIndexEntry(datum, documentId)
self._unindex[documentId] = datum
returnStatus = 1
self._p_changed = 1 # Tickle the transaction
return returnStatus
......
......@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
newKeywords = getattr(obj, self.id)
if callable(newKeywords):
newKeywords = newKeywords()
except:
except Except:
newKeywords = MV
if type(newKeywords) is StringType:
......@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
except TypeError:
return 0
self._unindex[documentId] = newKeywords
self._unindex[documentId] = newKeywords[:] # Make a copy
return 1
......
......@@ -89,43 +89,58 @@ The UnTextIndex falls under the 'I didnt have a better name for it'
excuse. It is an 'Un' Text index because it stores a little bit of
undo information so that objects can be unindexed when the old value
is no longer known.
"""
__version__ = '$Revision: 1.35 $'[11:-2]
"""
__version__='$Revision: 1.34 $'[11:-2]
import BTree, IIBTree, IOBTree, OIBTree
import string, regex, regsub, ts_regex
import operator
from intSet import intSet
from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree
from Acquisition import Implicit
BTree=BTree.BTree
IOBTree=IOBTree.BTree
IIBucket=IIBTree.Bucket
OIBTree=OIBTree.BTree
from intSet import intSet
import operator
from Splitter import Splitter
from string import strip
import string, regex, regsub, ts_regex
from zLOG import LOG, ERROR
from types import *
from Lexicon import Lexicon, stop_word_dict
from Lexicon import Lexicon
from ResultList import ResultList
from types import *
BTree = BTree.BTree # Regular generic BTree
IOBTree = IOBTree.BTree # Integer -> Object
IIBucket = IIBTree.Bucket # Integer -> Integer
OIBTree = OIBTree.BTree # Object -> Integer
AndNot = 'andnot'
And = 'and'
Or = 'or'
Near = '...'
QueryError='TextIndex.QueryError'
QueryError = 'TextIndex.QueryError'
class UnTextIndex(Persistent, Implicit):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way."""
meta_type = 'Text Index'
def __init__(self, id=None, ignore_ex=None,
call_methods=None, lexicon=None):
"""Create an index
......@@ -142,49 +157,33 @@ class UnTextIndex(Persistent, Implicit):
of getattr or getitem to get an attribute.
'lexicon' is the lexicon object to specify, if None, the
index will use a private lexicon.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to
mapping from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
index will use a private lexicon."""
This isn't exactly how things are represented in memory, many
optimizations happen along the way.
"""
if not id==ignore_ex==call_methods==None:
self.id=id
self.ignore_ex=ignore_ex
self.call_methods=call_methods
self._index=IOBTree()
self._unindex=IOBTree()
if not id == ignore_ex == call_methods == None:
self.id = id
self.ignore_ex = ignore_ex
self.call_methods = call_methods
self._index = IOBTree()
self._unindex = IOBTree()
else:
pass
if lexicon is None:
## if no lexicon is provided, create a default one
self._lexicon=Lexicon()
self._lexicon = Lexicon()
else:
self._lexicon = lexicon
def getLexicon(self, vocab_id):
"""Return the Lexicon in use.
""" bit of a hack, indexes have been made acquirers so that
they can acquire a vocabulary object from the object system in
Bit of a hack, indexes have been made acquirers so that they
can acquire a vocabulary object from the object system in
Zope. I don't think indexes were ever intended to participate
in this way, but I don't see too much of a problem with it.
"""
in this way, but I don't see too much of a problem with it."""
if type(vocab_id) is not StringType:
vocab = vocab_id
else:
......@@ -193,10 +192,14 @@ class UnTextIndex(Persistent, Implicit):
def __len__(self):
"""Return the number of objects indexed."""
return len(self._unindex)
def clear(self):
"""Reinitialize the text index."""
self._index = IOBTree()
self._unindex = IOBTree()
......@@ -214,6 +217,10 @@ class UnTextIndex(Persistent, Implicit):
def getEntryForObject(self, rid, default=None):
"""Get all information contained for a specific object.
This takes the objects record ID as it's main argument."""
wordMap = self.getLexicon(self._lexicon)._lexicon.items()
results = self._unindex.get(rid, None)
......@@ -247,12 +254,21 @@ class UnTextIndex(Persistent, Implicit):
# Tuples are only used for rows which have only
# a single entry. Since we now need more, we'll
# promote it to a mapping object (dictionary).
# First, make sure we're not already in it, if so
# update the score if necessary.
if indexRow[0] == documentId:
if indexRow[1] != score:
indexRow = (documentId, score)
else:
indexRow = { indexRow[0]: indexRow[1] }
indexRow[documentId] = score
self._index[entry] = indexRow
elif type(indexRow) is DictType:
if len(indexRow) > 4:
if indexRow.has_key(documentId):
if indexRow[documentId] == score:
return 1 # No need to update
elif len(indexRow) > 4:
# We have a mapping (dictionary), but it has
# grown too large, so we'll convert it to a
# bucket.
......@@ -266,6 +282,9 @@ class UnTextIndex(Persistent, Implicit):
indexRow[documentId] = score
else:
# We've got a IIBucket already.
if indexRow.has_key(documentId):
if indexRow[documentId] == score:
return 1
indexRow[documentId] = score
else:
# We don't have any information at this point, so we'll
......@@ -277,13 +296,43 @@ class UnTextIndex(Persistent, Implicit):
def insertReverseIndexEntry(self, entry, documentId):
"""Insert the correct entry into the reverse indexes for future
unindexing."""
newEntry = self._unindex.get(documentId, [])
newEntry.append(entry)
self._unindex[documentId] = newEntry
newRow = self._unindex.get(documentId, [])
if newRow:
# Catch cases where we don't need to modify anything
if entry in newRow:
return 1
newRow.append(entry)
self._unindex[documentId] = newRow
def index_object(self, documentId, obj, threshold=None):
def removeReverseEntry(self, entry, documentId):
"""Removes a single entry from the reverse index."""
newRow = self._unindex.get(documentId, [])
if newRow:
try:
newRow.remove(entry)
except ValueError:
pass # We don't have it, this is bad
self._unindex[documentId] = newRow
def removeForwardEntry(self, entry, documentId):
"""Remove a single entry from the forward index."""
currentRow = self._index.get(entry, None)
if type(currentRow) is TupleType:
del self._index[entry]
elif currentRow is not None:
try:
del self._index[entry][documentId]
except (KeyError, IndexError, TypeError):
LOG('UnTextIndex', ERROR,
'unindex_object tried to unindex nonexistent'
' document %s' % str(i))
def index_object(self, documentId, obj, threshold=None):
""" Index an object:
'documentId' is the integer id of the document
......@@ -301,7 +350,7 @@ class UnTextIndex(Persistent, Implicit):
source = str(source())
else:
source = str(source)
except:
except AttributeError:
return 0
......@@ -322,32 +371,36 @@ class UnTextIndex(Persistent, Implicit):
else:
wordList[word] = 1
index = self._index
unindex = self._unindex
lexicon = self.getLexicon(self._lexicon)
unindex[documentId] = [] # XXX
currentWordIds = self._unindex.get(documentId, [])
wordCount = 0
# First deal with deleted words
# To do this, the first thing we have to do is convert the
# existing words to words, from wordIDS
wordListAsIds = OIBTree()
for word, score in wordList.items():
if threshold is not None:
if ((wordCount % threshold) == 0) and not (wordCount == 0):
# commit a subtransaction hack
get_transaction().commit(1)
# kick the cache
self._p_jar.cacheFullSweep(1)
wordId = lexicon.set(word)
wordListAsIds[lexicon.getWordId(word)] = score
for word in currentWordIds:
if not wordListAsIds.has_key(word):
self.removeForwardEntry(word, documentId)
#import pdb; pdb.set_trace()
# Now we can deal with new/updated entries
for wordId, score in wordListAsIds.items():
self.insertForwardIndexEntry(wordId, documentId, score)
self.insertReverseIndexEntry(wordId, documentId)
wordCount = wordCount + 1
## return the number of words you indexed
# Return the number of words you indexed
return wordCount
def unindex_object(self, i):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index = self._index
unindex = self._unindex
val = unindex.get(i, None)
......@@ -385,7 +438,7 @@ class UnTextIndex(Persistent, Implicit):
if len(splitSource) == 1:
splitSource = splitSource[0]
if splitSource[:1]=='"' and splitSource[-1:]=='"':
if splitSource[:1] == '"' and splitSource[-1:] == '"':
return self[splitSource]
r = self._index.get(
......@@ -429,13 +482,13 @@ class UnTextIndex(Persistent, Implicit):
return None
if type(keys) is StringType:
if not keys or not strip(keys):
if not keys or not string.strip(keys):
return None
keys = [keys]
r = None
for key in keys:
key = strip(key)
key = string.strip(key)
if not key:
continue
......@@ -480,11 +533,11 @@ class UnTextIndex(Persistent, Implicit):
def _subindex(self, isrc, d, old, last):
src = self.getLexicon(self._lexicon).Splitter(isrc)
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
if s[0] == '\"':
last = self.subindex(s[1:-1],d,old,last)
else:
if old(s):
if s != last: d[s] = d[s]+1
......@@ -493,15 +546,12 @@ class UnTextIndex(Persistent, Implicit):
return last
def query(self, s, default_operator = Or, ws = (string.whitespace,)):
"""
This is called by TextIndexes. A 'query term' which is a string
's' is passed in, along with an index object. s is parsed, then
the wildcards are parsed, then something is parsed again, then the
whole thing is 'evaluated'
def query(self, s, default_operator=Or, ws=(string.whitespace,)):
""" This is called by TextIndexes. A 'query term' which is a
string 's' is passed in, along with an index object. s is
parsed, then the wildcards are parsed, then something is
parsed again, then the whole thing is 'evaluated'. """
"""
# First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub(
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
......@@ -523,7 +573,8 @@ class UnTextIndex(Persistent, Implicit):
def get_operands(self, q, i):
'''Evaluate and return the left and right operands for an operator'''
"""Evaluate and return the left and right operands for an operator"""
try:
left = q[i - 1]
right = q[i + 1]
......@@ -550,7 +601,7 @@ class UnTextIndex(Persistent, Implicit):
def evaluate(self, query):
'''Evaluate a parsed query'''
"""Evaluate a parsed query"""
# There are two options if the query passed in is only one
# item. It means either it's an embedded query, in which case
# we'll recursively evaluate, other wise it's nothing for us
......@@ -602,7 +653,7 @@ class UnTextIndex(Persistent, Implicit):
def parse(s):
'''Parse parentheses and quotes'''
"""Parse parentheses and quotes"""
l = []
tmp = string.lower(s)
......@@ -625,10 +676,10 @@ def parse(s):
return l
def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near}):
'''Find operators and operands'''
operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
"""Find operators and operands"""
i = 0
isop=operator_dict.has_key
isop = operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
......@@ -646,9 +697,9 @@ def parse2(q, default_operator,
return q
def parens(s, parens_re = regex.compile('(\|)').search):
def parens(s, parens_re=regex.compile('(\|)').search):
index=open_index=paren_count = 0
index = open_index = paren_count = 0
while 1:
index = parens_re(s, index)
......@@ -672,7 +723,7 @@ def parens(s, parens_re = regex.compile('(\|)').search):
def quotes(s, ws = (string.whitespace,)):
def quotes(s, ws=(string.whitespace,)):
# split up quoted regions
splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
split=string.split
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment