Commit 41eb6fe8 authored by Christopher Petrilli's avatar Christopher Petrilli

Merge of Leixcon cleaup and text index merging.

parent 78f3476f
...@@ -81,82 +81,108 @@ ...@@ -81,82 +81,108 @@
# many individuals on behalf of Digital Creations. Specific # many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file. # attributions are listed in the accompanying credits file.
# #
############################################################################## #############################################################################
__doc__=""" Lexicon object that supports
"""
from Lexicon import Lexicon from Lexicon import Lexicon
from Splitter import Splitter from Splitter import Splitter
from intSet import intSet from intSet import intSet
from UnTextIndex import Or from UnTextIndex import Or
import re, time import re, string
import OIBTree, BTree, IOBTree, IIBTree import OIBTree, BTree, IOBTree, IIBTree
# Short cuts for common data containers
OIBTree = OIBTree.BTree # Object -> Integer OIBTree = OIBTree.BTree # Object -> Integer
OOBTree = BTree.BTree # Object -> Object OOBTree = BTree.BTree # Object -> Object
IOBTree = IOBTree.BTree # Integer -> Object IOBTree = IOBTree.BTree # Integer -> Object
IIBucket = IIBTree.Bucket # Integer -> Integer IIBucket = IIBTree.Bucket # Integer -> Integer
import pdb
class GlobbingLexicon(Lexicon): class GlobbingLexicon(Lexicon):
""" """Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Base class to support globbing lexicon object. Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
""" """
multi_wc = '*' multi_wc = '*'
single_wc = '?' single_wc = '?'
eow = '$' eow = '$'
def __init__(self):
self.counter = 0 def __init__(self):
self.counter = 0 # word id counter XXX
self._lexicon = OIBTree() self._lexicon = OIBTree()
self._inverseLex = IOBTree() self._inverseLex = IOBTree()
self._digrams = OOBTree() self._digrams = OOBTree()
def set(self, word):
""" """ def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
digrams = []
digrams.append(self.eow + word[0]) # Mark the beginning
for i in range(len(word)):
digrams.append(word[i:i+2])
digrams[-1] = digrams[-1] + self.eow # Mark the end
return digrams
def getWordId(self, word):
"""Provided 'word', return the matching integer word id."""
if self._lexicon.has_key(word): if self._lexicon.has_key(word):
return self._lexicon[word] return self._lexicon[word]
else: else:
word = intern(word) return self.assignWordId(word)
self._lexicon[word] = self.counter
self._inverseLex[self.counter] = word
## now, split the word into digrams and insert references set = getWordId # Kludge for old code
## to 'word' into the digram object. The first and last
## digrams in the list are specially marked with $ to
## indicate the beginning and end of the word
digrams = []
digrams.append(self.eow + word[0]) # mark the beginning
for i in range(len(word)): def assignWordId(self, word):
digrams.append(word[i:i+2]) """Assigns a new word id to the provided word, and return it."""
digrams[-1] = digrams[-1] + self.eow # mark the end # Double check it's not in the lexicon already, and if it is, just
# return it.
_digrams = self._digrams if self._lexicon.has_key(word):
return self._lexicon[word]
for digram in digrams:
set = _digrams.get(digram) # First we go ahead and put the forward and reverse maps in.
if set is None: self._lexicon[word] = self.counter
_digrams[digram] = set = intSet() self._inverseLex[self.counter] = word
set.insert(self.counter)
counter = self.counter # Now take all the digrams and insert them into the digram map.
self.counter = self.counter + 1 for digram in self.createDigrams(word):
return counter set = self._digrams.get(digram)
if set is None:
self._digrams[digram] = set = intSet()
set.insert(self.counter)
self.counter = self.counter + 1
return self.counter - 1 # Adjust for the previous increment
def get(self, pattern): def get(self, pattern):
""" Query the lexicon for words matching a pattern. """ Query the lexicon for words matching a pattern."""
"""
wc_set = [self.multi_wc, self.single_wc] wc_set = [self.multi_wc, self.single_wc]
digrams = [] digrams = []
...@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon): ...@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
## may contain all matching digrams, but in the wrong ## may contain all matching digrams, but in the wrong
## order. ## order.
expr = re.compile(self.translate(pattern)) expr = re.compile(self.createRegex(pattern))
words = [] words = []
hits = [] hits = []
for x in result.keys(): for x in result.keys():
if expr.match(self._inverseLex[x]): if expr.match(self._inverseLex[x]):
hits.append(x) hits.append(x)
return hits return hits
def __getitem__(self, word): def __getitem__(self, word):
""" """ """ """
return self.get(word) return self.get(word)
def query_hook(self, q):
"""expand wildcards
""" def query_hook(self, q):
"""expand wildcards"""
words = [] words = []
wids = [] wids = []
for w in q: for w in q:
...@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon): ...@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
return words return words
def Splitter(self, astring, words=None): def Splitter(self, astring, words=None):
""" wrap the splitter """ """ wrap the splitter """
...@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon): ...@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
return Splitter(astring) return Splitter(astring)
def translate(self, pat): def createRegex(self, pat):
"""Translate a PATTERN to a regular expression. """Translate a PATTERN to a regular expression.
There is no way to quote meta-characters. There is no way to quote meta-characters.
""" """
i, n = 0, len(pat) transTable = string.maketrans("", "")
res = ''
while i < n: # First, deal with mutli-character globbing
c = pat[i] result = string.replace(pat, '*', '.*')
i = i+1
if c == self.multi_wc: # Next, we need to deal with single-character globbing
res = res + '.*' result = string.replace(result, '?', '.?')
elif c == self.single_wc:
res = res + '.?' # Now, we need to remove all of the characters that
else: # are forbidden.
res = res + re.escape(c) result = string.translate(result, transTable,
return res + '$' r'()&|!@#$%^{}\<>')
return "%s$" % result
...@@ -83,11 +83,6 @@ ...@@ -83,11 +83,6 @@
# #
############################################################################## ##############################################################################
import string, regex, ts_regex
import regsub
__doc__=""" Module breaks out Zope specific methods and behavior. In __doc__=""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer addition, provides the Lexicon class which defines a word to integer
mapping. mapping.
...@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit): ...@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
self.stop_syn = stop_syn self.stop_syn = stop_syn
def set(self, word): def getWordId(self, word):
""" return the word id of 'word' """ """ return the word id of 'word' """
if self._lexicon.has_key(word): if self._lexicon.has_key(word):
return self._lexicon[word] return self._lexicon[word]
else: else:
if not hasattr(self, 'counter'): return self.assignWordId(word)
self.counter = 0
self._lexicon[intern(word)] = self.counter set = getWordId
self.counter = self.counter + 1
return self.counter - 1
def assignWordId(self, word):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if self._lexicon.has_key(word):
return self._lexicon[word]
if not hasattr(self, 'counter'):
self.counter = 0
self._lexicon[intern(word)] = self.counter
self.counter = self.counter + 1
return self.counter - 1
def get(self, key, default=None): def get(self, key, default=None):
""" """ """Return the matched word against the key."""
return [self._lexicon.get(key, default)] return [self._lexicon.getWordId(key, default)]
def __getitem__(self, key): def __getitem__(self, key):
......
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
"""Simple column indices""" """Simple column indices"""
__version__='$Revision: 1.23 $'[11:-2] __version__='$Revision: 1.24 $'[11:-2]
...@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit): ...@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
('unindex_object could not remove ' ('unindex_object could not remove '
'integer id %s from index %s. This ' 'integer id %s from index %s. This '
'should not happen.' 'should not happen.'
% (str(i), str(k)))) % (str(documentId), str(self.id))))
else: else:
LOG(self.__class__.__name__, ERROR, LOG(self.__class__.__name__, ERROR,
('unindex_object tried to retrieve set %s ' ('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This ' 'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k)))) 'should not happen.' % (repr(entry), str(self.id))))
def insertForwardIndexEntry(self, entry, documentId): def insertForwardIndexEntry(self, entry, documentId):
...@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit): ...@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
This will also deal with creating the entire row if necessary.""" This will also deal with creating the entire row if necessary."""
indexRow = self._index.get(entry, MV) indexRow = self._index.get(entry, MV)
# Make sure there's actually a row there already. If not, create # Make sure there's actually a row there already. If not, create
# an IntSet and stuff it in first. # an IntSet and stuff it in first.
if indexRow is MV: if indexRow is MV:
...@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit): ...@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
datum = getattr(obj, self.id) datum = getattr(obj, self.id)
if callable(datum): if callable(datum):
datum = datum() datum = datum()
except: except AttributeError:
datum = MV datum = MV
# We don't want to do anything that we don't have to here, so we'll # We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same. # check to see if the new and existing information is the same.
if not (datum == self._unindex.get(documentId, MV)): oldDatum = self._unindex.get(documentId, MV)
if not datum == oldDatum:
if oldDatum is not MV:
self.removeForwardIndexEntry(oldDatum, documentId)
self.insertForwardIndexEntry(datum, documentId) self.insertForwardIndexEntry(datum, documentId)
self._unindex[documentId] = datum self._unindex[documentId] = datum
returnStatus = 1 returnStatus = 1
self._p_changed = 1 # Tickle the transaction
return returnStatus return returnStatus
......
...@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex): ...@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
newKeywords = getattr(obj, self.id) newKeywords = getattr(obj, self.id)
if callable(newKeywords): if callable(newKeywords):
newKeywords = newKeywords() newKeywords = newKeywords()
except: except Except:
newKeywords = MV newKeywords = MV
if type(newKeywords) is StringType: if type(newKeywords) is StringType:
...@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex): ...@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
except TypeError: except TypeError:
return 0 return 0
self._unindex[documentId] = newKeywords self._unindex[documentId] = newKeywords[:] # Make a copy
return 1 return 1
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment