Merge of Leixcon cleaup and text index merging.

41eb6fe8 · Christopher Petrilli · 78f3476f · 41eb6fe8 · 41eb6fe8 · 41eb6fe8
Commit 41eb6fe8 authored Jan 26, 2001 by Christopher Petrilli
5 changed files
--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -81,82 +81,108 @@
 # many individuals on behalf of Digital Creations.  Specific
 # attributions are listed in the accompanying credits file.
 # 
-##############################################################################
+#############################################################################
-__doc__=""" Lexicon object that supports 
-"""
 from Lexicon import Lexicon
 from Splitter import Splitter
 from intSet import intSet
 from UnTextIndex import Or
-import re, time
+import re, string
 import OIBTree, BTree, IOBTree, IIBTree
+# Short cuts for common data containers
 OIBTree = OIBTree.BTree                 # Object -> Integer
 OOBTree = BTree.BTree                   # Object -> Object
 IOBTree = IOBTree.BTree                 # Integer -> Object
 IIBucket = IIBTree.Bucket               # Integer -> Integer
-import pdb
 class GlobbingLexicon(Lexicon):
-    """
+    """Lexicon which supports basic globbing function ('*' and '?').
+    This lexicon keeps several data structures around that are useful
+    for searching. They are:
+      '_lexicon' -- Contains the mapping from word => word_id
+      '_inverseLex' -- Contains the mapping from word_id => word
+      '_digrams' -- Contains a mapping from digram => word_id
-    Base class to support globbing lexicon object.
+    Before going further, it is necessary to understand what a digram is,
+    as it is a core component of the structure of this lexicon.  A digram
+    is a two-letter sequence in a word.  For example, the word 'zope'
+    would be converted into the digrams::
+      ['$z', 'zo', 'op', 'pe', 'e$']
+    where the '$' is a word marker.  It is used at the beginning and end
+    of the words.  Those digrams are significant.
    """
    multi_wc = '*'
    single_wc = '?'
    eow = '$'
-    def __init__(self):
-        self.counter = 0
+    def __init__(self):
+        self.counter = 0                # word id counter XXX
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()
-    def set(self, word):
-        """  """
+    def createDigrams(self, word):
+        """Returns a list with the set of digrams in the word."""
+        digrams = []
+        digrams.append(self.eow + word[0])    # Mark the beginning
+        for i in range(len(word)):
+            digrams.append(word[i:i+2])
+        digrams[-1] = digrams[-1] + self.eow  # Mark the end
+        return digrams
+    def getWordId(self, word):
+        """Provided 'word', return the matching integer word id."""
        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
-            word = intern(word)
+            return self.assignWordId(word)
-            self._lexicon[word] = self.counter
-            self._inverseLex[self.counter] = word
-            ## now, split the word into digrams and insert references
+    set = getWordId                     # Kludge for old code
-            ## to 'word' into the digram object.  The first and last
-            ## digrams in the list are specially marked with $ to
-            ## indicate the beginning and end of the word
-            digrams = []
-            digrams.append(self.eow + word[0]) # mark the beginning
-            for i in range(len(word)):
+    def assignWordId(self, word):
-                digrams.append(word[i:i+2])
+        """Assigns a new word id to the provided word, and return it."""
-            digrams[-1] = digrams[-1] + self.eow  # mark the end
+        # Double check it's not in the lexicon already, and if it is, just
+        # return it.
-            _digrams = self._digrams
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
-            for digram in digrams:
-                set = _digrams.get(digram)
+        # First we go ahead and put the forward and reverse maps in.
-                if set is None:
+        self._lexicon[word] = self.counter
-                    _digrams[digram] = set = intSet()
+        self._inverseLex[self.counter] = word
-                set.insert(self.counter)
-            counter = self.counter
+        # Now take all the digrams and insert them into the digram map.
-            self.counter = self.counter + 1
+        for digram in self.createDigrams(word):
-            return counter
+            set = self._digrams.get(digram)
+            if set is None:
+                self._digrams[digram] = set = intSet()
+            set.insert(self.counter)
+        self.counter = self.counter + 1
+        return self.counter - 1         # Adjust for the previous increment
    def get(self, pattern):
-        """ Query the lexicon for words matching a pattern.
+        """ Query the lexicon for words matching a pattern."""
-        """
        wc_set = [self.multi_wc, self.single_wc]
        digrams = []
@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
            ## may contain all matching digrams, but in the wrong
            ## order.
-            expr = re.compile(self.translate(pattern))
+            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = []
            for x in result.keys():
                if expr.match(self._inverseLex[x]):
                    hits.append(x)
            return hits
    def __getitem__(self, word):
        """ """
        return self.get(word)
-    def query_hook(self, q):
-        """expand wildcards
-        """
+    def query_hook(self, q):
+        """expand wildcards"""
        words = []
        wids = []
        for w in q:
@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
        return words
    def Splitter(self, astring, words=None):
        """ wrap the splitter """
@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
        return Splitter(astring)
-    def translate(self, pat):
+    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.
        There is no way to quote meta-characters.
        """
-        i, n = 0, len(pat)
+        transTable = string.maketrans("", "")
-        res = ''
-        while i < n:
+        # First, deal with mutli-character globbing
-            c = pat[i]
+        result = string.replace(pat, '*', '.*')
-            i = i+1
-            if c == self.multi_wc:
+        # Next, we need to deal with single-character globbing
-                res = res + '.*'
+        result = string.replace(result, '?', '.?')
-            elif c == self.single_wc:
-                res = res + '.?'
+        # Now, we need to remove all of the characters that
-            else:
+        # are forbidden.
-                res = res + re.escape(c)
+        result = string.translate(result, transTable,
-        return res + '$'
+                                  r'()&|!@#$%^{}\<>')
+        return "%s$" % result 
--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -83,11 +83,6 @@
 # 
 ##############################################################################
-import string, regex, ts_regex
-import regsub
 __doc__=""" Module breaks out Zope specific methods and behavior.  In
 addition, provides the Lexicon class which defines a word to integer
 mapping.
@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
        self.stop_syn = stop_syn
-    def set(self, word):
+    def getWordId(self, word):
        """ return the word id of 'word' """
        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
-            if not hasattr(self, 'counter'):
+            return self.assignWordId(word)
-                self.counter = 0
-            self._lexicon[intern(word)] = self.counter
+    set = getWordId
-            self.counter = self.counter + 1
-            return self.counter - 1 
+    def assignWordId(self, word):
+        """Assigns a new word id to the provided word and returns it."""
+        # First make sure it's not already in there
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
+        if not hasattr(self, 'counter'):
+            self.counter = 0
+        self._lexicon[intern(word)] = self.counter
+        self.counter = self.counter + 1
+        return self.counter - 1 
    def get(self, key, default=None):
-        """  """
+        """Return the matched word against the key."""
-        return [self._lexicon.get(key, default)]
+        return [self._lexicon.getWordId(key, default)]
    def __getitem__(self, key):

--- a/lib/python/SearchIndex/UnIndex.py
+++ b/lib/python/SearchIndex/UnIndex.py
@@ -85,7 +85,7 @@
 """Simple column indices"""
-__version__='$Revision: 1.23 $'[11:-2]
+__version__='$Revision: 1.24 $'[11:-2]
@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
                    ('unindex_object could not remove '
                     'integer id %s from index %s.  This '
                     'should not happen.'
-                     % (str(i), str(k)))) 
+                     % (str(documentId), str(self.id)))) 
        else:
            LOG(self.__class__.__name__, ERROR,
                ('unindex_object tried to retrieve set %s '
                 'from index %s but couldn\'t.  This '
-                 'should not happen.' % (repr(set),str(k))))
+                 'should not happen.' % (repr(entry), str(self.id))))
    def insertForwardIndexEntry(self, entry, documentId):
@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
        This will also deal with creating the entire row if necessary."""
        indexRow = self._index.get(entry, MV)
        # Make sure there's actually a row there already.  If not, create
        # an IntSet and stuff it in first.
        if indexRow is MV:
@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
            datum = getattr(obj, self.id)
            if callable(datum):
                datum = datum()
-        except:
+        except AttributeError:
            datum = MV
        # We don't want to do anything that we don't have to here, so we'll
        # check to see if the new and existing information is the same.
-        if not (datum == self._unindex.get(documentId, MV)):
+        oldDatum = self._unindex.get(documentId, MV)
+        if not datum == oldDatum:
+            if oldDatum is not MV:
+                self.removeForwardIndexEntry(oldDatum, documentId)
            self.insertForwardIndexEntry(datum, documentId)
            self._unindex[documentId] = datum
            returnStatus = 1
-            self._p_changed = 1         # Tickle the transaction
        return returnStatus

--- a/lib/python/SearchIndex/UnKeywordIndex.py
+++ b/lib/python/SearchIndex/UnKeywordIndex.py
@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
            newKeywords = getattr(obj, self.id)
            if callable(newKeywords):
                newKeywords = newKeywords()
-        except:
+        except Except:
            newKeywords = MV
        if type(newKeywords) is StringType:
@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
            except TypeError:
                return 0
-        self._unindex[documentId] = newKeywords
+        self._unindex[documentId] = newKeywords[:] # Make a copy
        return 1

--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py