Merge of Leixcon cleaup and text index merging.

41eb6fe8 · Christopher Petrilli · 78f3476f · 41eb6fe8 · 41eb6fe8 · 41eb6fe8
Commit 41eb6fe8 authored Jan 26, 2001 by Christopher Petrilli
5 changed files
--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -81,82 +81,108 @@
 # many individuals on behalf of Digital Creations.  Specific
 # attributions are listed in the accompanying credits file.
 # 
-##############################################################################
-
-__doc__=""" Lexicon object that supports 
+#############################################################################

-"""
 from Lexicon import Lexicon
 from Splitter import Splitter
 from intSet import intSet
 from UnTextIndex import Or

-import re, time
+import re, string
 import OIBTree, BTree, IOBTree, IIBTree
+
+# Short cuts for common data containers
 OIBTree = OIBTree.BTree                 # Object -> Integer
 OOBTree = BTree.BTree                   # Object -> Object
 IOBTree = IOBTree.BTree                 # Integer -> Object
 IIBucket = IIBTree.Bucket               # Integer -> Integer

-import pdb
+
 class GlobbingLexicon(Lexicon):
-    """
+    """Lexicon which supports basic globbing function ('*' and '?').
+
+    This lexicon keeps several data structures around that are useful
+    for searching. They are:
+
+      '_lexicon' -- Contains the mapping from word => word_id
+
+      '_inverseLex' -- Contains the mapping from word_id => word
+
+      '_digrams' -- Contains a mapping from digram => word_id

-    Base class to support globbing lexicon object.
+    Before going further, it is necessary to understand what a digram is,
+    as it is a core component of the structure of this lexicon.  A digram
+    is a two-letter sequence in a word.  For example, the word 'zope'
+    would be converted into the digrams::
+
+      ['$z', 'zo', 'op', 'pe', 'e$']
+
+    where the '$' is a word marker.  It is used at the beginning and end
+    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'

-    def __init__(self):

-        self.counter = 0
+    def __init__(self):
+        self.counter = 0                # word id counter XXX
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

-    def set(self, word):
-        """  """
+
+    def createDigrams(self, word):
+        """Returns a list with the set of digrams in the word."""
+        digrams = []
+
+        digrams.append(self.eow + word[0])    # Mark the beginning
+
+        for i in range(len(word)):
+            digrams.append(word[i:i+2])
+
+        digrams[-1] = digrams[-1] + self.eow  # Mark the end
+
+        return digrams
+
+    
+    def getWordId(self, word):
+        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
-
        else:
-            word = intern(word)
-            self._lexicon[word] = self.counter
-            self._inverseLex[self.counter] = word
+            return self.assignWordId(word)

-            ## now, split the word into digrams and insert references
-            ## to 'word' into the digram object.  The first and last
-            ## digrams in the list are specially marked with $ to
-            ## indicate the beginning and end of the word
+    set = getWordId                     # Kludge for old code

-            digrams = []
-            digrams.append(self.eow + word[0]) # mark the beginning

-            for i in range(len(word)):
-                digrams.append(word[i:i+2])
+    def assignWordId(self, word):
+        """Assigns a new word id to the provided word, and return it."""

-            digrams[-1] = digrams[-1] + self.eow  # mark the end
-
-            _digrams = self._digrams
-            
-            for digram in digrams:
-                set = _digrams.get(digram)
-                if set is None:
-                    _digrams[digram] = set = intSet()
-                    
-                set.insert(self.counter)
+        # Double check it's not in the lexicon already, and if it is, just
+        # return it.
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
+        
+        # First we go ahead and put the forward and reverse maps in.
+        self._lexicon[word] = self.counter
+        self._inverseLex[self.counter] = word

-            counter = self.counter
-            self.counter = self.counter + 1
-            return counter
+        # Now take all the digrams and insert them into the digram map.
+        for digram in self.createDigrams(word):
+            set = self._digrams.get(digram)
+            if set is None:
+                self._digrams[digram] = set = intSet()
+            set.insert(self.counter)

+        self.counter = self.counter + 1
+        return self.counter - 1         # Adjust for the previous increment

+    
    def get(self, pattern):
-        """ Query the lexicon for words matching a pattern.
-        """
+        """ Query the lexicon for words matching a pattern."""
        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
            ## may contain all matching digrams, but in the wrong
            ## order.

-            expr = re.compile(self.translate(pattern))
+            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = []
            for x in result.keys():
                if expr.match(self._inverseLex[x]):
                    hits.append(x)
            return hits
+
                
    def __getitem__(self, word):
        """ """
        return self.get(word)

-    def query_hook(self, q):
-        """expand wildcards

-        """
+    def query_hook(self, q):
+        """expand wildcards"""
        words = []
        wids = []
        for w in q:
@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):

        return words

+
    def Splitter(self, astring, words=None):
        """ wrap the splitter """

@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
        return Splitter(astring)


-    def translate(self, pat):
+    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

-        i, n = 0, len(pat)
-        res = ''
-        while i < n:
-            c = pat[i]
-            i = i+1
-            if c == self.multi_wc:
-                res = res + '.*'
-            elif c == self.single_wc:
-                res = res + '.?'
-            else:
-                res = res + re.escape(c)
-        return res + '$'
+        transTable = string.maketrans("", "")
+        
+        # First, deal with mutli-character globbing
+        result = string.replace(pat, '*', '.*')
+
+        # Next, we need to deal with single-character globbing
+        result = string.replace(result, '?', '.?')
+
+        # Now, we need to remove all of the characters that
+        # are forbidden.
+        result = string.translate(result, transTable,
+                                  r'()&|!@#$%^{}\<>')
+
+        return "%s$" % result 
--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -83,11 +83,6 @@
 # 
 ##############################################################################

-import string, regex, ts_regex
-import regsub
-
-
-
 __doc__=""" Module breaks out Zope specific methods and behavior.  In
 addition, provides the Lexicon class which defines a word to integer
 mapping.
@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
        self.stop_syn = stop_syn
        

-    def set(self, word):
+    def getWordId(self, word):
        """ return the word id of 'word' """

        if self._lexicon.has_key(word):
            return self._lexicon[word]
-
        else:
-            if not hasattr(self, 'counter'):
-                self.counter = 0
-            self._lexicon[intern(word)] = self.counter
-            self.counter = self.counter + 1
-            return self.counter - 1 
+            return self.assignWordId(word)
+
+    set = getWordId

    
+    def assignWordId(self, word):
+        """Assigns a new word id to the provided word and returns it."""
+        # First make sure it's not already in there
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
+        
+        if not hasattr(self, 'counter'):
+            self.counter = 0
+        self._lexicon[intern(word)] = self.counter
+        self.counter = self.counter + 1
+        return self.counter - 1 
+
+
    def get(self, key, default=None):
-        """  """
-        return [self._lexicon.get(key, default)]
+        """Return the matched word against the key."""
+        return [self._lexicon.getWordId(key, default)]


    def __getitem__(self, key):

--- a/lib/python/SearchIndex/UnIndex.py
+++ b/lib/python/SearchIndex/UnIndex.py
@@ -85,7 +85,7 @@

 """Simple column indices"""

-__version__='$Revision: 1.23 $'[11:-2]
+__version__='$Revision: 1.24 $'[11:-2]



@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
                    ('unindex_object could not remove '
                     'integer id %s from index %s.  This '
                     'should not happen.'
-                     % (str(i), str(k)))) 
+                     % (str(documentId), str(self.id)))) 
        else:
            LOG(self.__class__.__name__, ERROR,
                ('unindex_object tried to retrieve set %s '
                 'from index %s but couldn\'t.  This '
-                 'should not happen.' % (repr(set),str(k))))
+                 'should not happen.' % (repr(entry), str(self.id))))

        
    def insertForwardIndexEntry(self, entry, documentId):
@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
        This will also deal with creating the entire row if necessary."""

        indexRow = self._index.get(entry, MV)
-
+        
        # Make sure there's actually a row there already.  If not, create
        # an IntSet and stuff it in first.
        if indexRow is MV:
@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
            datum = getattr(obj, self.id)
            if callable(datum):
                datum = datum()
-        except:
+        except AttributeError:
            datum = MV
 
        # We don't want to do anything that we don't have to here, so we'll
        # check to see if the new and existing information is the same.
-        if not (datum == self._unindex.get(documentId, MV)):
+        oldDatum = self._unindex.get(documentId, MV)
+        if not datum == oldDatum:
+            if oldDatum is not MV:
+                self.removeForwardIndexEntry(oldDatum, documentId)
            self.insertForwardIndexEntry(datum, documentId)
            self._unindex[documentId] = datum

            returnStatus = 1
-            self._p_changed = 1         # Tickle the transaction

        return returnStatus
    

--- a/lib/python/SearchIndex/UnKeywordIndex.py
+++ b/lib/python/SearchIndex/UnKeywordIndex.py
@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
            newKeywords = getattr(obj, self.id)
            if callable(newKeywords):
                newKeywords = newKeywords()
-        except:
+        except Except:
            newKeywords = MV

        if type(newKeywords) is StringType:
@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
            except TypeError:
                return 0
        
-        self._unindex[documentId] = newKeywords
+        self._unindex[documentId] = newKeywords[:] # Make a copy

        return 1
    

--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py