Merge of Leixcon cleaup and text index merging.

41eb6fe8 · Christopher Petrilli · 78f3476f · 41eb6fe8 · 41eb6fe8 · 41eb6fe8
Commit 41eb6fe8 authored Jan 26, 2001 by Christopher Petrilli
5 changed files
--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -81,82 +81,108 @@
 # many individuals on behalf of Digital Creations.  Specific
 # attributions are listed in the accompanying credits file.
 # 
-##############################################################################
-
-__doc__=""" Lexicon object that supports 
+#############################################################################

-"""
 from Lexicon import Lexicon
 from Splitter import Splitter
 from intSet import intSet
 from UnTextIndex import Or

-import re, time
+import re, string
 import OIBTree, BTree, IOBTree, IIBTree
+
+# Short cuts for common data containers
 OIBTree = OIBTree.BTree                 # Object -> Integer
 OOBTree = BTree.BTree                   # Object -> Object
 IOBTree = IOBTree.BTree                 # Integer -> Object
 IIBucket = IIBTree.Bucket               # Integer -> Integer

-import pdb
+
 class GlobbingLexicon(Lexicon):
-    """
+    """Lexicon which supports basic globbing function ('*' and '?').
+
+    This lexicon keeps several data structures around that are useful
+    for searching. They are:

-    Base class to support globbing lexicon object.
+      '_lexicon' -- Contains the mapping from word => word_id
+
+      '_inverseLex' -- Contains the mapping from word_id => word
+
+      '_digrams' -- Contains a mapping from digram => word_id
+
+    Before going further, it is necessary to understand what a digram is,
+    as it is a core component of the structure of this lexicon.  A digram
+    is a two-letter sequence in a word.  For example, the word 'zope'
+    would be converted into the digrams::
+
+      ['$z', 'zo', 'op', 'pe', 'e$']
+
+    where the '$' is a word marker.  It is used at the beginning and end
+    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'

-    def __init__(self):

-        self.counter = 0
+    def __init__(self):
+        self.counter = 0                # word id counter XXX
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

-    def set(self, word):
-        """  """
+
+    def createDigrams(self, word):
+        """Returns a list with the set of digrams in the word."""
+        digrams = []
+
+        digrams.append(self.eow + word[0])    # Mark the beginning
+
+        for i in range(len(word)):
+            digrams.append(word[i:i+2])
+
+        digrams[-1] = digrams[-1] + self.eow  # Mark the end
+
+        return digrams
+
+    
+    def getWordId(self, word):
+        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
-
        else:
-            word = intern(word)
-            self._lexicon[word] = self.counter
-            self._inverseLex[self.counter] = word
+            return self.assignWordId(word)

-            ## now, split the word into digrams and insert references
-            ## to 'word' into the digram object.  The first and last
-            ## digrams in the list are specially marked with $ to
-            ## indicate the beginning and end of the word
+    set = getWordId                     # Kludge for old code

-            digrams = []
-            digrams.append(self.eow + word[0]) # mark the beginning

-            for i in range(len(word)):
-                digrams.append(word[i:i+2])
+    def assignWordId(self, word):
+        """Assigns a new word id to the provided word, and return it."""

-            digrams[-1] = digrams[-1] + self.eow  # mark the end
+        # Double check it's not in the lexicon already, and if it is, just
+        # return it.
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
        
-            _digrams = self._digrams
+        # First we go ahead and put the forward and reverse maps in.
+        self._lexicon[word] = self.counter
+        self._inverseLex[self.counter] = word

-            for digram in digrams:
-                set = _digrams.get(digram)
+        # Now take all the digrams and insert them into the digram map.
+        for digram in self.createDigrams(word):
+            set = self._digrams.get(digram)
            if set is None:
-                    _digrams[digram] = set = intSet()
-                    
+                self._digrams[digram] = set = intSet()
            set.insert(self.counter)

-            counter = self.counter
        self.counter = self.counter + 1
-            return counter
+        return self.counter - 1         # Adjust for the previous increment

    
    def get(self, pattern):
-        """ Query the lexicon for words matching a pattern.
-        """
+        """ Query the lexicon for words matching a pattern."""
        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
@@ -199,7 +225,7 @@ class GlobbingLexicon(Lexicon):
            ## may contain all matching digrams, but in the wrong
            ## order.

-            expr = re.compile(self.translate(pattern))
+            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = []
            for x in result.keys():
@@ -207,14 +233,14 @@ class GlobbingLexicon(Lexicon):
                    hits.append(x)
            return hits

+                
    def __getitem__(self, word):
        """ """
        return self.get(word)

-    def query_hook(self, q):
-        """expand wildcards

-        """
+    def query_hook(self, q):
+        """expand wildcards"""
        words = []
        wids = []
        for w in q:
@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):

        return words

+
    def Splitter(self, astring, words=None):
        """ wrap the splitter """

@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
        return Splitter(astring)


-    def translate(self, pat):
+    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

-        i, n = 0, len(pat)
-        res = ''
-        while i < n:
-            c = pat[i]
-            i = i+1
-            if c == self.multi_wc:
-                res = res + '.*'
-            elif c == self.single_wc:
-                res = res + '.?'
-            else:
-                res = res + re.escape(c)
-        return res + '$'
+        transTable = string.maketrans("", "")
+        
+        # First, deal with mutli-character globbing
+        result = string.replace(pat, '*', '.*')
+
+        # Next, we need to deal with single-character globbing
+        result = string.replace(result, '?', '.?')
+
+        # Now, we need to remove all of the characters that
+        # are forbidden.
+        result = string.translate(result, transTable,
+                                  r'()&|!@#$%^{}\<>')
+
+        return "%s$" % result 
--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -83,11 +83,6 @@
 # 
 ##############################################################################

-import string, regex, ts_regex
-import regsub
-
-
-
 __doc__=""" Module breaks out Zope specific methods and behavior.  In
 addition, provides the Lexicon class which defines a word to integer
 mapping.
@@ -137,13 +132,23 @@ class Lexicon(Persistent, Implicit):
        self.stop_syn = stop_syn
        

-    def set(self, word):
+    def getWordId(self, word):
        """ return the word id of 'word' """

        if self._lexicon.has_key(word):
            return self._lexicon[word]
-
        else:
+            return self.assignWordId(word)
+
+    set = getWordId
+
+    
+    def assignWordId(self, word):
+        """Assigns a new word id to the provided word and returns it."""
+        # First make sure it's not already in there
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
+        
        if not hasattr(self, 'counter'):
            self.counter = 0
        self._lexicon[intern(word)] = self.counter
@@ -152,8 +157,8 @@ class Lexicon(Persistent, Implicit):


    def get(self, key, default=None):
-        """  """
-        return [self._lexicon.get(key, default)]
+        """Return the matched word against the key."""
+        return [self._lexicon.getWordId(key, default)]


    def __getitem__(self, key):

--- a/lib/python/SearchIndex/UnIndex.py
+++ b/lib/python/SearchIndex/UnIndex.py
@@ -85,7 +85,7 @@

 """Simple column indices"""

-__version__='$Revision: 1.23 $'[11:-2]
+__version__='$Revision: 1.24 $'[11:-2]



@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
                    ('unindex_object could not remove '
                     'integer id %s from index %s.  This '
                     'should not happen.'
-                     % (str(i), str(k)))) 
+                     % (str(documentId), str(self.id)))) 
        else:
            LOG(self.__class__.__name__, ERROR,
                ('unindex_object tried to retrieve set %s '
                 'from index %s but couldn\'t.  This '
-                 'should not happen.' % (repr(set),str(k))))
+                 'should not happen.' % (repr(entry), str(self.id))))

        
    def insertForwardIndexEntry(self, entry, documentId):
@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
            datum = getattr(obj, self.id)
            if callable(datum):
                datum = datum()
-        except:
+        except AttributeError:
            datum = MV
 
        # We don't want to do anything that we don't have to here, so we'll
        # check to see if the new and existing information is the same.
-        if not (datum == self._unindex.get(documentId, MV)):
+        oldDatum = self._unindex.get(documentId, MV)
+        if not datum == oldDatum:
+            if oldDatum is not MV:
+                self.removeForwardIndexEntry(oldDatum, documentId)
            self.insertForwardIndexEntry(datum, documentId)
            self._unindex[documentId] = datum

            returnStatus = 1
-            self._p_changed = 1         # Tickle the transaction

        return returnStatus
    

--- a/lib/python/SearchIndex/UnKeywordIndex.py
+++ b/lib/python/SearchIndex/UnKeywordIndex.py
@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
            newKeywords = getattr(obj, self.id)
            if callable(newKeywords):
                newKeywords = newKeywords()
-        except:
+        except Except:
            newKeywords = MV

        if type(newKeywords) is StringType:
@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
            except TypeError:
                return 0
        
-        self._unindex[documentId] = newKeywords
+        self._unindex[documentId] = newKeywords[:] # Make a copy

        return 1
    

--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -89,43 +89,58 @@ The UnTextIndex falls under the 'I didnt have a better name for it'
 excuse.  It is an 'Un' Text index because it stores a little bit of
 undo information so that objects can be unindexed when the old value
 is no longer known.
+"""

+__version__ = '$Revision: 1.35 $'[11:-2]

-"""
-__version__='$Revision: 1.34 $'[11:-2]

+import BTree, IIBTree, IOBTree, OIBTree
+import string, regex, regsub, ts_regex
+import operator

+from intSet import intSet
 from Globals import Persistent
-import BTree, IIBTree, IOBTree, OIBTree
 from Acquisition import Implicit
-BTree=BTree.BTree
-IOBTree=IOBTree.BTree
-IIBucket=IIBTree.Bucket
-OIBTree=OIBTree.BTree
-from intSet import intSet
-import operator
 from Splitter import Splitter
-from string import strip
-import string, regex, regsub, ts_regex
 from zLOG import LOG, ERROR
-from types import *
-
-from Lexicon import Lexicon, stop_word_dict
+from Lexicon import Lexicon
 from ResultList import ResultList
+from types import *

+BTree = BTree.BTree                     # Regular generic BTree
+IOBTree = IOBTree.BTree                 # Integer -> Object 
+IIBucket = IIBTree.Bucket               # Integer -> Integer
+OIBTree = OIBTree.BTree                 # Object -> Integer

 AndNot = 'andnot'
 And = 'and'
 Or = 'or'
 Near = '...'
-QueryError='TextIndex.QueryError'
-
+QueryError = 'TextIndex.QueryError'

            
 class UnTextIndex(Persistent, Implicit):
+    """Full-text index.
+
+    There is a ZCatalog UML model that sheds some light on what is
+    going on here.  '_index' is a BTree which maps word ids to mapping
+    from document id to score.  Something like:
+
+      {'bob' : {1 : 5, 2 : 3, 42 : 9}}
+      {'uncle' : {1 : 1}}
+
+
+    The '_unindex' attribute is a mapping from document id to word
+    ids.  This mapping allows the catalog to unindex an object:
+
+      {42 : ('bob', 'is', 'your', 'uncle')
+
+    This isn't exactly how things are represented in memory, many
+    optimizations happen along the way."""

    meta_type = 'Text Index'

+
    def __init__(self, id=None, ignore_ex=None,
                 call_methods=None, lexicon=None):
        """Create an index
@@ -142,49 +157,33 @@ class UnTextIndex(Persistent, Implicit):
          of getattr or getitem to get an attribute.

          'lexicon' is the lexicon object to specify, if None, the
-          index will use a private lexicon.
-
-        There is a ZCatalog UML model that sheds some light on what is
-        going on here.  '_index' is a BTree which maps word ids to
-        mapping from document id to score.  Something like:
-
-          {'bob' : {1 : 5, 2 : 3, 42 : 9}}
-          {'uncle' : {1 : 1}}
-
-
-        The '_unindex' attribute is a mapping from document id to word 
-        ids.  This mapping allows the catalog to unindex an object:
-
-          {42 : ('bob', 'is', 'your', 'uncle')
+          index will use a private lexicon."""
        
-        This isn't exactly how things are represented in memory, many
-        optimizations happen along the way.
-
-        """
-        if not id==ignore_ex==call_methods==None:
-            self.id=id
-            self.ignore_ex=ignore_ex
-            self.call_methods=call_methods
-            self._index=IOBTree()
-            self._unindex=IOBTree()
+        if not id == ignore_ex == call_methods == None:
+            self.id = id
+            self.ignore_ex = ignore_ex
+            self.call_methods = call_methods
+            self._index = IOBTree()
+            self._unindex = IOBTree()

        else:
            pass

        if lexicon is None:
            ## if no lexicon is provided, create a default one
-            self._lexicon=Lexicon()
+            self._lexicon = Lexicon()
        else:
            self._lexicon = lexicon


    def getLexicon(self, vocab_id):
+        """Return the Lexicon in use.
        
-        """ bit of a hack, indexes have been made acquirers so that
-        they can acquire a vocabulary object from the object system in 
+        Bit of a hack, indexes have been made acquirers so that they
+        can acquire a vocabulary object from the object system in
        Zope.  I don't think indexes were ever intended to participate
-        in this way, but I don't see too much of a problem with it.
-        """
+        in this way, but I don't see too much of a problem with it."""
+
        if type(vocab_id) is not StringType:
            vocab = vocab_id
        else:
@@ -193,10 +192,14 @@ class UnTextIndex(Persistent, Implicit):
        

    def __len__(self):
+        """Return the number of objects indexed."""
+
        return len(self._unindex)


    def clear(self):
+        """Reinitialize the text index."""
+        
        self._index = IOBTree()
        self._unindex = IOBTree()

@@ -214,6 +217,10 @@ class UnTextIndex(Persistent, Implicit):


    def getEntryForObject(self, rid, default=None):
+        """Get all information contained for a specific object.
+
+        This takes the objects record ID as it's main argument."""
+
        wordMap = self.getLexicon(self._lexicon)._lexicon.items()
        results = self._unindex.get(rid, None)

@@ -247,12 +254,21 @@ class UnTextIndex(Persistent, Implicit):
                # Tuples are only used for rows which have only
                # a single entry.  Since we now need more, we'll
                # promote it to a mapping object (dictionary).
+
+                # First, make sure we're not already in it, if so
+                # update the score if necessary.
+                if indexRow[0] == documentId:
+                    if indexRow[1] != score:
+                        indexRow = (documentId, score)
+                else:
                    indexRow = { indexRow[0]: indexRow[1] }
                    indexRow[documentId] = score
-
                    self._index[entry] = indexRow
            elif type(indexRow) is DictType:
-                if len(indexRow) > 4:
+                if indexRow.has_key(documentId):
+                    if indexRow[documentId] == score:
+                        return 1    # No need to update
+                elif len(indexRow) > 4:
                    # We have a mapping (dictionary), but it has
                    # grown too large, so we'll convert it to a
                    # bucket.
@@ -266,6 +282,9 @@ class UnTextIndex(Persistent, Implicit):
                    indexRow[documentId] = score
            else:
                # We've got a IIBucket already.
+                if indexRow.has_key(documentId):
+                    if indexRow[documentId] == score:
+                        return 1
                indexRow[documentId] = score
        else:
            # We don't have any information at this point, so we'll
@@ -277,13 +296,43 @@ class UnTextIndex(Persistent, Implicit):
    def insertReverseIndexEntry(self, entry, documentId):
        """Insert the correct entry into the reverse indexes for future
        unindexing."""
-        newEntry = self._unindex.get(documentId, [])
-        newEntry.append(entry)
-        self._unindex[documentId] = newEntry

+        newRow = self._unindex.get(documentId, [])
+        if newRow:
+            # Catch cases where we don't need to modify anything
+            if entry in newRow:
+                return 1
+        newRow.append(entry)
+        self._unindex[documentId] = newRow

-    def index_object(self, documentId, obj, threshold=None):

+    def removeReverseEntry(self, entry, documentId):
+        """Removes a single entry from the reverse index."""
+
+        newRow = self._unindex.get(documentId, [])
+        if newRow:
+            try:
+                newRow.remove(entry)
+            except ValueError:
+                pass                    # We don't have it, this is bad
+        self._unindex[documentId] = newRow
+
+
+    def removeForwardEntry(self, entry, documentId):
+        """Remove a single entry from the forward index."""
+
+        currentRow = self._index.get(entry, None)
+        if type(currentRow) is TupleType:
+            del self._index[entry]
+        elif currentRow is not None:
+            try:
+                del self._index[entry][documentId]
+            except (KeyError, IndexError, TypeError):
+                LOG('UnTextIndex', ERROR,
+                    'unindex_object tried to unindex nonexistent'
+                    ' document %s' % str(i))
+
+    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document
        
@@ -301,7 +350,7 @@ class UnTextIndex(Persistent, Implicit):
                source = str(source())
            else:
                source = str(source)
-        except:
+        except AttributeError:
            return 0
        

@@ -322,32 +371,36 @@ class UnTextIndex(Persistent, Implicit):
                else:
                    wordList[word] = 1

-        index = self._index
-        unindex = self._unindex
        lexicon = self.getLexicon(self._lexicon)
-        unindex[documentId] = []        # XXX
+        currentWordIds = self._unindex.get(documentId, [])
        wordCount = 0

+        # First deal with deleted words
+        # To do this, the first thing we have to do is convert the
+        # existing words to words, from wordIDS
+        wordListAsIds = OIBTree()
        for word, score in wordList.items():
-            if threshold is not None:
-                if ((wordCount % threshold) == 0) and not (wordCount == 0):
-                    # commit a subtransaction hack
-                    get_transaction().commit(1)
-                    # kick the cache
-                    self._p_jar.cacheFullSweep(1)
-                    
-            wordId = lexicon.set(word)
+            wordListAsIds[lexicon.getWordId(word)] = score
+        
+        for word in currentWordIds:
+            if not wordListAsIds.has_key(word):
+                self.removeForwardEntry(word, documentId)
+
+        #import pdb; pdb.set_trace()
+        # Now we can deal with new/updated entries
+        for wordId, score in wordListAsIds.items():
            self.insertForwardIndexEntry(wordId, documentId, score)
            self.insertReverseIndexEntry(wordId, documentId)
            wordCount = wordCount + 1

-        ## return the number of words you indexed
+        # Return the number of words you indexed
        return wordCount


    def unindex_object(self, i): 
        """ carefully unindex document with integer id 'i' from the text
        index and do not fail if it does not exist """
+
        index = self._index
        unindex = self._unindex
        val = unindex.get(i, None)
@@ -385,7 +438,7 @@ class UnTextIndex(Persistent, Implicit):
        
            if len(splitSource) == 1:
                splitSource = splitSource[0]
-                if splitSource[:1]=='"' and splitSource[-1:]=='"':
+                if splitSource[:1] == '"' and splitSource[-1:] == '"':
                    return self[splitSource]

                r = self._index.get(
@@ -429,13 +482,13 @@ class UnTextIndex(Persistent, Implicit):
            return None

        if type(keys) is StringType:
-            if not keys or not strip(keys):
+            if not keys or not string.strip(keys):
                return None
            keys = [keys]
        r = None
        
        for key in keys:
-            key = strip(key)
+            key = string.strip(key)
            if not key:
                continue
            
@@ -480,11 +533,11 @@ class UnTextIndex(Persistent, Implicit):


    def _subindex(self, isrc, d, old, last):
-
        src = self.getLexicon(self._lexicon).Splitter(isrc)  

        for s in src:
-            if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
+            if s[0] == '\"':
+                last = self.subindex(s[1:-1],d,old,last)
            else:
                if old(s):
                    if s != last: d[s] = d[s]+1
@@ -493,15 +546,12 @@ class UnTextIndex(Persistent, Implicit):
        return last


-    def query(self, s, default_operator = Or, ws = (string.whitespace,)):
-        """
-
-        This is called by TextIndexes.  A 'query term' which is a string
-        's' is passed in, along with an index object.  s is parsed, then
-        the wildcards are parsed, then something is parsed again, then the 
-        whole thing is 'evaluated'
+    def query(self, s, default_operator=Or, ws=(string.whitespace,)):
+        """ This is called by TextIndexes.  A 'query term' which is a
+        string 's' is passed in, along with an index object.  s is
+        parsed, then the wildcards are parsed, then something is
+        parsed again, then the whole thing is 'evaluated'. """

-        """
        # First replace any occurences of " and not " with " andnot "
        s = ts_regex.gsub(
            '[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
@@ -523,7 +573,8 @@ class UnTextIndex(Persistent, Implicit):


    def get_operands(self, q, i):
-        '''Evaluate and return the left and right operands for an operator'''
+        """Evaluate and return the left and right operands for an operator"""
+        
        try:
            left  = q[i - 1]
            right = q[i + 1]
@@ -550,7 +601,7 @@ class UnTextIndex(Persistent, Implicit):


    def evaluate(self, query):
-        '''Evaluate a parsed query'''
+        """Evaluate a parsed query"""
        # There are two options if the query passed in is only one
        # item. It means either it's an embedded query, in which case
        # we'll recursively evaluate, other wise it's nothing for us
@@ -602,7 +653,7 @@ class UnTextIndex(Persistent, Implicit):


 def parse(s):
-    '''Parse parentheses and quotes'''
+    """Parse parentheses and quotes"""
    l = []
    tmp = string.lower(s)

@@ -625,10 +676,10 @@ def parse(s):
    return l

 def parse2(q, default_operator,
-           operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near}):
-    '''Find operators and operands'''
+           operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
+    """Find operators and operands"""
    i = 0
-    isop=operator_dict.has_key
+    isop = operator_dict.has_key
    while (i < len(q)):
        if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)

@@ -646,9 +697,9 @@ def parse2(q, default_operator,
    return q


-def parens(s, parens_re = regex.compile('(\|)').search):
+def parens(s, parens_re=regex.compile('(\|)').search):

-    index=open_index=paren_count = 0
+    index = open_index = paren_count = 0

    while 1:
        index = parens_re(s, index)
@@ -672,7 +723,7 @@ def parens(s, parens_re = regex.compile('(\|)').search):



-def quotes(s, ws = (string.whitespace,)):
+def quotes(s, ws=(string.whitespace,)):
     # split up quoted regions
     splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
     split=string.split