Added stop word interface to Lexicon.

b8c3a39b · Michel Pelletier · 8e6e5acb · b8c3a39b · b8c3a39b · b8c3a39b
Commit b8c3a39b authored Mar 25, 2000 by Michel Pelletier
5 changed files
--- a/lib/python/Products/ZCatalog/Vocabulary.py
+++ b/lib/python/Products/ZCatalog/Vocabulary.py
@@ -91,7 +91,7 @@ from Persistence import Persistent
 from OFS.SimpleItem import Item
 from SearchIndex import Lexicon, GlobbingLexicon
-from VocabularyInterface import VocabularyInterface
+from SearchIndex.Lexicon import stop_word_dict
 manage_addVocabularyForm=HTMLFile('addVocabulary',globals())
@@ -113,7 +113,6 @@ class Vocabulary(Item, Persistent, Implicit):
    meta_type = "Vocabulary"
    _isAVocabulary = 1
-    __extends__=(VocabularyInterface,)
    manage_options=(
@@ -137,6 +136,7 @@ class Vocabulary(Item, Persistent, Implicit):
         ['Anonymous', 'Manager']), 
        )
 ##    manage_main = HTMLFile('vocab_manage_main', globals())
    manage_vocabulary = HTMLFile('manage_vocab', globals())
@@ -151,7 +151,7 @@ class Vocabulary(Item, Persistent, Implicit):
        if globbing:
            self.lexicon = GlobbingLexicon.GlobbingLexicon()
        else:
-            self.lexicon = Lexicon.Lexicon()
+            self.lexicon = Lexicon.Lexicon(stop_word_dict)
    def query(self, pattern):
        """ """
@@ -171,6 +171,11 @@ class Vocabulary(Item, Persistent, Implicit):
        if RESPONSE:
            RESPONSE.redirect(URL1 + '/manage_vocabulary')
+    def manage_stop_syn(self, stop_syn, REQUEST=None):
+        pass
    def insert(self, word=''):
        self.lexicon.set(word)

--- a/lib/python/Products/ZCatalog/edit_stop_syn.dtml
+++ b/lib/python/Products/ZCatalog/edit_stop_syn.dtml
+<html>
+<head>
+<title>Edit <dtml-var title_or_id></title>
+</head>
+<body bgcolor="#ffffff" link="#000099" vlink="#555555" alink="#77003b">
+<dtml-var manage_tabs>
+<form action="manage_stop_syn" method="POST">
+<textarea  name="stop_syn:lines">
+</textarea>
+</form>
+<br>
+</body>
+</html>
--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -239,6 +239,14 @@ class GlobbingLexicon(Lexicon):
        return words
+    def Splitter(self, astring, words=None):
+        """ wrap the splitter """
+        ## don't do anything, less efficient but there's not much
+        ## sense in stemming a globbing lexicon.
+        return Splitter(astring)
    def translate(self, pat):
        """Translate a PATTERN to a regular expression.

--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -113,11 +113,27 @@ class Lexicon(Persistent, Implicit):
    """
-    counter = 0
-    def __init__(self):
+    def __init__(self, stop_syn=None):
        self._lexicon = OIBTree()
        self.counter = 0
+        if stop_syn is None:
+            self.stop_syn = {}
+        else:
+            self.stop_syn = {}
+    def set_stop_syn(selfb, stop_syn):
+        """ pass in a mapping of stopwords and synonyms.  Format is:
+        {'word' : [syn1, syn2, ..., synx]}
+        Vocabularies do not necesarily need to implement this if their
+        splitters do not support stemming or stoping.
+        """
+        self.stop_syn = stop_syn
    def set(self, word):
        """ return the word id of 'word' """
@@ -142,8 +158,11 @@ class Lexicon(Persistent, Implicit):
    def __len__(self):
        return len(self._lexicon)
-    def Splitter(self, astring, words):
+    def Splitter(self, astring, words=None):
        """ wrap the splitter """
+        if words is None:
+            word = self.stop_syn
        return Splitter(astring, words)
    def grep(self, query):

--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -92,7 +92,7 @@ is no longer known.
 """
-__version__='$Revision: 1.21 $'[11:-2]
+__version__='$Revision: 1.22 $'[11:-2]
 from Globals import Persistent
 import BTree, IIBTree, IOBTree, OIBTree
@@ -164,7 +164,6 @@ class UnTextIndex(Persistent, Implicit):
            self.call_methods=call_methods
            self._index=IOBTree()
            self._unindex=IOBTree()
-            self._syn=stop_word_dict
        else:
            pass
@@ -177,6 +176,11 @@ class UnTextIndex(Persistent, Implicit):
            self._lexicon = lexicon
+    def __setstate(self, state):
+        Persistent.__setstate__(self, state)
+        if hasattr(self, '_syn'):
+            del self._syn
    def getLexicon(self, vocab_id):
        """ bit of a hack, indexes have been made acquirers so that
@@ -194,10 +198,10 @@ class UnTextIndex(Persistent, Implicit):
    def __len__(self):
        return len(self._unindex)
-    def __setstate__(self, state):
+##    def __setstate__(self, state):
-        Persistent.__setstate__(self, state)
+##        Persistent.__setstate__(self, state)
-        if not hasattr(self, '_lexicon'):
+##        if not hasattr(self, '_lexicon'):
-            self._lexicon = Lexicon()
+##            self._lexicon = Lexicon()
    def clear(self):
@@ -240,7 +244,11 @@ class UnTextIndex(Persistent, Implicit):
        ## The Splitter should now be european compliant at least.
        ## Someone should test this.
-        src = self.getLexicon(self._lexicon).Splitter(k, self._syn)
+##        import pdb
+##        pdb.set_trace()
+        src = self.getLexicon(self._lexicon).Splitter(k)
        ## This returns a tuple of stemmed words.  Stopwords have been 
        ## stripped.
@@ -324,7 +332,7 @@ class UnTextIndex(Persistent, Implicit):
    def __getitem__(self, word):
        """Return an InvertedIndex-style result "list"
        """
-        src = tuple(self.getLexicon(self._lexicon).Splitter(word, self._syn))
+        src = tuple(self.getLexicon(self._lexicon).Splitter(word))
        if not src: return ResultList({}, (word,), self)
        if len(src) == 1:
            src=src[0]
@@ -412,13 +420,13 @@ class UnTextIndex(Persistent, Implicit):
        r = []
        for word in words:
-            r = r+self.getLexicon(self._lexicon).Splitter(doc, self._syn).indexes(word)
+            r = r+self.getLexicon(self._lexicon).Splitter(doc).indexes(word)
        return r
    def _subindex(self, isrc, d, old, last):
-        src = self.getLexicon(self._lexicon).Splitter(isrc, self._syn)  
+        src = self.getLexicon(self._lexicon).Splitter(isrc)  
        for s in src:
            if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)