A tiny start at introducing a base class for the cosine and Okapi

indexers. CAUTION: I'm sure I don't understand how persistency needs to be spelled. Is it enough to say just that the base class derives from Persistent, or does that need to be duplicated (or done instead exclusively) in the derived classes? Is there a point to keeping "import ZODB" in the derived-class files? Is there a point to keeping it anywhere <wink>?

A tiny start at introducing a base class for the cosine and Okapi
indexers. CAUTION: I'm sure I don't understand how persistency needs to be spelled. Is it enough to say just that the base class derives from Persistent, or does that need to be duplicated (or done instead exclusively) in the derived classes? Is there a point to keeping "import ZODB" in the derived-class files? Is there a point to keeping it anywhere <wink>?
597b6934 · Tim Peters · 763e8d56 · 597b6934 · 597b6934 · 597b6934
Commit 597b6934 authored May 17, 2002 by Tim Peters
3 changed files
--- a/lib/python/Products/ZCTextIndex/BaseIndex.py
+++ b/lib/python/Products/ZCTextIndex/BaseIndex.py
--- a/lib/python/Products/ZCTextIndex/CosineIndex.py
+++ b/lib/python/Products/ZCTextIndex/CosineIndex.py
@@ -21,6 +21,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.BaseIndex import BaseIndex
 from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
                                        mass_weightedUnion
@@ -43,12 +44,12 @@ def scaled_int(f, scale=SCALE_FACTOR):
    # expensive.
    return int(f * scale + 0.5)
-class CosineIndex(Persistent):
+class CosineIndex(BaseIndex):
    __implements__ = IIndex
    def __init__(self, lexicon):
-        self._lexicon = lexicon
+        BaseIndex.__init__(self, lexicon)
        # wid -> { docid -> frequency }
        self._wordinfo = IOBTree()
@@ -56,18 +57,6 @@ class CosineIndex(Persistent):
        # docid -> W(docid)
        self._docweight = IIBTree()
-        # docid -> [ wid ]
-        # used for un-indexing
-        self._docwords = IOBTree()
-    def length(self):
-        """Return the number of documents in the index."""
-        return len(self._docwords)
-    def get_words(self, docid):
-        """Returns the wordids for a given docid"""
-        return WidCode.decode(self._docwords[docid])
    # Most of the computation for computing a relevance score for the
    # document occurs in the search() method.  The code currently
    # implements the cosine similarity function described in Managing

--- a/lib/python/Products/ZCTextIndex/OkapiIndex.py
+++ b/lib/python/Products/ZCTextIndex/OkapiIndex.py
@@ -24,6 +24,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.BaseIndex import BaseIndex
 from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
                                        mass_weightedUnion
@@ -46,7 +47,7 @@ def scaled_int(f, scale=SCALE_FACTOR):
    # expensive.
    return int(f * scale + 0.5)
-class OkapiIndex(Persistent):
+class OkapiIndex(BaseIndex):
    __implements__ = IIndex
@@ -57,7 +58,7 @@ class OkapiIndex(Persistent):
    assert 0.0 <= B <= 1.0
    def __init__(self, lexicon):
-        self._lexicon = lexicon
+        BaseIndex.__init__(self, lexicon)
        # wid -> {docid -> frequency}; t -> D -> f(D, t)
        # There are two kinds of OOV words:  wid 0 is explicitly OOV,
@@ -78,18 +79,6 @@ class OkapiIndex(Persistent):
        # used often enough that speed should matter.
        self._totaldoclen = 0L
-        # docid -> WidCode'd list of wids
-        # Used for un-indexing, and for phrase search.
-        self._docwords = IOBTree()
-    def length(self):
-        """Return the number of documents in the index."""
-        return len(self._docwords)
-    def get_words(self, docid):
-        """Returns the wordids for a given docid"""
-        return WidCode.decode(self._docwords[docid])
    def index_doc(self, docid, text):
        wids = self._lexicon.sourceToWordIds(text)
        self._doclen[docid] = len(wids)