Commit 597b6934 authored by Tim Peters's avatar Tim Peters

A tiny start at introducing a base class for the cosine and Okapi

indexers.

CAUTION:  I'm sure I don't understand how persistency needs to be spelled.
Is it enough to say just that the base class derives from Persistent, or
does that need to be duplicated (or done instead exclusively) in the
derived classes?

Is there a point to keeping "import ZODB" in the derived-class files?  Is
there a point to keeping it anywhere <wink>?
parent 763e8d56
This diff is collapsed.
......@@ -21,6 +21,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion
......@@ -43,12 +44,12 @@ def scaled_int(f, scale=SCALE_FACTOR):
# expensive.
return int(f * scale + 0.5)
class CosineIndex(Persistent):
class CosineIndex(BaseIndex):
__implements__ = IIndex
def __init__(self, lexicon):
self._lexicon = lexicon
BaseIndex.__init__(self, lexicon)
# wid -> { docid -> frequency }
self._wordinfo = IOBTree()
......@@ -56,18 +57,6 @@ class CosineIndex(Persistent):
# docid -> W(docid)
self._docweight = IIBTree()
# docid -> [ wid ]
# used for un-indexing
self._docwords = IOBTree()
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
# implements the cosine similarity function described in Managing
......
......@@ -24,6 +24,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion
......@@ -46,7 +47,7 @@ def scaled_int(f, scale=SCALE_FACTOR):
# expensive.
return int(f * scale + 0.5)
class OkapiIndex(Persistent):
class OkapiIndex(BaseIndex):
__implements__ = IIndex
......@@ -57,7 +58,7 @@ class OkapiIndex(Persistent):
assert 0.0 <= B <= 1.0
def __init__(self, lexicon):
self._lexicon = lexicon
BaseIndex.__init__(self, lexicon)
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# There are two kinds of OOV words: wid 0 is explicitly OOV,
......@@ -78,18 +79,6 @@ class OkapiIndex(Persistent):
# used often enough that speed should matter.
self._totaldoclen = 0L
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
self._doclen[docid] = len(wids)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment