Commit 597b6934 authored by Tim Peters's avatar Tim Peters

A tiny start at introducing a base class for the cosine and Okapi

indexers.

CAUTION:  I'm sure I don't understand how persistency needs to be spelled.
Is it enough to say just that the base class derives from Persistent, or
does that need to be duplicated (or done instead exclusively) in the
derived classes?

Is there a point to keeping "import ZODB" in the derived-class files?  Is
there a point to keeping it anywhere <wink>?
parent 763e8d56
This diff is collapsed.
...@@ -21,6 +21,7 @@ from BTrees.IIBTree import IIBTree, IIBucket ...@@ -21,6 +21,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \ from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion mass_weightedUnion
...@@ -43,12 +44,12 @@ def scaled_int(f, scale=SCALE_FACTOR): ...@@ -43,12 +44,12 @@ def scaled_int(f, scale=SCALE_FACTOR):
# expensive. # expensive.
return int(f * scale + 0.5) return int(f * scale + 0.5)
class CosineIndex(Persistent): class CosineIndex(BaseIndex):
__implements__ = IIndex __implements__ = IIndex
def __init__(self, lexicon): def __init__(self, lexicon):
self._lexicon = lexicon BaseIndex.__init__(self, lexicon)
# wid -> { docid -> frequency } # wid -> { docid -> frequency }
self._wordinfo = IOBTree() self._wordinfo = IOBTree()
...@@ -56,18 +57,6 @@ class CosineIndex(Persistent): ...@@ -56,18 +57,6 @@ class CosineIndex(Persistent):
# docid -> W(docid) # docid -> W(docid)
self._docweight = IIBTree() self._docweight = IIBTree()
# docid -> [ wid ]
# used for un-indexing
self._docwords = IOBTree()
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
# Most of the computation for computing a relevance score for the # Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently # document occurs in the search() method. The code currently
# implements the cosine similarity function described in Managing # implements the cosine similarity function described in Managing
......
...@@ -24,6 +24,7 @@ from BTrees.IIBTree import IIBTree, IIBucket ...@@ -24,6 +24,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \ from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion mass_weightedUnion
...@@ -46,7 +47,7 @@ def scaled_int(f, scale=SCALE_FACTOR): ...@@ -46,7 +47,7 @@ def scaled_int(f, scale=SCALE_FACTOR):
# expensive. # expensive.
return int(f * scale + 0.5) return int(f * scale + 0.5)
class OkapiIndex(Persistent): class OkapiIndex(BaseIndex):
__implements__ = IIndex __implements__ = IIndex
...@@ -57,7 +58,7 @@ class OkapiIndex(Persistent): ...@@ -57,7 +58,7 @@ class OkapiIndex(Persistent):
assert 0.0 <= B <= 1.0 assert 0.0 <= B <= 1.0
def __init__(self, lexicon): def __init__(self, lexicon):
self._lexicon = lexicon BaseIndex.__init__(self, lexicon)
# wid -> {docid -> frequency}; t -> D -> f(D, t) # wid -> {docid -> frequency}; t -> D -> f(D, t)
# There are two kinds of OOV words: wid 0 is explicitly OOV, # There are two kinds of OOV words: wid 0 is explicitly OOV,
...@@ -78,18 +79,6 @@ class OkapiIndex(Persistent): ...@@ -78,18 +79,6 @@ class OkapiIndex(Persistent):
# used often enough that speed should matter. # used often enough that speed should matter.
self._totaldoclen = 0L self._totaldoclen = 0L
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
def index_doc(self, docid, text): def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text) wids = self._lexicon.sourceToWordIds(text)
self._doclen[docid] = len(wids) self._doclen[docid] = len(wids)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment