Commit 72ed10fe authored by Tim Peters's avatar Tim Peters

Pushed the subclassing far enough to be useful. More is needed, but

I need a break.
parent 597b6934
......@@ -51,8 +51,8 @@ class CosineIndex(BaseIndex):
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# wid -> { docid -> frequency }
self._wordinfo = IOBTree()
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# docid -> W(docid)
self._docweight = IIBTree()
......@@ -102,33 +102,6 @@ class CosineIndex(BaseIndex):
del self._docwords[docid]
del self._docweight[docid]
def search(self, term):
wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
if 0 in wids:
wids = filter(None, wids)
return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
return mass_weightedUnion(self._search_wids(wids))
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
if 0 in wids:
return IIBTree()
hits = mass_weightedIntersection(self._search_wids(wids))
if not hits:
return hits
code = WidCode.encode(wids)
result = IIBTree()
for docid, weight in hits.items():
docwords = self._docwords[docid]
if docwords.find(code) >= 0:
result[docid] = weight
return result
def _search_wids(self, wids):
if not wids:
return []
......
......@@ -60,13 +60,8 @@ class OkapiIndex(BaseIndex):
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word *we've* never seen (e.g., lexicons can be shared
# across indices, and a query can contain a word some other
# index knows about but we don't).
self._wordinfo = IOBTree()
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
......@@ -101,38 +96,6 @@ class OkapiIndex(BaseIndex):
del self._doclen[docid]
self._totaldoclen -= count
def search(self, term):
wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
wids = self._remove_oov_wids(wids)
return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
return mass_weightedUnion(self._search_wids(wids))
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
cleaned_wids = self._remove_oov_wids(wids)
if len(wids) != len(cleaned_wids):
# At least one wid was OOV: can't possibly find it.
return IIBTree()
scores = self._search_wids(cleaned_wids)
hits = mass_weightedIntersection(scores)
if not hits:
return hits
code = WidCode.encode(wids)
result = IIBTree()
for docid, weight in hits.items():
docwords = self._docwords[docid]
if docwords.find(code) >= 0:
result[docid] = weight
return result
def _remove_oov_wids(self, wids):
return filter(self._wordinfo.has_key, wids)
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment