Commit 763e8d56 authored by Tim Peters's avatar Tim Peters

wid 0 isn't the only kind of OOV word possible, so change the search

logic to deal with all cases.  All the tests pass again.
parent fd3a9291
...@@ -60,6 +60,11 @@ class OkapiIndex(Persistent): ...@@ -60,6 +60,11 @@ class OkapiIndex(Persistent):
self._lexicon = lexicon self._lexicon = lexicon
# wid -> {docid -> frequency}; t -> D -> f(D, t) # wid -> {docid -> frequency}; t -> D -> f(D, t)
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word *we've* never seen (e.g., lexicons can be shared
# across indices, and a query can contain a word some other
# index knows about but we don't).
self._wordinfo = IOBTree() self._wordinfo = IOBTree()
# docid -> # of words in the doc # docid -> # of words in the doc
...@@ -111,8 +116,7 @@ class OkapiIndex(Persistent): ...@@ -111,8 +116,7 @@ class OkapiIndex(Persistent):
wids = self._lexicon.termToWordIds(term) wids = self._lexicon.termToWordIds(term)
if not wids: if not wids:
return None # All docs match return None # All docs match
if 0 in wids: wids = self._remove_oov_wids(wids)
wids = filter(None, wids)
return mass_weightedUnion(self._search_wids(wids)) return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern): def search_glob(self, pattern):
...@@ -121,9 +125,12 @@ class OkapiIndex(Persistent): ...@@ -121,9 +125,12 @@ class OkapiIndex(Persistent):
def search_phrase(self, phrase): def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase) wids = self._lexicon.termToWordIds(phrase)
if 0 in wids: cleaned_wids = self._remove_oov_wids(wids)
if len(wids) != len(cleaned_wids):
# At least one wid was OOV: can't possibly find it.
return IIBTree() return IIBTree()
hits = mass_weightedIntersection(self._search_wids(wids)) scores = self._search_wids(cleaned_wids)
hits = mass_weightedIntersection(scores)
if not hits: if not hits:
return hits return hits
code = WidCode.encode(wids) code = WidCode.encode(wids)
...@@ -134,6 +141,9 @@ class OkapiIndex(Persistent): ...@@ -134,6 +141,9 @@ class OkapiIndex(Persistent):
result[docid] = weight result[docid] = weight
return result return result
def _remove_oov_wids(self, wids):
return filter(self._wordinfo.has_key, wids)
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair # The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to # for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t. # TF(D,t) * IDF(t) for every docid D containing t.
...@@ -157,6 +167,7 @@ class OkapiIndex(Persistent): ...@@ -157,6 +167,7 @@ class OkapiIndex(Persistent):
L = [] L = []
docid2len = self._doclen docid2len = self._doclen
for t in wids: for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)} d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket() result = IIBucket()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment