Commit ea795e5a authored by Guido van Rossum's avatar Guido van Rossum

Fix queries of the form 'extension module C'.

parent 9319c8e0
...@@ -115,6 +115,10 @@ class CosineIndex(Persistent): ...@@ -115,6 +115,10 @@ class CosineIndex(Persistent):
def search(self, term): def search(self, term):
wids = self._lexicon.termToWordIds(term) wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
if 0 in wids:
wids = filter(None, wids)
return mass_weightedUnion(self._search_wids(wids)) return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern): def search_glob(self, pattern):
...@@ -123,6 +127,8 @@ class CosineIndex(Persistent): ...@@ -123,6 +127,8 @@ class CosineIndex(Persistent):
def search_phrase(self, phrase): def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase) wids = self._lexicon.termToWordIds(phrase)
if 0 in wids:
return IIBTree()
hits = mass_weightedIntersection(self._search_wids(wids)) hits = mass_weightedIntersection(self._search_wids(wids))
if not hits: if not hits:
return hits return hits
...@@ -157,6 +163,8 @@ class CosineIndex(Persistent): ...@@ -157,6 +163,8 @@ class CosineIndex(Persistent):
N = float(len(self._docweight)) N = float(len(self._docweight))
sum = 0.0 sum = 0.0
for wid in wids: for wid in wids:
if wid == 0:
continue
wt = math.log(1.0 + N / len(self._wordinfo[wid])) wt = math.log(1.0 + N / len(self._wordinfo[wid]))
sum += wt ** 2.0 sum += wt ** 2.0
return scaled_int(math.sqrt(sum)) return scaled_int(math.sqrt(sum))
......
...@@ -62,9 +62,7 @@ class Lexicon: ...@@ -62,9 +62,7 @@ class Lexicon:
last = element.process(last) last = element.process(last)
wids = [] wids = []
for word in last: for word in last:
wid = self._wids.get(word) wids.append(self._wids.get(word, 0))
if wid is not None:
wids.append(wid)
return wids return wids
def get_word(self, wid): def get_word(self, wid):
......
...@@ -109,6 +109,10 @@ class OkapiIndex(Persistent): ...@@ -109,6 +109,10 @@ class OkapiIndex(Persistent):
def search(self, term): def search(self, term):
wids = self._lexicon.termToWordIds(term) wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
if 0 in wids:
wids = filter(None, wids)
return mass_weightedUnion(self._search_wids(wids)) return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern): def search_glob(self, pattern):
...@@ -117,6 +121,8 @@ class OkapiIndex(Persistent): ...@@ -117,6 +121,8 @@ class OkapiIndex(Persistent):
def search_phrase(self, phrase): def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase) wids = self._lexicon.termToWordIds(phrase)
if 0 in wids:
return IIBTree()
hits = mass_weightedIntersection(self._search_wids(wids)) hits = mass_weightedIntersection(self._search_wids(wids))
if not hits: if not hits:
return hits return hits
......
...@@ -20,10 +20,10 @@ from Products.ZCTextIndex.NBest import NBest ...@@ -20,10 +20,10 @@ from Products.ZCTextIndex.NBest import NBest
def mass_weightedIntersection(L): def mass_weightedIntersection(L):
"A list of (mapping, weight) pairs -> their weightedIntersection IIBTree." "A list of (mapping, weight) pairs -> their weightedIntersection IIBTree."
L = [(map, weight) for (map, weight) in L if map is not None]
if not L: if not L:
return IIBTree() return IIBTree()
# Intersect with smallest first. # Intersect with smallest first.
L = L[:] # don't mutate the caller's L
L.sort(lambda x, y: cmp(len(x[0]), len(y[0]))) L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
x, w = L[0] x, w = L[0]
dummy, result = weightedUnion(IIBTree(), x, 1, w) dummy, result = weightedUnion(IIBTree(), x, 1, w)
......
...@@ -72,6 +72,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -72,6 +72,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
""" """
tree = QueryParser().parseQuery(query) tree = QueryParser().parseQuery(query)
results = tree.executeQuery(self.index) results = tree.executeQuery(self.index)
if results is None:
return [], 0
chooser = NBest(nbest) chooser = NBest(nbest)
chooser.addmany(results.items()) chooser.addmany(results.items())
return chooser.getbest(), len(results) return chooser.getbest(), len(results)
......
...@@ -143,7 +143,7 @@ class Indexer: ...@@ -143,7 +143,7 @@ class Indexer:
if not text: if not text:
continue continue
try: try:
n, results = self.timequery(text, top + nbest) results, n = self.timequery(text, top + nbest)
except: except:
reportexc() reportexc()
text = "" text = ""
...@@ -163,7 +163,7 @@ class Indexer: ...@@ -163,7 +163,7 @@ class Indexer:
top += nbest top += nbest
def query(self, text, nbest=NBEST, maxlines=MAXLINES): def query(self, text, nbest=NBEST, maxlines=MAXLINES):
n, results = self.timequery(text, nbest) results, n = self.timequery(text, nbest)
if not n: if not n:
print "No hits for %r." % text print "No hits for %r." % text
return return
...@@ -173,11 +173,11 @@ class Indexer: ...@@ -173,11 +173,11 @@ class Indexer:
def timequery(self, text, nbest): def timequery(self, text, nbest):
t0 = time.time() t0 = time.time()
c0 = time.clock() c0 = time.clock()
n, results = self.index.query(text, nbest) results, n = self.index.query(text, nbest)
t1 = time.time() t1 = time.time()
c1 = time.clock() c1 = time.clock()
print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0) print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
return n, results return results, n
def formatresults(self, text, results, maxlines=MAXLINES, def formatresults(self, text, results, maxlines=MAXLINES,
lo=0, hi=sys.maxint): lo=0, hi=sys.maxint):
...@@ -397,9 +397,11 @@ class TextIndex(Persistent): ...@@ -397,9 +397,11 @@ class TextIndex(Persistent):
parser = QueryParser() parser = QueryParser()
tree = parser.parseQuery(query) tree = parser.parseQuery(query)
results = tree.executeQuery(self.index) results = tree.executeQuery(self.index)
if results is None:
return [], 0
chooser = NBest(nbest) chooser = NBest(nbest)
chooser.addmany(results.items()) chooser.addmany(results.items())
return len(results), chooser.getbest() return chooser.getbest(), len(results)
def query_weight(self, query): def query_weight(self, query):
parser = QueryParser() parser = QueryParser()
......
...@@ -76,7 +76,7 @@ class Test(TestCase): ...@@ -76,7 +76,7 @@ class Test(TestCase):
lexicon = Lexicon(Splitter()) lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs') wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('boxes') wids = lexicon.termToWordIds('boxes')
self.assertEqual(wids, []) self.assertEqual(wids, [0])
def testOnePipelineElement(self): def testOnePipelineElement(self):
lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish')) lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
...@@ -94,7 +94,7 @@ class Test(TestCase): ...@@ -94,7 +94,7 @@ class Test(TestCase):
lexicon = Lexicon(Splitter()) lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('CATS and dogs') wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs') wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [2, 3]) self.assertEqual(wids, [0, 2, 3])
def testTwoElementPipeline(self): def testTwoElementPipeline(self):
lexicon = Lexicon(Splitter(), lexicon = Lexicon(Splitter(),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment