Use the new SetOps for mass union/intersection.

f4c2c29b · Tim Peters · 08fe38f4 · f4c2c29b · f4c2c29b
Commit f4c2c29b authored May 15, 2002 by Tim Peters
Showing with 14 additions and 56 deletions

lib/python/Products/ZCTextIndex/Index.py lib/python/Products/ZCTextIndex/Index.py +7 -22

lib/python/Products/ZCTextIndex/OkapiIndex.py lib/python/Products/ZCTextIndex/OkapiIndex.py +7 -34

No files found.
--- a/lib/python/Products/ZCTextIndex/Index.py
+++ b/lib/python/Products/ZCTextIndex/Index.py
@@ -17,11 +17,12 @@
 import math

 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
-from BTrees.IIBTree import weightedIntersection, weightedUnion
+from BTrees.IIBTree import IIBTree, IIBucket

 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
+                                        mass_weightedUnion

 import ZODB
 from Persistence import Persistent
@@ -62,7 +63,7 @@ class Index(Persistent):
    def length(self):
        """Return the number of documents in the index."""
        return len(self._docwords)
-        
+
    def get_words(self, docid):
        """Returns the wordids for a given docid"""
        return WidCode.decode(self._docwords[docid])
@@ -114,15 +115,15 @@ class Index(Persistent):

    def search(self, term):
        wids = self._lexicon.termToWordIds(term)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))

    def search_glob(self, pattern):
        wids = self._lexicon.globToWordIds(pattern)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))

    def search_phrase(self, phrase):
        wids = self._lexicon.termToWordIds(phrase)
-        hits = self._intersection(self._search_wids(wids))
+        hits = mass_weightedIntersection(self._search_wids(wids))
        if not hits:
            return hits
        code = WidCode.encode(wids)
@@ -149,22 +150,6 @@ class Index(Persistent):
        L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
        return L

-    def _intersection(self, L):
-        if not L:
-            return IIBTree()
-        d2w, weight = L[0]
-        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
-        for d2w, weight in L[1:]:
-            dummy, result = weightedIntersection(result, d2w, 1, weight)
-        return result
-
-    def _union(self, L):
-        # XXX This can be optimized, see OkapiIndex
-        result = IIBTree()
-        for d2w, weight in L:
-            dummy, result = weightedUnion(result, d2w, 1, weight)
-        return result
-
    def query_weight(self, terms):
        wids = []
        for term in terms:

--- a/lib/python/Products/ZCTextIndex/OkapiIndex.py
+++ b/lib/python/Products/ZCTextIndex/OkapiIndex.py
@@ -20,12 +20,13 @@
 import math

 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
-from BTrees.IIBTree import weightedIntersection, weightedUnion
+from BTrees.IIBTree import IIBTree, IIBucket

 from Products.ZCTextIndex.IIndex import IIndex
-from Products.ZCTextIndex import WidCode
 from Products.ZCTextIndex.NBest import NBest
+from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
+                                        mass_weightedUnion

 # Instead of storing floats, we generally store scaled ints.  Binary pickles
 # can store those more efficiently.  The default SCALE_FACTOR of 1024
@@ -98,15 +99,15 @@ class Index:

    def search(self, term):
        wids = self._lexicon.termToWordIds(term)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))

    def search_glob(self, pattern):
        wids = self._lexicon.globToWordIds(pattern)
-        return self._union(self._search_wids(wids))
+        return mass_weightedUnion(self._search_wids(wids))

    def search_phrase(self, phrase):
        wids = self._lexicon.termToWordIds(phrase)
-        hits = self._intersection(self._search_wids(wids))
+        hits = mass_weightedIntersection(self._search_wids(wids))
        if not hits:
            return hits
        code = WidCode.encode(wids)
@@ -156,34 +157,6 @@ class Index:
        # of tf would still done at Python speed, and it's a lot more
        # work than just multiplying by idf.

-    def _intersection(self, L):
-        if not L:
-            return IIBTree()
-        # Intersect with smallest first.
-        L = L[:]    # don't mutate the caller's L
-        L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
-        d2w, weight = L[0]
-        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
-        for d2w, weight in L[1:]:
-            dummy, result = weightedIntersection(result, d2w, 1, weight)
-        return result
-
-    def _union(self, L):
-        if not L:
-            return IIBTree()
-        # Balance unions as closely as possible, smallest to largest.
-        merge = NBest(len(L))
-        for x, weight in L:
-            merge.add((x, weight), len(x))
-        while len(merge) > 1:
-            # Merge the two smallest so far, and add back to the queue.
-            (x, wx), dummy = merge.pop_smallest()
-            (y, wy), dummy = merge.pop_smallest()
-            dummy, z = weightedUnion(x, y, wx, wy)
-            merge.add((z, 1), len(z))
-        (result, weight), score = merge.pop_smallest()
-        return result
-
    def query_weight(self, terms):
        # XXX I have no idea what to put here
        return 10