More sorting improvements:

* Changed logic for activating first sort algorithm to elminate bad performance with large result sets (20k+). The full sort is now faster for a larger proportion of cases. This algorithm is also skipped now if a sort limit value is passed. * Full sort now handles sort limits where the limit is 25% or greater of the total result where N-Best performance degrades. This allows the application to always apply a sort limit up to and beyond the result set length. * Added an "N-worst" sort handler to deal with forward sort limits (previously only reverse limits worked properly). * Small optimizations to N-best/worst to wring out a few more CPU cycles.

More sorting improvements:
* Changed logic for activating first sort algorithm to elminate bad performance with large result sets (20k+). The full sort is now faster for a larger proportion of cases. This algorithm is also skipped now if a sort limit value is passed. * Full sort now handles sort limits where the limit is 25% or greater of the total result where N-Best performance degrades. This allows the application to always apply a sort limit up to and beyond the result set length. * Added an "N-worst" sort handler to deal with forward sort limits (previously only reverse limits worked properly). * Small optimizations to N-best/worst to wring out a few more CPU cycles.
e45028f3 · Casey Duncan · d4ce5d0c · e45028f3 · e45028f3
Commit e45028f3 authored Dec 09, 2002 by Casey Duncan
Showing with 128 additions and 65 deletions

lib/python/Products/ZCatalog/Catalog.py lib/python/Products/ZCatalog/Catalog.py +92 -60

lib/python/Products/ZCatalog/tests/testCatalog.py lib/python/Products/ZCatalog/tests/testCatalog.py +36 -5

No files found.
--- a/lib/python/Products/ZCatalog/Catalog.py
+++ b/lib/python/Products/ZCatalog/Catalog.py
@@ -22,6 +22,7 @@ from zLOG import LOG, ERROR

 from Lazy import LazyMap, LazyFilter, LazyCat, LazyValues
 from CatalogBrains import AbstractCatalogBrain, NoBrainer
+from sorters import buildSortableResults

 from BTrees.IIBTree import intersection, weightedIntersection, IISet
 from BTrees.OIBTree import OIBTree
@@ -524,24 +525,29 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1):
        # Sort a result set using a sort index. Return a lazy
        # result set in sorted order if merge is true otherwise
-        # returns a list of (sortkey, results) tuples
+        # returns a list of (sortkey, uid, getter_function) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
        # Try to avoid all non-local attribute lookup inside
        # those loops.
+        assert limit is None or limit > 0, 'Limit value must be 1 or greater'
        _lazymap = LazyMap
        _intersection = intersection
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        _None = None
+        _keyerror = KeyError
        result = []
        append = result.append
+        if hasattr(rs, 'keys'):
+            rs = rs.keys()
+        rlen = len(rs)
        
-        if (len(rs) > (len(sort_index) * 4)):
+        if limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
-            # This is almost never exercised in practice...
+            # This is rarely exercised in practice...
            
            length = 0

@@ -550,8 +556,6 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
-                if hasattr(rs, 'keys'):
-                    rs = rs.keys()
                rs = IISet(rs)

            for k, intset in sort_index.items():
@@ -575,18 +579,13 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
                result = LazyCat(LazyValues(result), length)
            else:
                return result            
-        else:
+        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index
-            if hasattr(rs, 'keys'):
-                rs = rs.keys()
-            _keyerror = KeyError
-            if limit is None:
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
-                        # This document is not in the sort key index.
-                        # skip it.
+                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    append((key, did, _self__getitem__))
@@ -594,29 +593,30 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
-
            if merge:
                result.sort()
                if reverse:
                    result.reverse()
+                if limit is not None:
+                    result = result[:limit]                    
                result = LazyValues(result)
            else:
                return result
-            else: 
+        elif reverse: 
            # Limit/sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            n = 0
+            worst = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
-                        # This document is not in the sort key index.
-                        # skip it.
+                    # This document is not in the sort key index, skip it.
                    pass
                else:
-                        if n >= limit and key <= keys[0]:
+                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
@@ -625,14 +625,41 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
                        del keys[0], result[0]
                    else:
                        n += 1
+                    worst = keys[0]
            result.reverse()
            if merge:
                result = LazyValues(result) 
            else:
                return result
+        elif not reverse:
+            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
+            keys = []
+            n = 0
+            best = None
+            for did in rs:
+                try:
+                    key = index_key_map[did]
+                except _keyerror:
+                    # This document is not in the sort key index, skip it.
+                    pass
+                else:
+                    if n >= limit and key >= best:
+                        continue
+                    i = bisect(keys, key)
+                    keys.insert(i, key)
+                    result.insert(i, (key, did, _self__getitem__))
+                    if n == limit:
+                        del keys[-1], result[-1]
+                    else:
+                        n += 1
+                    best = keys[-1]
+            if merge:
+                result = LazyValues(result) 
+            else:
+                return result
        
        result = LazyMap(self.__getitem__, result, len(result))
-        result.actual_result_count = len(rs)
+        result.actual_result_count = rlen
        return result

    def _get_sort_attr(self, attr, kw):
@@ -743,9 +770,14 @@ def mergeResults(results, has_sort_keys, reverse):
        # Concatenate the catalog results into one list and sort it
        # Each result record consists of a list of tuples with three values:
        # (sortkey, docid, catalog__getitem__)
+        if len(results) > 1:
            all = []
            for r in results:
                all.extend(r)
+        elif len(results) == 1:
+            all = results[0]
+        else:
+            return []
        all.sort()
        if reverse:
            all.reverse()

--- a/lib/python/Products/ZCatalog/tests/testCatalog.py
+++ b/lib/python/Products/ZCatalog/tests/testCatalog.py
@@ -17,7 +17,7 @@ from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
 from Products.PluginIndexes.TextIndex.Lexicon import  Lexicon
 from Products.PluginIndexes.KeywordIndex.KeywordIndex import KeywordIndex

-import whrandom,string, unittest
+import whrandom,string, unittest, random


 def createDatabase():
@@ -166,6 +166,16 @@ class TestZCatalog(unittest.TestCase):
        self.assertEqual(len(sr), 3)

 class TestCatalogObject(unittest.TestCase):
+    
+    upper = 1000
+    
+    nums = range(upper)
+    for i in range(upper):
+        j = random.randint(0, upper-1)
+        tmp = nums[i]
+        nums[i] = nums[j]
+        nums[j] = tmp
+        
    def setUp(self):
        self._vocabulary = Vocabulary.Vocabulary('Vocabulary','Vocabulary',
                                                 globbing=1)
@@ -196,7 +206,6 @@ class TestCatalogObject(unittest.TestCase):
        self._catalog.addColumn('att3')
        self._catalog.addColumn('num')

-        self.upper = 1000
        class dummy(ExtensionClass.Base):
            att1 = 'att1'
            att2 = 'att2'
@@ -215,7 +224,7 @@ class TestCatalogObject(unittest.TestCase):

        
        for x in range(0, self.upper):
-            self._catalog.catalogObject(dummy(x), `x`)
+            self._catalog.catalogObject(dummy(self.nums[x]), `x`)
        self._catalog.aq_parent = dummy('foo') # fake out acquisition

    def tearDown(self):
@@ -354,11 +363,33 @@ class TestCatalogObject(unittest.TestCase):
        a = self._catalog(sort_on='att1')
        self.assertEqual(len(a), self.upper)
        
+    def testBadSortLimits(self):
+        self.assertRaises(
+            AssertionError, self._catalog, sort_on='num', sort_limit=0)
+        self.assertRaises(
+            AssertionError, self._catalog, sort_on='num', sort_limit=-10)
+    
    def testSortLimit(self):
+        full = self._catalog(sort_on='num')
        a = self._catalog(sort_on='num', sort_limit=10)
-        self.assertEqual(a[0].num, self.upper - 1)
+        self.assertEqual([r.num for r in a], [r.num for r in full[:10]])
+        self.assertEqual(a.actual_result_count, self.upper)
+        a = self._catalog(sort_on='num', sort_limit=10, sort_order='reverse')
+        rev = [r.num for r in full[-10:]]
+        rev.reverse()
+        self.assertEqual([r.num for r in a], rev)
        self.assertEqual(a.actual_result_count, self.upper)
        
+    def testBigSortLimit(self):
+        a = self._catalog(sort_on='num', sort_limit=self.upper*3)
+        self.assertEqual(a.actual_result_count, self.upper)
+        self.assertEqual(a[0].num, 0)
+        a = self._catalog(
+            sort_on='num', sort_limit=self.upper*3, sort_order='reverse')
+        self.assertEqual(a.actual_result_count, self.upper)
+        self.assertEqual(a[0].num, self.upper - 1)
+        
+
 class objRS(ExtensionClass.Base):

    def __init__(self,num):