Commit e45028f3 authored by Casey Duncan's avatar Casey Duncan

More sorting improvements:

* Changed logic for activating first sort algorithm to elminate bad performance with large result sets (20k+). The full sort is now faster for a larger proportion of cases. This algorithm is also skipped now if a sort limit value is passed.

* Full sort now handles sort limits where the limit is 25% or greater of the total result where N-Best performance degrades. This allows the application to always apply a sort limit up to and beyond the result set length.

* Added an "N-worst" sort handler to deal with forward sort limits (previously only reverse limits worked properly).

* Small optimizations to N-best/worst to wring out a few more CPU cycles.
parent d4ce5d0c
......@@ -22,6 +22,7 @@ from zLOG import LOG, ERROR
from Lazy import LazyMap, LazyFilter, LazyCat, LazyValues
from CatalogBrains import AbstractCatalogBrain, NoBrainer
from sorters import buildSortableResults
from BTrees.IIBTree import intersection, weightedIntersection, IISet
from BTrees.OIBTree import OIBTree
......@@ -524,24 +525,29 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1):
# Sort a result set using a sort index. Return a lazy
# result set in sorted order if merge is true otherwise
# returns a list of (sortkey, results) tuples
# returns a list of (sortkey, uid, getter_function) tuples
#
# The two 'for' loops in here contribute a significant
# proportion of the time to perform an indexed search.
# Try to avoid all non-local attribute lookup inside
# those loops.
assert limit is None or limit > 0, 'Limit value must be 1 or greater'
_lazymap = LazyMap
_intersection = intersection
_self__getitem__ = self.__getitem__
index_key_map = sort_index.documentToKeyMap()
_None = None
_keyerror = KeyError
result = []
append = result.append
if hasattr(rs, 'keys'):
rs = rs.keys()
rlen = len(rs)
if (len(rs) > (len(sort_index) * 4)):
if limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))):
# The result set is much larger than the sorted index,
# so iterate over the sorted index for speed.
# This is almost never exercised in practice...
# This is rarely exercised in practice...
length = 0
......@@ -550,8 +556,6 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
except TypeError:
# rs is not an object in the IIBTree family.
# Try to turn rs into an IISet.
if hasattr(rs, 'keys'):
rs = rs.keys()
rs = IISet(rs)
for k, intset in sort_index.items():
......@@ -575,64 +579,87 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
result = LazyCat(LazyValues(result), length)
else:
return result
else:
elif limit is None or (limit * 4 > rlen):
# Iterate over the result set getting sort keys from the index
if hasattr(rs, 'keys'):
rs = rs.keys()
_keyerror = KeyError
if limit is None:
for did in rs:
try:
key = index_key_map[did]
except _keyerror:
# This document is not in the sort key index.
# skip it.
pass
else:
append((key, did, _self__getitem__))
# The reference back to __getitem__ is used in case
# we do not merge now and need to intermingle the
# results with those of other catalogs while avoiding
# the cost of instantiating a LazyMap per result
if merge:
result.sort()
if reverse:
result.reverse()
result = LazyValues(result)
for did in rs:
try:
key = index_key_map[did]
except _keyerror:
# This document is not in the sort key index, skip it.
pass
else:
append((key, did, _self__getitem__))
# The reference back to __getitem__ is used in case
# we do not merge now and need to intermingle the
# results with those of other catalogs while avoiding
# the cost of instantiating a LazyMap per result
if merge:
result.sort()
if reverse:
result.reverse()
if limit is not None:
result = result[:limit]
result = LazyValues(result)
else:
return result
elif reverse:
# Limit/sort results using N-Best algorithm
# This is faster for large sets then a full sort
# And uses far less memory
keys = []
n = 0
worst = None
for did in rs:
try:
key = index_key_map[did]
except _keyerror:
# This document is not in the sort key index, skip it.
pass
else:
return result
else:
# Limit/sort results using N-Best algorithm
# This is faster for large sets then a full sort
# And uses far less memory
keys = []
n = 0
for did in rs:
try:
key = index_key_map[did]
except _keyerror:
# This document is not in the sort key index.
# skip it.
pass
if n >= limit and key <= worst:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (key, did, _self__getitem__))
if n == limit:
del keys[0], result[0]
else:
if n >= limit and key <= keys[0]:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (key, did, _self__getitem__))
if n == limit:
del keys[0], result[0]
else:
n += 1
result.reverse()
if merge:
result = LazyValues(result)
n += 1
worst = keys[0]
result.reverse()
if merge:
result = LazyValues(result)
else:
return result
elif not reverse:
# Limit/sort results using N-Best algorithm in reverse (N-Worst?)
keys = []
n = 0
best = None
for did in rs:
try:
key = index_key_map[did]
except _keyerror:
# This document is not in the sort key index, skip it.
pass
else:
return result
if n >= limit and key >= best:
continue
i = bisect(keys, key)
keys.insert(i, key)
result.insert(i, (key, did, _self__getitem__))
if n == limit:
del keys[-1], result[-1]
else:
n += 1
best = keys[-1]
if merge:
result = LazyValues(result)
else:
return result
result = LazyMap(self.__getitem__, result, len(result))
result.actual_result_count = len(rs)
result.actual_result_count = rlen
return result
def _get_sort_attr(self, attr, kw):
......@@ -743,9 +770,14 @@ def mergeResults(results, has_sort_keys, reverse):
# Concatenate the catalog results into one list and sort it
# Each result record consists of a list of tuples with three values:
# (sortkey, docid, catalog__getitem__)
all = []
for r in results:
all.extend(r)
if len(results) > 1:
all = []
for r in results:
all.extend(r)
elif len(results) == 1:
all = results[0]
else:
return []
all.sort()
if reverse:
all.reverse()
......
......@@ -17,7 +17,7 @@ from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
from Products.PluginIndexes.KeywordIndex.KeywordIndex import KeywordIndex
import whrandom,string, unittest
import whrandom,string, unittest, random
def createDatabase():
......@@ -166,6 +166,16 @@ class TestZCatalog(unittest.TestCase):
self.assertEqual(len(sr), 3)
class TestCatalogObject(unittest.TestCase):
upper = 1000
nums = range(upper)
for i in range(upper):
j = random.randint(0, upper-1)
tmp = nums[i]
nums[i] = nums[j]
nums[j] = tmp
def setUp(self):
self._vocabulary = Vocabulary.Vocabulary('Vocabulary','Vocabulary',
globbing=1)
......@@ -196,7 +206,6 @@ class TestCatalogObject(unittest.TestCase):
self._catalog.addColumn('att3')
self._catalog.addColumn('num')
self.upper = 1000
class dummy(ExtensionClass.Base):
att1 = 'att1'
att2 = 'att2'
......@@ -213,9 +222,9 @@ class TestCatalogObject(unittest.TestCase):
def col3(self):
return ['col3']
for x in range(0, self.upper):
self._catalog.catalogObject(dummy(x), `x`)
self._catalog.catalogObject(dummy(self.nums[x]), `x`)
self._catalog.aq_parent = dummy('foo') # fake out acquisition
def tearDown(self):
......@@ -353,11 +362,33 @@ class TestCatalogObject(unittest.TestCase):
# set is much larger than the sort index.
a = self._catalog(sort_on='att1')
self.assertEqual(len(a), self.upper)
def testBadSortLimits(self):
self.assertRaises(
AssertionError, self._catalog, sort_on='num', sort_limit=0)
self.assertRaises(
AssertionError, self._catalog, sort_on='num', sort_limit=-10)
def testSortLimit(self):
full = self._catalog(sort_on='num')
a = self._catalog(sort_on='num', sort_limit=10)
self.assertEqual(a[0].num, self.upper - 1)
self.assertEqual([r.num for r in a], [r.num for r in full[:10]])
self.assertEqual(a.actual_result_count, self.upper)
a = self._catalog(sort_on='num', sort_limit=10, sort_order='reverse')
rev = [r.num for r in full[-10:]]
rev.reverse()
self.assertEqual([r.num for r in a], rev)
self.assertEqual(a.actual_result_count, self.upper)
def testBigSortLimit(self):
a = self._catalog(sort_on='num', sort_limit=self.upper*3)
self.assertEqual(a.actual_result_count, self.upper)
self.assertEqual(a[0].num, 0)
a = self._catalog(
sort_on='num', sort_limit=self.upper*3, sort_order='reverse')
self.assertEqual(a.actual_result_count, self.upper)
self.assertEqual(a[0].num, self.upper - 1)
class objRS(ExtensionClass.Base):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment