Commit db2e6ca1 authored by Tim Peters's avatar Tim Peters

A new multiunion() function for integer sets computes the union of many

input sets quickly, and will become much faster again soon (see TODO
in C function multiunion_m).
parent f9448ed0
...@@ -81,6 +81,7 @@ typedef struct Bucket_s { ...@@ -81,6 +81,7 @@ typedef struct Bucket_s {
} Bucket; } Bucket;
#define BUCKET(O) ((Bucket*)(O)) #define BUCKET(O) ((Bucket*)(O))
#define SIZED(O) ((Bucket*)(O))
static void PyVar_AssignB(Bucket **v, Bucket *e) { Py_XDECREF(*v); *v=e;} static void PyVar_AssignB(Bucket **v, Bucket *e) { Py_XDECREF(*v); *v=e;}
#define ASSIGNB(V,E) PyVar_AssignB(&(V),(E)) #define ASSIGNB(V,E) PyVar_AssignB(&(V),(E))
...@@ -262,6 +263,14 @@ static struct PyMethodDef module_methods[] = { ...@@ -262,6 +263,14 @@ static struct PyMethodDef module_methods[] = {
"compute the intersection of o1 and o2\n" "compute the intersection of o1 and o2\n"
"\nw1 and w2 are weights." "\nw1 and w2 are weights."
}, },
#endif
#ifdef MULTI_INT_UNION
{"multiunion", (PyCFunction) multiunion_m, METH_VARARGS,
"multiunion(seq) -- compute union of a sequence of integer sets.\n"
"\n"
"Each element of seq must be an integer set, or convertible to one\n"
"via the set iteration protocol. The union returned is an IISet."
},
#endif #endif
{NULL, NULL} /* sentinel */ {NULL, NULL} /* sentinel */
}; };
...@@ -270,7 +279,7 @@ static char BTree_module_documentation[] = ...@@ -270,7 +279,7 @@ static char BTree_module_documentation[] =
"\n" "\n"
MASTER_ID MASTER_ID
BTREEITEMSTEMPLATE_C BTREEITEMSTEMPLATE_C
"$Id: BTreeModuleTemplate.c,v 1.21 2002/03/08 18:33:01 jeremy Exp $\n" "$Id: BTreeModuleTemplate.c,v 1.22 2002/05/30 21:00:30 tim_one Exp $\n"
BTREETEMPLATE_C BTREETEMPLATE_C
BUCKETTEMPLATE_C BUCKETTEMPLATE_C
KEYMACROS_H KEYMACROS_H
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
Set operations Set operations
****************************************************************************/ ****************************************************************************/
#define SETOPTEMPLATE_C "$Id: SetOpTemplate.c,v 1.12 2002/02/21 21:41:17 jeremy Exp $\n" #define SETOPTEMPLATE_C "$Id: SetOpTemplate.c,v 1.13 2002/05/30 21:00:30 tim_one Exp $\n"
#ifdef INTSET_H #ifdef INTSET_H
static int static int
...@@ -401,3 +401,80 @@ wintersection_m(PyObject *ignored, PyObject *args) ...@@ -401,3 +401,80 @@ wintersection_m(PyObject *ignored, PyObject *args)
} }
#endif #endif
#ifdef MULTI_INT_UNION
#include "sorters.c"
/* Input is a sequence of integer sets (or convertible to sets by the
set iteration protocol). Output is the union of the sets. The point
is to run much faster than doing pairs of unions.
*/
static PyObject *
multiunion_m(PyObject *ignored, PyObject *args)
{
PyObject *seq; /* input sequence */
int n; /* length of input sequence */
PyObject *set = NULL; /* an element of the input sequence */
Bucket *result; /* result set */
int i;
UNLESS(PyArg_ParseTuple(args, "O", &seq))
return NULL;
n = PyObject_Length(seq);
if (n < 0)
return NULL;
/* Construct an empty result set. */
result = BUCKET(PyObject_CallObject(OBJECT(&SetType), NULL));
if (result == NULL)
return NULL;
/* For each set in the input sequence, append its elements to the result
set. At this point, we ignore the possibility of duplicates. */
for (i = 0; i < n; ++i) {
SetIteration setiter = {0, 0, 0};
int merge; /* dummy needed for initSetIteration */
set = PySequence_GetItem(seq, i);
if (set == NULL)
goto Error;
/* XXX TODO: If set is a bucket, do a straight resize+memcpy instead.
*/
if (initSetIteration(&setiter, set, 1, &merge) < 0)
goto Error;
if (setiter.next(&setiter) < 0)
goto Error;
while (setiter.position >= 0) {
if (result->len >= result->size && Bucket_grow(result, 1) < 0)
goto Error;
COPY_KEY(result->keys[result->len], setiter.key);
++result->len;
/* We know the key is an int, so no need to incref it. */
if (setiter.next(&setiter) < 0)
goto Error;
}
Py_DECREF(set);
set = NULL;
}
/* Combine, sort, remove duplicates, and reset the result's len.
If the set shrinks (which happens if and only if there are
duplicates), no point to realloc'ing the set smaller, as we
expect the result set to be short-lived.
*/
if (result->len > 0) {
size_t newlen; /* number of elements in final result set */
newlen = sort_int4_nodups(result->keys, (size_t)result->len);
result->len = (int)newlen;
}
return (PyObject *)result;
Error:
Py_DECREF(result);
Py_XDECREF(set);
return NULL;
}
#endif
/* Setup template macros */ /* Setup template macros */
#define MASTER_ID "$Id: _IIBTree.c,v 1.5 2002/02/21 21:41:17 jeremy Exp $\n" #define MASTER_ID "$Id: _IIBTree.c,v 1.6 2002/05/30 21:00:30 tim_one Exp $\n"
#define PERSISTENT #define PERSISTENT
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#define INITMODULE init_IIBTree #define INITMODULE init_IIBTree
#define DEFAULT_MAX_BUCKET_SIZE 120 #define DEFAULT_MAX_BUCKET_SIZE 120
#define DEFAULT_MAX_BTREE_SIZE 500 #define DEFAULT_MAX_BTREE_SIZE 500
#define MULTI_INT_UNION 1
#include "intkeymacros.h" #include "intkeymacros.h"
#include "intvaluemacros.h" #include "intvaluemacros.h"
......
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import sys, os, time, random
from unittest import TestCase, TestSuite, TextTestRunner, makeSuite
from BTrees.IIBTree import IIBTree, IIBucket, IISet, IITreeSet, \
union, intersection, difference, weightedUnion, weightedIntersection, \
multiunion
# XXX TODO Needs more tests.
# This file was created when multiunion was added. The other set operations
# don't appear to be tested anywhere yet.
class TestMultiUnion(TestCase):
def testEmpty(self):
self.assertEqual(len(multiunion([])), 0)
def testOne(self):
for sequence in [3], range(20), range(-10, 0, 2) + range(1, 10, 2):
seq1 = sequence[:]
seq2 = sequence[:]
seq2.reverse()
seqsorted = sequence[:]
seqsorted.sort()
for seq in seq1, seq2, seqsorted:
for builder in IISet, IITreeSet:
input = builder(seq)
output = multiunion([input])
self.assertEqual(len(seq), len(output))
self.assertEqual(seqsorted, list(output))
def testValuesIgnored(self):
for builder in IIBucket, IIBTree:
input = builder([(1, 2), (3, 4), (5, 6)])
output = multiunion([input])
self.assertEqual([1, 3, 5], list(output))
def testBigInput(self):
input = IISet(range(50000))
reversed = range(50000)
reversed.reverse()
reversed = IISet(reversed)
output = multiunion([input, reversed] * 5)
self.assertEqual(len(output), 50000)
self.assertEqual(list(output), range(50000))
def testLotsOfLittleOnes:
from random import shuffle
N = 5000
inputs = []
for i in range(N):
base = i * 4 - N
inputs.append(IISet([base, base+1]))
inputs.append(IITreeSet([base+2, base+3]))
inputs.shuffle()
output = multiunion(inputs)
self.assertEqual(len(output), N*4)
self.assertEqual(list(output), range(-N, 3*N))
def test_suite():
alltests = TestSuite((makeSuite(TestMultiUnion, 'test'),
))
return alltests
def main():
TextTestRunner().run(test_suite())
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment