Commit 315bcde9 authored by Tim Peters's avatar Tim Peters

Since I did the work to write the inner Okapi scoring loop in C, may as

well check it in.  This yields an overall 133% speedup on a "hot" search
for 'python' in my python-dev archive (a word that appears in all but
2 documents).  For those who read the email, turned out it was a
significant speedup to iterate over an IIBTree's items rather than to
materialize the items into an explicit list first.

This is now within 20% of simply doing "IIBucket(the_IIBTree)" (i.e.,
no arithmetic at all), so there's no significant possibility remaining
for speeding the inner score loop.
parent 53b46dc9
...@@ -70,6 +70,8 @@ class OkapiIndex(BaseIndex): ...@@ -70,6 +70,8 @@ class OkapiIndex(BaseIndex):
# As currently written, the weights are always 1, and the IIBucket maps # As currently written, the weights are always 1, and the IIBucket maps
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
# but stored as a scaled_int. # but stored as a scaled_int.
# NOTE: This is overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
def _search_wids(self, wids): def _search_wids(self, wids):
if not wids: if not wids:
return [] return []
...@@ -87,7 +89,6 @@ class OkapiIndex(BaseIndex): ...@@ -87,7 +89,6 @@ class OkapiIndex(BaseIndex):
L = [] L = []
docid2len = self._docweight docid2len = self._docweight
for t in wids: for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)} d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket() result = IIBucket()
...@@ -111,6 +112,35 @@ class OkapiIndex(BaseIndex): ...@@ -111,6 +112,35 @@ class OkapiIndex(BaseIndex):
# of tf would still be done at Python speed, and it's a lot more # of tf would still be done at Python speed, and it's a lot more
# work than just multiplying by idf. # work than just multiplying by idf.
# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def _search_wids(self, wids):
from Products.ZCTextIndex.okascore import score
if not wids:
return []
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
score(result, d2f.items(), docid2len, idf, meandoclen)
L.append((result, 1))
return L
def query_weight(self, terms): def query_weight(self, terms):
# This method was inherited from the cosine measure, and doesn't # This method was inherited from the cosine measure, and doesn't
# make sense for Okapi measures in the way the cosine measure uses # make sense for Okapi measures in the way the cosine measure uses
......
*shared* *shared*
stopper stopper.c stopper stopper.c
okascore okascore.c
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* okascore.c
*
* The inner scoring loop of OkapiIndex._search_wids() coded in C.
*
* With the Python scoring loop,
*
* query: python
* # results: 10 of 19056 in 534.77 ms
* query: python
* # results: 10 of 19056 in 277.52 ms
*
* The first timing is cold, the second timing from an immediate repeat of
* the same query. With the timing loop here in C:
*
* query: python
* # results: 10 of 19056 in 380.74 ms -- 40% speedup
* query: python
* # results: 10 of 19056 in 118.96 ms -- 133% speedup
*/
#include "Python.h"
#define K1 1.2
#define B 0.75
static PyObject *
score(PyObject *self, PyObject *args)
{
/* Believe it or not, floating these common subexpressions "by hand"
gets better code out of MSVC 6. */
const double B_FROM1 = 1.0 - B;
const double K1_PLUS1 = K1 + 1.0;
/* Inputs */
PyObject *result; /* IIBucket result, maps d to score */
PyObject *d2fitems; /* ._wordinfo[t].items(), maps d to f(d, t) */
PyObject *d2len; /* ._docweight, maps d to # words in d */
double idf; /* inverse doc frequency of t */
double meandoclen; /* average number of words in a doc */
int n, i;
if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
&idf, &meandoclen))
return NULL;
n = PyObject_Length(d2fitems);
for (i = 0; i < n; ++i) {
PyObject *d_and_f; /* d2f[i], a (d, f) pair */
PyObject *d;
double f;
PyObject *doclen; /* ._docweight[d] */
double lenweight;
double tf;
double score;
PyObject *scaled_int;
int status;
d_and_f = PySequence_GetItem(d2fitems, i);
if (d_and_f == NULL)
return NULL;
if (!(PyTuple_Check(d_and_f) &&
PyTuple_Size(d_and_f) == 2)) {
PyErr_SetString(PyExc_TypeError,
"d2fitems must produce 2-item tuples");
Py_DECREF(d_and_f);
return NULL;
}
d = PyTuple_GET_ITEM(d_and_f, 0);
f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
doclen = PyObject_GetItem(d2len, d);
if (doclen == NULL) {
Py_DECREF(d_and_f);
return NULL;
}
lenweight = B_FROM1 + B * PyInt_AsLong(doclen) / meandoclen;
tf = f * K1_PLUS1 / (f + K1 * lenweight);
score = tf * idf;
scaled_int = PyInt_FromLong((long)(score * 1024.0 + 0.5));
status = PyObject_SetItem(result, d, scaled_int);
Py_DECREF(d_and_f);
Py_DECREF(doclen);
Py_DECREF(scaled_int);
if (status < 0)
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
}
static char score__doc__[] =
"score(result, d2fitems, d2len, idf, meandoclen)\n"
"\n"
"Do the inner scoring loop for an Okapi index.\n";
static PyMethodDef okascore_functions[] = {
{"score", score, METH_VARARGS, score__doc__},
{NULL}
};
void
initokascore(void)
{
PyObject *m;
m = Py_InitModule3("okascore", okascore_functions,
"inner scoring loop for Okapi rank");
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment