Commit 1c157170 authored by Casey Duncan's avatar Casey Duncan

Reimplemented Index.length to use a BTree.Length. Previous dynamic computation...

Reimplemented Index.length to use a BTree.Length. Previous dynamic computation was way too slow for big indexes.
Updated tests to include length value checks
parent 8743e46f
...@@ -20,6 +20,7 @@ import math ...@@ -20,6 +20,7 @@ import math
from BTrees.IOBTree import IOBTree from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from BTrees.IIBTree import intersection, difference from BTrees.IIBTree import intersection, difference
import BTrees.Length
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
...@@ -53,6 +54,8 @@ class BaseIndex(Persistent): ...@@ -53,6 +54,8 @@ class BaseIndex(Persistent):
__implements__ = IIndex __implements__ = IIndex
word_count = 0
def __init__(self, lexicon): def __init__(self, lexicon):
self._lexicon = lexicon self._lexicon = lexicon
...@@ -81,12 +84,17 @@ class BaseIndex(Persistent): ...@@ -81,12 +84,17 @@ class BaseIndex(Persistent):
# Used for un-indexing, and for phrase search. # Used for un-indexing, and for phrase search.
self._docwords = IOBTree() self._docwords = IOBTree()
# Use a BTree length for efficient length computation w/o conflicts
self.length = BTrees.Length.Length()
def length(self): def length(self):
"""Return the number of words in the index.""" """Return the number of words in the index."""
# This is overridden per instance
return len(self._wordinfo) return len(self._wordinfo)
def get_words(self, docid): def get_words(self, docid):
"""Return a list of the wordids for a given docid.""" """Return a list of the wordids for a given docid."""
# Note this is overridden in the instance
return WidCode.decode(self._docwords[docid]) return WidCode.decode(self._docwords[docid])
# A subclass may wish to extend or override this. # A subclass may wish to extend or override this.
...@@ -239,6 +247,7 @@ class BaseIndex(Persistent): ...@@ -239,6 +247,7 @@ class BaseIndex(Persistent):
doc2score = self._wordinfo.get(wid) doc2score = self._wordinfo.get(wid)
if doc2score is None: if doc2score is None:
doc2score = {} doc2score = {}
self.length.change(1)
else: else:
# _add_wordinfo() is called for each update. If the map # _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree. # size exceeds the DICT_CUTOFF, convert to an IIBTree.
...@@ -262,15 +271,19 @@ class BaseIndex(Persistent): ...@@ -262,15 +271,19 @@ class BaseIndex(Persistent):
def _mass_add_wordinfo(self, wid2weight, docid): def _mass_add_wordinfo(self, wid2weight, docid):
dicttype = type({}) dicttype = type({})
get_doc2score = self._wordinfo.get get_doc2score = self._wordinfo.get
new_word_count = 0
for wid, weight in wid2weight.items(): for wid, weight in wid2weight.items():
doc2score = get_doc2score(wid) doc2score = get_doc2score(wid)
if doc2score is None: if doc2score is None:
doc2score = {} doc2score = {}
new_word_count += 1
elif (isinstance(doc2score, dicttype) and elif (isinstance(doc2score, dicttype) and
len(doc2score) == self.DICT_CUTOFF): len(doc2score) == self.DICT_CUTOFF):
doc2score = IIBTree(doc2score) doc2score = IIBTree(doc2score)
doc2score[docid] = weight doc2score[docid] = weight
self._wordinfo[wid] = doc2score # not redundant: Persistency! self._wordinfo[wid] = doc2score # not redundant: Persistency!
self.length.change(new_word_count)
def _del_wordinfo(self, wid, docid): def _del_wordinfo(self, wid, docid):
doc2score = self._wordinfo[wid] doc2score = self._wordinfo[wid]
...@@ -278,6 +291,7 @@ class BaseIndex(Persistent): ...@@ -278,6 +291,7 @@ class BaseIndex(Persistent):
numdocs = len(doc2score) numdocs = len(doc2score)
if numdocs == 0: if numdocs == 0:
del self._wordinfo[wid] del self._wordinfo[wid]
self.length.change(-1)
return return
if numdocs == self.DICT_CUTOFF: if numdocs == self.DICT_CUTOFF:
new = {} new = {}
......
...@@ -37,6 +37,8 @@ class IndexTest(TestCase): ...@@ -37,6 +37,8 @@ class IndexTest(TestCase):
self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 5) self.assertEqual(len(self.index.get_words(DOCID)), 5)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
for map in self.index._wordinfo.values(): for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1) self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID)) self.assert_(map.has_key(DOCID))
...@@ -48,6 +50,8 @@ class IndexTest(TestCase): ...@@ -48,6 +50,8 @@ class IndexTest(TestCase):
self.assertEqual(len(self.index._docweight), 0) self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0) self.assertEqual(len(self.index._docwords), 0)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
def test_index_two_documents(self): def test_index_two_documents(self):
self.test_index_document() self.test_index_document()
...@@ -59,6 +63,8 @@ class IndexTest(TestCase): ...@@ -59,6 +63,8 @@ class IndexTest(TestCase):
self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index.get_words(DOCID)), 4)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
wids = self.lexicon.termToWordIds("document") wids = self.lexicon.termToWordIds("document")
self.assertEqual(len(wids), 1) self.assertEqual(len(wids), 1)
document_wid = wids[0] document_wid = wids[0]
...@@ -80,6 +86,8 @@ class IndexTest(TestCase): ...@@ -80,6 +86,8 @@ class IndexTest(TestCase):
self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index.get_words(DOCID)), 4)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
for map in self.index._wordinfo.values(): for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1) self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID)) self.assert_(map.has_key(DOCID))
...@@ -91,6 +99,8 @@ class IndexTest(TestCase): ...@@ -91,6 +99,8 @@ class IndexTest(TestCase):
self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 7) self.assertEqual(len(self.index.get_words(DOCID)), 7)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
wids = self.lexicon.termToWordIds("repeat") wids = self.lexicon.termToWordIds("repeat")
self.assertEqual(len(wids), 1) self.assertEqual(len(wids), 1)
repititive_wid = wids[0] repititive_wid = wids[0]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment