Commit 86fc53ee authored by Tim Peters's avatar Tim Peters

Reindex docs touching as few docid->w(docid, w) maps as possible.

parent bad257b8
...@@ -19,6 +19,7 @@ import math ...@@ -19,6 +19,7 @@ import math
from BTrees.IOBTree import IOBTree from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
from BTrees.IIBTree import intersection, difference
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
...@@ -91,8 +92,7 @@ class BaseIndex(Persistent): ...@@ -91,8 +92,7 @@ class BaseIndex(Persistent):
# A subclass may wish to extend or override this. # A subclass may wish to extend or override this.
def index_doc(self, docid, text): def index_doc(self, docid, text):
if self._docwords.has_key(docid): if self._docwords.has_key(docid):
# XXX Do something smarter than this. return self._reindex_doc(docid, text)
self.unindex_doc(docid)
wids = self._lexicon.sourceToWordIds(text) wids = self._lexicon.sourceToWordIds(text)
wid2weight, docweight = self._get_frequencies(wids) wid2weight, docweight = self._get_frequencies(wids)
for wid, weight in wid2weight.items(): for wid, weight in wid2weight.items():
...@@ -101,6 +101,45 @@ class BaseIndex(Persistent): ...@@ -101,6 +101,45 @@ class BaseIndex(Persistent):
self._docwords[docid] = WidCode.encode(wids) self._docwords[docid] = WidCode.encode(wids)
return len(wids) return len(wids)
# A subclass may wish to extend or override this. This is for adjusting
# to a new version of a doc that already exists. The goal is to be
# faster than simply unindexing the old version in its entirety and then
# adding the new version in its entirety.
def _reindex_doc(self, docid, text):
# Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
old_wids = self.get_words(docid)
old_wid2w, old_docw = self._get_frequencies(old_wids)
new_wids = self._lexicon.sourceToWordIds(text)
new_wid2w, new_docw = self._get_frequencies(new_wids)
old_widset = IITreeSet(old_wid2w.keys())
new_widset = IITreeSet(new_wid2w.keys())
in_both_widset = intersection(old_widset, new_widset)
only_old_widset = difference(old_widset, in_both_widset)
only_new_widset = difference(new_widset, in_both_widset)
del old_widset, new_widset
for wid in only_old_widset.keys():
self._del_wordinfo(wid, docid)
for wid in only_new_widset.keys():
self._add_wordinfo(wid, new_wid2w[wid], docid)
for wid in in_both_widset.keys():
# For the Okapi indexer, the "if" will trigger only for words
# whose counts have changed. For the cosine indexer, the "if"
# may trigger for every wid, since W(d) probably changed and
# W(d) is divided into every score.
newscore = new_wid2w[wid]
if old_wid2w[wid] != newscore:
self._add_wordinfo(wid, newscore, docid)
self._docweight[docid] = new_docw
self._docwords[docid] = WidCode.encode(new_wids)
return len(new_wids)
# Subclass must override. # Subclass must override.
def _get_frequencies(self, wids): def _get_frequencies(self, wids):
# Compute term frequencies and a doc weight, whatever those mean # Compute term frequencies and a doc weight, whatever those mean
......
...@@ -54,6 +54,11 @@ class OkapiIndex(BaseIndex): ...@@ -54,6 +54,11 @@ class OkapiIndex(BaseIndex):
def index_doc(self, docid, text): def index_doc(self, docid, text):
count = BaseIndex.index_doc(self, docid, text) count = BaseIndex.index_doc(self, docid, text)
self._totaldoclen += count self._totaldoclen += count
return count
def _reindex_doc(self, docid, text):
self._totaldoclen -= self._docweight[docid]
return BaseIndex._reindex_doc(self, docid, text)
def unindex_doc(self, docid): def unindex_doc(self, docid):
self._totaldoclen -= self._docweight[docid] self._totaldoclen -= self._docweight[docid]
......
...@@ -142,18 +142,29 @@ class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest): ...@@ -142,18 +142,29 @@ class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
def testRanking(self): def testRanking(self):
self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old", self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
"pease", "porridge", "pot"] "pease", "porridge", "pot"]
self.docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
self._ranking_index() self._ranking_index()
self._ranking_tf() self._ranking_tf()
self._ranking_idf() self._ranking_idf()
self._ranking_queries() self._ranking_queries()
# A digression to exercise re-indexing. This should leave
# things exactly as they were.
docs = self.docs
for variant in ("hot cold porridge python", "pease hot pithy ",
docs[-1]):
self.zc_index.index_object(len(docs), Indexable(variant))
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self): def _ranking_index(self):
docs = ["Pease porridge hot, pease porridge cold,", docs = self.docs
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
for i in range(len(docs)): for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i])) self.zc_index.index_object(i + 1, Indexable(docs[i]))
...@@ -220,6 +231,12 @@ class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest): ...@@ -220,6 +231,12 @@ class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
"one two three"] "one two three"]
for i in range(len(docs)): for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i])) self.zc_index.index_object(i + 1, Indexable(docs[i]))
# A brief digression to exercise re-indexing. This should leave
# things exactly as they were.
for variant in "one xyz", "xyz two three", "abc def", docs[-1]:
self.zc_index.index_object(len(docs), Indexable(variant))
self.assertEqual(self.index._totaldoclen, 6) self.assertEqual(self.index._totaldoclen, 6)
# So the mean doc length is 2. We use that later. # So the mean doc length is 2. We use that later.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment