More comments and small cleanups & rearrangements. No semantic changes

(unless I erred ...).

More comments and small cleanups & rearrangements. No semantic changes
(unless I erred ...).
79b99fbb · Tim Peters · aee22894 · 79b99fbb
Commit 79b99fbb authored May 15, 2002 by Tim Peters
Show whitespace changes
Inline Side-by-side

Showing with 72 additions and 51 deletions

lib/python/Products/ZCTextIndex/OkapiIndex.py lib/python/Products/ZCTextIndex/OkapiIndex.py +72 -51

No files found.
--- a/lib/python/Products/ZCTextIndex/OkapiIndex.py
+++ b/lib/python/Products/ZCTextIndex/OkapiIndex.py
@@ -56,42 +56,45 @@ class Index:
    def __init__(self, lexicon):
        self._lexicon = lexicon
-        # wid -> { docid -> frequency }; t -> D -> f(D, t)
+        # wid -> {docid -> frequency}; t -> D -> f(D, t)
        self._wordinfo = IOBTree()
        # docid -> # of words in the doc
-        # XXX this is just len(self._docwords[docid]), but if _docwords
+        # This is just len(self._docwords[docid]), but _docwords is stored
-        # XXX is stored in compressed form then uncompressing just to count
+        # in compressed form, so uncompressing it just to count the list
-        # XXX the list length would be ridiculously expensive.
+        # length would be ridiculously expensive.
        self._doclen = IIBTree()
-        # docid -> [ wid ]
-        # used for un-indexing
-        self._docwords = IOBTree()
        # sum(self._doclen.values()), the total # of words in all docs
+        # This is a long for "better safe than sorry" reasons.  It isn't
+        # used often enough that speed should matter.
        self._totaldoclen = 0L
+        # docid -> WidCode'd list of wids
+        # Used for un-indexing, and for phrase search.
+        self._docwords = IOBTree()
    def length(self):
        """Return the number of documents in the index."""
        return len(self._docwords)
-    # Most of the computation for computing a relevance score for the
-    # document occurs in the search() method.
    def index_doc(self, docid, text):
        wids = self._lexicon.sourceToWordIds(text)
        self._doclen[docid] = len(wids)
        self._totaldoclen += len(wids)
        wid2count = self._get_frequencies(wids)
        for wid, count in wid2count.items():
            self._add_wordinfo(wid, count, docid)
-        self._add_undoinfo(docid, wids)
+        self._docwords[docid] = WidCode.encode(wids)
    def unindex_doc(self, docid):
-        for wid in self._get_undoinfo(docid):
+        for wid in WidCode.decode(self._docwords[docid]):
            self._del_wordinfo(wid, docid)
        del self._docwords[docid]
        count = self._doclen[docid]
        del self._doclen[docid]
        self._totaldoclen -= count
@@ -117,27 +120,34 @@ class Index:
                result[docid] = weight
        return result
+    # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
+    # for each wid t in wids.  The IIBucket, times the weight, maps D to
+    # TF(D,t) * IDF(t) for every docid D containing t.
+    # As currently written, the weights are always 1, and the IIBucket maps
+    # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
+    # but stored as a scaled_int.
    def _search_wids(self, wids):
        if not wids:
            return []
-        N = float(len(self._doclen))
+        N = float(len(self._doclen))  # total # of docs
-        L = []
+        meandoclen = self._totaldoclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B
-        meandoclen = self._totaldoclen / N
        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-        for wid in wids:
+        L = []
-            d2f = self._wordinfo[wid] # map {docid -> f(docid, wid)}
+        docid2len = self._doclen
-            idf = inverse_doc_frequency(len(d2f), N)  # this is an unscaled float
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            for docid, f in d2f.items():
-                lenweight = B_from1 + B * self._doclen[docid] / meandoclen
+                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = scaled_int(tf * idf)
            L.append((result, 1))
@@ -153,18 +163,21 @@ class Index:
        # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
        # as an invariant weight across the whole result.  But besides
        # skating near the edge, it's not a speed cure, since the computation
-        # of tf would still done at Python speed, and it's a lot more
+        # of tf would still be done at Python speed, and it's a lot more
        # work than just multiplying by idf.
    def query_weight(self, terms):
-        # XXX I have no idea what to put here
+        # This method was inherited from the cosine measure, and doesn't
-        return 10
+        # make sense for Okapi measures in the way the cosine measure uses
+        # it.  See the long comment at the end of the file for how full
+        # Okapi BM25 deals with weighting query terms.
+        return 10   # arbitrary
    def _get_frequencies(self, wids):
        """Return individual term frequencies."""
        # Computes f(d, t) for each term.
-        # Returns a dict mapping wid to the number of times wid appeared
+        # Returns a dict mapping wid to the number of times wid appeares
-        # in wids, {t: f(d, t)}
+        # in wids, {t -> f(d, t)}
        d = {}
        dget = d.get
        for wid in wids:
@@ -223,32 +236,6 @@ class Index:
            map = new
        self._wordinfo[wid] = map # Not redundant, because of Persistency!
-    def _add_undoinfo(self, docid, wids):
-        self._docwords[docid] = WidCode.encode(wids)
-    def _get_undoinfo(self, docid):
-        return WidCode.decode(self._docwords[docid])
-    # The rest are helper methods to support unit tests
-    # XXX These don't work for Okapi, I assume
-    def _get_wdt(self, d, t):
-        wid, = self._lexicon.termToWordIds(t)
-        map = self._wordinfo[wid]
-        return map.get(d, 0) * self._doclen[d] / SCALE_FACTOR
-    def _get_Wd(self, d):
-        return self._doclen[d]
-    def _get_ft(self, t):
-        wid, = self._lexicon.termToWordIds(t)
-        return len(self._wordinfo[wid])
-    def _get_wt(self, t):
-        wid, = self._lexicon.termToWordIds(t)
-        map = self._wordinfo[wid]
-        return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
 def inverse_doc_frequency(term_count, num_items):
    """Return the inverse doc frequency for a term,
@@ -400,4 +387,38 @@ Putting it all together, the final TF function is
                f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
 with k1=1.2 and b=0.75.
+Query Term Weighting
+--------------------
+I'm ignoring the query adjustment part of Okapi BM25 because I expect our
+queries are very short.  Full BM25 takes them into account by adding the
+following to every score(D, Q); it depends on the lengths of D and Q, but
+not on the specific words in Q, or even on whether they appear in D(!):
+                   E(len(D)) - len(D)
+    k2 * len(Q) * -------------------
+                   E(len(D)) + len(D)
+Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
+len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
+so it apparently doesn't do much good (or may even hurt).
+Full BM25 *also* multiplies the following factor into IDF(Q, t):
+    f(Q, t) * (k3 + 1)
+    ------------------
+       f(Q, t) + k3
+where k3 is yet another free parameter, and f(Q,t) is the number of times t
+appears in Q.  Since we're using short "web style" queries, I expect f(Q,t)
+to always be 1, and then that quotient is
+     1 * (k3 + 1)
+     ------------ = 1
+        1 + k3
+regardless of k3's value.  So, in a trivial sense, we are incorporating
+this measure (and optimizing it by not bothering to multiply by 1 <wink>).
 """