Products.ZCTextIndex was moved to its own distribution

1c1a53d1 · Hanno Schlichting · 48f67574 · 1c1a53d1 · 1c1a53d1 · 48f67574
Commit 1c1a53d1 authored Jun 19, 2010 by Hanno Schlichting
59 changed files
--- a/buildout.cfg
+++ b/buildout.cfg
@@ -44,6 +44,7 @@ eggs =
    Missing
    MultiMapping
    Persistence
+    Products.ZCTextIndex
    Record
    RestrictedPython
    initgroups

--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 ##############################################################################

 import os
-from setuptools import setup, find_packages, Extension
+from setuptools import setup, find_packages


 setup(name='Zope2',
@@ -29,18 +29,6 @@ setup(name='Zope2',
    packages=find_packages('src'),
    namespace_packages=['Products'],
    package_dir={'': 'src'},
-
-    ext_modules=[
-      # indexes
-      Extension(
-            name='Products.ZCTextIndex.stopper',
-            sources=['src/Products/ZCTextIndex/stopper.c']),
-      Extension(
-            name='Products.ZCTextIndex.okascore',
-            sources=['src/Products/ZCTextIndex/okascore.c']),
-
-    ],
-
    install_requires=[
      'AccessControl',
      'Acquisition',
@@ -50,6 +38,7 @@ setup(name='Zope2',
      'Missing',
      'MultiMapping',
      'Persistence',
+      'Products.ZCTextIndex',
      'Record',
      'RestrictedPython',
      'ZConfig',

--- a/src/Products/ZCTextIndex/BaseIndex.py
+++ b/src/Products/ZCTextIndex/BaseIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-"""Abstract base class for full text index with relevance ranking."""
-
-
-import math
-
-from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree
-from BTrees.IIBTree import IIBucket
-from BTrees.IIBTree import IITreeSet
-from BTrees.IIBTree import difference
-from BTrees.IIBTree import intersection
-from BTrees.Length import Length
-from Persistence import Persistent
-from zope.interface import implements
-
-from Products.ZCTextIndex import WidCode
-from Products.ZCTextIndex.interfaces import IIndex
-from Products.ZCTextIndex.SetOps import mass_weightedIntersection
-from Products.ZCTextIndex.SetOps import mass_weightedUnion
-
-
-# Instead of storing floats, we generally store scaled ints.  Binary pickles
-# can store those more efficiently.  The default SCALE_FACTOR of 1024
-# is large enough to get about 3 decimal digits of fractional info, and
-# small enough so that scaled values should almost always fit in a signed
-# 16-bit int (we're generally storing logs, so a few bits before the radix
-# point goes a long way; on the flip side, for reasonably small numbers x
-# most of the info in log(x) is in the fractional bits, so we do want to
-# save a lot of those).
-SCALE_FACTOR = 1024.0
-
-def scaled_int(f, scale=SCALE_FACTOR):
-    # We expect only positive inputs, so "add a half and chop" is the
-    # same as round().  Surprising, calling round() is significantly more
-    # expensive.
-    return int(f * scale + 0.5)
-
-def unique(L):
-    """Return a list of the unique elements in L."""
-    return IITreeSet(L).keys()
-
-class BaseIndex(Persistent):
-
-    implements(IIndex)
-
-    def __init__(self, lexicon):
-        self._lexicon = lexicon
-
-        # wid -> {docid -> weight}; t -> D -> w(D, t)
-        # Different indexers have different notions of term weight, but we
-        # expect each indexer to use ._wordinfo to map wids to its notion
-        # of a docid-to-weight map.
-        # There are two kinds of OOV words:  wid 0 is explicitly OOV,
-        # and it's possible that the lexicon will return a non-zero wid
-        # for a word we don't currently know about.  For example, if we
-        # unindex the last doc containing a particular word, that wid
-        # remains in the lexicon, but is no longer in our _wordinfo map;
-        # lexicons can also be shared across indices, and some other index
-        # may introduce a lexicon word we've never seen.
-        # A word is in-vocabulary for this index if and only if
-        # _wordinfo.has_key(wid).  Note that wid 0 must not be a key.
-        self._wordinfo = IOBTree()
-
-        # docid -> weight
-        # Different indexers have different notions of doc weight, but we
-        # expect each indexer to use ._docweight to map docids to its
-        # notion of what a doc weight is.
-        self._docweight = IIBTree()
-
-        # docid -> WidCode'd list of wids
-        # Used for un-indexing, and for phrase search.
-        self._docwords = IOBTree()
-
-        # Use a BTree length for efficient length computation w/o conflicts
-        self.length = Length()
-        self.document_count = Length()
-
-    def length(self):
-        """Return the number of words in the index."""
-        # This is overridden per instance
-        return len(self._wordinfo)
-        
-    def document_count(self):
-        """Return the number of documents in the index"""
-        # This is overridden per instance
-        return len(self._docweight)        
-
-    def get_words(self, docid):
-        """Return a list of the wordids for a given docid."""
-        # Note this is overridden in the instance
-        return WidCode.decode(self._docwords[docid])
-
-    # A subclass may wish to extend or override this.
-    def index_doc(self, docid, text):
-        if self._docwords.has_key(docid):
-            return self._reindex_doc(docid, text)
-        wids = self._lexicon.sourceToWordIds(text)
-        wid2weight, docweight = self._get_frequencies(wids)
-        self._mass_add_wordinfo(wid2weight, docid)
-        self._docweight[docid] = docweight
-        self._docwords[docid] = WidCode.encode(wids)
-        try:
-            self.document_count.change(1)
-        except AttributeError:
-            # Upgrade document_count to Length object
-            self.document_count = Length(self.document_count())
-        return len(wids)
-
-    # A subclass may wish to extend or override this.  This is for adjusting
-    # to a new version of a doc that already exists.  The goal is to be
-    # faster than simply unindexing the old version in its entirety and then
-    # adding the new version in its entirety.
-    def _reindex_doc(self, docid, text):
-        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
-        old_wids = self.get_words(docid)
-        old_wid2w, old_docw = self._get_frequencies(old_wids)
-
-        new_wids = self._lexicon.sourceToWordIds(text)
-        new_wid2w, new_docw = self._get_frequencies(new_wids)
-
-        old_widset = IITreeSet(old_wid2w.keys())
-        new_widset = IITreeSet(new_wid2w.keys())
-
-        in_both_widset = intersection(old_widset, new_widset)
-        only_old_widset = difference(old_widset, in_both_widset)
-        only_new_widset = difference(new_widset, in_both_widset)
-        del old_widset, new_widset
-
-        for wid in only_old_widset.keys():
-            self._del_wordinfo(wid, docid)
-
-        for wid in only_new_widset.keys():
-            self._add_wordinfo(wid, new_wid2w[wid], docid)
-
-        for wid in in_both_widset.keys():
-            # For the Okapi indexer, the "if" will trigger only for words
-            # whose counts have changed.  For the cosine indexer, the "if"
-            # may trigger for every wid, since W(d) probably changed and
-            # W(d) is divided into every score.
-            newscore = new_wid2w[wid]
-            if old_wid2w[wid] != newscore:
-                self._add_wordinfo(wid, newscore, docid)
-
-        self._docweight[docid] = new_docw
-        self._docwords[docid] = WidCode.encode(new_wids)
-        return len(new_wids)
-
-    # Subclass must override.
-    def _get_frequencies(self, wids):
-        # Compute term frequencies and a doc weight, whatever those mean
-        # to an indexer.
-        # Return pair:
-        #    {wid0: w(d, wid0), wid1: w(d, wid1),  ...],
-        #    docweight
-        # The wid->weight mappings are fed into _add_wordinfo, and docweight
-        # becomes the value of _docweight[docid].
-        raise NotImplementedError
-
-    def has_doc(self, docid):
-        return self._docwords.has_key(docid)
-
-    # A subclass may wish to extend or override this.
-    def unindex_doc(self, docid):
-        for wid in unique(self.get_words(docid)):
-            self._del_wordinfo(wid, docid)
-        del self._docwords[docid]
-        del self._docweight[docid]
-        try:
-            self.document_count.change(-1)
-        except AttributeError:
-            # Upgrade document_count to Length object
-            self.document_count = Length(self.document_count())
-
-    def search(self, term):
-        wids = self._lexicon.termToWordIds(term)
-        if not wids:
-            return None # All docs match
-        wids = self._remove_oov_wids(wids)
-        return mass_weightedUnion(self._search_wids(wids))
-
-    def search_glob(self, pattern):
-        wids = self._lexicon.globToWordIds(pattern)
-        wids = self._remove_oov_wids(wids)
-        return mass_weightedUnion(self._search_wids(wids))
-
-    def search_phrase(self, phrase):
-        wids = self._lexicon.termToWordIds(phrase)
-        cleaned_wids = self._remove_oov_wids(wids)
-        if len(wids) != len(cleaned_wids):
-            # At least one wid was OOV:  can't possibly find it.
-            return IIBTree()
-        scores = self._search_wids(wids)
-        hits = mass_weightedIntersection(scores)
-        if not hits:
-            return hits
-        code = WidCode.encode(wids)
-        result = IIBTree()
-        for docid, weight in hits.items():
-            docwords = self._docwords[docid]
-            if docwords.find(code) >= 0:
-                result[docid] = weight
-        return result
-
-    def _remove_oov_wids(self, wids):
-        return filter(self._wordinfo.has_key, wids)
-
-    # Subclass must override.
-    # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
-    # for each wid t in wids.  The IIBucket, times the weight, maps D to
-    # TF(D,t) * IDF(t) for every docid D containing t.  wids must not
-    # contain any OOV words.
-    def _search_wids(self, wids):
-        raise NotImplementedError
-
-    # Subclass must override.
-    # It's not clear what it should do.  It must return an upper bound on
-    # document scores for the query.  It would be nice if a document score
-    # divided by the query's query_weight gave the proabability that a
-    # document was relevant, but nobody knows how to do that.  For
-    # CosineIndex, the ratio is the cosine of the angle between the document
-    # and query vectors.  For OkapiIndex, the ratio is a (probably
-    # unachievable) upper bound with no "intuitive meaning" beyond that.
-    def query_weight(self, terms):
-        raise NotImplementedError
-
-    DICT_CUTOFF = 10
-
-    def _add_wordinfo(self, wid, f, docid):
-        # Store a wordinfo in a dict as long as there are less than
-        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.
-
-        # The pickle of a dict is smaller than the pickle of an
-        # IIBTree, substantially so for small mappings.  Thus, we use
-        # a dictionary until the mapping reaches DICT_CUTOFF elements.
-
-        # The cutoff is chosen based on the implementation
-        # characteristics of Python dictionaries.  The dict hashtable
-        # always has 2**N slots and is resized whenever it is 2/3s
-        # full.  A pickled dict with 10 elts is half the size of an
-        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
-        # choose 10 as the cutoff for now.
-
-        # The IIBTree has a smaller in-memory representation than a
-        # dictionary, so pickle size isn't the only consideration when
-        # choosing the threshold.  The pickle of a 500-elt dict is 92%
-        # of the size of the same IIBTree, but the dict uses more
-        # space when it is live in memory.  An IIBTree stores two C
-        # arrays of ints, one for the keys and one for the values.  It
-        # holds up to 120 key-value pairs in a single bucket.
-        doc2score = self._wordinfo.get(wid)
-        if doc2score is None:
-            doc2score = {}
-            self.length.change(1)
-        else:
-            # _add_wordinfo() is called for each update.  If the map
-            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
-            # Obscure:  First check the type.  If it's not a dict, it
-            # can't need conversion, and then we can avoid an expensive
-            # len(IIBTree).
-            if (isinstance(doc2score, type({})) and
-                len(doc2score) == self.DICT_CUTOFF):
-                doc2score = IIBTree(doc2score)
-        doc2score[docid] = f
-        self._wordinfo[wid] = doc2score # not redundant:  Persistency!
-
-    #    self._mass_add_wordinfo(wid2weight, docid)
-    #
-    # is the same as
-    #
-    #    for wid, weight in wid2weight.items():
-    #        self._add_wordinfo(wid, weight, docid)
-    #
-    # except that _mass_add_wordinfo doesn't require so many function calls.
-    def _mass_add_wordinfo(self, wid2weight, docid):
-        dicttype = type({})
-        get_doc2score = self._wordinfo.get
-        new_word_count = 0
-        for wid, weight in wid2weight.items():
-            doc2score = get_doc2score(wid)
-            if doc2score is None:
-                doc2score = {}
-                new_word_count += 1
-            elif (isinstance(doc2score, dicttype) and
-                  len(doc2score) == self.DICT_CUTOFF):
-                doc2score = IIBTree(doc2score)
-            doc2score[docid] = weight
-            self._wordinfo[wid] = doc2score # not redundant:  Persistency!
-        self.length.change(new_word_count)
-
-
-    def _del_wordinfo(self, wid, docid):
-        doc2score = self._wordinfo[wid]
-        del doc2score[docid]
-        if doc2score:
-            self._wordinfo[wid] = doc2score # not redundant:  Persistency!
-        else:
-            del self._wordinfo[wid]
-            self.length.change(-1)
-
-def inverse_doc_frequency(term_count, num_items):
-    """Return the inverse doc frequency for a term,
-
-    that appears in term_count items in a collection with num_items
-    total items.
-    """
-    # implements IDF(q, t) = log(1 + N/f(t))
-    return math.log(1.0 + float(num_items) / term_count)
--- a/src/Products/ZCTextIndex/CosineIndex.py
+++ b/src/Products/ZCTextIndex/CosineIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-"""Full text index with relevance ranking, using a cosine measure."""
-
-import math
-
-from BTrees.IIBTree import IIBucket
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import IIndex
-from Products.ZCTextIndex.BaseIndex import BaseIndex
-from Products.ZCTextIndex.BaseIndex import inverse_doc_frequency
-from Products.ZCTextIndex.BaseIndex import scaled_int
-from Products.ZCTextIndex.BaseIndex import SCALE_FACTOR
-
-class CosineIndex(BaseIndex):
-
-    implements(IIndex)
-
-    def __init__(self, lexicon):
-        BaseIndex.__init__(self, lexicon)
-
-        # ._wordinfo for cosine is wid -> {docid -> weight};
-        # t -> D -> w(d, t)/W(d)
-
-        # ._docweight for cosine is
-        # docid -> W(docid)
-
-    # Most of the computation for computing a relevance score for the
-    # document occurs in the _search_wids() method.  The code currently
-    # implements the cosine similarity function described in Managing
-    # Gigabytes, eq. 4.3, p. 187.  The index_object() method
-    # precomputes some values that are independent of the particular
-    # query.
-
-    # The equation is
-    #
-    #                     sum(for t in I(d,q): w(d,t) * w(q,t))
-    #     cosine(d, q) =  -------------------------------------
-    #                                  W(d) * W(q)
-    #
-    # where
-    #    I(d, q) = the intersection of the terms in d and q.
-    #
-    #    w(d, t) = 1 + log f(d, t)
-    #        computed by doc_term_weight(); for a given word t,
-    #        self._wordinfo[t] is a map from d to w(d, t).
-    #
-    #    w(q, t) = log(1 + N/f(t))
-    #        computed by inverse_doc_frequency()
-    #
-    #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
-    #        computed by _get_frequencies(), and remembered in
-    #        self._docweight[d]
-    #
-    #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
-    #        computed by self.query_weight()
-
-    def _search_wids(self, wids):
-        if not wids:
-            return []
-        N = float(self.document_count())
-        L = []
-        DictType = type({})
-        for wid in wids:
-            assert self._wordinfo.has_key(wid)  # caller responsible for OOV
-            d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
-            idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
-            #print "idf = %.3f" % idf
-            if isinstance(d2w, DictType):
-                d2w = IIBucket(d2w)
-            L.append((d2w, scaled_int(idf)))
-        return L
-
-    def query_weight(self, terms):
-        wids = []
-        for term in terms:
-            wids += self._lexicon.termToWordIds(term)
-        N = float(self.document_count())
-        sum = 0.0
-        for wid in self._remove_oov_wids(wids):
-            wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
-            sum += wt ** 2.0
-        return scaled_int(math.sqrt(sum))
-
-    def _get_frequencies(self, wids):
-        d = {}
-        dget = d.get
-        for wid in wids:
-            d[wid] = dget(wid, 0) + 1
-        Wsquares = 0.0
-        for wid, count in d.items():
-            w = doc_term_weight(count)
-            Wsquares += w * w
-            d[wid] = w
-        W = math.sqrt(Wsquares)
-        #print "W = %.3f" % W
-        for wid, weight in d.items():
-            #print i, ":", "%.3f" % weight,
-            d[wid] = scaled_int(weight / W)
-            #print "->", d[wid]
-        return d, scaled_int(W)
-
-    # The rest are helper methods to support unit tests
-
-    def _get_wdt(self, d, t):
-        wid, = self._lexicon.termToWordIds(t)
-        map = self._wordinfo[wid]
-        return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
-
-    def _get_Wd(self, d):
-        return self._docweight[d]
-
-    def _get_ft(self, t):
-        wid, = self._lexicon.termToWordIds(t)
-        return len(self._wordinfo[wid])
-
-    def _get_wt(self, t):
-        wid, = self._lexicon.termToWordIds(t)
-        map = self._wordinfo[wid]
-        return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
-
-def doc_term_weight(count):
-    """Return the doc-term weight for a term that appears count times."""
-    # implements w(d, t) = 1 + log f(d, t)
-    return 1.0 + math.log(count)
--- a/src/Products/ZCTextIndex/HTMLSplitter.py
+++ b/src/Products/ZCTextIndex/HTMLSplitter.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-import re
-
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import ISplitter
-from Products.ZCTextIndex.PipelineFactory import element_factory
-
-class HTMLWordSplitter:
-
-    implements(ISplitter)
-
-    def process(self, text, wordpat=r"(?L)\w+"):
-        splat = []
-        for t in text:
-            splat += self._split(t, wordpat)
-        return splat
-
-    def processGlob(self, text):
-        # see Lexicon.globToWordIds()
-        return self.process(text, r"(?L)\w+[\w*?]*")
-
-    def _split(self, text, wordpat):
-        text = text.lower()
-        remove = [r"<[^<>]*>",
-                  r"&[A-Za-z]+;"]
-        for pat in remove:
-            text = re.sub(pat, " ", text)
-        return re.findall(wordpat, text)
-
-element_factory.registerFactory('Word Splitter',
-                                'HTML aware splitter',
-                                HTMLWordSplitter)
-
-if __name__ == "__main__":
-    import sys
-    splitter = HTMLWordSplitter()
-    for path in sys.argv[1:]:
-        f = open(path, "rb")
-        buf = f.read()
-        f.close()
-        print path
-        print splitter.process([buf])
--- a/src/Products/ZCTextIndex/IIndex.py
+++ b/src/Products/ZCTextIndex/IIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-"""Index Interface."""
-
-from Products.ZCTextIndex.interfaces import IIndex # BBB
--- a/src/Products/ZCTextIndex/INBest.py
+++ b/src/Products/ZCTextIndex/INBest.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from Products.ZCTextIndex.interfaces import INBest # BBB
--- a/src/Products/ZCTextIndex/IPipelineElement.py
+++ b/src/Products/ZCTextIndex/IPipelineElement.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-from Products.ZCTextIndex.interfaces import IPipelineElement # BBB
--- a/src/Products/ZCTextIndex/IPipelineElementFactory.py
+++ b/src/Products/ZCTextIndex/IPipelineElementFactory.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
--- a/src/Products/ZCTextIndex/IQueryParseTree.py
+++ b/src/Products/ZCTextIndex/IQueryParseTree.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
--- a/src/Products/ZCTextIndex/IQueryParser.py
+++ b/src/Products/ZCTextIndex/IQueryParser.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from Products.ZCTextIndex.interfaces import IQueryParser # BBB
--- a/src/Products/ZCTextIndex/ISplitter.py
+++ b/src/Products/ZCTextIndex/ISplitter.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-from Products.ZCTextIndex.interfaces import ISplitter # BBB
--- a/src/Products/ZCTextIndex/Lexicon.py
+++ b/src/Products/ZCTextIndex/Lexicon.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Lexicon.
-
-$Id$
-"""
-
-import re
-
-from BTrees.IOBTree import IOBTree
-from BTrees.OIBTree import OIBTree
-from BTrees.Length import Length
-from Persistence import Persistent
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import ILexicon
-from Products.ZCTextIndex.StopDict import get_stopdict
-from Products.ZCTextIndex.ParseTree import QueryError
-from Products.ZCTextIndex.PipelineFactory import element_factory
-
-
-class Lexicon(Persistent):
-
-    implements(ILexicon)
-
-    def __init__(self, *pipeline):
-        self._wids = OIBTree()  # word -> wid
-        self._words = IOBTree() # wid -> word
-        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
-        # of vocabulary).  This can happen, e.g., if a query contains a word
-        # we never saw before, and that isn't a known stopword (or otherwise
-        # filtered out).  Returning a special wid value for OOV words is a
-        # way to let clients know when an OOV word appears.
-        self.length = Length()
-        self._pipeline = pipeline
-
-    def length(self):
-        """Return the number of unique terms in the lexicon."""
-        # Overridden in instances
-        return len(self._wids)
-
-    def words(self):
-        return self._wids.keys()
-
-    def wids(self):
-        return self._words.keys()
-
-    def items(self):
-        return self._wids.items()
-
-    def sourceToWordIds(self, text):
-        last = _text2list(text)
-        for element in self._pipeline:
-            last = element.process(last)
-        if not hasattr(self.length, 'change'):
-            # Make sure length is overridden with a BTrees.Length.Length
-            self.length = Length(self.length())        
-        # Strategically unload the length value so that we get the most
-        # recent value written to the database to minimize conflicting wids
-        # Because length is independent, this will load the most
-        # recent value stored, regardless of whether MVCC is enabled
-        self.length._p_deactivate()
-        return map(self._getWordIdCreate, last)
-
-    def termToWordIds(self, text):
-        last = _text2list(text)
-        for element in self._pipeline:
-            process = getattr(element, "process_post_glob", element.process) 
-            last = process(last)
-        wids = []
-        for word in last:
-            wids.append(self._wids.get(word, 0))
-        return wids
-
-    def parseTerms(self, text):
-        last = _text2list(text)
-        for element in self._pipeline:
-            process = getattr(element, "processGlob", element.process)
-            last = process(last)
-        return last
-
-    def isGlob(self, word):
-        return "*" in word or "?" in word
-
-    def get_word(self, wid):
-        return self._words[wid]
-
-    def get_wid(self, word):
-        return self._wids.get(word, 0)
-
-    def globToWordIds(self, pattern):
-        # Implement * and ? just as in the shell, except the pattern
-        # must not start with either of these
-        prefix = ""
-        while pattern and pattern[0] not in "*?":
-            prefix += pattern[0]
-            pattern = pattern[1:]
-        if not pattern:
-            # There were no globbing characters in the pattern
-            wid = self._wids.get(prefix, 0)
-            if wid:
-                return [wid]
-            else:
-                return []
-        if not prefix:
-            # The pattern starts with a globbing character.
-            # This is too efficient, so we raise an exception.
-            raise QueryError(
-                "pattern %r shouldn't start with glob character" % pattern)
-        pat = prefix
-        for c in pattern:
-            if c == "*":
-                pat += ".*"
-            elif c == "?":
-                pat += "."
-            else:
-                pat += re.escape(c)
-        pat += "$"
-        prog = re.compile(pat)
-        keys = self._wids.keys(prefix) # Keys starting at prefix
-        wids = []
-        for key in keys:
-            if not key.startswith(prefix):
-                break
-            if prog.match(key):
-                wids.append(self._wids[key])
-        return wids
-
-    def _getWordIdCreate(self, word):
-        wid = self._wids.get(word)
-        if wid is None:
-            wid = self._new_wid()
-            self._wids[word] = wid
-            self._words[wid] = word
-        return wid
-
-    def _new_wid(self):
-        self.length.change(1)
-        while self._words.has_key(self.length()): # just to be safe
-            self.length.change(1)
-        return self.length()
-
-def _text2list(text):
-    # Helper: splitter input may be a string or a list of strings
-    try:
-        text + ""
-    except:
-        return text
-    else:
-        return [text]
-
-# Sample pipeline elements
-
-class Splitter:
-
-    import re
-    rx = re.compile(r"(?L)\w+")
-    rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
-
-    def process(self, lst):
-        result = []
-        for s in lst:
-            result += self.rx.findall(s)
-        return result
-
-    def processGlob(self, lst):
-        result = []
-        for s in lst:
-            result += self.rxGlob.findall(s)
-        return result
-
-element_factory.registerFactory('Word Splitter',
-                                 'Whitespace splitter',
-                                 Splitter)
-
-class CaseNormalizer:
-
-    def process(self, lst):
-        return [w.lower() for w in lst]
-
-element_factory.registerFactory('Case Normalizer',
-                                'Case Normalizer',
-                                CaseNormalizer)
-
-element_factory.registerFactory('Stop Words',
-                                ' Don\'t remove stop words',
-                                None)
-
-class StopWordRemover:
-
-    dict = get_stopdict().copy()
-
-    try:
-        from Products.ZCTextIndex.stopper import process as _process
-    except ImportError:
-        def process(self, lst):
-            has_key = self.dict.has_key
-            return [w for w in lst if not has_key(w)]
-    else:
-        def process(self, lst):
-            return self._process(self.dict, lst)
-
-element_factory.registerFactory('Stop Words',
-                                'Remove listed stop words only',
-                                StopWordRemover)
-
-class StopWordAndSingleCharRemover(StopWordRemover):
-
-    dict = get_stopdict().copy()
-    for c in range(255):
-        dict[chr(c)] = None
-
-element_factory.registerFactory('Stop Words',
-                                'Remove listed and single char words',
-                                StopWordAndSingleCharRemover)
--- a/src/Products/ZCTextIndex/NBest.py
+++ b/src/Products/ZCTextIndex/NBest.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""NBest
-
-An NBest object remembers the N best-scoring items ever passed to its
-.add(item, score) method.  If .add() is called M times, the worst-case
-number of comparisons performed overall is M * log2(N).
-"""
-
-from bisect import bisect
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import INBest
-
-class NBest:
-    implements(INBest)
-
-    def __init__(self, N):
-        "Build an NBest object to remember the N best-scoring objects."
-
-        if N < 1:
-            raise ValueError("NBest() argument must be at least 1")
-        self._capacity = N
-
-        # This does a very simple thing with sorted lists.  For large
-        # N, a min-heap can be unboundedly better in terms of data
-        # movement time.
-        self._scores = []
-        self._items = []
-
-    def __len__(self):
-        return len(self._scores)
-
-    def capacity(self):
-        return self._capacity
-
-    def add(self, item, score):
-        self.addmany([(item, score)])
-
-    def addmany(self, sequence):
-        scores, items, capacity = self._scores, self._items, self._capacity
-        n = len(scores)
-        for item, score in sequence:
-            # When we're in steady-state, the usual case is that we're filled
-            # to capacity, and that an incoming item is worse than any of
-            # the best-seen so far.
-            if n >= capacity and score <= scores[0]:
-                continue
-            i = bisect(scores, score)
-            scores.insert(i, score)
-            items.insert(i, item)
-            if n == capacity:
-                del items[0], scores[0]
-            else:
-                n += 1
-        assert n == len(scores)
-
-    def getbest(self):
-        result = zip(self._items, self._scores)
-        result.reverse()
-        return result
-
-    def pop_smallest(self):
-        if self._scores:
-            return self._items.pop(0), self._scores.pop(0)
-        raise IndexError("pop_smallest() called on empty NBest object")
--- a/src/Products/ZCTextIndex/OkapiIndex.py
+++ b/src/Products/ZCTextIndex/OkapiIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-"""Full text index with relevance ranking, using an Okapi BM25 rank."""
-
-# Lots of comments are at the bottom of this file.  Read them to
-# understand what's going on.
-
-from BTrees.IIBTree import IIBucket
-from BTrees.Length import Length
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import IIndex
-from Products.ZCTextIndex.BaseIndex import BaseIndex
-from Products.ZCTextIndex.BaseIndex import inverse_doc_frequency
-from Products.ZCTextIndex.BaseIndex import scaled_int
-from Products.ZCTextIndex.okascore import score
-
-class OkapiIndex(BaseIndex):
-
-    implements(IIndex)
-
-    # BM25 free parameters.
-    K1 = 1.2
-    B  = 0.75
-    assert K1 >= 0.0
-    assert 0.0 <= B <= 1.0
-
-    def __init__(self, lexicon):
-        BaseIndex.__init__(self, lexicon)
-
-        # ._wordinfo for Okapi is
-        # wid -> {docid -> frequency}; t -> D -> f(D, t)
-
-        # ._docweight for Okapi is
-        # docid -> # of words in the doc
-        # This is just len(self._docwords[docid]), but _docwords is stored
-        # in compressed form, so uncompressing it just to count the list
-        # length would be ridiculously expensive.
-
-        # sum(self._docweight.values()), the total # of words in all docs
-        # This is a long for "better safe than sorry" reasons.  It isn't
-        # used often enough that speed should matter.
-        # Use a BTree.Length.Length object to avoid concurrent write conflicts
-        self._totaldoclen = Length(0L)
-
-    def index_doc(self, docid, text):
-        count = BaseIndex.index_doc(self, docid, text)
-        self._change_doc_len(count)
-        return count
-
-    def _reindex_doc(self, docid, text):
-        self._change_doc_len(-self._docweight[docid])
-        return BaseIndex._reindex_doc(self, docid, text)
-
-    def unindex_doc(self, docid):
-        self._change_doc_len(-self._docweight[docid])
-        BaseIndex.unindex_doc(self, docid)
-    
-    def _change_doc_len(self, delta):
-        # Change total doc length used for scoring
-        try:
-            self._totaldoclen.change(delta)
-        except AttributeError:
-            # Opportunistically upgrade _totaldoclen attribute to Length object
-            self._totaldoclen = Length(long(self._totaldoclen + delta))
-
-    # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
-    # for each wid t in wids.  The IIBucket, times the weight, maps D to
-    # TF(D,t) * IDF(t) for every docid D containing t.
-    # As currently written, the weights are always 1, and the IIBucket maps
-    # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
-    # but stored as a scaled_int.
-    # NOTE:  This is overridden below, by a function that computes the
-    # same thing but with the inner scoring loop in C.
-    def _search_wids(self, wids):
-        if not wids:
-            return []
-        N = float(self.document_count())  # total # of docs
-        try:
-            doclen = self._totaldoclen()
-        except TypeError:
-            # _totaldoclen has not yet been upgraded
-            doclen = self._totaldoclen
-        meandoclen = doclen / N
-        K1 = self.K1
-        B = self.B
-        K1_plus1 = K1 + 1.0
-        B_from1 = 1.0 - B
-
-        #                           f(D, t) * (k1 + 1)
-        #   TF(D, t) =  -------------------------------------------
-        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
-        L = []
-        docid2len = self._docweight
-        for t in wids:
-            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-            result = IIBucket()
-            for docid, f in d2f.items():
-                lenweight = B_from1 + B * docid2len[docid] / meandoclen
-                tf = f * K1_plus1 / (f + K1 * lenweight)
-                result[docid] = scaled_int(tf * idf)
-            L.append((result, 1))
-        return L
-
-        # Note about the above:  the result is tf * idf.  tf is small -- it
-        # can't be larger than k1+1 = 2.2.  idf is formally unbounded, but
-        # is less than 14 for a term that appears in only 1 of a million
-        # documents.  So the product is probably less than 32, or 5 bits
-        # before the radix point.  If we did the scaled-int business on
-        # both of them, we'd be up to 25 bits.  Add 64 of those and we'd
-        # be in overflow territory.  That's pretty unlikely, so we *could*
-        # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
-        # as an invariant weight across the whole result.  But besides
-        # skating near the edge, it's not a speed cure, since the computation
-        # of tf would still be done at Python speed, and it's a lot more
-        # work than just multiplying by idf.
-
-    # The same function as _search_wids above, but with the inner scoring
-    # loop written in C (module okascore, function score()).
-    # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
-    # function.
-    def _search_wids(self, wids):
-        if not wids:
-            return []
-        N = float(self.document_count())  # total # of docs
-        try:
-            doclen = self._totaldoclen()
-        except TypeError:
-            # _totaldoclen has not yet been upgraded
-            doclen = self._totaldoclen
-        meandoclen = doclen / N
-        #K1 = self.K1
-        #B = self.B
-        #K1_plus1 = K1 + 1.0
-        #B_from1 = 1.0 - B
-
-        #                           f(D, t) * (k1 + 1)
-        #   TF(D, t) =  -------------------------------------------
-        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
-        L = []
-        docid2len = self._docweight
-        for t in wids:
-            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-            result = IIBucket()
-            score(result, d2f.items(), docid2len, idf, meandoclen)
-            L.append((result, 1))
-        return L
-
-    def query_weight(self, terms):
-        # Get the wids.
-        wids = []
-        for term in terms:
-            termwids = self._lexicon.termToWordIds(term)
-            wids.extend(termwids)
-        # The max score for term t is the maximum value of
-        #     TF(D, t) * IDF(Q, t)
-        # We can compute IDF directly, and as noted in the comments below
-        # TF(D, t) is bounded above by 1+K1.
-        N = float(len(self._docweight))
-        tfmax = 1.0 + self.K1
-        sum = 0
-        for t in self._remove_oov_wids(wids):
-            idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
-            sum += scaled_int(idf * tfmax)
-        return sum
-
-    def _get_frequencies(self, wids):
-        d = {}
-        dget = d.get
-        for wid in wids:
-            d[wid] = dget(wid, 0) + 1
-        return d, len(wids)
-
-"""
-"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
-It's based on probability arguments about how words are distributed in
-documents, not on an abstract vector space model.  A long paper by its
-principal inventors gives an excellent overview of how it was derived:
-
-    A probabilistic model of information retrieval:  development and status
-    K. Sparck Jones, S. Walker, S.E. Robertson
-    http://citeseer.nj.nec.com/jones98probabilistic.html
-
-Spellings that ignore relevance information (which we don't have) are of this
-high-level form:
-
-    score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
-
-where
-
-    D         a specific document
-
-    Q         a specific query
-
-    t         a term (word, atomic phrase, whatever)
-
-    D&Q       the terms common to D and Q
-
-    TF(D, t)  a measure of t's importance in D -- a kind of term frequency
-              weight
-
-    IDF(Q, t) a measure of t's importance in the query and in the set of
-              documents as a whole -- a kind of inverse document frequency
-              weight
-
-The IDF(Q, t) here is identical to the one used for our cosine measure.
-Since queries are expected to be short, it ignores Q entirely:
-
-   IDF(Q, t) = log(1.0 + N / f(t))
-
-where
-
-   N        the total number of documents
-   f(t)     the number of documents in which t appears
-
-Most Okapi literature seems to use log(N/f(t)) instead.  We don't, because
-that becomes 0 for a term that's in every document, and, e.g., if someone
-is searching for "documentation" on python.org (a term that may well show
-up on every page, due to the top navigation bar), we still want to find the
-pages that use the word a lot (which is TF's job to find, not IDF's -- we
-just want to stop IDF from considering this t to be irrelevant).
-
-The TF(D, t) spellings are more interesting.  With lots of variations, the
-most basic spelling is of the form
-
-                   f(D, t)
-    TF(D, t) = ---------------
-                f(D, t) + K(D)
-
-where
-
-    f(D, t)   the number of times t appears in D
-    K(D)      a measure of the length of D, normalized to mean doc length
-
-The functional *form* f/(f+K) is clever.  It's a gross approximation to a
-mixture of two distinct Poisson distributions, based on the idea that t
-probably appears in D for one of two reasons:
-
-1. More or less at random.
-
-2. Because it's important to D's purpose in life ("eliteness" in papers).
-
-Note that f/(f+K) is always between 0 and 1.  If f is very large compared to
-K, it approaches 1.  If K is very large compared to f, it approaches 0.  If
-t appears in D more or less "for random reasons", f is likely to be small,
-and so K will dominate unless it's a very small doc, and the ratio will be
-small.  OTOH, if t appears a lot in D, f will dominate unless it's a very
-large doc, and the ratio will be close to 1.
-
-We use a variation on that simple theme, a simplification of what's called
-BM25 in the literature (it was the 25th stab at a Best Match function from
-the Okapi group; "a simplification" means we're setting some of BM25's more
-esoteric free parameters to 0):
-
-                f(D, t) * (k1 + 1)
-    TF(D, t) = --------------------
-                f(D, t) + k1 * K(D)
-
-where
-
-    k1      a "tuning factor", typically between 1.0 and 2.0.  We use 1.2,
-            the usual default value.  This constant adjusts the curve to
-            look more like a theoretical 2-Poisson curve.
-
-Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
-an asymptote of k1+1 from below.
-
-Finally, we use
-
-    K(D) = (1-b) + b * len(D)/E(len(D))
-
-where
-
-    b           is another free parameter, discussed below.  We use 0.75.
-
-    len(D)      the length of D in words
-
-    E(len(D))   the expected value of len(D) across the whole document set;
-                or, IOW, the average document length
-
-b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
-of the "Verbosity Hypothesis".  Suppose b is 1, and some word t appears
-10 times as often in document d2 than in document d1.  If document d2 is
-also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
-
-                     f(d2, t) * (k1 + 1)
-   TF(d2, t) = --------------------------------- =
-                f(d2, t) + k1 * len(d2)/E(len(D))
-
-                            10 * f(d1, t) * (k1 + 1)
-               ----------------------------------------------- = TF(d1, t)
-                10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
-
-because the 10's cancel out.  This is appropriate if we believe that a word
-appearing 10x more often in a doc 10x as long is simply due to that the
-longer doc is more verbose.  If we do believe that, the longer doc and the
-shorter doc are probably equally relevant.  OTOH, it *could* be that the
-longer doc is talking about t in greater depth too, in which case it's
-probably more relevant than the shorter doc.
-
-At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
-completely, and a doc scores higher for having more occurences of a word
-regardless of the doc's length.
-
-Reality is between these extremes, and probably varies by document and word
-too.  Reports in the literature suggest that b=0.75 is a good compromise "in
-general", favoring the "verbosity hypothesis" end of the scale.
-
-Putting it all together, the final TF function is
-
-                           f(D, t) * (k1 + 1)
-    TF(D, t) = --------------------------------------------
-                f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
-with k1=1.2 and b=0.75.
-
-
-Query Term Weighting
--------------------
-
-I'm ignoring the query adjustment part of Okapi BM25 because I expect our
-queries are very short.  Full BM25 takes them into account by adding the
-following to every score(D, Q); it depends on the lengths of D and Q, but
-not on the specific words in Q, or even on whether they appear in D(!):
-
-                   E(len(D)) - len(D)
-    k2 * len(Q) * -------------------
-                   E(len(D)) + len(D)
-
-Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
-len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
-so it apparently doesn't do much good (or may even hurt).
-
-Full BM25 *also* multiplies the following factor into IDF(Q, t):
-
-    f(Q, t) * (k3 + 1)
-    ------------------
-       f(Q, t) + k3
-
-where k3 is yet another free parameter, and f(Q,t) is the number of times t
-appears in Q.  Since we're using short "web style" queries, I expect f(Q,t)
-to always be 1, and then that quotient is
-
-     1 * (k3 + 1)
-     ------------ = 1
-        1 + k3
-
-regardless of k3's value.  So, in a trivial sense, we are incorporating
-this measure (and optimizing it by not bothering to multiply by 1 <wink>).
-"""
--- a/src/Products/ZCTextIndex/ParseTree.py
+++ b/src/Products/ZCTextIndex/ParseTree.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-"""Generic parser support: exception and parse tree nodes."""
-from BTrees.IIBTree import difference
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import IQueryParseTree
-from Products.ZCTextIndex.SetOps import mass_weightedIntersection
-from Products.ZCTextIndex.SetOps import mass_weightedUnion
-
-class QueryError(Exception):
-    pass
-
-class ParseError(Exception):
-    pass
-
-class ParseTreeNode:
-
-    implements(IQueryParseTree)
-
-    _nodeType = None
-
-    def __init__(self, value):
-        self._value = value
-
-    def nodeType(self):
-        return self._nodeType
-
-    def getValue(self):
-        return self._value
-
-    def __repr__(self):
-        return "%s(%r)" % (self.__class__.__name__, self.getValue())
-
-    def terms(self):
-        t = []
-        for v in self.getValue():
-            t.extend(v.terms())
-        return t
-
-    def executeQuery(self, index):
-        raise NotImplementedError
-
-class NotNode(ParseTreeNode):
-
-    _nodeType = "NOT"
-
-    def terms(self):
-        return []
-
-    def executeQuery(self, index):
-        raise QueryError, "NOT parse tree node cannot be executed directly"
-
-class AndNode(ParseTreeNode):
-
-    _nodeType = "AND"
-
-    def executeQuery(self, index):
-        L = []
-        Nots = []
-        for subnode in self.getValue():
-            if subnode.nodeType() == "NOT":
-                r = subnode.getValue().executeQuery(index)
-                # If None, technically it matches every doc, but we treat
-                # it as if it matched none (we want
-                #     real_word AND NOT stop_word
-                # to act like plain real_word).
-                if r is not None:
-                    Nots.append((r, 1))
-            else:
-                r = subnode.executeQuery(index)
-                # If None, technically it matches every doc, so needn't be
-                # included.
-                if r is not None:
-                    L.append((r, 1))
-        set = mass_weightedIntersection(L)
-        if Nots:
-            notset = mass_weightedUnion(Nots)
-            set = difference(set, notset)
-        return set
-
-class OrNode(ParseTreeNode):
-
-    _nodeType = "OR"
-
-    def executeQuery(self, index):
-        weighted = []
-        for node in self.getValue():
-            r = node.executeQuery(index)
-            # If None, technically it matches every doc, but we treat
-            # it as if it matched none (we want
-            #     real_word OR stop_word
-            # to act like plain real_word).
-            if r is not None:
-                weighted.append((r, 1))
-        return mass_weightedUnion(weighted)
-
-class AtomNode(ParseTreeNode):
-
-    _nodeType = "ATOM"
-
-    def terms(self):
-        return [self.getValue()]
-
-    def executeQuery(self, index):
-        return index.search(self.getValue())
-
-class PhraseNode(AtomNode):
-
-    _nodeType = "PHRASE"
-
-    def executeQuery(self, index):
-        return index.search_phrase(self.getValue())
-
-class GlobNode(AtomNode):
-
-    _nodeType = "GLOB"
-
-    def executeQuery(self, index):
-        return index.search_glob(self.getValue())
--- a/src/Products/ZCTextIndex/PipelineFactory.py
+++ b/src/Products/ZCTextIndex/PipelineFactory.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import IPipelineElementFactory
-
-class PipelineElementFactory:
-
-    implements(IPipelineElementFactory)
-
-    def __init__(self):
-        self._groups = {}
-
-    def registerFactory(self, group, name, factory):
-        if self._groups.has_key(group) and \
-           self._groups[group].has_key(name):
-            raise ValueError('ZCTextIndex lexicon element "%s" '
-                             'already registered in group "%s"'
-                             % (name, group))
-
-        elements = self._groups.get(group)
-        if elements is None:
-            elements = self._groups[group] = {}
-        elements[name] = factory
-
-    def getFactoryGroups(self):
-        groups = self._groups.keys()
-        groups.sort()
-        return groups
-
-    def getFactoryNames(self, group):
-        names = self._groups[group].keys()
-        names.sort()
-        return names
-
-    def instantiate(self, group, name):
-        factory = self._groups[group][name]
-        if factory is not None:
-            return factory()
-
-element_factory = PipelineElementFactory()
--- a/src/Products/ZCTextIndex/QueryParser.py
+++ b/src/Products/ZCTextIndex/QueryParser.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-"""Query Parser.
-
-This particular parser recognizes the following syntax:
-
-Start = OrExpr
-OrExpr = AndExpr ('OR' AndExpr)*
-AndExpr = Term ('AND' NotExpr)*
-NotExpr = ['NOT'] Term
-Term = '(' OrExpr ')' | ATOM+
-
-The key words (AND, OR, NOT) are recognized in any mixture of case.
-
-An ATOM is either:
-
-+ A sequence of characters not containing whitespace or parentheses or
-  double quotes, and not equal (ignoring case) to one of the key words
-  'AND', 'OR', 'NOT'; or
-
-+ A non-empty string enclosed in double quotes.  The interior of the
-  string can contain whitespace, parentheses and key words, but not
-  quotes.
-
-+ A hyphen followed by one of the two forms above, meaning that it
-  must not be present.
-
-An unquoted ATOM may also contain globbing characters.  Globbing
-syntax is defined by the lexicon; for example "foo*" could mean any
-word starting with "foo".
-
-When multiple consecutive ATOMs are found at the leaf level, they are
-connected by an implied AND operator, and an unquoted leading hyphen
-is interpreted as a NOT operator.
-
-Summarizing the default operator rules:
-
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
-"""
-import re
-
-from zope.interface import implements
-
-from Products.ZCTextIndex.interfaces import IQueryParser
-from Products.ZCTextIndex import ParseTree
-
-# Create unique symbols for token types.
-_AND    = intern("AND")
-_OR     = intern("OR")
-_NOT    = intern("NOT")
-_LPAREN = intern("(")
-_RPAREN = intern(")")
-_ATOM   = intern("ATOM")
-_EOF    = intern("EOF")
-
-# Map keyword string to token type.
-_keywords = {
-    _AND:       _AND,
-    _OR:        _OR,
-    _NOT:       _NOT,
-    _LPAREN:    _LPAREN,
-    _RPAREN:    _RPAREN,
-}
-
-# Regular expression to tokenize.
-_tokenizer_regex = re.compile(r"""
-    # a paren
-    [()]
-    # or an optional hyphen
-|   -?
-    # followed by
-    (?:
-        # a string inside double quotes (and not containing these)
-        " [^"]* "
-        # or a non-empty stretch w/o whitespace, parens or double quotes
-    |    [^()\s"]+
-    )
-""", re.VERBOSE)
-
-# Use unicode regex to treat fullwidth space characters defined in Unicode
-# as valid whitespace.
-_tokenizer_unicode_regex = re.compile(
-    _tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
-
-class QueryParser:
-
-    implements(IQueryParser)
-
-    # This class is not thread-safe;
-    # each thread should have its own instance
-
-    def __init__(self, lexicon):
-        self._lexicon = lexicon
-        self._ignored = None
-
-    # Public API methods
-
-    def parseQuery(self, query):
-        # Lexical analysis.
-        try:
-            # Try to use unicode and treat fullwidth whitespace as valid one.
-            if not isinstance(query, unicode):
-                query = query.decode('utf-8')
-            tokens = _tokenizer_unicode_regex.findall(query)
-        except UnicodeDecodeError:
-            tokens = _tokenizer_regex.findall(query)
-        self._tokens = tokens
-        # classify tokens
-        self._tokentypes = [_keywords.get(token.upper(), _ATOM)
-                            for token in tokens]
-        # add _EOF
-        self._tokens.append(_EOF)
-        self._tokentypes.append(_EOF)
-        self._index = 0
-
-        # Syntactical analysis.
-        self._ignored = [] # Ignored words in the query, for parseQueryEx
-        tree = self._parseOrExpr()
-        self._require(_EOF)
-        if tree is None:
-            raise ParseTree.ParseError(
-                "Query contains only common words: %s" % repr(query))
-        return tree
-
-    def getIgnored(self):
-        return self._ignored
-
-    def parseQueryEx(self, query):
-        tree = self.parseQuery(query)
-        ignored = self.getIgnored()
-        return tree, ignored
-
-    # Recursive descent parser
-
-    def _require(self, tokentype):
-        if not self._check(tokentype):
-            t = self._tokens[self._index]
-            msg = "Token %r required, %r found" % (tokentype, t)
-            raise ParseTree.ParseError, msg
-
-    def _check(self, tokentype):
-        if self._tokentypes[self._index] is tokentype:
-            self._index += 1
-            return 1
-        else:
-            return 0
-
-    def _peek(self, tokentype):
-        return self._tokentypes[self._index] is tokentype
-
-    def _get(self, tokentype):
-        t = self._tokens[self._index]
-        self._require(tokentype)
-        return t
-
-    def _parseOrExpr(self):
-        L = []
-        L.append(self._parseAndExpr())
-        while self._check(_OR):
-            L.append(self._parseAndExpr())
-        L = filter(None, L)
-        if not L:
-            return None # Only stopwords
-        elif len(L) == 1:
-            return L[0]
-        else:
-            return ParseTree.OrNode(L)
-
-    def _parseAndExpr(self):
-        L = []
-        t = self._parseTerm()
-        if t is not None:
-            L.append(t)
-        Nots = []
-        while self._check(_AND):
-            t = self._parseNotExpr()
-            if t is None:
-                continue
-            if isinstance(t, ParseTree.NotNode):
-                Nots.append(t)
-            else:
-                L.append(t)
-        if not L:
-            return None # Only stopwords
-        L.extend(Nots)
-        if len(L) == 1:
-            return L[0]
-        else:
-            return ParseTree.AndNode(L)
-
-    def _parseNotExpr(self):
-        if self._check(_NOT):
-            t = self._parseTerm()
-            if t is None:
-                return None # Only stopwords
-            return ParseTree.NotNode(t)
-        else:
-            return self._parseTerm()
-
-    def _parseTerm(self):
-        if self._check(_LPAREN):
-            tree = self._parseOrExpr()
-            self._require(_RPAREN)
-        else:
-            nodes = []
-            nodes = [self._parseAtom()]
-            while self._peek(_ATOM):
-                nodes.append(self._parseAtom())
-            nodes = filter(None, nodes)
-            if not nodes:
-                return None # Only stopwords
-            structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
-                         for i in range(len(nodes))]
-            structure.sort()
-            nodes = [node for (bit, index, node) in structure]
-            if isinstance(nodes[0], ParseTree.NotNode):
-                raise ParseTree.ParseError(
-                    "a term must have at least one positive word")
-            if len(nodes) == 1:
-                return nodes[0]
-            tree = ParseTree.AndNode(nodes)
-        return tree
-
-    def _parseAtom(self):
-        term = self._get(_ATOM)
-        words = self._lexicon.parseTerms(term)
-        if not words:
-            self._ignored.append(term)
-            return None
-        if len(words) > 1:
-            tree = ParseTree.PhraseNode(words)
-        elif self._lexicon.isGlob(words[0]):
-            tree = ParseTree.GlobNode(words[0])
-        else:
-            tree = ParseTree.AtomNode(words[0])
-        if term[0] == "-":
-            tree = ParseTree.NotNode(tree)
-        return tree
--- a/src/Products/ZCTextIndex/README.txt
+++ b/src/Products/ZCTextIndex/README.txt
-ZCTextIndex
-===========
-
-This product is a replacement for the full text indexing facility of
-ZCatalog.  Specifically, it is an alternative to
-PluginIndexes/TextIndex.
-
-Advantages of using ZCTextIndex over TextIndex:
-
- A new query language, supporting both explicit and implicit Boolean
-  operators, parentheses, globbing, and phrase searching.  Apart from
-  explicit operators and globbing, the syntax is roughly the same as
-  that popularized by Google.
-
- A more refined scoring algorithm, resulting in better selectiveness:
-  it's much more likely that you'll find the document you are looking
-  for among the first few highest-ranked results.
-
- Actually, ZCTextIndex gives you a choice of two scoring algorithms
-  from recent literature: the Cosine ranking from the Managing
-  Gigabytes book, and Okapi from more recent research papers.  Okapi
-  usually does better, so it is the default (but your milage may
-  vary).
-
- A redesigned Lexicon, using a pipeline architecture to split the
-  input text into words.  This makes it possible to mix and match
-  pipeline components, e.g. you can choose between an HTML-aware
-  splitter and a plain text splitter, and additional components can be
-  added to the pipeline for case folding, stopword removal, and other
-  features.  Enough example pipeline components are provided to get
-  you started, and it is very easy to write new components.
-
-Performance is roughly the same as for TextIndex, and we're expecting
-to make tweaks to the code that will make it faster.
-
-This code can be used outside of Zope too; all you need is a
-standalone ZODB installation to make your index persistent.  Several
-functional test programs in the tests subdirectory show how to do
-this, for example mhindex.py, mailtest.py, indexhtml.py, and
-queryhtml.py.
-
-See the online help for how to use ZCTextIndex within Zope.  (Included
-in the subdirectory "help".)
-
-
-Code overview
-------------
-
-ZMI interface:
-
-__init__.py			ZMI publishing code
-ZCTextIndex.py			pluggable index class
-PipelineFactory.py		ZMI helper to configure the pipeline
-
-Indexing:
-
-BaseIndex.py			common code for Cosine and Okapi index
-CosineIndex.py			Cosine index implementation
-OkapiIndex.py			Okapi index implementation
-okascore.c			C implementation of scoring loop
-
-Lexicon:
-
-Lexicon.py			lexicon and sample pipeline elements
-HTMLSplitter.py			HTML-aware splitter
-StopDict.py			list of English stopwords
-stopper.c			C implementation of stop word remover
-
-Query parser:
-
-QueryParser.py			parse a query into a parse tree
-ParseTree.py			parse tree node classes and exceptions
-
-Utilities:
-
-NBest.py			find N best items in a list without sorting
-SetOps.py			efficient weighted set operations
-WidCode.py			list compression allowing phrase searches
-RiceCode.py			list compression code (as yet unused)
-
-Interfaces (these speak for themselves):
-
-IIndex.py
-ILexicon.py
-INBest.py
-IPipelineElement.py
-IPipelineElementFactory.py
-IQueryParseTree.py
-IQueryParser.py
-ISplitter.py
-
-Subdirectories:
-
-dtml				ZMI templates
-help				ZMI help files
-tests				unittests and some functional tests/examples
-www				images used in the ZMI
-
-
-Tests
-----
-
-Functional tests and helpers:
-
-hs-tool.py			helper to interpret hotshot profiler logs
-indexhtml.py			index a collection of HTML files
-mailtest.py			index and query a Unix mailbox file
-mhindex.py			index and query a set of MH folders
-python.txt			output from benchmark queries
-queryhtml.py			query an index created by indexhtml.py
-wordstats.py			dump statistics about each indexed word
-
-Unit tests (these speak for themselves):
-
-testIndex.py			
-testLexicon.py
-testNBest.py
-testPipelineFactory.py
-testQueryEngine.py
-testQueryParser.py
-testSetOps.py
-testStopper.py
-testZCTextIndex.py
--- a/src/Products/ZCTextIndex/RiceCode.py
+++ b/src/Products/ZCTextIndex/RiceCode.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-"""Rice coding (a variation of Golomb coding)
-
-Based on a Java implementation by Glen McCluskey described in a Usenix
- ;login: article at
-http://www.usenix.org/publications/login/2000-4/features/java.html
-
-McCluskey's article explains the approach as follows.  The encoding
-for a value x is represented as a unary part and a binary part.  The
-unary part is a sequence of 1 bits followed by a 0 bit.  The binary
-part encodes some of the lower bits of x-1.
-
-The encoding is parameterized by a value m that describes how many
-bits to store in the binary part.  If most of the values are smaller
-than 2**m then they can be stored in only m+1 bits.
-
-Compute the length of the unary part, q, where
-   q = math.floor((x-1)/ 2 ** m)
-
-   Emit q 1 bits followed by a 0 bit.
-
-Emit the lower m bits of x-1, treating x-1 as a binary value.
-"""
-
-import array
-
-class BitArray:
-
-    def __init__(self, buf=None):
-        self.bytes = array.array('B')
-        self.nbits = 0
-        self.bitsleft = 0
-        self.tostring = self.bytes.tostring
-
-    def __getitem__(self, i):
-        byte, offset = divmod(i, 8)
-        mask = 2 ** offset
-        if self.bytes[byte] & mask:
-            return 1
-        else:
-            return 0
-
-    def __setitem__(self, i, val):
-        byte, offset = divmod(i, 8)
-        mask = 2 ** offset
-        if val:
-            self.bytes[byte] |= mask
-        else:
-            self.bytes[byte] &= ~mask
-
-    def __len__(self):
-        return self.nbits
-
-    def append(self, bit):
-        """Append a 1 if bit is true or 1 if it is false."""
-        if self.bitsleft == 0:
-            self.bytes.append(0)
-            self.bitsleft = 8
-        self.__setitem__(self.nbits, bit)
-        self.nbits += 1
-        self.bitsleft -= 1
-
-    def __getstate__(self):
-        return self.nbits, self.bitsleft, self.tostring()
-
-    def __setstate__(self, (nbits, bitsleft, s)):
-        self.bytes = array.array('B', s)
-        self.nbits = nbits
-        self.bitsleft = bitsleft
-
-class RiceCode:
-    def __init__(self, m):
-        """Constructor a RiceCode for m-bit values."""
-        if not (0 <= m <= 16):
-            raise ValueError, "m must be between 0 and 16"
-        self.init(m)
-        self.bits = BitArray()
-        self.len = 0
-
-    def init(self, m):
-        self.m = m
-        self.lower = (1 << m) - 1
-        self.mask = 1 << (m - 1)
-
-    def append(self, val):
-        """Append an item to the list."""
-        if val < 1:
-            raise ValueError, "value >= 1 expected, got %s" % `val`
-        val -= 1
-        # emit the unary part of the code
-        q = val >> self.m
-        for i in range(q):
-            self.bits.append(1)
-        self.bits.append(0)
-        # emit the binary part
-        r = val & self.lower
-        mask = self.mask
-        while mask:
-            self.bits.append(r & mask)
-            mask >>= 1
-        self.len += 1
-
-    def __len__(self):
-        return self.len
-
-    def tolist(self):
-        """Return the items as a list."""
-        l = []
-        i = 0 # bit offset
-        binary_range = range(self.m)
-        for j in range(self.len):
-            unary = 0
-            while self.bits[i] == 1:
-                unary += 1
-                i += 1
-            assert self.bits[i] == 0
-            i += 1
-            binary = 0
-            for k in binary_range:
-                binary = (binary << 1) | self.bits[i]
-                i += 1
-            l.append((unary << self.m) + (binary + 1))
-        return l
-
-    def tostring(self):
-        """Return a binary string containing the encoded data.
-
-        The binary string may contain some extra zeros at the end.
-        """
-        return self.bits.tostring()
-
-    def __getstate__(self):
-        return self.m, self.bits
-
-    def __setstate__(self, (m, bits)):
-        self.init(m)
-        self.bits = bits
-
-def encode(m, l):
-    c = RiceCode(m)
-    for elt in l:
-        c.append(elt)
-    assert c.tolist() == l
-    return c
-
-def encode_deltas(l):
-    if len(l) == 1:
-        return l[0], []
-    deltas = RiceCode(6)
-    deltas.append(l[1] - l[0])
-    for i in range(2, len(l)):
-        deltas.append(l[i] - l[i - 1])
-    return l[0], deltas
-
-def decode_deltas(start, enc_deltas):
-    deltas = enc_deltas.tolist()
-    l = [start]
-    for i in range(1, len(deltas)):
-        l.append(l[i-1] + deltas[i])
-    l.append(l[-1] + deltas[-1])
-    return l
-
-def test():
-    import random
-    for size in [10, 20, 50, 100, 200]:
-        l = [random.randint(1, size) for i in range(50)]
-        c = encode(random.randint(1, 16), l)
-        assert c.tolist() == l
-    for size in [10, 20, 50, 100, 200]:
-        l = range(random.randint(1, size), size + random.randint(1, size))
-        t = encode_deltas(l)
-        l2 = decode_deltas(*t)
-        assert l == l2
-        if l != l2:
-            print l
-            print l2
-
-def pickle_efficiency():
-    import pickle
-    import random
-    for m in [4, 8, 12]:
-        for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
-            for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
-                l = [random.randint(1, elt_range) for i in range(size)]
-                raw = pickle.dumps(l, 1)
-                enc = pickle.dumps(encode(m, l), 1)
-                print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
-                print "%5d %5d" % (len(raw), len(enc)),
-                if len(raw) > len(enc):
-                    print "win"
-                else:
-                    print "lose"
-
-if __name__ == "__main__":
-    test()
--- a/src/Products/ZCTextIndex/SETUP.cfg
+++ b/src/Products/ZCTextIndex/SETUP.cfg
-<extension okascore>
-  source okascore.c
-</extension>
-
-<extension stopper>
-  source stopper.c
-</extension>
--- a/src/Products/ZCTextIndex/SetOps.py
+++ b/src/Products/ZCTextIndex/SetOps.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-"""SetOps -- Weighted intersections and unions applied to many inputs."""
-
-from BTrees.IIBTree import IIBucket
-from BTrees.IIBTree import weightedIntersection
-from BTrees.IIBTree import weightedUnion
-
-from Products.ZCTextIndex.NBest import NBest
-
-def mass_weightedIntersection(L):
-    "A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
-    L = [(x, wx) for (x, wx) in L if x is not None]
-    if len(L) < 2:
-        return _trivial(L)
-    # Intersect with smallest first.  We expect the input maps to be
-    # IIBuckets, so it doesn't hurt to get their lengths repeatedly
-    # (len(Bucket) is fast; len(BTree) is slow).
-    L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
-    (x, wx), (y, wy) = L[:2]
-    dummy, result = weightedIntersection(x, y, wx, wy)
-    for x, wx in L[2:]:
-        dummy, result = weightedIntersection(result, x, 1, wx)
-    return result
-
-def mass_weightedUnion(L):
-    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
-    if len(L) < 2:
-        return _trivial(L)
-    # Balance unions as closely as possible, smallest to largest.
-    merge = NBest(len(L))
-    for x, weight in L:
-        merge.add((x, weight), len(x))
-    while len(merge) > 1:
-        # Merge the two smallest so far, and add back to the queue.
-        (x, wx), dummy = merge.pop_smallest()
-        (y, wy), dummy = merge.pop_smallest()
-        dummy, z = weightedUnion(x, y, wx, wy)
-        merge.add((z, 1), len(z))
-    (result, weight), dummy = merge.pop_smallest()
-    return result
-
-def _trivial(L):
-    # L is empty or has only one (mapping, weight) pair.  If there is a
-    # pair, we may still need to multiply the mapping by its weight.
-    assert len(L) <= 1
-    if len(L) == 0:
-        return IIBucket()
-    [(result, weight)] = L
-    if weight != 1:
-        dummy, result = weightedUnion(IIBucket(), result, 0, weight)
-    return result
--- a/src/Products/ZCTextIndex/Setup
+++ b/src/Products/ZCTextIndex/Setup
-*shared*
-stopper stopper.c
-okascore okascore.c
--- a/src/Products/ZCTextIndex/StopDict.py
+++ b/src/Products/ZCTextIndex/StopDict.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-"""Provide a default list of stop words for the index.
-
-The specific splitter and lexicon are customizable, but the default
-ZCTextIndex should do something useful.
-"""
-
-def get_stopdict():
-    """Return a dictionary of stopwords."""
-    return _dict
-
-# This list of English stopwords comes from Lucene
-_words = [
-    "a", "and", "are", "as", "at", "be", "but", "by",
-    "for", "if", "in", "into", "is", "it",
-    "no", "not", "of", "on", "or", "such",
-    "that", "the", "their", "then", "there", "these",
-    "they", "this", "to", "was", "will", "with"
-]
-
-_dict = {}
-for w in _words:
-    _dict[w] = None
--- a/src/Products/ZCTextIndex/WidCode.py
+++ b/src/Products/ZCTextIndex/WidCode.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
-# for smaller ints.  This is intended for lists of word ids (wids).  The
-# ordinary string .find() method can be used to find the encoded form of a
-# desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
-# of an encoding can't appear in the interior of an encoding, so find() can't
-# be fooled into starting a match "in the middle" of an encoding. Unlike
-# UTF-8, the initial byte does not tell you how many continuation bytes
-# follow; and there's no ASCII superset property.
-
-# Details:
-#
-# + Only the first byte of an encoding has the sign bit set.
-#
-# + The first byte has 7 bits of data.
-#
-# + Bytes beyond the first in an encoding have the sign bit clear, followed
-#   by 7 bits of data.
-#
-# + The first byte doesn't tell you how many continuation bytes are
-#   following.  You can tell by searching for the next byte with the
-#   high bit set (or the end of the string).
-#
-# The int to be encoded can contain no more than 28 bits.
-#
-# If it contains no more than 7 bits, 0abcdefg, the encoding is
-#     1abcdefg
-#
-# If it contains 8 thru 14 bits,
-#     00abcdef ghijkLmn
-# the encoding is
-#     1abcdefg 0hijkLmn
-#
-# Static tables _encoding and _decoding capture all encodes and decodes for
-# 14 or fewer bits.
-#
-# If it contains 15 thru 21 bits,
-#    000abcde fghijkLm nopqrstu
-# the encoding is
-#    1abcdefg 0hijkLmn 0opqrstu
-#
-# If it contains 22 thru 28 bits,
-#    0000abcd efghijkL mnopqrst uvwxyzAB
-# the encoding is
-#    1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
-
-assert 0x80**2 == 0x4000
-assert 0x80**4 == 0x10000000
-
-import re
-
-def encode(wids):
-    # Encode a list of wids as a string.
-    wid2enc = _encoding
-    n = len(wid2enc)
-    return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
-
-_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
-
-def _encode(w):
-    assert 0x4000 <= w < 0x10000000
-    b, c = divmod(w, 0x80)
-    a, b = divmod(b, 0x80)
-    s = chr(b) + chr(c)
-    if a < 0x80:    # no more than 21 data bits
-        return chr(a + 0x80) + s
-    a, b = divmod(a, 0x80)
-    assert a < 0x80, (w, a, b, s)  # else more than 28 data bits
-    return (chr(a + 0x80) + chr(b)) + s
-
-_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
-
-def decode(code):
-    # Decode a string into a list of wids.
-    get = _decoding.get
-    # Obscure:  while _decoding does have the key '\x80', its value is 0,
-    # so the "or" here calls _decode('\x80') anyway.
-    return [get(p) or _decode(p) for p in _prog.findall(code)]
-
-_decoding = {} # Filled later
-
-def _decode(s):
-    if s == '\x80':
-        # See comment in decode().  This is here to allow a trick to work.
-        return 0
-    if len(s) == 3:
-        a, b, c = map(ord, s)
-        assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
-        return ((a & 0x7F) << 14) | (b << 7) | c
-    assert len(s) == 4, `s`
-    a, b, c, d = map(ord, s)
-    assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
-    return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
-
-def _fill():
-    global _encoding
-    for i in range(0x80):
-        s = chr(i + 0x80)
-        _encoding[i] = s
-        _decoding[s] = i
-    for i in range(0x80, 0x4000):
-        hi, lo = divmod(i, 0x80)
-        s = chr(hi + 0x80) + chr(lo)
-        _encoding[i] = s
-        _decoding[s] = i
-    _encoding = tuple(_encoding)
-
-_fill()
-
-def test():
-    for i in range(2**20):
-        if i % 1000 == 0: print i
-        wids = [i]
-        code = encode(wids)
-        assert decode(code) == wids, (wids, code, decode(code))
-
-if __name__ == "__main__":
-    test()
--- a/src/Products/ZCTextIndex/ZCTextIndex.py
+++ b/src/Products/ZCTextIndex/ZCTextIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Plug in text index for ZCatalog with relevance ranking.
-
-$Id$
-"""
-
-from cgi import escape
-
-from AccessControl.class_init import InitializeClass
-from AccessControl.Permissions import manage_vocabulary
-from AccessControl.Permissions import manage_zcatalog_indexes
-from AccessControl.Permissions import query_vocabulary
-from AccessControl.Permissions import search_zcatalog
-from AccessControl.SecurityInfo import ClassSecurityInfo
-from Acquisition import aq_base
-from Acquisition import aq_inner
-from Acquisition import aq_parent
-from Acquisition import Implicit
-from App.special_dtml import DTMLFile
-from OFS.SimpleItem import SimpleItem
-from Persistence import Persistent
-from zope.interface import implements
-
-from Products.PluginIndexes.common.util import parseIndexRequest
-from Products.PluginIndexes.common import safe_callable
-from Products.PluginIndexes.interfaces import IPluggableIndex
-
-from Products.ZCTextIndex.Lexicon import CaseNormalizer
-from Products.ZCTextIndex.Lexicon import Lexicon
-from Products.ZCTextIndex.Lexicon import Splitter
-from Products.ZCTextIndex.Lexicon import StopWordRemover
-from Products.ZCTextIndex.NBest import NBest
-from Products.ZCTextIndex.QueryParser import QueryParser
-from Products.ZCTextIndex.CosineIndex import CosineIndex
-from Products.ZCTextIndex.interfaces import ILexicon
-from Products.ZCTextIndex.interfaces import IZCLexicon
-from Products.ZCTextIndex.interfaces import IZCTextIndex
-from Products.ZCTextIndex.OkapiIndex import OkapiIndex
-from Products.ZCTextIndex.PipelineFactory import element_factory
-
-index_types = {'Okapi BM25 Rank':OkapiIndex,
-               'Cosine Measure':CosineIndex}
-
-
-class ZCTextIndex(Persistent, Implicit, SimpleItem):
-
-    """Persistent text index.
-    """
-    implements(IZCTextIndex, IPluggableIndex)
-
-    ## Magic class attributes ##
-
-    meta_type = 'ZCTextIndex'
-    query_options = ('query',)
-
-    manage_options = (
-        {'label': 'Overview', 'action': 'manage_main'},
-    )
-
-    security = ClassSecurityInfo()
-    security.declareObjectProtected(manage_zcatalog_indexes)
-
-    ## Constructor ##
-
-    def __init__(self, id, extra=None, caller=None, index_factory=None,
-                 field_name=None, lexicon_id=None):
-        self.id = id
-
-        # Arguments can be passed directly to the constructor or
-        # via the silly "extra" record.
-        self._fieldname = field_name or getattr(extra, 'doc_attr', '') or id
-        self._indexed_attrs = self._fieldname.split(',')
-        self._indexed_attrs = [ attr.strip()
-                                for attr in self._indexed_attrs if attr ]
-
-        lexicon_id = lexicon_id or getattr(extra, 'lexicon_id', '')
-        lexicon = getattr(caller, lexicon_id, None)
-
-        if lexicon is None:
-            raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id)
-
-        if not ILexicon.providedBy(lexicon):
-            raise ValueError('Object "%s" does not implement '
-                             'ZCTextIndex Lexicon interface'
-                             % lexicon.getId())
-
-        self.lexicon_id = lexicon.getId()
-        self._v_lexicon = lexicon
-
-        if index_factory is None:
-            if extra.index_type not in index_types.keys():
-                raise ValueError, 'Invalid index type "%s"' % escape(
-                    extra.index_type)
-            self._index_factory = index_types[extra.index_type]
-            self._index_type = extra.index_type
-        else:
-            self._index_factory = index_factory
-
-        self.index = self._index_factory(aq_base(self.getLexicon()))
-
-    ## Private Methods ##
-
-    security.declarePrivate('getLexicon')
-
-    def getLexicon(self):
-        """Get the lexicon for this index
-        """
-        if hasattr(aq_base(self), 'lexicon'):
-            # Fix up old ZCTextIndexes by removing direct lexicon ref
-            # and changing it to an ID
-            lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon.getId())
-            self.lexicon_id = lexicon.getId()
-            del self.lexicon
-
-        if getattr(aq_base(self), 'lexicon_path', None):
-            # Fix up slightly less old ZCTextIndexes by removing
-            # the physical path and changing it to an ID.
-            # There's no need to use a physical path, which otherwise
-            # makes it difficult to move or rename ZCatalogs.
-            self.lexicon_id = self.lexicon_path[-1]
-            del self.lexicon_path
-
-        try:
-            return self._v_lexicon
-        except AttributeError:
-            lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id)
-            if not ILexicon.providedBy(lexicon):
-                raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
-                                % repr(lexicon))
-            self._v_lexicon = lexicon
-            return lexicon
-
-    ## External methods not in the Pluggable Index API ##
-
-    security.declareProtected(search_zcatalog, 'query')
-
-    def query(self, query, nbest=10):
-        """Return pair (mapping from docids to scores, num results).
-
-        The num results is the total number of results before trimming
-        to the nbest results.
-        """
-        tree = QueryParser(self.getLexicon()).parseQuery(query)
-        results = tree.executeQuery(self.index)
-        if results is None:
-            return [], 0
-        chooser = NBest(nbest)
-        chooser.addmany(results.items())
-        return chooser.getbest(), len(results)
-
-    ## Pluggable Index APIs ##
-
-    def index_object(self, documentId, obj, threshold=None):
-        """Wrapper for  index_doc()  handling indexing of multiple attributes.
-
-        Enter the document with the specified documentId in the index
-        under the terms extracted from the indexed text attributes,
-        each of which should yield either a string or a list of
-        strings (Unicode or otherwise) to be passed to index_doc().
-        """
-        # XXX We currently ignore subtransaction threshold
-
-        # needed for backward compatibility
-        try: fields = self._indexed_attrs
-        except: fields  = [ self._fieldname ]
-
-        res = 0
-        all_texts = []
-        for attr in fields:
-            text = getattr(obj, attr, None)
-            if text is None:
-                continue
-            if safe_callable(text):
-                text = text()
-            if text is None:
-                continue
-            if text:
-                if isinstance(text, (list, tuple, )):
-                    all_texts.extend(text)
-                else:
-                    all_texts.append(text)
-
-        # Check that we're sending only strings
-        all_texts = filter(lambda text: isinstance(text, basestring), \
-                           all_texts)
-        if all_texts:
-            return self.index.index_doc(documentId, all_texts)
-        return res
-
-    def unindex_object(self, docid):
-        if self.index.has_doc(docid):
-            self.index.unindex_doc(docid)
-
-    def _apply_index(self, request):
-        """Apply query specified by request, a mapping containing the query.
-
-        Returns two object on success, the resultSet containing the
-        matching record numbers and a tuple containing the names of
-        the fields used
-
-        Returns None if request is not valid for this index.
-        """
-        record = parseIndexRequest(request, self.id, self.query_options)
-        if record.keys is None:
-            return None
-
-        query_str = ' '.join(record.keys)
-        if not query_str:
-            return None
-        tree = QueryParser(self.getLexicon()).parseQuery(query_str)
-        results = tree.executeQuery(self.index)
-        return  results, (self.id,)
-
-    def getEntryForObject(self, documentId, default=None):
-        """Return the list of words indexed for documentId"""
-        try:
-            word_ids = self.index.get_words(documentId)
-        except KeyError:
-            return default
-        get_word = self.getLexicon().get_word
-        return [get_word(wid) for wid in word_ids]
-
-    def uniqueValues(self, name=None, withLengths=0):
-        raise NotImplementedError
-
-    ## The ZCatalog Index management screen uses these methods ##
-
-    def numObjects(self):
-        """Return number of unique words in the index"""
-        return self.index.length()
-
-    def indexSize(self):
-        """Return the number of indexes objects """
-        return self.index.document_count()
-
-    def clear(self):
-        """reinitialize the index (but not the lexicon)"""
-        try:
-            # Remove the cached reference to the lexicon
-            # So that it is refreshed
-            del self._v_lexicon
-        except (AttributeError, KeyError):
-            pass
-        self.index = self._index_factory(aq_base(self.getLexicon()))
-
-    ## User Interface Methods ##
-
-    manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
-
-    def getIndexSourceNames(self):
-        """Return sequence of names of indexed attributes"""
-        try:
-            return self._indexed_attrs
-        except:
-            return [self._fieldname]
-
-    def getIndexType(self):
-        """Return index type string"""
-        return getattr(self, '_index_type', self._index_factory.__name__)
-
-    def getLexiconURL(self):
-        """Return the url of the lexicon used by the index"""
-        try:
-            lex = self.getLexicon()
-        except (KeyError, AttributeError):
-            return None
-        else:
-            return lex.absolute_url()
-
-InitializeClass(ZCTextIndex)
-
-def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
-                          RESPONSE=None):
-    """Add a text index"""
-    if REQUEST is None:
-        URL3 = None
-    else:
-        URL3 = REQUEST.URL3
-    return self.manage_addIndex(id, 'ZCTextIndex', extra,
-                                REQUEST, RESPONSE, URL3)
-
-manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
-
-manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
-
-def manage_addLexicon(self, id, title='', elements=[], REQUEST=None):
-    """Add ZCTextIndex Lexicon"""
-
-    pipeline = []
-    for el_record in elements:
-        if not hasattr(el_record, 'name'):
-            continue # Skip over records that only specify element group
-        element = element_factory.instantiate(el_record.group, el_record.name)
-        if element is not None:
-            if el_record.group == 'Word Splitter':
-                # I don't like hardcoding this, but its a simple solution
-                # to get the splitter element first in the pipeline
-                pipeline.insert(0, element)
-            else:
-                pipeline.append(element)
-
-    lexicon = PLexicon(id, title, *pipeline)
-    self._setObject(id, lexicon)
-    if REQUEST is not None:
-        return self.manage_main(self, REQUEST, update_menu=1)
-
-# I am borrowing the existing vocabulary permissions for now to avoid
-# adding new permissions. This may change when old style Vocabs go away
-LexiconQueryPerm = query_vocabulary
-LexiconMgmtPerm = manage_vocabulary
-
-
-class PLexicon(Lexicon, Implicit, SimpleItem):
-
-    """Lexicon for ZCTextIndex.
-    """
-
-    implements(IZCLexicon)
-
-    meta_type = 'ZCTextIndex Lexicon'
-
-    manage_options = ({'label':'Overview', 'action':'manage_main'},
-                      {'label':'Query', 'action':'queryLexicon'},
-                     ) + SimpleItem.manage_options
-
-    security = ClassSecurityInfo()
-    security.declareObjectProtected(LexiconQueryPerm)
-
-    def __init__(self, id, title='', *pipeline):
-        self.id = str(id)
-        self.title = str(title)
-        PLexicon.inheritedAttribute('__init__')(self, *pipeline)
-
-    ## User Interface Methods ##
-
-    def getPipelineNames(self):
-        """Return list of names of pipeline element classes"""
-        return [element.__class__.__name__ for element in self._pipeline]
-
-    _queryLexicon = DTMLFile('dtml/queryLexicon', globals())
-
-    security.declareProtected(LexiconQueryPerm, 'queryLexicon')
-
-    def queryLexicon(self, REQUEST, words=None, page=0, rows=20, cols=4):
-        """Lexicon browser/query user interface
-        """
-        if words:
-            wids = []
-            for word in self.parseTerms(words):
-                wids.extend(self.globToWordIds(word))
-            words = [self.get_word(wid) for wid in wids]
-        else:
-            words = self.words()
-
-        word_count = len(words)
-        rows = max(min(rows, 500), 1)
-        cols = max(min(cols, 12), 1)
-        page_count = word_count / (rows * cols) + \
-                     (word_count % (rows * cols) > 0)
-        page = max(min(page, page_count - 1), 0)
-        start = rows * cols * page
-        end = min(rows * cols * (page + 1), word_count)
-
-        if word_count:
-            words = list(words[start:end])
-        else:
-            words = []
-
-        columns = []
-        i = 0
-        while i < len(words):
-            columns.append(words[i:i + rows])
-            i += rows
-
-        info = dict(page=page,
-                    rows=rows,
-                    cols=cols,
-                    start_word=start+1,
-                    end_word=end,
-                    word_count=word_count,
-                    page_count=page_count,
-                    page_range=xrange(page_count),
-                    page_columns=columns)
-
-        if REQUEST is not None:
-            return self._queryLexicon(self, REQUEST, **info)
-
-        return info
-
-    security.declareProtected(LexiconMgmtPerm, 'manage_main')
-    manage_main = DTMLFile('dtml/manageLexicon', globals())
-
-InitializeClass(PLexicon)
--- a/src/Products/ZCTextIndex/__init__.py
+++ b/src/Products/ZCTextIndex/__init__.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""ZCatalog Text Index
-
-Plugin text index for ZCatalog.
-"""
-
-from PipelineFactory import element_factory
-from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
-
-def initialize(context):
-
-    context.registerClass(
-        ZCTextIndex.ZCTextIndex,
-        permission = 'Add Pluggable Index',
-        constructors = (ZCTextIndex.manage_addZCTextIndexForm,
-                        ZCTextIndex.manage_addZCTextIndex,
-                        getIndexTypes),
-        icon='www/index.gif',
-        visibility=None
-    )
-
-    context.registerClass(
-        ZCTextIndex.PLexicon,
-        permission = 'Add Vocabularies',
-        constructors = (ZCTextIndex.manage_addLexiconForm,
-                        ZCTextIndex.manage_addLexicon,
-                        getElementGroups, getElementNames),
-        icon='www/lexicon.gif'
-    )
-
-    context.registerHelp()
-    context.registerHelpTitle("Zope Help")
-
-## Functions below are for use in the ZMI constructor forms ##
-
-def getElementGroups(self):
-    return element_factory.getFactoryGroups()
-
-def getElementNames(self, group):
-    return element_factory.getFactoryNames(group)
-
-def getIndexTypes(self):
-    return ZCTextIndex.index_types.keys()
-
-## Allow relevent exceptions to be caught in untrusted code
-from AccessControl import ModuleSecurityInfo
-ModuleSecurityInfo('Products').declarePublic('ZCTextIndex')
-ModuleSecurityInfo('Products.ZCTextIndex').declarePublic('ParseTree')
-ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('QueryError')
-ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('ParseError')
--- a/src/Products/ZCTextIndex/dtml/addLexicon.dtml
+++ b/src/Products/ZCTextIndex/dtml/addLexicon.dtml
-<dtml-var manage_page_header>
-
-<dtml-var "manage_form_title(this(), _,
-           form_title='Add ZCTextIndex Lexicon',
-           help_product='ZCTextIndex',
-           help_topic='Lexicon_Add.stx'
-	   )">
-     
-<p class="form-help">
-  A ZCTextIndex Lexicon processes and stores the words of documents indexed
-  with a ZCTextIndex. Multiple ZCTextIndexes can share the same lexicon.
-</p>
-
-<form action="manage_addLexicon" method="POST">
-<table cellspacing="0" cellpadding="2" border="0">
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Id
-    </div>
-    </td>
-    <td align="left" valign="top">
-    <input type="text" name="id" size="40" />
-    </td>
-  </tr>
-  
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-optional">
-    Title
-    </div>
-    </td>
-    <td align="left" valign="top">
-    <input type="text" name="title" size="40" />
-    </td>
-  </tr>
-
-  <dtml-in name="getElementGroups" prefix="group">
-    <dtml-let elements="getElementNames(group_item)">
-      <tr>
-        <td align="left" valign="top">
-          <div class="form-label">&dtml-group_item;</div>
-        </td>
-        <td align="left" valign="top">
-          <input type="hidden" name="elements.group:records" 
-                 value="&dtml-group_item;" />
-          <dtml-if expr="_.len(elements) > 1">
-            <select name="elements.name:records">
-              <dtml-in name="elements">
-                <option value="&dtml-sequence-item;"
-                >&dtml-sequence-item;</option>
-              </dtml-in>
-            </select>
-          <dtml-else>
-            <input type="checkbox" name="elements.name:records" 
-                   value="<dtml-var expr="elements[0]" html_quote>" checked />
-          </dtml-if>
-        </td>
-      </tr>
-    </dtml-let>
-  </dtml-in>
-
-  <tr>
-    <td align="left" valign="top">
-    </td>
-    <td align="left" valign="top">
-    <div class="form-element">
-    <input class="form-element" type="submit" name="submit" 
-     value=" Add " /> 
-    </div>
-    </td>
-  </tr>
-</table>
-</form>
-
-
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
+++ b/src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
-<dtml-var manage_page_header>
-
-<dtml-var "manage_form_title(this(), _,
-           form_title='Add ZCTextIndex',
-           help_product='ZCTextIndex',
-           help_topic='ZCTextIndex_Add.stx'
-	   )">
-
-
-<p class="form-help">
-<strong>Text Indexes</strong> break text up into individual words, and 
-are often referred to as full-text indexes. Text indexes 
-sort results by score, meaning they return hits in order 
-from the most relevant to the least relevant.
-</p>
-
-
-<form action="manage_addZCTextIndex" method="post"
-      enctype="multipart/form-data">
-<table cellspacing="0" cellpadding="2" border="0">
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Id
-    </div>
-    </td>
-    <td align="left" valign="top">
-    <input type="text" name="id" size="40" />
-    </td>
-  </tr>
-
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Indexed attributes
-    </div></td>
-    <td align="left" valign="top">
-    <input type="text" name="extra.doc_attr:record" size="40" />
-    <em>attribute1,attribute2,...</em> or leave empty
-    </td>
-  </tr>
-
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Ranking Strategy
-    </div>
-    </td>
-    <td align="left" valign="top">
-      <select name="extra.index_type:record">
-        <dtml-in name="getIndexTypes">
-          <option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
-        </dtml-in>
-      </select>        
-    </td>
-  </tr>
-
-  <tr>
-    <td align="left" valign"top">
-    <div class="form-label">
-    Lexicon
-    </div></td>
-    <td>
-    <dtml-in expr="superValues('ZCTextIndex Lexicon')">
-      <dtml-if sequence-start>
-        <select name="extra.lexicon_id:record">
-      </dtml-if>
-      <option value="&dtml-id;">
-        &dtml-id; <dtml-var name="title" fmt="(%s)" null html_quote>
-      </option>
-      <dtml-if sequence-end>
-        </select>
-      </dtml-if>
-    <dtml-else>
-      <em>You must create a ZCTextIndex Lexicon first.</em>
-    </dtml-in>
-    </td> 
-  </tr>
-
-  <tr>
-    <td align="left" valign="top">
-    </td>
-    <td align="left" valign="top">
-    <div class="form-element">
-    <input class="form-element" type="submit" name="submit" 
-     value=" Add " /> 
-    </div>
-    </td>
-  </tr>
-</table>
-</form>
-
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/manageLexicon.dtml
+++ b/src/Products/ZCTextIndex/dtml/manageLexicon.dtml
-<dtml-var manage_page_header>
-<dtml-var manage_tabs>
-
-<p class="form-help">
-  The lexicon processes and stores the words found in objects indexed by one 
-  or more ZCTextIndexes.
-</p>
-
-<p class="section-bar">
-  <span class="form-label">Input Pipeline Stages</span>
-</p>
-
-<p class="form-help">
-  Text indexed through this lexicon is processed by the following pipeline 
-  stages
-</p>
-
-<ol class="form-help">
-  <dtml-in name="getPipelineNames">
-    <li>&dtml-sequence-item;</li>
-  </dtml-in>
-</ol>
-
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
+++ b/src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
-<dtml-var manage_page_header>
-<dtml-var manage_tabs>
-
-<p class="form-help">
-  Name(s) of attribute(s) indexed: 
-  <em><dtml-var "', '.join(getIndexSourceNames())"></em>
-</p>
-<p class="form-help">
-  Index type: 
-  <em>&dtml-getIndexType;</em>
-</p>
-<p class="form-help">
-  ZCTextIndex Lexicon used: 
-  <dtml-if getLexiconURL>
-    <a href="&dtml-getLexiconURL;/manage_main"
-    >&dtml-getLexiconURL;</a>
-  <dtml-else>
-    <em>(Lexicon Not Found)</em>
-  </dtml-if>
-</p>
-<p class="form-help">
-  <em>Note:</em> The lexicon assigned to the index cannot be changed. To replace
-  the existing lexicon, create a new lexicon in the same place and clear the
-  index. This will make the index use the replacement lexicon.
-</p>
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/queryLexicon.dtml
+++ b/src/Products/ZCTextIndex/dtml/queryLexicon.dtml
-<dtml-var manage_page_header>
-<dtml-var manage_tabs>
-
-<p class="form-help">
-  Browse the words in the lexicon or enter the word(s) you are interested in
-  below. Globbing characters (*, ?) are supported
-</p>
-
-<dtml-let words_str="' '.join(REQUEST.get('words',[]))">
-  <form action="&dtml-URL;">
-    <p class="form-element">
-      <span class="form-label">Word(s)</span>
-      <input name="words:tokens" size="20"  value="&dtml-words_str;" />
-      <input type="submit" value="Query" />
-
-      <span class="form-label">&nbsp;Output Columns:</span>
-      <input name="cols:int" size="2" value="&dtml-cols;" />
-      <span class="form-label">&nbsp;Rows:</span>
-      <input name="rows:int" size="2" value="&dtml-rows;" />
-    </p>
-  </form>
-  <hr />
-  <form action="&dtml-URL;">
-    <table width="100%"  cellpadding="2" cellspacing="0" border="0">
-
-    <tr class="section-bar">
-      <td><span class="form-label">
-        &dtml-word_count; Words Found<dtml-if word_count>,
-        Displaying &dtml-start_word;-&dtml-end_word;
-        </dtml-if>
-
-        <dtml-if expr="page_count > 0">
-          </span></td>
-          <td align="right"><span class="form-label">
-            Page:
-            <select name="page:int" onchange="this.form.submit()">
-              <dtml-in name="page_range" prefix="page">
-                <option value="&dtml-page_item;"
-                <dtml-if expr="page == page_item">
-                  selected
-                </dtml-if>
-                >
-                  <dtml-var expr="page_item+1">
-                </option>
-              </dtml-in>
-            </select>
-            of &dtml-page_count;
-            <input type="submit" value="Go" />
-            <input type="hidden" name="cols:int" value="&dtml-cols;" />
-            <input type="hidden" name="rows:int" value="&dtml-rows;" />
-            <input type="hidden" name="words:tokens" value="&dtml-words_str;" />
-        </dtml-if>
-        </span></td>
-    </tr>
-    </table>
-  </form>
-</dtml-let>
-
-<dtml-if name="page_columns">
-  <table width="100%" cellpadding="0" cellspacing="10" border="0">
-    <tr>
-    <dtml-in name="page_columns" prefix="column">
-      <td align="left" valign="top">
-        <dtml-var expr="'<br />'.join(column_item)">
-      </td>
-    </dtml-in>
-    </tr>
-  </table>
-</dtml-if>
-
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/help/Lexicon_Add.stx
+++ b/src/Products/ZCTextIndex/help/Lexicon_Add.stx
-ZCTextIndex Lexicon - Add: Create a new ZCTextIndex Lexicon
-
-    Description
-    
-        This view allows you to create a new ZCTextIndex Lexicon object.
-        ZCTextIndex Lexicons store the words indexed by ZCTextIndexes in a
-        ZCatalog.
-        
-    Controls
-    
-        'Id' -- Allows you to specify the id of the ZCTextIndex Lexicon.
-        
-        'Title' -- Allows you to specify the title of the ZCTextIndex Lexicon.
-        
-        Pipeline Stages
-
-            The remaining controls allow you to select the desired processing
-            of text to index by selecting pipeline stages.
-
-            The default available stages are:
-
-            - **Word Splitter** This is the only mandatory stage. The word
-              splitter breaks the text up into a list of words. Included is a
-              simple whitespace splitter, and a splitter that removes HTML
-              tags. The HTML aware splitter gives best results when all of
-              the incoming content to index is HTML.
-
-            - **Stop Words** To conserve space in the vocabulary, and possibly
-              increase performance, you can select a stop word remover which
-              subtracts very common or single letter words from the Lexicon.
-              Bear in mind that you will not be able to search on removed stop
-              words, and they will also be removed from queries passed to
-              search ZCTextIndexes using the Lexicon.
-
-            - **Case Normalizer** The case normalizer removes case information
-              from the words in the  Lexicon. If case-sensitive searching is
-              desires, then omit this element from the pipeline.
--- a/src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
+++ b/src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
-ZCTextIndex  Add: Create a new ZCTextIndex
-
-    Description
-    
-        A ZCTextIndex is an index for performing full text searches over
-        bodies of text. It includes the following features:
-        
-        - Boolean query operators with parenthetical grouping
-        
-        - Globbing (partial word) and phrase matching
-        
-        - Two selectable relevance scoring algorithms
-        
-        ZCTextIndex is designed as a replacement for standard TextIndex, and
-        has several advantages over it.
-        
-    Controls
-    
-        'Id' -- The id of the ZCTextIndex, must be unique for this ZCatalog.
-        
-        'Field Name' -- The name of the field (object attribute) to be indexed.
-        
-        'Ranking Strategy'
-        
-        - **Okapi BM25 Rank** A relevance scoring technique that seems to
-          work well when the document text is considerably longer than the
-          query string, which is often the case with user specified query
-          strings.
-
-        - **Cosine Measure** A relevance scoring technique derived from the
-          "*Managing Gigabytes*":http://www.cs.mu.oz.au/mg/ book. It seems
-          to work best when the queries are similar in size and content to
-          the text they are searching.
-             
-        'Lexicon' -- The ZCTextIndex Lexicon to be used by this ZCTextIndex.
-                     Lexicons process and store the words from the text and
-                     help in processing queries. You must define a ZCTextIndex
-                     Lexicon before you can create a ZCTextIndex. Several
-                     ZCTextIndexes can share the same Lexicon if desired.
--- a/src/Products/ZCTextIndex/interfaces.py
+++ b/src/Products/ZCTextIndex/interfaces.py
-##############################################################################
-#
-# Copyright (c) 2005 Zope Foundation and Contributors.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""ZCTextIndex z3 interfaces.
-
-$Id$
-"""
-
-from zope.interface import Interface
-
-
-class IZCTextIndex(Interface):
-
-    """Persistent text index.
-    """
-
-
-class ILexicon(Interface):
-
-    """Object responsible for converting text to word identifiers.
-    """
-
-    def termToWordIds(text):
-        """Return a sequence of ids of the words parsed from the text.
-
-        The input text may be either a string or a list of strings.
-
-        Parse the text as if they are search terms, and skips words
-        that aren't in the lexicon.
-        """
-
-    def sourceToWordIds(text):
-        """Return a sequence of ids of the words parsed from the text.
-
-        The input text may be either a string or a list of strings.
-
-        Parse the text as if they come from a source document, and
-        creates new word ids for words that aren't (yet) in the
-        lexicon.
-        """
-
-    def globToWordIds(pattern):
-        """Return a sequence of ids of words matching the pattern.
-
-        The argument should be a single word using globbing syntax,
-        e.g. 'foo*' meaning anything starting with 'foo'.
-
-        Return the wids for all words in the lexicon that match the
-        pattern.
-        """
-
-    def length():
-        """Return the number of unique term in the lexicon.
-        """
-
-    def get_word(wid):
-        """Return the word for the given word id.
-
-        Raise KeyError if the word id is not in the lexicon.
-        """
-
-    def get_wid(word):
-        """Return the wird id for the given word.
-
-        Return 0 of the word is not in the lexicon.
-        """
-
-    def parseTerms(text):
-        """Pass the text through the pipeline.
-
-        Return a list of words, normalized by the pipeline
-        (e.g. stopwords removed, case normalized etc.).
-        """
-
-    def isGlob(word):
-        """Return true if the word is a globbing pattern.
-
-        The word should be one of the words returned by parseTerm().
-        """
-
-
-class IZCLexicon(Interface):
-
-    """Lexicon for ZCTextIndex.
-    """
-
-class ISplitter(Interface):
-    """A splitter."""
-
-    def process(text):
-        """Run the splitter over the input text, returning a list of terms.
-        """
-
-class IPipelineElement(Interface):
-
-    def process(source):
-        """Provide a text processing step.
-
-        Process a source sequence of words into a result sequence.
-        """
-
-    def processGlob(source):
-        """Process, passing through globbing metacharaters.
-
-        This is an optional method; if it is not used, process() is used.
-        """
-
-class IPipelineElementFactory(Interface):
-    """Class for creating pipeline elements by name"""
-
-    def registerFactory(group, name, factory):
-        """Registers a pipeline factory by name and element group.
-
-        Each name can be registered only once for a given group. Duplicate
-        registrations will raise a ValueError
-        """
-
-    def getFactoryGroups():
-        """Returns a sorted list of element group names
-        """
-
-    def getFactoryNames(group):
-        """Returns a sorted list of registered pipeline factory names
-        in the specified element group
-        """
-
-    def instantiate(group, name):
-        """Instantiates a pipeline element by group and name. If name is not
-        registered raise a KeyError.
-        """
-
-
-class IQueryParseTree(Interface):
-    """Interface for parse trees returned by parseQuery()."""
-
-    def nodeType():
-        """Return the node type.
-
-        This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
-        """
-
-    def getValue():
-        """Return a node-type specific value.
-
-        For node type:    Return:
-        'AND'             a list of parse trees
-        'OR'              a list of parse trees
-        'NOT'             a parse tree
-        'ATOM'            a string (representing a single search term)
-        'PHRASE'          a string (representing a search phrase)
-        'GLOB'            a string (representing a pattern, e.g. "foo*")
-        """
-
-    def terms():
-        """Return a list of all terms in this node, excluding NOT subtrees."""
-
-    def executeQuery(index):
-        """Execute the query represented by this node against the index.
-
-        The index argument must implement the IIndex interface.
-
-        Return an IIBucket or IIBTree mapping document ids to scores
-        (higher scores mean better results).
-
-        May raise ParseTree.QueryError.
-        """
-
-class IQueryParser(Interface):
-    """Interface for Query Parsers."""
-
-    def parseQuery(query):
-        """Parse a query string.
-
-        Return a parse tree (which implements IQueryParseTree).
-
-        Some of the query terms may be ignored because they are
-        stopwords; use getIgnored() to find out which terms were
-        ignored.  But if the entire query consists only of stop words,
-        or of stopwords and one or more negated terms, an exception is
-        raised.
-
-        May raise ParseTree.ParseError.
-        """
-
-    def getIgnored():
-        """Return the list of ignored terms.
-
-        Return the list of terms that were ignored by the most recent
-        call to parseQuery() because they were stopwords.
-
-        If parseQuery() was never called this returns None.
-        """
-
-    def parseQueryEx(query):
-        """Parse a query string.
-
-        Return a tuple (tree, ignored) where 'tree' is the parse tree
-        as returned by parseQuery(), and 'ignored' is a list of
-        ignored terms as returned by getIgnored().
-
-        May raise ParseTree.ParseError.
-        """
-
-class IIndex(Interface):
-    """Interface for an Index."""
-
-    def length():
-        """Return the number of words in the index."""
-        
-    def document_count():
-        """Return the number of documents in the index."""
-
-    def get_words(docid):
-        """Return a list of wordids for the given docid."""
-
-    def search(term):
-        """Execute a search on a single term given as a string.
-
-        Return an IIBTree mapping docid to score, or None if all docs
-        match due to the lexicon returning no wids for the term (e.g.,
-        if the term is entirely composed of stopwords).
-        """
-
-    def search_phrase(phrase):
-        """Execute a search on a phrase given as a string.
-
-        Return an IIBtree mapping docid to score.
-        """
-
-    def search_glob(pattern):
-        """Execute a pattern search.
-
-        The pattern represents a set of words by using * and ?.  For
-        example, "foo*" represents the set of all words in the lexicon
-        starting with "foo".
-
-        Return an IIBTree mapping docid to score.
-        """
-
-    def query_weight(terms):
-        """Return the weight for a set of query terms.
-
-        'terms' is a sequence of all terms included in the query,
-        although not terms with a not.  If a term appears more than
-        once in a query, it should appear more than once in terms.
-
-        Nothing is defined about what "weight" means, beyond that the
-        result is an upper bound on document scores returned for the
-        query.
-        """
-
-    def index_doc(docid, text):
-        """Add a document with the specified id and text to the index. If a
-        document by that id already exists, replace its text with the new
-        text provided
-        text  may be either a string (Unicode or otherwise) or a list
-        of strings from which to extract the terms under which to
-        index the source document.
-        """
-
-    def unindex_doc(docid):
-        """Remove the document with the specified id from the index"""
-
-    def has_doc(docid):
-        """Returns true if docid is an id of a document in the index"""
-
-class INBest(Interface):
-    """NBest chooser Interface.
-
-    An NBest object remembers the N best-scoring items ever passed to its
-    .add(item, score) method.  If .add() is called M times, the worst-case
-    number of comparisons performed overall is M * log2(N).
-    """
-
-    def add(item, score):
-        """Record that item 'item' has score 'score'.  No return value.
-
-        The N best-scoring items are remembered, where N was passed to
-        the constructor.  'item' can by anything.  'score' should be
-        a number, and larger numbers are considered better.
-        """
-
-    def addmany(sequence):
-        """Like "for item, score in sequence: self.add(item, score)".
-
-        This is simply faster than calling add() len(seq) times.
-        """
-
-    def getbest():
-        """Return the (at most) N best-scoring items as a sequence.
-
-        The return value is a sequence of 2-tuples, (item, score), with
-        the largest score first.  If .add() has been called fewer than
-        N times, this sequence will contain fewer than N pairs.
-        """
-
-    def pop_smallest():
-        """Return and remove the (item, score) pair with lowest score.
-
-        If len(self) is 0, raise IndexError.
-
-        To be cleaer, this is the lowest score among the N best-scoring
-        seen so far.  This is most useful if the capacity of the NBest
-        object is never exceeded, in which case  pop_smallest() allows
-        using the object as an ordinary smallest-in-first-out priority
-        queue.
-        """
-
-    def __len__():
-        """Return the number of (item, score) pairs currently known.
-
-        This is N (the value passed to the constructor), unless .add()
-        has been called fewer than N times.
-        """
-
-    def capacity():
-        """Return the maximum number of (item, score) pairs.
-
-        This is N (the value passed to the constructor).
-        """
--- a/src/Products/ZCTextIndex/okascore.c
+++ b/src/Products/ZCTextIndex/okascore.c
-/*****************************************************************************
-
-  Copyright (c) 2002 Zope Foundation and Contributors.
-  All Rights Reserved.
-
-  This software is subject to the provisions of the Zope Public License,
-  Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-  FOR A PARTICULAR PURPOSE
-
- ****************************************************************************/
-
-/*	okascore.c
- *
- *	The inner scoring loop of OkapiIndex._search_wids() coded in C.
- *
- * Example from an indexed Python-Dev archive, where "python" shows up in all
- * but 2 of the 19,058 messages.  With the Python scoring loop,
- *
- *      query: python
- *      # results: 10 of 19056 in 534.77 ms
- *      query: python
- *      # results: 10 of 19056 in 277.52 ms
- *
- * The first timing is cold, the second timing from an immediate repeat of
- * the same query.  With the scoring loop here in C:
- *
- *     query: python
- *     # results: 10 of 19056 in 380.74 ms  -- 40% speedup
- *     query: python
- *     # results: 10 of 19056 in 118.96 ms  -- 133% speedup
- */
-
-#include "Python.h"
-
-#define K1 1.2
-#define B  0.75
-
-#ifndef PyTuple_CheckExact
-#define PyTuple_CheckExact PyTuple_Check
-#endif
-
-static PyObject *
-score(PyObject *self, PyObject *args)
-{
-	/* Believe it or not, floating these common subexpressions "by hand"
-	   gets better code out of MSVC 6. */
-	const double B_FROM1 = 1.0 - B;
-	const double K1_PLUS1 = K1 + 1.0;
-
-	/* Inputs */
-	PyObject *result;	/* IIBucket result, maps d to score */
-	PyObject *d2fitems;	/* ._wordinfo[t].items(), maps d to f(d, t) */
-	PyObject *d2len;	/* ._docweight, maps d to # words in d */
-	double idf;		/* inverse doc frequency of t */
-	double meandoclen;	/* average number of words in a doc */
-
-	int n, i;
-
-	if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
-						   &idf, &meandoclen))
-		return NULL;
-
-	idf *= 1024.0;	/* float out part of the scaled_int computation */
-	n = PyObject_Length(d2fitems);
-	for (i = 0; i < n; ++i) {
-		PyObject *d_and_f;	/* d2f[i], a (d, f) pair */
-		PyObject *d;
-		double f;
-		PyObject *doclen;	/* ._docweight[d] */
-		double lenweight;
-		double tf;
-		PyObject *scaled_int;
-		int status;
-
-		d_and_f = PySequence_GetItem(d2fitems, i);
-		if (d_and_f == NULL)
-			return NULL;
-		if (!(PyTuple_CheckExact(d_and_f) &&
-		      PyTuple_GET_SIZE(d_and_f) == 2)) {
-			PyErr_SetString(PyExc_TypeError,
-				"d2fitems must produce 2-item tuples");
-			Py_DECREF(d_and_f);
-			return NULL;
-		}
-		d = PyTuple_GET_ITEM(d_and_f, 0);
-		f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
-
-		doclen = PyObject_GetItem(d2len, d);
-		if (doclen == NULL) {
-			Py_DECREF(d_and_f);
-			return NULL;
-		}
-		lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
-
-		tf = f * K1_PLUS1 / (f + K1 * lenweight);
-		scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
-		if (scaled_int == NULL)
-			status = -1;
-		else
-			status = PyObject_SetItem(result, d, scaled_int);
-		Py_DECREF(d_and_f);
-		Py_DECREF(doclen);
-		Py_XDECREF(scaled_int);
-		if (status < 0)
-			return NULL;
-	}
-	Py_INCREF(Py_None);
-	return Py_None;
-}
-
-static char score__doc__[] =
-"score(result, d2fitems, d2len, idf, meandoclen)\n"
-"\n"
-"Do the inner scoring loop for an Okapi index.\n";
-
-static PyMethodDef okascore_functions[] = {
-	{"score",	   score,	  METH_VARARGS, score__doc__},
-	{NULL}
-};
-
-void
-initokascore(void)
-{
-	PyObject *m;
-
-	m = Py_InitModule3("okascore", okascore_functions,
-			    "inner scoring loop for Okapi rank");
-}
--- a/src/Products/ZCTextIndex/stopper.c
+++ b/src/Products/ZCTextIndex/stopper.c
-/*****************************************************************************
-
-  Copyright (c) 2002 Zope Foundation and Contributors.
-  All Rights Reserved.
-  
-  This software is subject to the provisions of the Zope Public License,
-  Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-  FOR A PARTICULAR PURPOSE
-  
- ****************************************************************************/
-
-/*  stopper.c
- *
- *  Fast version of the StopWordRemover object.
- */
-
-#include "Python.h"
-
-static PyObject *
-stopper_process(PyObject *unused, PyObject *args)
-{
-    PyObject *result = NULL;
-    PyObject *dict;
-    PyObject *seq;
-    int len, i;
-
-    if (!PyArg_ParseTuple(args, "O!O:process", &PyDict_Type, &dict, &seq))
-        return NULL;
-    seq = PySequence_Fast(seq,
-                          "process() requires a sequence as argument 2");
-    if (seq == NULL)
-        return NULL;
-    result = PyList_New(0);
-    if (result == NULL)
-        goto finally;
-#if PY_VERSION_HEX >= 0x02020000
-    /* Only available in Python 2.2 and newer. */
-    len = PySequence_Fast_GET_SIZE(seq);
-#else
-    len = PyObject_Length(seq);
-#endif
-    for (i = 0; i < len; ++i) {
-        PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
-        /*
-         * PyDict_GetItem() returns NULL if there isn't a matching
-         * item, but without setting an exception, so this does what
-         * we want.
-         */
-        if (PyDict_GetItem(dict, s) == NULL) {
-            if (PyList_Append(result, s) < 0) {
-                Py_DECREF(result);
-                result = NULL;
-                goto finally;
-            }
-        }
-    }
- finally:
-    Py_DECREF(seq);
-    return result;
-}
-
-static PyMethodDef stopper_functions[] = {
-    {"process", stopper_process, METH_VARARGS,
-     "process(dict, [str, ...]) --> [str, ...]\n"
-     "Remove stop words (the keys of dict) from the input list of strings\n"
-     " to create a new list."},
-    {NULL}
-};
-
-void
-initstopper(void)
-{
-    Py_InitModule3("stopper", stopper_functions,
-                   "Fast StopWordRemover implementation.");
-}
--- a/src/Products/ZCTextIndex/tests/__init__.py
+++ b/src/Products/ZCTextIndex/tests/__init__.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""Test package."""
--- a/src/Products/ZCTextIndex/tests/hs-tool.py
+++ b/src/Products/ZCTextIndex/tests/hs-tool.py
-#! /usr/bin/env python
-
-import cPickle
-import os.path
-import sys
-
-from hotshot.log import LogReader
-
-def load_line_info(log):
-    byline = {}
-    prevloc = None
-    for what, place, tdelta in log:
-        if tdelta > 0:
-            t, nhits = byline.get(prevloc, (0, 0))
-            byline[prevloc] = (tdelta + t), (nhits + 1)
-            prevloc = place
-    return byline
-
-def basename(path, cache={}):
-    try:
-        return cache[path]
-    except KeyError:
-        fn = os.path.split(path)[1]
-        cache[path] = fn
-        return fn
-
-def print_results(results):
-    for info, place in results:
-        if place is None:
-            # This is the startup time for the profiler, and only
-            # occurs at the very beginning.  Just ignore it, since it
-            # corresponds to frame setup of the outermost call, not
-            # anything that's actually interesting.
-            continue
-        filename, line, funcname = place
-        print '%8d %8d' % info, basename(filename), line
-
-def annotate_results(results):
-    files = {}
-    for stats, place in results:
-        if not place:
-            continue
-        time, hits = stats
-        file, line, func = place
-        l = files.get(file)
-        if l is None:
-            l = files[file] = []
-        l.append((line, hits, time))
-    order = files.keys()
-    order.sort()
-    for k in order:
-        if os.path.exists(k):
-            v = files[k]
-            v.sort()
-            annotate(k, v)
-
-def annotate(file, lines):
-    print "-" * 60
-    print file
-    print "-" * 60
-    f = open(file)
-    i = 1
-    match = lines[0][0]
-    for line in f:
-        if match == i:
-            print "%6d %8d " % lines[0][1:], line,
-            del lines[0]
-            if lines:
-                match = lines[0][0]
-            else:
-                match = None
-        else:
-            print " " * 16, line,
-        i += 1
-    print
-
-def get_cache_name(filename):
-    d, fn = os.path.split(filename)
-    cache_dir = os.path.join(d, '.hs-tool')
-    cache_file = os.path.join(cache_dir, fn)
-    return cache_dir, cache_file
-
-def cache_results(filename, results):
-    cache_dir, cache_file = get_cache_name(filename)
-    if not os.path.exists(cache_dir):
-        os.mkdir(cache_dir)
-    fp = open(cache_file, 'wb')
-    try:
-        cPickle.dump(results, fp, 1)
-    finally:
-        fp.close()
-
-def main(filename, annotate):
-    cache_dir, cache_file = get_cache_name(filename)
-
-    if (  os.path.isfile(cache_file)
-          and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
-        # cached data is up-to-date:
-        fp = open(cache_file, 'rb')
-        results = cPickle.load(fp)
-        fp.close()
-    else:
-        log = LogReader(filename)
-        byline = load_line_info(log)
-        # Sort
-        results = [(v, k) for k, v in byline.items()]
-        results.sort()
-        cache_results(filename, results)
-
-    if annotate:
-        annotate_results(results)
-    else:
-        print_results(results)
-
-
-if __name__ == "__main__":
-    import getopt
-
-    annotate_p = 0
-    opts, args = getopt.getopt(sys.argv[1:], 'A')
-    for o, v in opts:
-        if o == '-A':
-            annotate_p = 1
-    if args:
-        filename, = args
-    else:
-        filename = "profile.dat"
-
-    main(filename, annotate_p)
--- a/src/Products/ZCTextIndex/tests/indexhtml.py
+++ b/src/Products/ZCTextIndex/tests/indexhtml.py
-#! /usr/bin/env python
-"""Index a collection of HTML files on the filesystem.
-
-usage: indexhtml.py [options] dir
-
-Will create an index of all files in dir or its subdirectories.
-
-options:
-f data.fs  -- the path to the filestorage datafile
-"""
-# XXX: Products.PluginIndexes.TextIndex no longer exists
-from __future__ import nested_scopes
-
-import os
-from time import clock
-
-import ZODB
-from ZODB.FileStorage import FileStorage
-from BTrees.IOBTree import IOBTree
-import transaction
-
-from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
-from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
-from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
-
-def make_zc_index():
-    # there's an elaborate dance necessary to construct an index
-    class Struct:
-        pass
-    extra = Struct()
-    extra.doc_attr = "read"
-    extra.lexicon_id = "lexicon"
-    caller = Struct()
-    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
-    return ZCTextIndex("read", extra, caller)
-
-# XXX make a splitter more like the HTMLSplitter for TextIndex
-# signature is
-# Splitter(string, stop_words, encoding,
-#          singlechar, indexnumbers, casefolding)
-
-class MySplitter:
-    def __init__(self):
-        self._v_splitter = HTMLWordSplitter()
-    def __call__(self, text, stopdict, *args, **kwargs):
-        words = self._v_splitter._split(text)
-        def lookup(w):
-            return stopdict.get(w, w)
-        return filter(None, map(lookup, words))
-
-#def make_old_index():
-#    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
-#    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
-#    from Products.ZCTextIndex.StopDict import get_stopdict
-#
-#    l = Lexicon(get_stopdict())
-#    l.SplitterFunc = MySplitter()
-#    return TextIndex("read", lexicon=l)
-
-def main(db, root, dir):
-    rt["index"] = index = INDEX()
-    rt["files"] = paths = IOBTree()
-    transaction.commit()
-
-    zodb_time = 0.0
-    pack_time = 0.0
-
-    files = [os.path.join(dir, file) for file in os.listdir(dir)]
-    docid = 0
-    t0 = clock()
-    for file in files:
-        if os.path.isdir(file):
-            files += [os.path.join(file, sub) for sub in os.listdir(file)]
-        else:
-            if not file.endswith(".html"):
-                continue
-            docid += 1
-            if LIMIT is not None and docid > LIMIT:
-                break
-            if VERBOSE:
-                print "%5d" % docid, file
-            f = open(file, "rb")
-            paths[docid] = file
-            index.index_object(docid, f)
-            f.close()
-            if docid % TXN_INTERVAL == 0:
-                z0 = clock()
-                transaction.commit()
-                z1 = clock()
-                zodb_time += z1 - z0
-                if VERBOSE:
-                    print "commit took", z1 - z0, zodb_time
-            if docid % PACK_INTERVAL == 0:
-                p0 = clock()
-                db.pack()
-                p1 = clock()
-                zodb_time += p1 - p0
-                pack_time += p1 - p0
-                if VERBOSE:
-                    print "pack took", p1 - p0, pack_time
-    z0 = clock()
-    transaction.commit()
-    z1 = t1 = clock()
-    total_time = t1 - t0
-    zodb_time += z1 - z0
-    if VERBOSE:
-        print "Total index time", total_time
-        print "Non-pack time", total_time - pack_time
-        print "Non-ZODB time", total_time - zodb_time
-
-if __name__ == "__main__":
-    import sys
-    import getopt
-
-    VERBOSE = 0
-    FSPATH = "Data.fs"
-    TXN_INTERVAL = 100
-    PACK_INTERVAL = 500
-    LIMIT = None
-    INDEX = make_zc_index
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
-    except getopt.error, msg:
-        print msg
-        print __doc__
-        sys.exit(2)
-
-    for o, v in opts:
-        if o == '-v':
-            VERBOSE += 1
-        if o == '-f':
-            FSPATH = v
-        if o == '-t':
-            TXN_INTERVAL = int(v)
-        if o == '-p':
-            PACK_INTERVAL = int(v)
-        if o == '-n':
-            LIMIT = int(v)
-#        if o == '-T':
-#            INDEX = make_old_index
-
-    if len(args) != 1:
-        print "Expected on argument"
-        print __doc__
-        sys.exit(2)
-    dir = args[0]
-
-    fs = FileStorage(FSPATH)
-    db = ZODB.DB(fs)
-    cn = db.open()
-    rt = cn.root()
-    dir = os.path.join(os.getcwd(), dir)
-    print dir
-    main(db, rt, dir)
-    cn.close()
-    fs.close()
--- a/src/Products/ZCTextIndex/tests/mailtest.py
+++ b/src/Products/ZCTextIndex/tests/mailtest.py
-"""Test an index with a Unix mailbox file.
-
-usage: python mailtest.py [options] <data.fs>
-
-options:
-    -v     -- verbose
-
-    Index Generation
-    -i mailbox
-    -n NNN -- max number of messages to read from mailbox
-    -t NNN -- commit a transaction every NNN messages (default: 1)
-    -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-    -p 0   -- don't pack at all
-    -x     -- exclude the message text from the data.fs
-
-    Queries
-    -q query
-    -b NNN -- return the NNN best matches (default: 10)
-    -c NNN -- context; if -v, show the first NNN lines of results (default: 5)
-
-The script either indexes or queries depending on whether -q or -i is
-passed as an option.
-
-For -i mailbox, the script reads mail messages from the mailbox and
-indexes them.  It indexes one message at a time, then commits the
-transaction.
-
-For -q query, it performs a query on an existing index.
-
-If both are specified, the index is performed first.
-
-You can also interact with the index after it is completed. Load the
-index from the database:
-
-    import ZODB
-    from ZODB.FileStorage import FileStorage
-    fs = FileStorage(<data.fs>
-    db = ZODB.DB(fs)
-    index = cn.open().root()["index"]
-    index.search("python AND unicode")
-"""
-
-import ZODB
-import ZODB.FileStorage
-import transaction
-from Products.ZCTextIndex.Lexicon import \
-     Lexicon, CaseNormalizer, Splitter, StopWordRemover
-from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
-from BTrees.IOBTree import IOBTree
-from Products.ZCTextIndex.QueryParser import QueryParser
-
-import sys
-import mailbox
-import time
-
-def usage(msg):
-    print msg
-    print __doc__
-    sys.exit(2)
-
-class Message:
-
-    total_bytes = 0
-
-    def __init__(self, msg):
-        subject = msg.getheader('subject', '')
-        author = msg.getheader('from', '')
-        if author:
-            summary = "%s (%s)\n" % (subject, author)
-        else:
-            summary = "%s\n" % subject
-        self.text = summary + msg.fp.read()
-        Message.total_bytes += len(self.text)
-
-class Extra:
-    pass
-
-def index(rt, mboxfile, db, profiler):
-    global NUM
-    idx_time = 0
-    pack_time = 0
-    start_time = time.time()
-
-    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
-    extra = Extra()
-    extra.lexicon_id = 'lexicon'
-    extra.doc_attr = 'text'
-    extra.index_type = 'Okapi BM25 Rank'
-    caller = Extra()
-    caller.lexicon = lexicon
-    rt["index"] = idx = ZCTextIndex("index", extra, caller)
-    if not EXCLUDE_TEXT:
-        rt["documents"] = docs = IOBTree()
-    else:
-        docs = None
-    transaction.commit()
-
-    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
-    if VERBOSE:
-        print "opened", mboxfile
-    if not NUM:
-        NUM = sys.maxint
-
-    if profiler:
-        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
-    else:
-        itime, ptime, i = indexmbox(mbox, idx, docs, db)
-    idx_time += itime
-    pack_time += ptime
-
-    transaction.commit()
-
-    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
-        if VERBOSE >= 2:
-            print "packing one last time..."
-        p0 = time.clock()
-        db.pack(time.time())
-        p1 = time.clock()
-        if VERBOSE:
-            print "pack took %s sec" % (p1 - p0)
-        pack_time += p1 - p0
-
-    if VERBOSE:
-        finish_time = time.time()
-        print
-        print "Index time", round(idx_time / 60, 3), "minutes"
-        print "Pack time", round(pack_time / 60, 3), "minutes"
-        print "Index bytes", Message.total_bytes
-        rate = (Message.total_bytes / idx_time) / 1024
-        print "Index rate %.2f KB/sec" % rate
-        print "Indexing began", time.ctime(start_time)
-        print "Indexing ended", time.ctime(finish_time)
-        print "Wall clock minutes", round((finish_time - start_time)/60, 3)
-
-def indexmbox(mbox, idx, docs, db):
-    idx_time = 0
-    pack_time = 0
-    i = 0
-    while i < NUM:
-        _msg = mbox.next()
-        if _msg is None:
-            break
-        i += 1
-        msg = Message(_msg)
-        if VERBOSE >= 2:
-            print "indexing msg", i
-        i0 = time.clock()
-        idx.index_object(i, msg)
-        if not EXCLUDE_TEXT:
-            docs[i] = msg
-        if i % TXN_SIZE == 0:
-            transaction.commit()
-        i1 = time.clock()
-        idx_time += i1 - i0
-        if VERBOSE and i % 50 == 0:
-            print i, "messages indexed"
-            print "cache size", db.cacheSize()
-        if PACK_INTERVAL and i % PACK_INTERVAL == 0:
-            if VERBOSE >= 2:
-                print "packing..."
-            p0 = time.clock()
-            db.pack(time.time())
-            p1 = time.clock()
-            if VERBOSE:
-                print "pack took %s sec" % (p1 - p0)
-            pack_time += p1 - p0
-    return idx_time, pack_time, i
-
-
-def query(rt, query_str, profiler):
-    idx = rt["index"]
-    docs = rt["documents"]
-
-    start = time.clock()
-    if profiler is None:
-        results, num_results = idx.query(query_str, BEST)
-    else:
-        if WARM_CACHE:
-            print "Warming the cache..."
-            idx.query(query_str, BEST)
-        start = time.clock()
-        results, num_results = profiler.runcall(idx.query, query_str, BEST)
-    elapsed = time.clock() - start
-
-    print "query:", query_str
-    print "# results:", len(results), "of", num_results, \
-          "in %.2f ms" % (elapsed * 1000)
-
-    tree = QueryParser(idx.lexicon).parseQuery(query_str)
-    qw = idx.index.query_weight(tree.terms())
-
-    for docid, score in results:
-        scaled = 100.0 * score / qw
-        print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
-        if VERBOSE:
-            msg = docs[docid]
-            ctx = msg.text.split("\n", CONTEXT)
-            del ctx[-1]
-            print "-" * 60
-            print "message:"
-            for l in ctx:
-                print l
-            print "-" * 60
-
-
-def main(fs_path, mbox_path, query_str, profiler):
-    f = ZODB.FileStorage.FileStorage(fs_path)
-    db = ZODB.DB(f, cache_size=CACHE_SIZE)
-    cn = db.open()
-    rt = cn.root()
-
-    if mbox_path is not None:
-        index(rt, mbox_path, db, profiler)
-    if query_str is not None:
-        query(rt, query_str, profiler)
-
-    cn.close()
-    db.close()
-    f.close()
-
-if __name__ == "__main__":
-    import getopt
-
-    NUM = 0
-    VERBOSE = 0
-    PACK_INTERVAL = 500
-    EXCLUDE_TEXT = 0
-    CACHE_SIZE = 10000
-    TXN_SIZE = 1
-    BEST = 10
-    CONTEXT = 5
-    WARM_CACHE = 0
-    query_str = None
-    mbox_path = None
-    profile = None
-    old_profile = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
-                                   ['profile=', 'old-profile='])
-    except getopt.error, msg:
-        usage(msg)
-    if len(args) != 1:
-        usage("exactly 1 filename argument required")
-    for o, v in opts:
-        if o == '-n':
-            NUM = int(v)
-        elif o == '-v':
-            VERBOSE += 1
-        elif o == '-p':
-            PACK_INTERVAL = int(v)
-        elif o == '-q':
-            query_str = v
-        elif o == '-i':
-            mbox_path = v
-        elif o == '-b':
-            BEST = int(v)
-        elif o == '-x':
-            EXCLUDE_TEXT = 1
-        elif o == '-t':
-            TXN_SIZE = int(v)
-        elif o == '-c':
-            CONTEXT = int(v)
-        elif o == '-w':
-            WARM_CACHE = 1
-        elif o == '--profile':
-            profile = v
-        elif o == '--old-profile':
-            old_profile = v
-    fs_path, = args
-
-    if profile:
-        import hotshot
-        profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
-    elif old_profile:
-        import profile
-        profiler = profile.Profile()
-    else:
-        profiler = None
-
-    main(fs_path, mbox_path, query_str, profiler)
-
-    if profile:
-        profiler.close()
-    elif old_profile:
-        import pstats
-        profiler.dump_stats(old_profile)
-        stats = pstats.Stats(old_profile)
-        stats.strip_dirs().sort_stats('time').print_stats(20)
--- a/src/Products/ZCTextIndex/tests/mhindex.py
+++ b/src/Products/ZCTextIndex/tests/mhindex.py
-"""MH mail indexer.
-
-To index messages from a single folder (messages defaults to 'all'):
-  mhindex.py [options] -u +folder [messages ...]
-
-To bulk index all messages from several folders:
-  mhindex.py [options] -b folder ...; the folder name ALL means all folders.
-
-To execute a single query:
-  mhindex.py [options] query
-
-To enter interactive query mode:
-  mhindex.py [options]
-
-Common options:
-  -d FILE -- specify the Data.fs to use (default ~/.Data.fs)
-  -w -- dump the word list in alphabetical order and exit
-  -W -- dump the word list ordered by word id and exit
-
-Indexing options:
-  -O -- do a prescan on the data to compute optimal word id assignments;
-        this is only useful the first time the Data.fs is used
-  -t N -- commit a transaction after every N messages (default 20000)
-  -p N -- pack after every N commits (by default no packing is done)
-
-Querying options:
-  -m N -- show at most N matching lines from the message (default 3)
-  -n N -- show the N best matching messages (default 3)
-"""
-
-import os
-import re
-import sys
-import time
-import mhlib
-import getopt
-import traceback
-from StringIO import StringIO
-from stat import ST_MTIME
-
-DATAFS = "~/.Data.fs"
-ZOPECODE = "~/projects/Zope/lib/python"
-
-sys.path.append(os.path.expanduser(ZOPECODE))
-
-from ZODB import DB
-from ZODB.FileStorage import FileStorage
-from Persistence import Persistent
-from BTrees.IOBTree import IOBTree
-from BTrees.OIBTree import OIBTree
-from BTrees.IIBTree import IIBTree
-import transaction
-
-from Products.ZCTextIndex.NBest import NBest
-from Products.ZCTextIndex.OkapiIndex import OkapiIndex
-from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
-from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
-from Products.ZCTextIndex.QueryParser import QueryParser
-from Products.ZCTextIndex.StopDict import get_stopdict
-
-NBEST = 3
-MAXLINES = 3
-
-def main():
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], "bd:fhm:n:Op:t:uwW")
-    except getopt.error, msg:
-        print msg
-        print "use -h for help"
-        return 2
-    update = 0
-    bulk = 0
-    optimize = 0
-    nbest = NBEST
-    maxlines = MAXLINES
-    datafs = os.path.expanduser(DATAFS)
-    pack = 0
-    trans = 20000
-    dumpwords = dumpwids = dumpfreqs = 0
-    for o, a in opts:
-        if o == "-b":
-            bulk = 1
-        if o == "-d":
-            datafs = a
-        if o == "-f":
-            dumpfreqs = 1
-        if o == "-h":
-            print __doc__
-            return
-        if o == "-m":
-            maxlines = int(a)
-        if o == "-n":
-            nbest = int(a)
-        if o == "-O":
-            optimize = 1
-        if o == "-p":
-            pack = int(a)
-        if o == "-t":
-            trans = int(a)
-        if o == "-u":
-            update = 1
-        if o == "-w":
-            dumpwords = 1
-        if o == "-W":
-            dumpwids = 1
-    ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
-    if dumpfreqs:
-        ix.dumpfreqs()
-    if dumpwords:
-        ix.dumpwords()
-    if dumpwids:
-        ix.dumpwids()
-    if dumpwords or dumpwids or dumpfreqs:
-        return
-    if bulk:
-        if optimize:
-            ix.optimize(args)
-        ix.bulkupdate(args)
-    elif update:
-        ix.update(args)
-    elif args:
-        for i in range(len(args)):
-            a = args[i]
-            if " " in a:
-                if a[0] == "-":
-                    args[i] = '-"' + a[1:] + '"'
-                else:
-                    args[i] = '"' + a + '"'
-        ix.query(" ".join(args), nbest, maxlines)
-    else:
-        ix.interact(nbest)
-    if pack:
-        ix.pack()
-
-class Indexer:
-
-    filestorage = database = connection = root = None
-
-    def __init__(self, datafs, writable=0, trans=0, pack=0):
-        self.trans_limit = trans
-        self.pack_limit = pack
-        self.trans_count = 0
-        self.pack_count = 0
-        self.stopdict = get_stopdict()
-        self.mh = mhlib.MH()
-        self.filestorage = FileStorage(datafs, read_only=(not writable))
-        self.database = DB(self.filestorage)
-        self.connection = self.database.open()
-        self.root = self.connection.root()
-        try:
-            self.index = self.root["index"]
-        except KeyError:
-            self.index = self.root["index"] = TextIndex()
-        try:
-            self.docpaths = self.root["docpaths"]
-        except KeyError:
-            self.docpaths = self.root["docpaths"] = IOBTree()
-        try:
-            self.doctimes = self.root["doctimes"]
-        except KeyError:
-            self.doctimes = self.root["doctimes"] = IIBTree()
-        try:
-            self.watchfolders = self.root["watchfolders"]
-        except KeyError:
-            self.watchfolders = self.root["watchfolders"] = {}
-        self.path2docid = OIBTree()
-        for docid in self.docpaths.keys():
-            path = self.docpaths[docid]
-            self.path2docid[path] = docid
-        try:
-            self.maxdocid = max(self.docpaths.keys())
-        except ValueError:
-            self.maxdocid = 0
-        print len(self.docpaths), "Document ids"
-        print len(self.path2docid), "Pathnames"
-        print self.index.lexicon.length(), "Words"
-
-    def dumpfreqs(self):
-        lexicon = self.index.lexicon
-        index = self.index.index
-        assert isinstance(index, OkapiIndex)
-        L = []
-        for wid in lexicon.wids():
-            freq = 0
-            for f in index._wordinfo.get(wid, {}).values():
-                freq += f
-            L.append((freq, wid, lexicon.get_word(wid)))
-        L.sort()
-        L.reverse()
-        for freq, wid, word in L:
-            print "%10d %10d %s" % (wid, freq, word)
-
-    def dumpwids(self):
-        lexicon = self.index.lexicon
-        index = self.index.index
-        assert isinstance(index, OkapiIndex)
-        for wid in lexicon.wids():
-            freq = 0
-            for f in index._wordinfo.get(wid, {}).values():
-                freq += f
-            print "%10d %10d %s" % (wid, freq, lexicon.get_word(wid))
-
-    def dumpwords(self):
-        lexicon = self.index.lexicon
-        index = self.index.index
-        assert isinstance(index, OkapiIndex)
-        for word in lexicon.words():
-            wid = lexicon.get_wid(word)
-            freq = 0
-            for f in index._wordinfo.get(wid, {}).values():
-                freq += f
-            print "%10d %10d %s" % (wid, freq, word)
-
-    def close(self):
-        self.root = None
-        if self.connection is not None:
-            self.connection.close()
-            self.connection = None
-        if self.database is not None:
-            self.database.close()
-            self.database = None
-        if self.filestorage is not None:
-            self.filestorage.close()
-            self.filestorage = None
-
-    def interact(self, nbest=NBEST, maxlines=MAXLINES):
-        try:
-            import readline
-        except ImportError:
-            pass
-        text = ""
-        top = 0
-        results = []
-        while 1:
-            try:
-                line = raw_input("Query: ")
-            except EOFError:
-                print "\nBye."
-                break
-            line = line.strip()
-            if line.startswith("/"):
-                self.specialcommand(line, results, top - nbest)
-                continue
-            if line:
-                text = line
-                top = 0
-            else:
-                if not text:
-                    continue
-            try:
-                results, n = self.timequery(text, top + nbest)
-            except KeyboardInterrupt:
-                raise
-            except:
-                reportexc()
-                text = ""
-                continue
-            if len(results) <= top:
-                if not n:
-                    print "No hits for %r." % text
-                else:
-                    print "No more hits for %r." % text
-                text = ""
-                continue
-            print "[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
-            print "for query %s]" % repr(text)
-            self.formatresults(text, results, maxlines, top, top+nbest)
-            top += nbest
-
-    def specialcommand(self, line, results, first):
-        assert line.startswith("/")
-        line = line[1:]
-        if not line:
-            n = first
-        else:
-            try:
-                n = int(line) - 1
-            except:
-                print "Huh?"
-                return
-        if n < 0 or n >= len(results):
-            print "Out of range"
-            return
-        docid, score = results[n]
-        path = self.docpaths[docid]
-        i = path.rfind("/")
-        assert i > 0
-        folder = path[:i]
-        n = path[i+1:]
-        cmd = "show +%s %s" % (folder, n)
-        if os.getenv("DISPLAY"):
-            os.system("xterm -e  sh -c '%s | less' &" % cmd)
-        else:
-            os.system(cmd)
-
-    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
-        results, n = self.timequery(text, nbest)
-        if not n:
-            print "No hits for %r." % text
-            return
-        print "[Results 1-%d from %d]" % (len(results), n)
-        self.formatresults(text, results, maxlines)
-
-    def timequery(self, text, nbest):
-        t0 = time.time()
-        c0 = time.clock()
-        results, n = self.index.query(text, nbest)
-        t1 = time.time()
-        c1 = time.clock()
-        print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
-        return results, n
-
-    def formatresults(self, text, results, maxlines=MAXLINES,
-                      lo=0, hi=sys.maxint):
-        stop = self.stopdict.has_key
-        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
-        pattern = r"\b(" + "|".join(words) + r")\b"
-        pattern = pattern.replace("*", ".*") # glob -> re syntax
-        prog = re.compile(pattern, re.IGNORECASE)
-        print '='*70
-        rank = lo
-        qw = self.index.query_weight(text)
-        for docid, score in results[lo:hi]:
-            rank += 1
-            path = self.docpaths[docid]
-            score = 100.0*score/qw
-            print "Rank:    %d   Score: %d%%   File: %s" % (rank, score, path)
-            path = os.path.join(self.mh.getpath(), path)
-            try:
-                fp = open(path)
-            except (IOError, OSError), msg:
-                print "Can't open:", msg
-                continue
-            msg = mhlib.Message("<folder>", 0, fp)
-            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
-                h = msg.getheader(header)
-                if h:
-                    print "%-8s %s" % (header+":", h)
-            text = self.getmessagetext(msg)
-            if text:
-                print
-                nleft = maxlines
-                for part in text:
-                    for line in part.splitlines():
-                        if prog.search(line):
-                            print line
-                            nleft -= 1
-                            if nleft <= 0:
-                                break
-                    if nleft <= 0:
-                        break
-            print '-'*70
-
-    def update(self, args):
-        folder = None
-        seqs = []
-
-        for arg in args:
-            if arg.startswith("+"):
-                if folder is None:
-                    folder = arg[1:]
-                else:
-                    print "only one folder at a time"
-                    return
-            else:
-                seqs.append(arg)
-
-        if not folder:
-            folder = self.mh.getcontext()
-        if not seqs:
-            seqs = ['all']
-
-        try:
-            f = self.mh.openfolder(folder)
-        except mhlib.Error, msg:
-            print msg
-            return
-
-        dict = {}
-        for seq in seqs:
-            try:
-                nums = f.parsesequence(seq)
-            except mhlib.Error, msg:
-                print msg or "unparsable message sequence: %s" % `seq`
-                return
-            for n in nums:
-                dict[n] = n
-        msgs = dict.keys()
-        msgs.sort()
-
-        self.updatefolder(f, msgs)
-        self.commit()
-
-    def optimize(self, args):
-        uniqwords = {}
-        for folder in args:
-            if folder.startswith("+"):
-                folder = folder[1:]
-            print "\nOPTIMIZE FOLDER", folder
-            try:
-                f = self.mh.openfolder(folder)
-            except mhlib.Error, msg:
-                print msg
-                continue
-            self.prescan(f, f.listmessages(), uniqwords)
-        L = [(uniqwords[word], word) for word in uniqwords.keys()]
-        L.sort()
-        L.reverse()
-        for i in range(100):
-            print "%3d. %6d %s" % ((i+1,) + L[i])
-        self.index.lexicon.sourceToWordIds([word for (count, word) in L])
-
-    def prescan(self, f, msgs, uniqwords):
-        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
-        for n in msgs:
-            print "prescanning", n
-            m = f.openmessage(n)
-            text = self.getmessagetext(m, f.name)
-            for p in pipeline:
-                text = p.process(text)
-            for word in text:
-                uniqwords[word] = uniqwords.get(word, 0) + 1
-
-    def bulkupdate(self, args):
-        if not args:
-            print "No folders specified; use ALL to bulk-index all folders"
-            return
-        if "ALL" in args:
-            i = args.index("ALL")
-            args[i:i+1] = self.mh.listfolders()
-        for folder in args:
-            if folder.startswith("+"):
-                folder = folder[1:]
-            print "\nFOLDER", folder
-            try:
-                f = self.mh.openfolder(folder)
-            except mhlib.Error, msg:
-                print msg
-                continue
-            self.updatefolder(f, f.listmessages())
-            print "Total", len(self.docpaths)
-        self.commit()
-        print len(self.index.lexicon._words), "unique words."
-
-    def updatefolder(self, f, msgs):
-        self.watchfolders[f.name] = self.getmtime(f.name)
-        for n in msgs:
-            path = "%s/%s" % (f.name, n)
-            docid = self.path2docid.get(path, 0)
-            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
-                print "unchanged", docid, path
-                continue
-            docid = self.newdocid(path)
-            try:
-                m = f.openmessage(n)
-            except IOError:
-                print "disappeared", docid, path
-                self.unindexpath(path)
-                continue
-            text = self.getmessagetext(m, f.name)
-            if not text:
-                self.unindexpath(path)
-                continue
-            print "indexing", docid, path
-            self.index.index_text(docid, text)
-            self.maycommit()
-        # Remove messages from the folder that no longer exist
-        for path in list(self.path2docid.keys(f.name)):
-            if not path.startswith(f.name + "/"):
-                break
-            if self.getmtime(path) == 0:
-                self.unindexpath(path)
-        print "done."
-
-    def unindexpath(self, path):
-        if self.path2docid.has_key(path):
-            docid = self.path2docid[path]
-            print "unindexing", docid, path
-            del self.docpaths[docid]
-            del self.doctimes[docid]
-            del self.path2docid[path]
-            try:
-                self.index.unindex(docid)
-            except KeyError, msg:
-                print "KeyError", msg
-            self.maycommit()
-
-    def getmessagetext(self, m, name=None):
-        L = []
-        if name:
-            L.append("_folder " + name) # To restrict search to a folder
-            self.getheaders(m, L)
-        try:
-            self.getmsgparts(m, L, 0)
-        except KeyboardInterrupt:
-            raise
-        except:
-            print "(getmsgparts failed:)"
-            reportexc()
-        return L
-
-    def getmsgparts(self, m, L, level):
-        ctype = m.gettype()
-        if level or ctype != "text/plain":
-            print ". "*level + str(ctype)
-        if ctype == "text/plain":
-            L.append(m.getbodytext())
-        elif ctype in ("multipart/alternative", "multipart/mixed"):
-            for part in m.getbodyparts():
-                self.getmsgparts(part, L, level+1)
-        elif ctype == "message/rfc822":
-            f = StringIO(m.getbodytext())
-            m = mhlib.Message("<folder>", 0, f)
-            self.getheaders(m, L)
-            self.getmsgparts(m, L, level+1)
-
-    def getheaders(self, m, L):
-        H = []
-        for key in "from", "to", "cc", "bcc", "subject":
-            value = m.get(key)
-            if value:
-                H.append(value)
-        if H:
-            L.append("\n".join(H))
-
-    def newdocid(self, path):
-        docid = self.path2docid.get(path)
-        if docid is not None:
-            self.doctimes[docid] = self.getmtime(path)
-            return docid
-        docid = self.maxdocid + 1
-        self.maxdocid = docid
-        self.docpaths[docid] = path
-        self.doctimes[docid] = self.getmtime(path)
-        self.path2docid[path] = docid
-        return docid
-
-    def getmtime(self, path):
-        path = os.path.join(self.mh.getpath(), path)
-        try:
-            st = os.stat(path)
-        except os.error, msg:
-            return 0
-        return int(st[ST_MTIME])
-
-    def maycommit(self):
-        self.trans_count += 1
-        if self.trans_count >= self.trans_limit > 0:
-            self.commit()
-
-    def commit(self):
-        if self.trans_count > 0:
-            print "committing..."
-            transaction.commit()
-            self.trans_count = 0
-            self.pack_count += 1
-            if self.pack_count >= self.pack_limit > 0:
-                self.pack()
-
-    def pack(self):
-        if self.pack_count > 0:
-            print "packing..."
-            self.database.pack()
-            self.pack_count = 0
-
-class TextIndex(Persistent):
-
-    def __init__(self):
-        self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
-        self.index = OkapiIndex(self.lexicon)
-
-    def index_text(self, docid, text):
-        self.index.index_doc(docid, text)
-        self._p_changed = 1 # XXX
-
-    def unindex(self, docid):
-        self.index.unindex_doc(docid)
-        self._p_changed = 1 # XXX
-
-    def query(self, query, nbest=10):
-        # returns a total hit count and a mapping from docids to scores
-        parser = QueryParser(self.lexicon)
-        tree = parser.parseQuery(query)
-        results = tree.executeQuery(self.index)
-        if results is None:
-            return [], 0
-        chooser = NBest(nbest)
-        chooser.addmany(results.items())
-        return chooser.getbest(), len(results)
-
-    def query_weight(self, query):
-        parser = QueryParser(self.lexicon)
-        tree = parser.parseQuery(query)
-        terms = tree.terms()
-        return self.index.query_weight(terms)
-
-def reportexc():
-    traceback.print_exc()
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/src/Products/ZCTextIndex/tests/python.txt
+++ b/src/Products/ZCTextIndex/tests/python.txt
-Search results for python.org
-
-query: "nested recursive functions"
-
-Ultraseek
-83% http://www.python.org/dev/doc/maint22/whatsnew/node9.html
-43% http://python.sourceforge.net/peps/pep-0227.txt
-37% http://www.python.org/dev/doc/maint22/lib/module-pprint.html 
-37% http://www.python.org/doc/1.5.2p1/lib/module-pprint.html 
-37% http://www.python.org/doc/2.0.1/lib/module-pprint.html 
-37% http://www.python.org/doc/1.5.2/lib/module-pprint.html 
-37% http://www.python.org/doc/1.6/lib/module-pprint.html 
-37% http://www.python.org/doc/1.5.1/lib/module-pprint.html 
-37% http://www.python.org/doc/1.5/lib/node54.html
-35% http://www.python.org/workshops/2000-01/proceedings/papers/tismers/
-spcpaper.htm
-
-Google
-www.python.org/peps/pep-0227.html
-www.python.org/dev/doc/maint22/whatsnew/node9.html
-www.python.org/cgi-bin/faqw.py?req=recent&days=28
-www.python.org/peps/pep-0255.html
-www.python.org/doc/current/lib/node558.html
-www.python.org/doc/1.5.2/lib/node52.html
-www.python.org/workshops/2000-01/proceedings/papers/tismers/spcpaper.pdf
-www.python.org/2.0/
-www.python.org/2.0.1/NEWS.txt
-www.python.org/peps/pep-0266.html 
-
-query: "explicit better than implicit"
-
-Ultraseek:
-http://www.python.org/dev/doc/maint22/lib/differ-examples.html
-http://www.python.org/doc/essays/ppt/python10/py10keynote.ppt 
-http://www.python.org/dev/culture.html 
-http://www.python.org/doc/Humor.html
-http://www.python.org/dev/doc/maint22/ref/implicit-joining.html
-http://www.python.org/dev/doc/maint22/ref/explicit-joining.html
-ttp://www.python.org/workshops/2000-01/proceedings/papers/
-tigges-wyvill/tigges-wyvill.html 
-http://www.python.org/peps/pep-0285.txt
-http://www.python.org/peps/pep-0285.html 
-
-Google:
-www.python.org/doc/current/lib/differ-examples.html
-www.python.org/doc/essays/ppt/python10/py10keynote.pdf
-www.python.org/dev/culture.html
-www.python.org/doc/essays/ppt/python10/py10keynote.ppt
-www.python.org/peps/pep-0285.html
-www.python.org/peps/pep-0287.html
-www.python.org/peps/pep-0287.txt
-www.python.org/peps/pep-0209.html
-www.python.org/~guido/Proposal.txt
-www.python.org/~guido/Proposal.doc
-
-query: "build hpux"
-
-Ultraseek:
-51% http://www.python.org/1.5/patches-1.5.1/configure.2.txt
-47% http://www.python.org/dev/doc/devel/whatsnew/node5.html
-43% http://www.python.org/1.5/patches-1.5.1/
-43% http://www.python.org/2.0/
-41% http://www.python.org/peps/pep-0243.html
-41 % http://www.python.org/ftp/python/binaries-1.3/ python-HP-UX-A.09.05-full.README
-39% http://www.python.org/ftp/python/binaries-1.3/ python-hppa1.1-hp-hpux10.10.README
-39% http://www.python.org/ftp/python/binaries-1.3/ python-hppa1.1-hp-hpux10.10.README
-35% http://www.python.org/peps/pep-0243.txt
-35% http://www.python.org/2.0.1/NEWS.txt
-35% http://python.sourceforge.net/peps/pep-0243.txt
-
-Google:
-www.python.org/2.1.1/NEWS.txt
-www.python.org/1.5/NEWS-152b2.txt
-
-query: "cannot create 'method-wrapper' instances"
-
-Ultraseek
-http://python.sourceforge.net/peps/pep-0007.txt
-http://www.python.org/workshops/1994-11/C++Python.txt
-http://www.python.org/peps/pep-0231.txt
-http://www.python.org/peps/pep-0231.html
-http://python.sourceforge.net/peps/pep-0231.txt
-http://www.python.org/dev/doc/maint22/lib/node383.html
-http://www.python.org/workshops/1994-11/BuiltInClasses/ BuiltInClasses_7.html
-http://www.python.org/workshops/1994-11/persistency.html
-http://www.python.org/dev/doc/maint22/lib/organizing-tests.html
-http://www.python.org/dev/doc/maint22/lib/module-SocketServer.html
-
-Google:
-no matches
-
-query: "extension module C++"
-http://www.python.org/dev/doc/devel/ext/building.html
-http://www.python.org/dev/doc/maint22/ext/module-defn-options.html
-http://www.python.org/dev/doc/maint21/ext/building-on-unix.html
-http://www.python.org/doc/1.6/ext/building-on-unix.html
-http://www.python.org/sigs/c++-sig/
-http://www.python.org/dev/doc/maint22/ext/intro.html
-http://www.python.org/dev/doc/maint22/ext/cplusplus.html
-http://www.python.org/doc/1.4/ext/node18.html
-http://www.python.org/doc/1.6/ext/building-on-windows.html
-http://www.python.org/doc/1.6/dist/node12.html
-
-Google:
-www.python.org/doc/current/ext/building-on-unix.html
-www.python.org/doc/current/ext/intro.html
-www.python.org/doc/current/ext/ext.html
-www.python.org/sigs/c++-sig/
-www.python.org/doc/current/ext/ module-defn-options.html
-www.python.org/doc/1.5.2p2/ext/building-on-unix.html
-www.python.org/doc/1.5.2p2/ext/contents.html
-www.python.org/doc/1.5.2p2/ext/ext.html
-www.python.org/doc/2.1.2/ext/building-on-unix.html
-www.python.org/doc/1.5.1/ext/intro.html
--- a/src/Products/ZCTextIndex/tests/queryhtml.py
+++ b/src/Products/ZCTextIndex/tests/queryhtml.py
-# XXX: Products.PluginIndexes.TextIndex no longer exists
-import os
-from time import clock
-
-import ZODB
-from ZODB.FileStorage import FileStorage
-
-QUERIES = ["nested recursive functions",
-           "explicit better than implicit",
-           "build hpux",
-           "cannot create 'method-wrapper' instances",
-            "extension module C++",
-           "class method",
-           "instance variable",
-           "articulate information",
-           "import default files",
-           "gopher ftp http",
-           "documentation",
-           ]
-
-def path2url(p):
-    # convert the paths to a python.org URL
-    # hack: only works for the way Jeremy indexed his copy of python.org
-    marker = "www.python.org/."
-    i = p.find(marker)
-    if i == -1:
-        return p
-    i += len(marker)
-    return "http://www.python.org" + p[i:]
-
-#from Products.PluginIndexes.TextIndex.TextIndex import And, Or
-from Products.ZCTextIndex.tests.indexhtml import MySplitter
-from Products.ZCTextIndex.NBest import NBest
-
-def main(rt):
-    index = rt["index"]
-    files = rt["files"]
-    times = {}
-    ITERS = range(50)
-    for i in range(11):
-        for q in QUERIES:
-            terms = q.split()
-            for c in " OR ", " AND ":
-                query = c.join(terms)
-                t0 = clock()
-                if TEXTINDEX:
-                    if c == " OR ":
-                        op = Or
-                    else:
-                        op = And
-                    _q = " ".join(terms)
-                    for _ in ITERS:
-                        b = index.query(_q, op).bucket()
-                        num = len(b)
-                        chooser = NBest(10)
-                        chooser.addmany(b.items())
-                        results = chooser.getbest()
-
-                else:
-                    try:
-                        for _ in ITERS:
-                            results, num = index.query(query)
-                    except:
-                        continue
-                t1 = clock()
-                print "<p>Query: \"%s\"" % query
-                print "<br>Num results: %d" % num
-                print "<br>time.clock(): %s" % (t1 - t0)
-                key = query
-                if i == 0:
-                    print "<ol>"
-                    for docid, score in results:
-                        url = path2url(files[docid])
-                        fmt = '<li><a href="%s">%s</A> score = %s'
-                        print fmt % (url, url, score)
-                    print "</ol>"
-                    continue
-                l = times.setdefault(key, [])
-                l.append(t1 - t0)
-
-    l = times.keys()
-    l.sort()
-    print "<hr>"
-    for k in l:
-        v = times[k]
-        print "<p>Query: \"%s\"" % k
-        print "<br>Min time: %s" % min(v)
-        print "<br>All times: %s" % " ".join(map(str, v))
-
-if __name__ == "__main__":
-    import sys
-    import getopt
-
-    VERBOSE = 0
-    FSPATH = "Data.fs"
-    TEXTINDEX = 0
-
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'vf:T')
-    except getopt.error, msg:
-        print msg
-        print __doc__
-        sys.exit(2)
-
-    for o, v in opts:
-        if o == '-v':
-            VERBOSE += 1
-        if o == '-f':
-            FSPATH = v
-#        if o == '-T':
-#            TEXTINDEX = 1
-
-    fs = FileStorage(FSPATH, read_only=1)
-    db = ZODB.DB(fs, cache_size=10000)
-    cn = db.open()
-    rt = cn.root()
-    main(rt)
--- a/src/Products/ZCTextIndex/tests/testHTMLSplitter.py
+++ b/src/Products/ZCTextIndex/tests/testHTMLSplitter.py
-##############################################################################
-#
-# Copyright (c) 2009 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Test zope.index.text.htmlsplitter
-"""
-import unittest
-
-class HTMLWordSplitterTests(unittest.TestCase):
-    # Subclasses must define '_getBTreesFamily'
-    def _getTargetClass(self):
-        from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
-        return HTMLWordSplitter
-
-    def _makeOne(self):
-        return self._getTargetClass()()
-
-    def test_class_conforms_to_ISplitter(self):
-        from zope.interface.verify import verifyClass
-        from Products.ZCTextIndex.interfaces import ISplitter
-        verifyClass(ISplitter, self._getTargetClass())
-
-    def test_instance_conforms_to_ISplitter(self):
-        from zope.interface.verify import verifyObject
-        from Products.ZCTextIndex.interfaces import ISplitter
-        verifyObject(ISplitter, self._makeOne())
-
-    def test_process_empty_string(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['']), [])
-
-    def test_process_no_markup(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['abc def']), ['abc', 'def'])
-
-    def test_process_w_markup(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['<h1>abc</h1> &nbsp; <p>def</p>']),
-                         ['abc', 'def'])
-
-    def test_process_no_markup_w_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['abc?def hij*klm nop* qrs?']),
-                         ['abc', 'def', 'hij', 'klm', 'nop', 'qrs'])
-
-    def test_processGlob_empty_string(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['']), [])
-
-    def test_processGlob_no_markup_no_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['abc def']), ['abc', 'def'])
-
-    def test_processGlob_w_markup_no_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['<h1>abc</h1> &nbsp; '
-                                               '<p>def</p>']),
-                         ['abc', 'def'])
-
-    def test_processGlob_no_markup_w_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['abc?def hij*klm nop* qrs?']),
-                         ['abc?def', 'hij*klm', 'nop*', 'qrs?'])
-
-def test_suite():
-    return unittest.TestSuite((
-        unittest.makeSuite(HTMLWordSplitterTests),
-    ))
--- a/src/Products/ZCTextIndex/tests/testIndex.py
+++ b/src/Products/ZCTextIndex/tests/testIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-import os
-from unittest import TestCase, TestSuite, main, makeSuite
-
-import transaction
-
-from BTrees.Length import Length
-from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
-from Products.ZCTextIndex.CosineIndex import CosineIndex
-from Products.ZCTextIndex.OkapiIndex import OkapiIndex
-
-# Subclasses must set a class variable IndexFactory to the appropriate
-# index object constructor.
-
-class IndexTest(TestCase):
-
-    def setUp(self):
-        self.lexicon = Lexicon(Splitter())
-        self.index = self.IndexFactory(self.lexicon)
-
-    def test_index_document(self, DOCID=1):
-        doc = "simple document contains five words"
-        self.assert_(not self.index.has_doc(DOCID))
-        self.index.index_doc(DOCID, doc)
-        self.assert_(self.index.has_doc(DOCID))
-        self.assert_(self.index._docweight[DOCID])
-        self.assertEqual(len(self.index._docweight), 1)
-        self.assertEqual(
-            len(self.index._docweight), self.index.document_count())
-        self.assertEqual(len(self.index._wordinfo), 5)
-        self.assertEqual(len(self.index._docwords), 1)
-        self.assertEqual(len(self.index.get_words(DOCID)), 5)
-        self.assertEqual(len(self.index._wordinfo),
-                         self.index.length())
-        for map in self.index._wordinfo.values():
-            self.assertEqual(len(map), 1)
-            self.assert_(map.has_key(DOCID))
-
-    def test_unindex_document(self):
-        DOCID = 1
-        self.test_index_document(DOCID)
-        self.index.unindex_doc(DOCID)
-        self.assertEqual(len(self.index._docweight), 0)
-        self.assertEqual(
-            len(self.index._docweight), self.index.document_count())
-        self.assertEqual(len(self.index._wordinfo), 0)
-        self.assertEqual(len(self.index._docwords), 0)
-        self.assertEqual(len(self.index._wordinfo),
-                         self.index.length())
-
-    def test_index_two_documents(self):
-        self.test_index_document()
-        doc = "another document just four"
-        DOCID = 2
-        self.index.index_doc(DOCID, doc)
-        self.assert_(self.index._docweight[DOCID])
-        self.assertEqual(len(self.index._docweight), 2)
-        self.assertEqual(
-            len(self.index._docweight), self.index.document_count())
-        self.assertEqual(len(self.index._wordinfo), 8)
-        self.assertEqual(len(self.index._docwords), 2)
-        self.assertEqual(len(self.index.get_words(DOCID)), 4)
-        self.assertEqual(len(self.index._wordinfo),
-                         self.index.length())
-        wids = self.lexicon.termToWordIds("document")
-        self.assertEqual(len(wids), 1)
-        document_wid = wids[0]
-        for wid, map in self.index._wordinfo.items():
-            if wid == document_wid:
-                self.assertEqual(len(map), 2)
-                self.assert_(map.has_key(1))
-                self.assert_(map.has_key(DOCID))
-            else:
-                self.assertEqual(len(map), 1)
-
-    def test_index_two_unindex_one(self):
-        # index two documents, unindex one, and test the results
-        self.test_index_two_documents()
-        self.index.unindex_doc(1)
-        DOCID = 2
-        self.assertEqual(len(self.index._docweight), 1)
-        self.assertEqual(
-            len(self.index._docweight), self.index.document_count())
-        self.assert_(self.index._docweight[DOCID])
-        self.assertEqual(len(self.index._wordinfo), 4)
-        self.assertEqual(len(self.index._docwords), 1)
-        self.assertEqual(len(self.index.get_words(DOCID)), 4)
-        self.assertEqual(len(self.index._wordinfo),
-                         self.index.length())
-        for map in self.index._wordinfo.values():
-            self.assertEqual(len(map), 1)
-            self.assert_(map.has_key(DOCID))
-
-    def test_index_duplicated_words(self, DOCID=1):
-        doc = "very simple repeat repeat repeat document test"
-        self.index.index_doc(DOCID, doc)
-        self.assert_(self.index._docweight[DOCID])
-        self.assertEqual(len(self.index._wordinfo), 5)
-        self.assertEqual(len(self.index._docwords), 1)
-        self.assertEqual(len(self.index.get_words(DOCID)), 7)
-        self.assertEqual(len(self.index._wordinfo),
-                         self.index.length())
-        self.assertEqual(
-            len(self.index._docweight), self.index.document_count())
-        wids = self.lexicon.termToWordIds("repeat")
-        self.assertEqual(len(wids), 1)
-        repititive_wid = wids[0]
-        for wid, map in self.index._wordinfo.items():
-            self.assertEqual(len(map), 1)
-            self.assert_(map.has_key(DOCID))
-
-    def test_simple_query_oneresult(self):
-        self.index.index_doc(1, 'not the same document')
-        results = self.index.search("document")
-        self.assertEqual(list(results.keys()), [1])
-
-    def test_simple_query_noresults(self):
-        self.index.index_doc(1, 'not the same document')
-        results = self.index.search("frobnicate")
-        self.assertEqual(list(results.keys()), [])
-
-    def test_query_oneresult(self):
-        self.index.index_doc(1, 'not the same document')
-        self.index.index_doc(2, 'something about something else')
-        results = self.index.search("document")
-        self.assertEqual(list(results.keys()), [1])
-
-    def test_search_phrase(self):
-        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
-        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
-        results = self.index.search_phrase("quick brown fox")
-        self.assertEqual(list(results.keys()), [1])
-
-    def test_search_glob(self):
-        self.index.index_doc(1, "how now brown cow")
-        self.index.index_doc(2, "hough nough browne cough")
-        self.index.index_doc(3, "bar brawl")
-        results = self.index.search_glob("bro*")
-        self.assertEqual(list(results.keys()), [1, 2])
-        results = self.index.search_glob("b*")
-        self.assertEqual(list(results.keys()), [1, 2, 3])
-
-class CosineIndexTest(IndexTest):
-    IndexFactory = CosineIndex
-
-class OkapiIndexTest(IndexTest):
-    IndexFactory = OkapiIndex
-
-class TestIndexConflict(TestCase):
-    
-    db = None
-
-    def tearDown(self):
-        if self.db is not None:
-            self.db.close()
-            self.storage.cleanup()
-
-    def openDB(self):
-        from ZODB.FileStorage import FileStorage
-        from ZODB.DB import DB
-        n = 'fs_tmp__%s' % os.getpid()
-        self.storage = FileStorage(n)
-        self.db = DB(self.storage)
-        
-    def test_index_doc_conflict(self):
-        self.index = OkapiIndex(Lexicon())
-        self.openDB()
-        r1 = self.db.open().root()
-        r1['i'] = self.index
-        transaction.commit()
-        
-        r2 = self.db.open().root()
-        copy = r2['i']
-        # Make sure the data is loaded
-        list(copy._docweight.items())
-        list(copy._docwords.items())
-        list(copy._wordinfo.items())
-        list(copy._lexicon._wids.items())
-        list(copy._lexicon._words.items())
-        
-        self.assertEqual(self.index._p_serial, copy._p_serial)
-        
-        self.index.index_doc(0, 'The time has come')
-        transaction.commit()
-        
-        copy.index_doc(1, 'That time has gone')
-        transaction.commit()
-
-    def test_reindex_doc_conflict(self):
-        self.index = OkapiIndex(Lexicon())
-        self.index.index_doc(0, 'Sometimes change is good')
-        self.index.index_doc(1, 'Then again, who asked')
-        self.openDB()
-        r1 = self.db.open().root()
-        r1['i'] = self.index
-        transaction.commit()
-        
-        r2 = self.db.open().root()
-        copy = r2['i']
-        # Make sure the data is loaded
-        list(copy._docweight.items())
-        list(copy._docwords.items())
-        list(copy._wordinfo.items())
-        list(copy._lexicon._wids.items())
-        list(copy._lexicon._words.items())
-        
-        self.assertEqual(self.index._p_serial, copy._p_serial)
-        
-        self.index.index_doc(0, 'Sometimes change isn\'t bad')
-        transaction.commit()
-        
-        copy.index_doc(1, 'Then again, who asked you?')
-        transaction.commit()
-        
-class TestUpgrade(TestCase):
-
-    def test_query_before_totaldoclen_upgrade(self):
-        self.index1 = OkapiIndex(Lexicon(Splitter()))
-        self.index1.index_doc(0, 'The quiet of night')
-        # Revert index1 back to a long to simulate an older index instance
-        self.index1._totaldoclen = long(self.index1._totaldoclen())
-        self.assertEqual(len(self.index1.search('night')), 1)
-    
-    def test_upgrade_totaldoclen(self):
-        self.index1 = OkapiIndex(Lexicon())
-        self.index2 = OkapiIndex(Lexicon())
-        self.index1.index_doc(0, 'The quiet of night')
-        self.index2.index_doc(0, 'The quiet of night')
-        # Revert index1 back to a long to simulate an older index instance
-        self.index1._totaldoclen = long(self.index1._totaldoclen())
-        self.index1.index_doc(1, 'gazes upon my shadow')
-        self.index2.index_doc(1, 'gazes upon my shadow')
-        self.assertEqual(
-            self.index1._totaldoclen(), self.index2._totaldoclen())
-        self.index1._totaldoclen = long(self.index1._totaldoclen())
-        self.index1.unindex_doc(0)
-        self.index2.unindex_doc(0)
-        self.assertEqual(
-            self.index1._totaldoclen(), self.index2._totaldoclen())
-
-    def test_query_before_document_count_upgrade(self):
-        self.index1 = OkapiIndex(Lexicon(Splitter()))
-        self.index1.index_doc(0, 'The quiet of night')
-        # Revert index1 back to a long to simulate an older index instance
-        del self.index1.document_count
-        self.assertEqual(len(self.index1.search('night')), 1)
-    
-    def test_upgrade_document_count(self):
-        self.index1 = OkapiIndex(Lexicon())
-        self.index2 = OkapiIndex(Lexicon())
-        self.index1.index_doc(0, 'The quiet of night')
-        self.index2.index_doc(0, 'The quiet of night')
-        # Revert index1 back to simulate an older index instance
-        del self.index1.document_count
-        self.index1.index_doc(1, 'gazes upon my shadow')
-        self.index2.index_doc(1, 'gazes upon my shadow')
-        self.assert_(self.index1.document_count.__class__ is Length)
-        self.assertEqual(
-            self.index1.document_count(), self.index2.document_count())
-        del self.index1.document_count
-        self.index1.unindex_doc(0)
-        self.index2.unindex_doc(0)
-        self.assert_(self.index1.document_count.__class__ is Length)
-        self.assertEqual(
-            self.index1.document_count(), self.index2.document_count())
-        
-        
-        
-def test_suite():
-    return TestSuite((makeSuite(CosineIndexTest),
-                      makeSuite(OkapiIndexTest),
-                      makeSuite(TestIndexConflict),
-                      makeSuite(TestUpgrade),
-                    ))
-
-if __name__=='__main__':
-    main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testLexicon.py
+++ b/src/Products/ZCTextIndex/tests/testLexicon.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Lexicon unit tests.
-
-$Id$
-"""
-
-import unittest
-
-import os, sys
-
-import ZODB
-import transaction
-
-from Products.ZCTextIndex.Lexicon import Lexicon
-from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer
-
-class StupidPipelineElement:
-    def __init__(self, fromword, toword):
-        self.__fromword = fromword
-        self.__toword = toword
-
-    def process(self, seq):
-        res = []
-        for term in seq:
-            if term == self.__fromword:
-                res.append(self.__toword)
-            else:
-                res.append(term)
-        return res
-
-class WackyReversePipelineElement:
-    def __init__(self, revword):
-        self.__revword = revword
-
-    def process(self, seq):
-        res = []
-        for term in seq:
-            if term == self.__revword:
-                x = list(term)
-                x.reverse()
-                res.append(''.join(x))
-            else:
-                res.append(term)
-        return res
-
-class StopWordPipelineElement:
-    def __init__(self, stopdict={}):
-        self.__stopdict = stopdict
-
-    def process(self, seq):
-        res = []
-        for term in seq:
-            if self.__stopdict.get(term):
-                continue
-            else:
-                res.append(term)
-        return res
-
-
-class Test(unittest.TestCase):
-
-    def test_z3interfaces(self):
-        from Products.ZCTextIndex.interfaces import ILexicon
-        from zope.interface.verify import verifyClass
-
-        verifyClass(ILexicon, Lexicon)
-
-    def testSourceToWordIds(self):
-        lexicon = Lexicon(Splitter())
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        self.assertEqual(wids, [1, 2, 3])
-
-    def testTermToWordIds(self):
-        lexicon = Lexicon(Splitter())
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('dogs')
-        self.assertEqual(wids, [3])
-
-    def testMissingTermToWordIds(self):
-        lexicon = Lexicon(Splitter())
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('boxes')
-        self.assertEqual(wids, [0])
-
-    def testTermToWordIdsWithProcess_post_glob(self):
-        """This test is for added process_post_glob"""
-        class AddedSplitter(Splitter):
-            def process_post_glob(self, lst):
-                assert lst == ['dogs']
-                return ['dogs']
-        lexicon = Lexicon(AddedSplitter())
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('dogs')
-        self.assertEqual(wids, [3])
-
-    def testMissingTermToWordIdsWithProcess_post_glob(self):
-        """This test is for added process_post_glob"""
-        class AddedSplitter(Splitter):
-            def process_post_glob(self, lst):
-                assert lst == ['dogs']
-                return ['fox']
-        lexicon = Lexicon(AddedSplitter())
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('dogs')
-        self.assertEqual(wids, [0])
-
-    def testOnePipelineElement(self):
-        lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('fish')
-        self.assertEqual(wids, [3])
-
-    def testSplitterAdaptorFold(self):
-        lexicon = Lexicon(Splitter(), CaseNormalizer())
-        wids = lexicon.sourceToWordIds('CATS and dogs')
-        wids = lexicon.termToWordIds('cats and dogs')
-        self.assertEqual(wids, [1, 2, 3])
-
-    def testSplitterAdaptorNofold(self):
-        lexicon = Lexicon(Splitter())
-        wids = lexicon.sourceToWordIds('CATS and dogs')
-        wids = lexicon.termToWordIds('cats and dogs')
-        self.assertEqual(wids, [0, 2, 3])
-
-    def testTwoElementPipeline(self):
-        lexicon = Lexicon(Splitter(),
-                          StupidPipelineElement('cats', 'fish'),
-                          WackyReversePipelineElement('fish'))
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('hsif')
-        self.assertEqual(wids, [1])
-
-    def testThreeElementPipeline(self):
-        lexicon = Lexicon(Splitter(),
-                          StopWordPipelineElement({'and':1}),
-                          StupidPipelineElement('dogs', 'fish'),
-                          WackyReversePipelineElement('fish'))
-        wids = lexicon.sourceToWordIds('cats and dogs')
-        wids = lexicon.termToWordIds('hsif')
-        self.assertEqual(wids, [2])
-        
-    def testSplitterLocaleAwareness(self):
-        from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
-        import locale
-        loc = locale.setlocale(locale.LC_ALL) # get current locale
-         # set German locale
-        try:
-            if sys.platform != 'win32':
-                locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1')
-            else:
-                locale.setlocale(locale.LC_ALL, 'German_Germany.1252')
-        except locale.Error:
-            return # This test doesn't work here :-(
-        expected = ['m\xfclltonne', 'waschb\xe4r',
-                    'beh\xf6rde', '\xfcberflieger']
-        words = [" ".join(expected)]
-        words = Splitter().process(words)
-        self.assertEqual(words, expected)
-        words = HTMLWordSplitter().process(words)
-        self.assertEqual(words, expected)
-        locale.setlocale(locale.LC_ALL, loc) # restore saved locale
-        
-    def testUpgradeLength(self):
-        from BTrees.Length import Length
-        lexicon = Lexicon(Splitter())
-        del lexicon.length # Older instances don't override length
-        lexicon.sourceToWordIds('how now brown cow')
-        self.assert_(lexicon.length.__class__ is Length)        
-        
-class TestLexiconConflict(unittest.TestCase):
-    
-    db = None
-
-    def tearDown(self):
-        if self.db is not None:
-            self.db.close()
-            self.storage.cleanup()
-
-    def openDB(self):
-        from ZODB.FileStorage import FileStorage
-        from ZODB.DB import DB
-        n = 'fs_tmp__%s' % os.getpid()
-        self.storage = FileStorage(n)
-        self.db = DB(self.storage)
-        
-    def testAddWordConflict(self):
-        self.l = Lexicon(Splitter())
-        self.openDB()
-        r1 = self.db.open().root()
-        r1['l'] = self.l
-        transaction.commit()
-        
-        r2 = self.db.open().root()
-        copy = r2['l']
-        # Make sure the data is loaded
-        list(copy._wids.items())
-        list(copy._words.items())
-        copy.length()
-        
-        self.assertEqual(self.l._p_serial, copy._p_serial)
-        
-        self.l.sourceToWordIds('mary had a little lamb')
-        transaction.commit()
-        
-        copy.sourceToWordIds('whose fleece was')
-        copy.sourceToWordIds('white as snow')
-        transaction.commit()
-        self.assertEqual(copy.length(), 11)
-        self.assertEqual(copy.length(), len(copy._words))
-
-
-def test_suite():
-    suite = unittest.TestSuite()
-    suite.addTest(unittest.makeSuite(Test))
-    suite.addTest(unittest.makeSuite(TestLexiconConflict))
-    return suite
-
-if __name__=='__main__':
-    unittest.main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testNBest.py
+++ b/src/Products/ZCTextIndex/tests/testNBest.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from unittest import TestCase, TestSuite, main, makeSuite
-
-from Products.ZCTextIndex.NBest import NBest
-
-class NBestTest(TestCase):
-
-    def testConstructor(self):
-        self.assertRaises(ValueError, NBest, 0)
-        self.assertRaises(ValueError, NBest, -1)
-
-        for n in range(1, 11):
-            nb = NBest(n)
-            self.assertEqual(len(nb), 0)
-            self.assertEqual(nb.capacity(), n)
-
-    def testOne(self):
-        nb = NBest(1)
-        nb.add('a', 0)
-        self.assertEqual(nb.getbest(), [('a', 0)])
-
-        nb.add('b', 1)
-        self.assertEqual(len(nb), 1)
-        self.assertEqual(nb.capacity(), 1)
-        self.assertEqual(nb.getbest(), [('b', 1)])
-
-        nb.add('c', -1)
-        self.assertEqual(len(nb), 1)
-        self.assertEqual(nb.capacity(), 1)
-        self.assertEqual(nb.getbest(), [('b', 1)])
-
-        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
-        self.assertEqual(len(nb), 1)
-        self.assertEqual(nb.capacity(), 1)
-        self.assertEqual(nb.getbest(), [('f', 5)])
-
-    def testMany(self):
-        import random
-        inputs = [(-i, i) for i in range(50)]
-
-        reversed_inputs = inputs[:]
-        reversed_inputs.reverse()
-
-        # Test the N-best for a variety of n (1, 6, 11, ... 50).
-        for n in range(1, len(inputs)+1, 5):
-            expected = inputs[-n:]
-            expected.reverse()
-
-            random_inputs = inputs[:]
-            random.shuffle(random_inputs)
-
-            for source in inputs, reversed_inputs, random_inputs:
-                # Try feeding them one at a time.
-                nb = NBest(n)
-                for item, score in source:
-                    nb.add(item, score)
-                self.assertEqual(len(nb), n)
-                self.assertEqual(nb.capacity(), n)
-                self.assertEqual(nb.getbest(), expected)
-
-                # And again in one gulp.
-                nb = NBest(n)
-                nb.addmany(source)
-                self.assertEqual(len(nb), n)
-                self.assertEqual(nb.capacity(), n)
-                self.assertEqual(nb.getbest(), expected)
-
-                for i in range(1, n+1):
-                    self.assertEqual(nb.pop_smallest(), expected[-i])
-                self.assertRaises(IndexError, nb.pop_smallest)
-
-def test_suite():
-    return makeSuite(NBestTest)
-
-if __name__=='__main__':
-    main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testParseTree.py
+++ b/src/Products/ZCTextIndex/tests/testParseTree.py
-##############################################################################
-#
-# Copyright (c) 2008 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-import unittest
-
-class ParseTreeTests(unittest.TestCase):
-
-    def _conforms(self, klass):
-        from zope.interface.verify import verifyClass
-        from Products.ZCTextIndex.interfaces import IQueryParseTree
-        verifyClass(IQueryParseTree, klass)
-
-    def test_ParseTreeNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import ParseTreeNode
-        self._conforms(ParseTreeNode)
-
-    def test_OrNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import OrNode
-        self._conforms(OrNode)
-
-    def test_AndNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        self._conforms(AndNode)
-
-    def test_NotNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import NotNode
-        self._conforms(NotNode)
-
-    def test_GlobNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import GlobNode
-        self._conforms(GlobNode)
-
-    def test_AtomNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self._conforms(AtomNode)
-
-    def test_PhraseNode_conforms_to_IQueryParseTree(self):
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        self._conforms(PhraseNode)
-
-
-def test_suite():
-    return unittest.TestSuite((
-            unittest.makeSuite(ParseTreeTests),
-        ))
-
-if __name__=="__main__":
-    unittest.main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testPipelineFactory.py
+++ b/src/Products/ZCTextIndex/tests/testPipelineFactory.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-
-from unittest import TestCase, TestSuite, main, makeSuite
-from Products.ZCTextIndex.interfaces import IPipelineElement
-from Products.ZCTextIndex.PipelineFactory import PipelineElementFactory
-from zope.interface import implements
-
-class NullPipelineElement:
-
-    implements(IPipelineElement)
-
-    def process(source):
-        pass
-
-class PipelineFactoryTest(TestCase):
-
-    def setUp(self):
-        self.huey = NullPipelineElement()
-        self.dooey = NullPipelineElement()
-        self.louie = NullPipelineElement()
-        self.daffy = NullPipelineElement()
-
-    def testPipeline(self):
-        pf = PipelineElementFactory()
-        pf.registerFactory('donald', 'huey', self.huey)
-        pf.registerFactory('donald', 'dooey',  self.dooey)
-        pf.registerFactory('donald', 'louie', self.louie)
-        pf.registerFactory('looney', 'daffy', self.daffy)
-        self.assertRaises(ValueError, pf.registerFactory,'donald',  'huey',
-                          self.huey)
-        self.assertEqual(pf.getFactoryGroups(), ['donald', 'looney'])
-        self.assertEqual(pf.getFactoryNames('donald'),
-                         ['dooey', 'huey', 'louie'])
-
-def test_suite():
-    return makeSuite(PipelineFactoryTest)
-
-if __name__=='__main__':
-    main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testQueryEngine.py
+++ b/src/Products/ZCTextIndex/tests/testQueryEngine.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from unittest import TestCase, TestSuite, main, makeSuite
-
-from BTrees.IIBTree import IIBucket
-
-from Products.ZCTextIndex.QueryParser import QueryParser
-from Products.ZCTextIndex.ParseTree import ParseError, QueryError
-from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
-
-class FauxIndex:
-
-    def search(self, term):
-        b = IIBucket()
-        if term == "foo":
-            b[1] = b[3] = 1
-        elif term == "bar":
-            b[1] = b[2] = 1
-        elif term == "ham":
-            b[1] = b[2] = b[3] = b[4] = 1
-        return b
-
-class TestQueryEngine(TestCase):
-
-    def setUp(self):
-        self.lexicon = Lexicon(Splitter())
-        self.parser = QueryParser(self.lexicon)
-        self.index = FauxIndex()
-
-    def compareSet(self, set, dict):
-        d = {}
-        for k, v in set.items():
-            d[k] = v
-        self.assertEqual(d, dict)
-
-    def compareQuery(self, query, dict):
-        tree = self.parser.parseQuery(query)
-        set = tree.executeQuery(self.index)
-        self.compareSet(set, dict)
-
-    def testExecuteQuery(self):
-        self.compareQuery("foo AND bar", {1: 2})
-        self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
-        self.compareQuery("foo AND NOT bar", {3: 1})
-        self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
-        self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
-        self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
-        self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
-        self.compareQuery("ham AND foo AND bar", {1: 3})
-
-    def testInvalidQuery(self):
-        from Products.ZCTextIndex.ParseTree import NotNode, AtomNode
-        tree = NotNode(AtomNode("foo"))
-        self.assertRaises(QueryError, tree.executeQuery, self.index)
-
-def test_suite():
-    return makeSuite(TestQueryEngine)
-
-if __name__=='__main__':
-    main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testQueryParser.py
+++ b/src/Products/ZCTextIndex/tests/testQueryParser.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from unittest import TestCase, TestSuite, main, makeSuite
-
-class TestInterfaces(TestCase):
-
-    def testInterfaces(self):
-        from zope.interface.verify import verifyClass
-        from Products.ZCTextIndex.interfaces import IQueryParser
-        from Products.ZCTextIndex.QueryParser import QueryParser
-        verifyClass(IQueryParser, QueryParser)
-
-
-class TestQueryParserBase(TestCase):
-
-    def setUp(self):
-        from Products.ZCTextIndex.QueryParser import QueryParser
-        from Products.ZCTextIndex.Lexicon import Lexicon
-        from Products.ZCTextIndex.Lexicon import Splitter
-        self.lexicon = Lexicon(Splitter())
-        self.parser = QueryParser(self.lexicon)
-
-    def expect(self, input, output, expected_ignored=[]):
-        tree = self.parser.parseQuery(input)
-        ignored = self.parser.getIgnored()
-        self.compareParseTrees(tree, output)
-        self.assertEqual(ignored, expected_ignored)
-        # Check that parseQueryEx() == (parseQuery(), getIgnored())
-        ex_tree, ex_ignored = self.parser.parseQueryEx(input)
-        self.compareParseTrees(ex_tree, tree)
-        self.assertEqual(ex_ignored, expected_ignored)
-
-    def failure(self, input):
-        from Products.ZCTextIndex.ParseTree import ParseError
-        self.assertRaises(ParseError, self.parser.parseQuery, input)
-        self.assertRaises(ParseError, self.parser.parseQueryEx, input)
-
-    def compareParseTrees(self, got, expected, msg=None):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import GlobNode
-        from Products.ZCTextIndex.ParseTree import NotNode
-        from Products.ZCTextIndex.ParseTree import OrNode
-        from Products.ZCTextIndex.ParseTree import ParseTreeNode
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        if msg is None:
-            msg = repr(got)
-        self.assertEqual(isinstance(got, ParseTreeNode), 1)
-        self.assertEqual(got.__class__, expected.__class__, msg)
-        if isinstance(got, PhraseNode):
-            self.assertEqual(got.nodeType(), "PHRASE", msg)
-            self.assertEqual(got.getValue(), expected.getValue(), msg)
-        elif isinstance(got, GlobNode):
-            self.assertEqual(got.nodeType(), "GLOB", msg)
-            self.assertEqual(got.getValue(), expected.getValue(), msg)
-        elif isinstance(got, AtomNode):
-            self.assertEqual(got.nodeType(), "ATOM", msg)
-            self.assertEqual(got.getValue(), expected.getValue(), msg)
-        elif isinstance(got, NotNode):
-            self.assertEqual(got.nodeType(), "NOT")
-            self.compareParseTrees(got.getValue(), expected.getValue(), msg)
-        elif isinstance(got, AndNode) or isinstance(got, OrNode):
-            self.assertEqual(got.nodeType(),
-                             isinstance(got, AndNode) and "AND" or "OR", msg)
-            list1 = got.getValue()
-            list2 = expected.getValue()
-            self.assertEqual(len(list1), len(list2), msg)
-            for i in range(len(list1)):
-                self.compareParseTrees(list1[i], list2[i], msg)
-
-
-class TestQueryParser(TestQueryParserBase):
-
-    def test001(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect("foo", AtomNode("foo"))
-
-    def test002(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect("note", AtomNode("note"))
-
-    def test003(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect("aa and bb AND cc",
-                    AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
-
-    def test004(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import OrNode
-        self.expect("aa OR bb or cc",
-                    OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
-
-    def test005(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import OrNode
-        self.expect("aa AND bb OR cc AnD dd",
-                    OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
-                            AndNode([AtomNode("cc"), AtomNode("dd")])]))
-
-    def test006(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import OrNode
-        self.expect("(aa OR bb) AND (cc OR dd)",
-                    AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
-                             OrNode([AtomNode("cc"), AtomNode("dd")])]))
-
-    def test007(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import NotNode
-        self.expect("aa AND NOT bb",
-                    AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
-
-    def test010(self):
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
-
-    def test011(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
-
-    def test012(self):
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
-
-    def test013(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
-
-    def test014(self):
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        self.expect("foo-bar", PhraseNode(["foo", "bar"]))
-
-    def test015(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import NotNode
-        self.expect("foo -bar", AndNode([AtomNode("foo"),
-                                         NotNode(AtomNode("bar"))]))
-
-    def test016(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import NotNode
-        self.expect("-foo bar", AndNode([AtomNode("bar"),
-                                         NotNode(AtomNode("foo"))]))
-
-    def test017(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import NotNode
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        self.expect("booh -foo-bar",
-                    AndNode([AtomNode("booh"),
-                             NotNode(PhraseNode(["foo", "bar"]))]))
-
-    def test018(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import NotNode
-        from Products.ZCTextIndex.ParseTree import PhraseNode
-        self.expect('booh -"foo bar"',
-                    AndNode([AtomNode("booh"),
-                             NotNode(PhraseNode(["foo", "bar"]))]))
-
-    def test019(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('foo"bar"',
-                    AndNode([AtomNode("foo"), AtomNode("bar")]))
-
-    def test020(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('"foo"bar',
-                    AndNode([AtomNode("foo"), AtomNode("bar")]))
-
-    def test021(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('foo"bar"blech',
-                    AndNode([AtomNode("foo"), AtomNode("bar"),
-                             AtomNode("blech")]))
-
-    def test022(self):
-        from Products.ZCTextIndex.ParseTree import GlobNode
-        self.expect("foo*", GlobNode("foo*"))
-
-    def test023(self):
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        from Products.ZCTextIndex.ParseTree import GlobNode
-        self.expect("foo* bar", AndNode([GlobNode("foo*"),
-                                         AtomNode("bar")]))
-
-    def test024(self):
-        # Split by UTF-8 fullwidth space
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect("foo\xe3\x80\x80bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
-
-    def test025(self):
-        # Split by Unicode fullwidth space
-        from Products.ZCTextIndex.ParseTree import AndNode
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect(u"foo\u3000bar", AndNode([AtomNode(u"foo"), AtomNode(u"bar")]))
-
-    def test101(self):
-        self.failure("")
-
-    def test102(self):
-        self.failure("not")
-
-    def test103(self):
-        self.failure("or")
-
-    def test104(self):
-        self.failure("and")
-
-    def test105(self):
-        self.failure("NOT")
-
-    def test106(self):
-        self.failure("OR")
-
-    def test107(self):
-        self.failure("AND")
-
-    def test108(self):
-        self.failure("NOT foo")
-
-    def test109(self):
-        self.failure(")")
-
-    def test110(self):
-        self.failure("(")
-
-    def test111(self):
-        self.failure("foo OR")
-
-    def test112(self):
-        self.failure("foo AND")
-
-    def test113(self):
-        self.failure("OR foo")
-
-    def test114(self):
-        self.failure("AND foo")
-
-    def test115(self):
-        self.failure("(foo) bar")
-
-    def test116(self):
-        self.failure("(foo OR)")
-
-    def test117(self):
-        self.failure("(foo AND)")
-
-    def test118(self):
-        self.failure("(NOT foo)")
-
-    def test119(self):
-        self.failure("-foo")
-
-    def test120(self):
-        self.failure("-foo -bar")
-
-    def test121(self):
-        self.failure("foo OR -bar")
-
-    def test122(self):
-        self.failure("foo AND -bar")
-
-
-class StopWordTestQueryParser(TestQueryParserBase):
-
-    def setUp(self):
-        from Products.ZCTextIndex.QueryParser import QueryParser
-        from Products.ZCTextIndex.Lexicon import Lexicon
-        from Products.ZCTextIndex.Lexicon import Splitter
-        # Only 'stop' is a stopword (but 'and' is still an operator)
-        self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
-        self.parser = QueryParser(self.lexicon)
-
-    def test201(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('and/', AtomNode("and"))
-
-    def test202(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('foo AND stop', AtomNode("foo"), ["stop"])
-
-    def test203(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
-
-    def test204(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('stop AND foo', AtomNode("foo"), ["stop"])
-
-    def test205(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('foo OR stop', AtomNode("foo"), ["stop"])
-
-    def test206(self):
-        from Products.ZCTextIndex.ParseTree import AtomNode
-        self.expect('stop OR foo', AtomNode("foo"), ["stop"])
-
-    def test301(self):
-        self.failure('stop')
-
-    def test302(self):
-        self.failure('stop stop')
-
-    def test303(self):
-        self.failure('stop AND stop')
-
-    def test304(self):
-        self.failure('stop OR stop')
-
-    def test305(self):
-        self.failure('stop -foo')
-
-    def test306(self):
-        self.failure('stop AND NOT foo')
-
-
-class FakeStopWordRemover:
-
-    def process(self, list):
-        return [word for word in list if word != "stop"]
-
-
-def test_suite():
-    return TestSuite((makeSuite(TestQueryParser),
-                      makeSuite(StopWordTestQueryParser),
-                      makeSuite(TestInterfaces),
-                    ))
-
-
-if __name__=="__main__":
-    main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testSetOps.py
+++ b/src/Products/ZCTextIndex/tests/testSetOps.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-
-from unittest import TestCase, TestSuite, main, makeSuite
-
-from BTrees.IIBTree import IIBTree, IIBucket
-
-from Products.ZCTextIndex.SetOps import mass_weightedIntersection
-from Products.ZCTextIndex.SetOps import mass_weightedUnion
-
-class TestSetOps(TestCase):
-
-    def testEmptyLists(self):
-        self.assertEqual(len(mass_weightedIntersection([])), 0)
-        self.assertEqual(len(mass_weightedUnion([])), 0)
-
-    def testIdentity(self):
-        t = IIBTree([(1, 2)])
-        b = IIBucket([(1, 2)])
-        for x in t, b:
-            for func in mass_weightedUnion, mass_weightedIntersection:
-                result = func([(x, 1)])
-                self.assertEqual(len(result), 1)
-                self.assertEqual(list(result.items()), list(x.items()))
-
-    def testScalarMultiply(self):
-        t = IIBTree([(1, 2), (2, 3), (3, 4)])
-        allkeys = [1, 2, 3]
-        b = IIBucket(t)
-        for x in t, b:
-            self.assertEqual(list(x.keys()), allkeys)
-            for func in mass_weightedUnion, mass_weightedIntersection:
-                for factor in 0, 1, 5, 10:
-                    result = func([(x, factor)])
-                    self.assertEqual(allkeys, list(result.keys()))
-                    for key in x.keys():
-                        self.assertEqual(x[key] * factor, result[key])
-
-    def testPairs(self):
-        t1 = IIBTree([(1, 10), (3, 30), (7, 70)])
-        t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)])
-        allkeys = [1, 3, 5, 7, 9]
-        b1 = IIBucket(t1)
-        b2 = IIBucket(t2)
-        for x in t1, t2, b1, b2:
-            for key in x.keys():
-                self.assertEqual(key in allkeys, 1)
-            for y in t1, t2, b1, b2:
-                for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3):
-                    # Test the union.
-                    expected = []
-                    for key in allkeys:
-                        if x.has_key(key) or y.has_key(key):
-                            result = x.get(key, 0) * w1 + y.get(key, 0) * w2
-                            expected.append((key, result))
-                    expected.sort()
-                    got = mass_weightedUnion([(x, w1), (y, w2)])
-                    self.assertEqual(expected, list(got.items()))
-                    got = mass_weightedUnion([(y, w2), (x, w1)])
-                    self.assertEqual(expected, list(got.items()))
-
-                    # Test the intersection.
-                    expected = []
-                    for key in allkeys:
-                        if x.has_key(key) and y.has_key(key):
-                            result = x[key] * w1 + y[key] * w2
-                            expected.append((key, result))
-                    expected.sort()
-                    got = mass_weightedIntersection([(x, w1), (y, w2)])
-                    self.assertEqual(expected, list(got.items()))
-                    got = mass_weightedIntersection([(y, w2), (x, w1)])
-                    self.assertEqual(expected, list(got.items()))
-
-    def testMany(self):
-        import random
-        N = 15  # number of IIBTrees to feed in
-        L = []
-        commonkey = N * 1000
-        allkeys = {commonkey: 1}
-        for i in range(N):
-            t = IIBTree()
-            t[commonkey] = i
-            for j in range(N-i):
-                key = i + j
-                allkeys[key] = 1
-                t[key] = N*i + j
-            L.append((t, i+1))
-        random.shuffle(L)
-        allkeys = allkeys.keys()
-        allkeys.sort()
-
-        # Test the union.
-        expected = []
-        for key in allkeys:
-            sum = 0
-            for t, w in L:
-                if t.has_key(key):
-                    sum += t[key] * w
-            expected.append((key, sum))
-        # print 'union', expected
-        got = mass_weightedUnion(L)
-        self.assertEqual(expected, list(got.items()))
-
-        # Test the intersection.
-        expected = []
-        for key in allkeys:
-            sum = 0
-            for t, w in L:
-                if t.has_key(key):
-                    sum += t[key] * w
-                else:
-                    break
-            else:
-                # We didn't break out of the loop so it's in the intersection.
-                expected.append((key, sum))
-        # print 'intersection', expected
-        got = mass_weightedIntersection(L)
-        self.assertEqual(expected, list(got.items()))
-
-def test_suite():
-    return makeSuite(TestSetOps)
-
-if __name__=="__main__":
-    main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testStopper.py
+++ b/src/Products/ZCTextIndex/tests/testStopper.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Tests for the C version of the StopWordRemover."""
-
-import unittest
-
-from Products.ZCTextIndex import stopper
-
-
-class StopperTest(unittest.TestCase):
-    def test_process_typeerror(self):
-        self.assertRaises(TypeError, stopper.process, 42, [])
-        self.assertRaises(TypeError, stopper.process, {}, 42)
-        self.assertRaises(TypeError, stopper.process, {})
-        self.assertRaises(TypeError, stopper.process, {}, [], 'extra arg')
-
-    def test_process_nostops(self):
-        words = ['a', 'b', 'c', 'splat!']
-        self.assertEqual(words, stopper.process({}, words))
-
-    def test_process_somestops(self):
-        d = {'b':1, 'splat!':1}
-        words = ['a', 'b', 'c', 'splat!']
-        self.assertEqual(['a', 'c'], stopper.process(d, words))
-
-    def test_process_allstops(self):
-        d = {'a':1, 'b':1, 'c':1, 'splat!':1}
-        words = ['a', 'b', 'c', 'splat!']
-        self.assertEqual([], stopper.process(d, words))
-
-
-def test_suite():
-    return unittest.makeSuite(StopperTest)
-
-if __name__ == "__main__":
-    unittest.main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/testZCTextIndex.py
+++ b/src/Products/ZCTextIndex/tests/testZCTextIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""ZCTextIndex unit tests.
-
-$Id$
-"""
-
-import unittest
-
-import re
-
-import Acquisition
-from zExceptions import NotFound
-
-from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex, PLexicon
-from Products.ZCTextIndex.tests import \
-     testIndex, testQueryEngine, testQueryParser
-from Products.ZCTextIndex.BaseIndex import \
-     scaled_int, SCALE_FACTOR, inverse_doc_frequency
-from Products.ZCTextIndex.CosineIndex import CosineIndex
-from Products.ZCTextIndex.OkapiIndex import OkapiIndex
-from Products.ZCTextIndex.Lexicon import Splitter
-from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
-from Products.ZCTextIndex.QueryParser import QueryParser
-from Products.ZCTextIndex.StopDict import get_stopdict
-from Products.ZCTextIndex.ParseTree import ParseError
-
-
-class Indexable:
-    def __init__(self, text):
-        self.text = text
-
-class Indexable2:
-    def __init__(self, text1, text2):
-        self.text1 = text1
-        self.text2 = text2
-
-class LexiconHolder(Acquisition.Implicit):
-    def __init__(self, lexicon):
-        self.lexicon = lexicon
-
-    def getPhysicalPath(self):
-        return ('',) # Pretend to be the root
-
-def dummyUnrestrictedTraverse(self, path):
-    if path == ('', 'lexicon',):
-        return self.lexicon
-    raise NotFound, path
-
-# The tests classes below create a ZCTextIndex().  Then they create
-# instance variables that point to the internal components used by
-# ZCTextIndex.  These tests run the individual module unit tests with
-# the fully integrated ZCTextIndex.
-
-def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
-    if abs(scaled1 - scaled2) > epsilon:
-        raise AssertionError, "%s != %s" % (scaled1, scaled2)
-
-# A series of text chunks to use for the re-index tests (testDocUpdate).
-text = [
-    """Here's a knocking indeed! If a
-    man were porter of hell-gate, he should have
-    old turning the key.  knock (that made sure
-    sure there's at least one word in common)."""
-
-    """Knock,
-    knock, knock! Who's there, i' the name of
-    Beelzebub? Here's a farmer, that hanged
-    himself on the expectation of plenty: come in
-    time; have napkins enow about you; here
-    you'll sweat for't.""",
-
-    """Knock,
-    knock! Who's there, in the other devil's
-    name? Faith, here's an equivocator, that could
-    swear in both the scales against either scale;
-    who committed treason enough for God's sake,
-    yet could not equivocate to heaven: O, come
-    in, equivocator.""",
-
-    """Knock,
-    knock, knock! Who's there? Faith, here's an
-    English tailor come hither, for stealing out of
-    a French hose: come in, tailor; here you may
-    roast your goose.""",
-
-    """Knock,
-    knock; never at quiet! What are you? But
-    this place is too cold for hell. I'll devil-porter
-    it no further: I had thought to have let in
-    some of all professions that go the primrose
-    way to the everlasting bonfire."""
-]
-
-# Subclasses should derive from one of testIndex.{CosineIndexTest,
-# OkapiIndexTest} too.
-
-class ZCIndexTestsBase:
-
-    def setUp(self):
-        self.lexicon = PLexicon('lexicon', '',
-                                Splitter(),
-                                CaseNormalizer(),
-                                StopWordRemover())
-        caller = LexiconHolder(self.lexicon)
-        self.zc_index = ZCTextIndex('name',
-                                    None,
-                                    caller,
-                                    self.IndexFactory,
-                                    'text',
-                                    'lexicon')
-        self.index = self.zc_index.index
-
-
-    def parserFailure(self, query):
-        self.assertRaises(ParseError, self.zc_index.query, query)
-
-    def parserSuccess(self, query, n):
-        r, num = self.zc_index.query(query)
-        self.assertEqual(num, n)
-        if n:
-            self.assertEqual(r[0][0], 1)
-
-    def testMultipleAttributes(self):
-        lexicon = PLexicon('lexicon', '',
-                            Splitter(),
-                            CaseNormalizer(),
-                            StopWordRemover())
-        caller = LexiconHolder(self.lexicon)
-        zc_index = ZCTextIndex('name',
-                                None,
-                                caller,
-                                self.IndexFactory,
-                               'text1,text2',
-                               'lexicon')
-        doc = Indexable2('foo bar', 'alpha omega')
-        zc_index.index_object(1, doc)
-        nbest, total = zc_index.query('foo')
-        self.assertEqual(len(nbest), 1)
-        nbest, total = zc_index.query('foo alpha')
-        self.assertEqual(len(nbest), 1)
-        nbest, total = zc_index.query('foo alpha gamma')
-        self.assertEqual(len(nbest), 0)
-
-    def testListAttributes(self):
-        lexicon = PLexicon('lexicon', '',
-                            Splitter(),
-                            CaseNormalizer(),
-                            StopWordRemover())
-        caller = LexiconHolder(self.lexicon)
-        zc_index = ZCTextIndex('name',
-                                None,
-                                caller,
-                                self.IndexFactory,
-                               'text1,text2',
-                               'lexicon')
-        doc = Indexable2('Hello Tim', \
-                         ['Now is the winter of our discontent',
-                          'Made glorious summer by this sun of York', ])
-        zc_index.index_object(1, doc)
-        nbest, total = zc_index.query('glorious')
-        self.assertEqual(len(nbest), 1)
-        nbest, total = zc_index.query('York Tim')
-        self.assertEqual(len(nbest), 1)
-        nbest, total = zc_index.query('Tuesday Tim York')
-        self.assertEqual(len(nbest), 0)
-
-    def testStopWords(self):
-        # the only non-stopword is question
-        text = ("to be or not to be "
-                "that is the question")
-        doc = Indexable(text)
-        self.zc_index.index_object(1, doc)
-        for word in text.split():
-            if word != "question":
-                wids = self.lexicon.termToWordIds(word)
-                self.assertEqual(wids, [])
-        self.assertEqual(len(self.index.get_words(1)), 1)
-
-        self.parserSuccess('question', 1)
-        self.parserSuccess('question AND to AND be', 1)
-        self.parserSuccess('to AND question AND be', 1)
-        self.parserSuccess('question AND NOT gardenia', 1)
-        self.parserSuccess('question AND gardenia', 0)
-        self.parserSuccess('gardenia', 0)
-        self.parserSuccess('question OR gardenia', 1)
-        self.parserSuccess('question AND NOT to AND NOT be', 1)
-        self.parserSuccess('question OR to OR be', 1)
-        self.parserSuccess('question to be', 1)
-
-        self.parserFailure('to be')
-        self.parserFailure('to AND be')
-        self.parserFailure('to OR be')
-        self.parserFailure('to AND NOT be')
-        self.parserFailure('to AND NOT question')
-        self.parserFailure('to AND NOT gardenia')
-
-    def testDocUpdate(self):
-        docid = 1   # doesn't change -- we index the same doc repeatedly
-        N = len(text)
-        stop = get_stopdict()
-
-        d = {} # word -> list of version numbers containing that word
-        for version, i in zip(text, range(N)):
-            # use a simple splitter rather than an official one
-            words = [w for w in re.split("\W+", version.lower())
-                     if len(w) > 1 and not stop.has_key(w)]
-            word_seen = {}
-            for w in words:
-                if not word_seen.has_key(w):
-                    d.setdefault(w, []).append(i)
-                    word_seen[w] = 1
-
-        unique = {} # version number -> list of words unique to that version
-        common = [] # list of words common to all versions
-        for w, versionlist in d.items():
-            if len(versionlist) == 1:
-                unique.setdefault(versionlist[0], []).append(w)
-            elif len(versionlist) == N:
-                common.append(w)
-        self.assert_(len(common) > 0)
-        self.assert_(len(unique) > 0)
-
-        for version, i in zip(text, range(N)):
-            doc = Indexable(version)
-            self.zc_index.index_object(docid, doc)
-            for w in common:
-                nbest, total = self.zc_index.query(w)
-                self.assertEqual(total, 1, "did not find %s" % w)
-            for k, v in unique.items():
-                if k == i:
-                    continue
-                for w in v:
-                    nbest, total = self.zc_index.query(w)
-                    self.assertEqual(total, 0, "did not expect to find %s" % w)
-
-
-class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
-
-    # A fairly involved test of the ranking calculations based on
-    # an example set of documents in queries in Managing
-    # Gigabytes, pp. 180-188.  This test peeks into many internals of the
-    # cosine indexer.
-
-    def test_z3interfaces(self):
-        from Products.PluginIndexes.interfaces import IPluggableIndex
-        from Products.ZCTextIndex.interfaces import IZCTextIndex
-        from zope.interface.verify import verifyClass
-
-        verifyClass(IPluggableIndex, ZCTextIndex)
-        verifyClass(IZCTextIndex, ZCTextIndex)
-
-    def testRanking(self):
-        self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
-                      "pease", "porridge", "pot"]
-        self.docs = ["Pease porridge hot, pease porridge cold,",
-                     "Pease porridge in the pot,",
-                     "Nine days old.",
-                     "In the pot cold, in the pot hot,",
-                     "Pease porridge, pease porridge,",
-                     "Eat the lot."]
-        self._ranking_index()
-        self._ranking_tf()
-        self._ranking_idf()
-        self._ranking_queries()
-
-        # A digression to exercise re-indexing.
-        docs = self.docs
-        for variant in "hot cold porridge python", "pease hot pithy":
-            self.zc_index.index_object(len(docs), Indexable(variant))
-            try:
-                self._ranking_tf()
-            except (AssertionError, KeyError):
-                pass
-            else:
-                self.fail("expected _ranking_tf() to fail -- reindex")
-
-            try:
-                self._ranking_idf()
-            except (AssertionError, KeyError):
-                pass
-            else:
-                self.fail("expected _ranking_idf() to fail -- reindex")
-
-            try:
-                self._ranking_queries()
-            except AssertionError:
-                pass
-            else:
-                self.fail("expected _ranking_queries() to fail -- reindex")
-
-        # This should leave things exactly as they were.
-        self.zc_index.index_object(len(docs), Indexable(docs[-1]))
-        self._ranking_tf()
-        self._ranking_idf()
-        self._ranking_queries()
-
-    def _ranking_index(self):
-        docs = self.docs
-        for i in range(len(docs)):
-            self.zc_index.index_object(i + 1, Indexable(docs[i]))
-
-    def _ranking_tf(self):
-        # matrix of term weights for the rows are docids
-        # and the columns are indexes into this list:
-        l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
-               (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
-               (0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
-               (1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
-               (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
-               (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
-        l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
-
-        for i in range(len(l_Wd)):
-            docid = i + 1
-            scaled_Wd = scaled_int(l_Wd[i])
-            eq(scaled_Wd, self.index._get_Wd(docid))
-            wdts = [scaled_int(t) for t in l_wdt[i]]
-            for j in range(len(wdts)):
-                wdt = self.index._get_wdt(docid, self.words[j])
-                eq(wdts[j], wdt)
-
-    def _ranking_idf(self):
-        word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
-        idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
-        for i in range(len(self.words)):
-            word = self.words[i]
-            eq(word_freqs[i], self.index._get_ft(word))
-            eq(scaled_int(idfs[i]), self.index._get_wt(word))
-
-    def _ranking_queries(self):
-        queries = ["eat", "porridge", "hot OR porridge",
-                   "eat OR nine OR day OR old OR porridge"]
-        wqs = [1.95, 1.10, 1.77, 3.55]
-        results = [[(6, 0.71)],
-                   [(1, 0.61), (2, 0.58), (5, 0.71)],
-                   [(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
-                   [(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
-        for i in range(len(queries)):
-            raw = queries[i]
-            q = QueryParser(self.lexicon).parseQuery(raw)
-            wq = self.index.query_weight(q.terms())
-            eq(wq, scaled_int(wqs[i]))
-            r, n = self.zc_index.query(raw)
-            self.assertEqual(len(r), len(results[i]))
-            # convert the results to a dict for each checking
-            d = {}
-            for doc, score in results[i]:
-                d[doc] = scaled_int(score)
-            for doc, score in r:
-                score = scaled_int(float(score / SCALE_FACTOR) / wq)
-                self.assert_(0 <= score <= SCALE_FACTOR)
-                eq(d[doc], score)
-
-class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
-
-    # A white-box test.
-    def testAbsoluteScores(self):
-        docs = ["one",
-                "one two",
-                "one two three"]
-
-        for i in range(len(docs)):
-            self.zc_index.index_object(i + 1, Indexable(docs[i]))
-
-        self._checkAbsoluteScores()
-
-        # Exercise re-indexing.
-        for variant in "one xyz", "xyz two three", "abc def":
-            self.zc_index.index_object(len(docs), Indexable(variant))
-            try:
-                self._checkAbsoluteScores()
-            except AssertionError:
-                pass
-            else:
-                self.fail("expected _checkAbsoluteScores() to fail -- reindex")
-        # This should leave things exactly as they were.
-        self.zc_index.index_object(len(docs), Indexable(docs[-1]))
-        self._checkAbsoluteScores()
-
-    def _checkAbsoluteScores(self):
-        self.assertEqual(self.index._totaldoclen(), 6)
-        # So the mean doc length is 2.  We use that later.
-
-        r, num = self.zc_index.query("one")
-        self.assertEqual(num, 3)
-        self.assertEqual(len(r), 3)
-
-        # Because our Okapi's B parameter is > 0, and "one" only appears
-        # once in each doc, the verbosity hypothesis favors shorter docs.
-        self.assertEqual([doc for doc, score in r], [1, 2, 3])
-
-        # The way the Okapi math works, a word that appears exactly once in
-        # an average (length) doc gets tf score 1.  Our second doc has
-        # an average length, so its score should by 1 (tf) times the
-        # inverse doc frequency of "one".  But "one" appears in every
-        # doc, so its IDF is log(1 + 3/3) = log(2).
-        self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3)))
-
-        # Similarly for "two".
-        r, num = self.zc_index.query("two")
-        self.assertEqual(num, 2)
-        self.assertEqual(len(r), 2)
-        self.assertEqual([doc for doc, score in r], [2, 3])
-        self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3)))
-
-        # And "three", except that doesn't appear in an average-size doc, so
-        # the math is much more involved.
-        r, num = self.zc_index.query("three")
-        self.assertEqual(num, 1)
-        self.assertEqual(len(r), 1)
-        self.assertEqual([doc for doc, score in r], [3])
-        idf = inverse_doc_frequency(1, 3)
-        meandoclen = 2.0
-        lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen
-        tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight)
-        self.assertEqual(r[0][1], scaled_int(tf * idf))
-
-    # More of a black-box test, but based on insight into how Okapi is trying
-    # to think.
-    def testRelativeScores(self):
-        # Create 9 10-word docs.
-        # All contain one instance of "one".
-        # Doc #i contains i instances of "two" and 9-i of "xyz".
-        for i in range(1, 10):
-            doc = "one " + "two " * i + "xyz " * (9 - i)
-            self.zc_index.index_object(i, Indexable(doc))
-        self._checkRelativeScores()
-
-        # Exercise re-indexing.
-        self.zc_index.index_object(9, Indexable("two xyz"))
-        try:
-            self._checkRelativeScores()
-        except AssertionError:
-            pass
-        else:
-            self.fail("expected _checkRelativeScores() to fail after reindex")
-        # This should leave things exactly as they were.
-        self.zc_index.index_object(9, Indexable(doc))
-        self._checkRelativeScores()
-
-    def _checkRelativeScores(self):
-        r, num = self.zc_index.query("one two")
-        self.assertEqual(num, 9)
-        self.assertEqual(len(r), 9)
-        # The more twos in a doc, the better the score should be.
-        self.assertEqual([doc for doc, score in r], range(9, 0, -1))
-
-        # Search for "two" alone shouldn't make any difference to relative
-        # results.
-        r, num = self.zc_index.query("two")
-        self.assertEqual(num, 9)
-        self.assertEqual(len(r), 9)
-        self.assertEqual([doc for doc, score in r], range(9, 0, -1))
-
-        # Searching for xyz should skip doc 9, and favor the lower-numbered
-        # docs (they have more instances of xyz).
-        r, num = self.zc_index.query("xyz")
-        self.assertEqual(num, 8)
-        self.assertEqual(len(r), 8)
-        self.assertEqual([doc for doc, score in r], range(1, 9))
-
-        # And relative results shouldn't change if we add "one".
-        r, num = self.zc_index.query("xyz one")
-        self.assertEqual(num, 8)
-        self.assertEqual(len(r), 8)
-        self.assertEqual([doc for doc, score in r], range(1, 9))
-
-        # But if we search for all the words, it's much muddier.  The boost
-        # in going from i instances to i+1 of a given word is smaller than
-        # the boost in going from i-1 to i, so the winner will be the one
-        # that balances the # of twos and xyzs best.  But the test is nasty
-        # that way:  doc 4 has 4 two and 5 xyz, while doc 5 has the reverse.
-        # However, xyz is missing from doc 9, so xyz has a larger idf than
-        # two has.  Since all the doc lengths are the same, doc lengths don't
-        # matter.  So doc 4 should win, and doc 5 should come in second.
-        # The loser will be the most unbalanced, but is that doc 1 (1 two 8
-        # xyz) or doc 8 (8 two 1 xyz)?  Again xyz has a higher idf, so doc 1
-        # is more valuable, and doc 8 is the loser.
-        r, num = self.zc_index.query("xyz one two")
-        self.assertEqual(num, 8)
-        self.assertEqual(len(r), 8)
-        self.assertEqual(r[0][0], 4)    # winner
-        self.assertEqual(r[1][0], 5)    # runner up
-        self.assertEqual(r[-1][0], 8)   # loser
-        self.assertEqual(r[-2][0], 1)   # penultimate loser
-
-        # And nothing about the relative results in the last test should
-        # change if we leave "one" out of the search (it appears in all
-        # docs, so it's a wash).
-        r, num = self.zc_index.query("two xyz")
-        self.assertEqual(num, 8)
-        self.assertEqual(len(r), 8)
-        self.assertEqual(r[0][0], 4)    # winner
-        self.assertEqual(r[1][0], 5)    # runner up
-        self.assertEqual(r[-1][0], 8)   # loser
-        self.assertEqual(r[-2][0], 1)   # penultimate loser
-
-
-############################################################################
-# Subclasses of QueryTestsBase must set a class variable IndexFactory to
-# the kind of index to be constructed.
-
-class QueryTestsBase(testQueryEngine.TestQueryEngine,
-                     testQueryParser.TestQueryParser):
-
-    # The FauxIndex in testQueryEngine contains four documents.
-    # docid 1: foo, bar, ham
-    # docid 2: bar, ham
-    # docid 3: foo, ham
-    # docid 4: ham
-
-    docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
-
-    def setUp(self):
-        self.lexicon = PLexicon('lexicon', '',
-                                Splitter(),
-                                CaseNormalizer(),
-                                StopWordRemover())
-        caller = LexiconHolder(self.lexicon)
-
-        self.zc_index = ZCTextIndex('name',
-                                    None,
-                                    caller,
-                                    self.IndexFactory,
-                                    'text',
-                                    'lexicon')
-        self.parser = QueryParser(self.lexicon)
-        self.index = self.zc_index.index
-        self.add_docs()
-
-    def add_docs(self):
-        for i in range(len(self.docs)):
-            text = self.docs[i]
-            obj = Indexable(text)
-            self.zc_index.index_object(i + 1, obj)
-
-    def compareSet(self, set, dict):
-        # XXX The FauxIndex and the real Index score documents very
-        # differently.  The set comparison can't actually compare the
-        # items, but it can compare the keys.  That will have to do for now.
-        setkeys = list(set.keys())
-        dictkeys = dict.keys()
-        setkeys.sort()
-        dictkeys.sort()
-        self.assertEqual(setkeys, dictkeys)
-
-
-class CosineQueryTests(QueryTestsBase):
-    IndexFactory = CosineIndex
-
-
-class OkapiQueryTests(QueryTestsBase):
-    IndexFactory = OkapiIndex
-
-
-class PLexiconTests(unittest.TestCase):
-
-    def _getTargetClass(self):
-        from Products.ZCTextIndex.ZCTextIndex import PLexicon
-        return PLexicon
-
-    def _makeOne(self, id='testing', title='Testing', *pipeline):
-        return self._getTargetClass()(id, title, *pipeline)
-
-    def test_class_conforms_to_ILexicon(self):
-        from Products.ZCTextIndex.interfaces import ILexicon
-        from zope.interface.verify import verifyClass
-        verifyClass(ILexicon, self._getTargetClass())
-
-    def test_instance_conforms_to_ILexicon(self):
-        from Products.ZCTextIndex.interfaces import ILexicon
-        from zope.interface.verify import verifyObject
-        verifyObject(ILexicon, self._makeOne())
-
-    def test_class_conforms_to_IZCLexicon(self):
-        from Products.ZCTextIndex.interfaces import IZCLexicon
-        from zope.interface.verify import verifyClass
-        verifyClass(IZCLexicon, self._getTargetClass())
-
-    def test_instance_conforms_to_IZCLexicon(self):
-        from Products.ZCTextIndex.interfaces import IZCLexicon
-        from zope.interface.verify import verifyObject
-        verifyObject(IZCLexicon, self._makeOne())
-
-    def test_queryLexicon_defaults_empty(self):
-        lexicon = self._makeOne()
-        info = lexicon.queryLexicon(REQUEST=None, words=None)
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 20)
-        self.assertEqual(info['cols'], 4)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 0)
-        self.assertEqual(info['word_count'], 0)
-        self.assertEqual(list(info['page_range']), [])
-        self.assertEqual(info['page_columns'], [])
-
-    def test_queryLexicon_defaults_non_empty(self):
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne()
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=None)
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 20)
-        self.assertEqual(info['cols'], 4)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 7)
-        self.assertEqual(info['word_count'], 7)
-        self.assertEqual(list(info['page_range']), [0])
-        self.assertEqual(info['page_columns'], [WORDS])
-
-    def test_queryLexicon_row_breaks(self):
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne()
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=None, rows=4)
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 4)
-        self.assertEqual(info['cols'], 4)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 7)
-        self.assertEqual(info['word_count'], 7)
-        self.assertEqual(list(info['page_range']), [0])
-        self.assertEqual(info['page_columns'], [WORDS[0:4], WORDS[4:]])
-
-    def test_queryLexicon_page_breaks(self):
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne()
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=None, rows=2, cols=2)
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 2)
-        self.assertEqual(info['cols'], 2)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 4)
-        self.assertEqual(info['word_count'], 7)
-        self.assertEqual(list(info['page_range']), [0, 1])
-        self.assertEqual(info['page_columns'], [WORDS[0:2], WORDS[2:4]])
-
-    def test_queryLexicon_page_break_not_first(self):
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne()
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=None,
-                                    page=1, rows=2, cols=2)
-        self.assertEqual(info['page'], 1)
-        self.assertEqual(info['rows'], 2)
-        self.assertEqual(info['cols'], 2)
-        self.assertEqual(info['start_word'], 5)
-        self.assertEqual(info['end_word'], 7)
-        self.assertEqual(info['word_count'], 7)
-        self.assertEqual(list(info['page_range']), [0, 1])
-        self.assertEqual(info['page_columns'], [WORDS[4:6], WORDS[6:]])
-
-    def test_queryLexicon_words_no_globbing(self):
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne()
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=['aaa', 'bbb'])
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 20)
-        self.assertEqual(info['cols'], 4)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 2)
-        self.assertEqual(info['word_count'], 2)
-        self.assertEqual(list(info['page_range']), [0])
-        self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
-
-    def test_queryLexicon_words_w_globbing(self):
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne()
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=['aa*', 'bbb*'])
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 20)
-        self.assertEqual(info['cols'], 4)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 2)
-        self.assertEqual(info['word_count'], 2)
-        self.assertEqual(list(info['page_range']), [0])
-        self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
-
-    def test_queryLexicon_uses_pipeline_for_normalization(self):
-        from Products.ZCTextIndex.Lexicon import CaseNormalizer
-        WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
-        lexicon = self._makeOne('test', 'Testing', CaseNormalizer())
-        lexicon.sourceToWordIds(WORDS)
-        info = lexicon.queryLexicon(REQUEST=None, words=['AA*', 'Bbb*'])
-        self.assertEqual(info['page'], 0)
-        self.assertEqual(info['rows'], 20)
-        self.assertEqual(info['cols'], 4)
-        self.assertEqual(info['start_word'], 1)
-        self.assertEqual(info['end_word'], 2)
-        self.assertEqual(info['word_count'], 2)
-        self.assertEqual(list(info['page_range']), [0])
-        self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
-
-
-def test_suite():
-    s = unittest.TestSuite()
-    for klass in (CosineIndexTests, OkapiIndexTests,
-                  CosineQueryTests, OkapiQueryTests, PLexiconTests):
-        s.addTest(unittest.makeSuite(klass))
-    return s
-
-if __name__=='__main__':
-    unittest.main(defaultTest='test_suite')
--- a/src/Products/ZCTextIndex/tests/wordstats.py
+++ b/src/Products/ZCTextIndex/tests/wordstats.py
-#! /usr/bin/env python
-"""Dump statistics about each word in the index.
-
-usage: wordstats.py data.fs [index key]
-"""
-
-import ZODB
-from ZODB.FileStorage import FileStorage
-
-def main(fspath, key):
-    fs = FileStorage(fspath, read_only=1)
-    db = ZODB.DB(fs)
-    rt = db.open().root()
-    index = rt[key]
-
-    lex = index.lexicon
-    idx = index.index
-    print "Words", lex.length()
-    print "Documents", idx.length()
-
-    print "Word frequencies: count, word, wid"
-    for word, wid in lex.items():
-        docs = idx._wordinfo[wid]
-        print len(docs), word, wid
-
-    print "Per-doc scores: wid, (doc, score,)+"
-    for wid in lex.wids():
-        print wid,
-        docs = idx._wordinfo[wid]
-        for docid, score in docs.items():
-            print docid, score,
-        print
-
-if __name__ == "__main__":
-    import sys
-
-    args = sys.argv[1:]
-    index_key = "index"
-    if len(args) == 1:
-        fspath = args[0]
-    elif len(args) == 2:
-        fspath, index_key = args
-    else:
-        print "Expected 1 or 2 args, got", len(args)
-    main(fspath, index_key)
--- a/src/Products/ZCTextIndex/www/index.gif
+++ b/src/Products/ZCTextIndex/www/index.gif
--- a/src/Products/ZCTextIndex/www/lexicon.gif
+++ b/src/Products/ZCTextIndex/www/lexicon.gif
--- a/versions.cfg
+++ b/versions.cfg
@@ -13,6 +13,7 @@ initgroups = 2.13.0
 Missing = 2.13.1
 MultiMapping = 2.13.0
 Persistence = 2.13.2
+Products.ZCTextIndex = 2.13.0
 Record = 2.13.0
 RestrictedPython = 3.6.0a1
 tempstorage = 2.11.3