Products.ZCTextIndex was moved to its own distribution

1c1a53d1 · Hanno Schlichting · 48f67574 · 1c1a53d1 · 1c1a53d1 · 48f67574
Commit 1c1a53d1 authored Jun 19, 2010 by Hanno Schlichting
59 changed files
--- a/buildout.cfg
+++ b/buildout.cfg
@@ -44,6 +44,7 @@ eggs =
    Missing
    MultiMapping
    Persistence
+    Products.ZCTextIndex
    Record
    RestrictedPython
    initgroups

--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 ##############################################################################
 import os
-from setuptools import setup, find_packages, Extension
+from setuptools import setup, find_packages
 setup(name='Zope2',
@@ -29,18 +29,6 @@ setup(name='Zope2',
    packages=find_packages('src'),
    namespace_packages=['Products'],
    package_dir={'': 'src'},
-    ext_modules=[
-      # indexes
-      Extension(
-            name='Products.ZCTextIndex.stopper',
-            sources=['src/Products/ZCTextIndex/stopper.c']),
-      Extension(
-            name='Products.ZCTextIndex.okascore',
-            sources=['src/Products/ZCTextIndex/okascore.c']),
-    ],
    install_requires=[
      'AccessControl',
      'Acquisition',
@@ -50,6 +38,7 @@ setup(name='Zope2',
      'Missing',
      'MultiMapping',
      'Persistence',
+      'Products.ZCTextIndex',
      'Record',
      'RestrictedPython',
      'ZConfig',

--- a/src/Products/ZCTextIndex/BaseIndex.py
+++ b/src/Products/ZCTextIndex/BaseIndex.py
--- a/src/Products/ZCTextIndex/CosineIndex.py
+++ b/src/Products/ZCTextIndex/CosineIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""Full text index with relevance ranking, using a cosine measure."""
-import math
-from BTrees.IIBTree import IIBucket
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import IIndex
-from Products.ZCTextIndex.BaseIndex import BaseIndex
-from Products.ZCTextIndex.BaseIndex import inverse_doc_frequency
-from Products.ZCTextIndex.BaseIndex import scaled_int
-from Products.ZCTextIndex.BaseIndex import SCALE_FACTOR
-class CosineIndex(BaseIndex):
-    implements(IIndex)
-    def __init__(self, lexicon):
-        BaseIndex.__init__(self, lexicon)
-        # ._wordinfo for cosine is wid -> {docid -> weight};
-        # t -> D -> w(d, t)/W(d)
-        # ._docweight for cosine is
-        # docid -> W(docid)
-    # Most of the computation for computing a relevance score for the
-    # document occurs in the _search_wids() method.  The code currently
-    # implements the cosine similarity function described in Managing
-    # Gigabytes, eq. 4.3, p. 187.  The index_object() method
-    # precomputes some values that are independent of the particular
-    # query.
-    # The equation is
-    #
-    #                     sum(for t in I(d,q): w(d,t) * w(q,t))
-    #     cosine(d, q) =  -------------------------------------
-    #                                  W(d) * W(q)
-    #
-    # where
-    #    I(d, q) = the intersection of the terms in d and q.
-    #
-    #    w(d, t) = 1 + log f(d, t)
-    #        computed by doc_term_weight(); for a given word t,
-    #        self._wordinfo[t] is a map from d to w(d, t).
-    #
-    #    w(q, t) = log(1 + N/f(t))
-    #        computed by inverse_doc_frequency()
-    #
-    #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
-    #        computed by _get_frequencies(), and remembered in
-    #        self._docweight[d]
-    #
-    #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
-    #        computed by self.query_weight()
-    def _search_wids(self, wids):
-        if not wids:
-            return []
-        N = float(self.document_count())
-        L = []
-        DictType = type({})
-        for wid in wids:
-            assert self._wordinfo.has_key(wid)  # caller responsible for OOV
-            d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
-            idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
-            #print "idf = %.3f" % idf
-            if isinstance(d2w, DictType):
-                d2w = IIBucket(d2w)
-            L.append((d2w, scaled_int(idf)))
-        return L
-    def query_weight(self, terms):
-        wids = []
-        for term in terms:
-            wids += self._lexicon.termToWordIds(term)
-        N = float(self.document_count())
-        sum = 0.0
-        for wid in self._remove_oov_wids(wids):
-            wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
-            sum += wt ** 2.0
-        return scaled_int(math.sqrt(sum))
-    def _get_frequencies(self, wids):
-        d = {}
-        dget = d.get
-        for wid in wids:
-            d[wid] = dget(wid, 0) + 1
-        Wsquares = 0.0
-        for wid, count in d.items():
-            w = doc_term_weight(count)
-            Wsquares += w * w
-            d[wid] = w
-        W = math.sqrt(Wsquares)
-        #print "W = %.3f" % W
-        for wid, weight in d.items():
-            #print i, ":", "%.3f" % weight,
-            d[wid] = scaled_int(weight / W)
-            #print "->", d[wid]
-        return d, scaled_int(W)
-    # The rest are helper methods to support unit tests
-    def _get_wdt(self, d, t):
-        wid, = self._lexicon.termToWordIds(t)
-        map = self._wordinfo[wid]
-        return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
-    def _get_Wd(self, d):
-        return self._docweight[d]
-    def _get_ft(self, t):
-        wid, = self._lexicon.termToWordIds(t)
-        return len(self._wordinfo[wid])
-    def _get_wt(self, t):
-        wid, = self._lexicon.termToWordIds(t)
-        map = self._wordinfo[wid]
-        return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
-def doc_term_weight(count):
-    """Return the doc-term weight for a term that appears count times."""
-    # implements w(d, t) = 1 + log f(d, t)
-    return 1.0 + math.log(count)
--- a/src/Products/ZCTextIndex/HTMLSplitter.py
+++ b/src/Products/ZCTextIndex/HTMLSplitter.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-import re
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import ISplitter
-from Products.ZCTextIndex.PipelineFactory import element_factory
-class HTMLWordSplitter:
-    implements(ISplitter)
-    def process(self, text, wordpat=r"(?L)\w+"):
-        splat = []
-        for t in text:
-            splat += self._split(t, wordpat)
-        return splat
-    def processGlob(self, text):
-        # see Lexicon.globToWordIds()
-        return self.process(text, r"(?L)\w+[\w*?]*")
-    def _split(self, text, wordpat):
-        text = text.lower()
-        remove = [r"<[^<>]*>",
-                  r"&[A-Za-z]+;"]
-        for pat in remove:
-            text = re.sub(pat, " ", text)
-        return re.findall(wordpat, text)
-element_factory.registerFactory('Word Splitter',
-                                'HTML aware splitter',
-                                HTMLWordSplitter)
-if __name__ == "__main__":
-    import sys
-    splitter = HTMLWordSplitter()
-    for path in sys.argv[1:]:
-        f = open(path, "rb")
-        buf = f.read()
-        f.close()
-        print path
-        print splitter.process([buf])
--- a/src/Products/ZCTextIndex/IIndex.py
+++ b/src/Products/ZCTextIndex/IIndex.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Index Interface."""
-from Products.ZCTextIndex.interfaces import IIndex # BBB
--- a/src/Products/ZCTextIndex/INBest.py
+++ b/src/Products/ZCTextIndex/INBest.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-from Products.ZCTextIndex.interfaces import INBest # BBB
--- a/src/Products/ZCTextIndex/IPipelineElement.py
+++ b/src/Products/ZCTextIndex/IPipelineElement.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-from Products.ZCTextIndex.interfaces import IPipelineElement # BBB
--- a/src/Products/ZCTextIndex/IPipelineElementFactory.py
+++ b/src/Products/ZCTextIndex/IPipelineElementFactory.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
--- a/src/Products/ZCTextIndex/IQueryParseTree.py
+++ b/src/Products/ZCTextIndex/IQueryParseTree.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
--- a/src/Products/ZCTextIndex/IQueryParser.py
+++ b/src/Products/ZCTextIndex/IQueryParser.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-from Products.ZCTextIndex.interfaces import IQueryParser # BBB
--- a/src/Products/ZCTextIndex/ISplitter.py
+++ b/src/Products/ZCTextIndex/ISplitter.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-from Products.ZCTextIndex.interfaces import ISplitter # BBB
--- a/src/Products/ZCTextIndex/Lexicon.py
+++ b/src/Products/ZCTextIndex/Lexicon.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Lexicon.
-$Id$
-"""
-import re
-from BTrees.IOBTree import IOBTree
-from BTrees.OIBTree import OIBTree
-from BTrees.Length import Length
-from Persistence import Persistent
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import ILexicon
-from Products.ZCTextIndex.StopDict import get_stopdict
-from Products.ZCTextIndex.ParseTree import QueryError
-from Products.ZCTextIndex.PipelineFactory import element_factory
-class Lexicon(Persistent):
-    implements(ILexicon)
-    def __init__(self, *pipeline):
-        self._wids = OIBTree()  # word -> wid
-        self._words = IOBTree() # wid -> word
-        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
-        # of vocabulary).  This can happen, e.g., if a query contains a word
-        # we never saw before, and that isn't a known stopword (or otherwise
-        # filtered out).  Returning a special wid value for OOV words is a
-        # way to let clients know when an OOV word appears.
-        self.length = Length()
-        self._pipeline = pipeline
-    def length(self):
-        """Return the number of unique terms in the lexicon."""
-        # Overridden in instances
-        return len(self._wids)
-    def words(self):
-        return self._wids.keys()
-    def wids(self):
-        return self._words.keys()
-    def items(self):
-        return self._wids.items()
-    def sourceToWordIds(self, text):
-        last = _text2list(text)
-        for element in self._pipeline:
-            last = element.process(last)
-        if not hasattr(self.length, 'change'):
-            # Make sure length is overridden with a BTrees.Length.Length
-            self.length = Length(self.length())        
-        # Strategically unload the length value so that we get the most
-        # recent value written to the database to minimize conflicting wids
-        # Because length is independent, this will load the most
-        # recent value stored, regardless of whether MVCC is enabled
-        self.length._p_deactivate()
-        return map(self._getWordIdCreate, last)
-    def termToWordIds(self, text):
-        last = _text2list(text)
-        for element in self._pipeline:
-            process = getattr(element, "process_post_glob", element.process) 
-            last = process(last)
-        wids = []
-        for word in last:
-            wids.append(self._wids.get(word, 0))
-        return wids
-    def parseTerms(self, text):
-        last = _text2list(text)
-        for element in self._pipeline:
-            process = getattr(element, "processGlob", element.process)
-            last = process(last)
-        return last
-    def isGlob(self, word):
-        return "*" in word or "?" in word
-    def get_word(self, wid):
-        return self._words[wid]
-    def get_wid(self, word):
-        return self._wids.get(word, 0)
-    def globToWordIds(self, pattern):
-        # Implement * and ? just as in the shell, except the pattern
-        # must not start with either of these
-        prefix = ""
-        while pattern and pattern[0] not in "*?":
-            prefix += pattern[0]
-            pattern = pattern[1:]
-        if not pattern:
-            # There were no globbing characters in the pattern
-            wid = self._wids.get(prefix, 0)
-            if wid:
-                return [wid]
-            else:
-                return []
-        if not prefix:
-            # The pattern starts with a globbing character.
-            # This is too efficient, so we raise an exception.
-            raise QueryError(
-                "pattern %r shouldn't start with glob character" % pattern)
-        pat = prefix
-        for c in pattern:
-            if c == "*":
-                pat += ".*"
-            elif c == "?":
-                pat += "."
-            else:
-                pat += re.escape(c)
-        pat += "$"
-        prog = re.compile(pat)
-        keys = self._wids.keys(prefix) # Keys starting at prefix
-        wids = []
-        for key in keys:
-            if not key.startswith(prefix):
-                break
-            if prog.match(key):
-                wids.append(self._wids[key])
-        return wids
-    def _getWordIdCreate(self, word):
-        wid = self._wids.get(word)
-        if wid is None:
-            wid = self._new_wid()
-            self._wids[word] = wid
-            self._words[wid] = word
-        return wid
-    def _new_wid(self):
-        self.length.change(1)
-        while self._words.has_key(self.length()): # just to be safe
-            self.length.change(1)
-        return self.length()
-def _text2list(text):
-    # Helper: splitter input may be a string or a list of strings
-    try:
-        text + ""
-    except:
-        return text
-    else:
-        return [text]
-# Sample pipeline elements
-class Splitter:
-    import re
-    rx = re.compile(r"(?L)\w+")
-    rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
-    def process(self, lst):
-        result = []
-        for s in lst:
-            result += self.rx.findall(s)
-        return result
-    def processGlob(self, lst):
-        result = []
-        for s in lst:
-            result += self.rxGlob.findall(s)
-        return result
-element_factory.registerFactory('Word Splitter',
-                                 'Whitespace splitter',
-                                 Splitter)
-class CaseNormalizer:
-    def process(self, lst):
-        return [w.lower() for w in lst]
-element_factory.registerFactory('Case Normalizer',
-                                'Case Normalizer',
-                                CaseNormalizer)
-element_factory.registerFactory('Stop Words',
-                                ' Don\'t remove stop words',
-                                None)
-class StopWordRemover:
-    dict = get_stopdict().copy()
-    try:
-        from Products.ZCTextIndex.stopper import process as _process
-    except ImportError:
-        def process(self, lst):
-            has_key = self.dict.has_key
-            return [w for w in lst if not has_key(w)]
-    else:
-        def process(self, lst):
-            return self._process(self.dict, lst)
-element_factory.registerFactory('Stop Words',
-                                'Remove listed stop words only',
-                                StopWordRemover)
-class StopWordAndSingleCharRemover(StopWordRemover):
-    dict = get_stopdict().copy()
-    for c in range(255):
-        dict[chr(c)] = None
-element_factory.registerFactory('Stop Words',
-                                'Remove listed and single char words',
-                                StopWordAndSingleCharRemover)
--- a/src/Products/ZCTextIndex/NBest.py
+++ b/src/Products/ZCTextIndex/NBest.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""NBest
-An NBest object remembers the N best-scoring items ever passed to its
-.add(item, score) method.  If .add() is called M times, the worst-case
-number of comparisons performed overall is M * log2(N).
-"""
-from bisect import bisect
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import INBest
-class NBest:
-    implements(INBest)
-    def __init__(self, N):
-        "Build an NBest object to remember the N best-scoring objects."
-        if N < 1:
-            raise ValueError("NBest() argument must be at least 1")
-        self._capacity = N
-        # This does a very simple thing with sorted lists.  For large
-        # N, a min-heap can be unboundedly better in terms of data
-        # movement time.
-        self._scores = []
-        self._items = []
-    def __len__(self):
-        return len(self._scores)
-    def capacity(self):
-        return self._capacity
-    def add(self, item, score):
-        self.addmany([(item, score)])
-    def addmany(self, sequence):
-        scores, items, capacity = self._scores, self._items, self._capacity
-        n = len(scores)
-        for item, score in sequence:
-            # When we're in steady-state, the usual case is that we're filled
-            # to capacity, and that an incoming item is worse than any of
-            # the best-seen so far.
-            if n >= capacity and score <= scores[0]:
-                continue
-            i = bisect(scores, score)
-            scores.insert(i, score)
-            items.insert(i, item)
-            if n == capacity:
-                del items[0], scores[0]
-            else:
-                n += 1
-        assert n == len(scores)
-    def getbest(self):
-        result = zip(self._items, self._scores)
-        result.reverse()
-        return result
-    def pop_smallest(self):
-        if self._scores:
-            return self._items.pop(0), self._scores.pop(0)
-        raise IndexError("pop_smallest() called on empty NBest object")
--- a/src/Products/ZCTextIndex/OkapiIndex.py
+++ b/src/Products/ZCTextIndex/OkapiIndex.py
--- a/src/Products/ZCTextIndex/ParseTree.py
+++ b/src/Products/ZCTextIndex/ParseTree.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Generic parser support: exception and parse tree nodes."""
-from BTrees.IIBTree import difference
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import IQueryParseTree
-from Products.ZCTextIndex.SetOps import mass_weightedIntersection
-from Products.ZCTextIndex.SetOps import mass_weightedUnion
-class QueryError(Exception):
-    pass
-class ParseError(Exception):
-    pass
-class ParseTreeNode:
-    implements(IQueryParseTree)
-    _nodeType = None
-    def __init__(self, value):
-        self._value = value
-    def nodeType(self):
-        return self._nodeType
-    def getValue(self):
-        return self._value
-    def __repr__(self):
-        return "%s(%r)" % (self.__class__.__name__, self.getValue())
-    def terms(self):
-        t = []
-        for v in self.getValue():
-            t.extend(v.terms())
-        return t
-    def executeQuery(self, index):
-        raise NotImplementedError
-class NotNode(ParseTreeNode):
-    _nodeType = "NOT"
-    def terms(self):
-        return []
-    def executeQuery(self, index):
-        raise QueryError, "NOT parse tree node cannot be executed directly"
-class AndNode(ParseTreeNode):
-    _nodeType = "AND"
-    def executeQuery(self, index):
-        L = []
-        Nots = []
-        for subnode in self.getValue():
-            if subnode.nodeType() == "NOT":
-                r = subnode.getValue().executeQuery(index)
-                # If None, technically it matches every doc, but we treat
-                # it as if it matched none (we want
-                #     real_word AND NOT stop_word
-                # to act like plain real_word).
-                if r is not None:
-                    Nots.append((r, 1))
-            else:
-                r = subnode.executeQuery(index)
-                # If None, technically it matches every doc, so needn't be
-                # included.
-                if r is not None:
-                    L.append((r, 1))
-        set = mass_weightedIntersection(L)
-        if Nots:
-            notset = mass_weightedUnion(Nots)
-            set = difference(set, notset)
-        return set
-class OrNode(ParseTreeNode):
-    _nodeType = "OR"
-    def executeQuery(self, index):
-        weighted = []
-        for node in self.getValue():
-            r = node.executeQuery(index)
-            # If None, technically it matches every doc, but we treat
-            # it as if it matched none (we want
-            #     real_word OR stop_word
-            # to act like plain real_word).
-            if r is not None:
-                weighted.append((r, 1))
-        return mass_weightedUnion(weighted)
-class AtomNode(ParseTreeNode):
-    _nodeType = "ATOM"
-    def terms(self):
-        return [self.getValue()]
-    def executeQuery(self, index):
-        return index.search(self.getValue())
-class PhraseNode(AtomNode):
-    _nodeType = "PHRASE"
-    def executeQuery(self, index):
-        return index.search_phrase(self.getValue())
-class GlobNode(AtomNode):
-    _nodeType = "GLOB"
-    def executeQuery(self, index):
-        return index.search_glob(self.getValue())
--- a/src/Products/ZCTextIndex/PipelineFactory.py
+++ b/src/Products/ZCTextIndex/PipelineFactory.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import IPipelineElementFactory
-class PipelineElementFactory:
-    implements(IPipelineElementFactory)
-    def __init__(self):
-        self._groups = {}
-    def registerFactory(self, group, name, factory):
-        if self._groups.has_key(group) and \
-           self._groups[group].has_key(name):
-            raise ValueError('ZCTextIndex lexicon element "%s" '
-                             'already registered in group "%s"'
-                             % (name, group))
-        elements = self._groups.get(group)
-        if elements is None:
-            elements = self._groups[group] = {}
-        elements[name] = factory
-    def getFactoryGroups(self):
-        groups = self._groups.keys()
-        groups.sort()
-        return groups
-    def getFactoryNames(self, group):
-        names = self._groups[group].keys()
-        names.sort()
-        return names
-    def instantiate(self, group, name):
-        factory = self._groups[group][name]
-        if factory is not None:
-            return factory()
-element_factory = PipelineElementFactory()
--- a/src/Products/ZCTextIndex/QueryParser.py
+++ b/src/Products/ZCTextIndex/QueryParser.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Query Parser.
-This particular parser recognizes the following syntax:
-Start = OrExpr
-OrExpr = AndExpr ('OR' AndExpr)*
-AndExpr = Term ('AND' NotExpr)*
-NotExpr = ['NOT'] Term
-Term = '(' OrExpr ')' | ATOM+
-The key words (AND, OR, NOT) are recognized in any mixture of case.
-An ATOM is either:
-+ A sequence of characters not containing whitespace or parentheses or
-  double quotes, and not equal (ignoring case) to one of the key words
-  'AND', 'OR', 'NOT'; or
-+ A non-empty string enclosed in double quotes.  The interior of the
-  string can contain whitespace, parentheses and key words, but not
-  quotes.
-+ A hyphen followed by one of the two forms above, meaning that it
-  must not be present.
-An unquoted ATOM may also contain globbing characters.  Globbing
-syntax is defined by the lexicon; for example "foo*" could mean any
-word starting with "foo".
-When multiple consecutive ATOMs are found at the leaf level, they are
-connected by an implied AND operator, and an unquoted leading hyphen
-is interpreted as a NOT operator.
-Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
-"""
-import re
-from zope.interface import implements
-from Products.ZCTextIndex.interfaces import IQueryParser
-from Products.ZCTextIndex import ParseTree
-# Create unique symbols for token types.
-_AND    = intern("AND")
-_OR     = intern("OR")
-_NOT    = intern("NOT")
-_LPAREN = intern("(")
-_RPAREN = intern(")")
-_ATOM   = intern("ATOM")
-_EOF    = intern("EOF")
-# Map keyword string to token type.
-_keywords = {
-    _AND:       _AND,
-    _OR:        _OR,
-    _NOT:       _NOT,
-    _LPAREN:    _LPAREN,
-    _RPAREN:    _RPAREN,
-}
-# Regular expression to tokenize.
-_tokenizer_regex = re.compile(r"""
-    # a paren
-    [()]
-    # or an optional hyphen
-|   -?
-    # followed by
-    (?:
-        # a string inside double quotes (and not containing these)
-        " [^"]* "
-        # or a non-empty stretch w/o whitespace, parens or double quotes
-    |    [^()\s"]+
-    )
-""", re.VERBOSE)
-# Use unicode regex to treat fullwidth space characters defined in Unicode
-# as valid whitespace.
-_tokenizer_unicode_regex = re.compile(
-    _tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
-class QueryParser:
-    implements(IQueryParser)
-    # This class is not thread-safe;
-    # each thread should have its own instance
-    def __init__(self, lexicon):
-        self._lexicon = lexicon
-        self._ignored = None
-    # Public API methods
-    def parseQuery(self, query):
-        # Lexical analysis.
-        try:
-            # Try to use unicode and treat fullwidth whitespace as valid one.
-            if not isinstance(query, unicode):
-                query = query.decode('utf-8')
-            tokens = _tokenizer_unicode_regex.findall(query)
-        except UnicodeDecodeError:
-            tokens = _tokenizer_regex.findall(query)
-        self._tokens = tokens
-        # classify tokens
-        self._tokentypes = [_keywords.get(token.upper(), _ATOM)
-                            for token in tokens]
-        # add _EOF
-        self._tokens.append(_EOF)
-        self._tokentypes.append(_EOF)
-        self._index = 0
-        # Syntactical analysis.
-        self._ignored = [] # Ignored words in the query, for parseQueryEx
-        tree = self._parseOrExpr()
-        self._require(_EOF)
-        if tree is None:
-            raise ParseTree.ParseError(
-                "Query contains only common words: %s" % repr(query))
-        return tree
-    def getIgnored(self):
-        return self._ignored
-    def parseQueryEx(self, query):
-        tree = self.parseQuery(query)
-        ignored = self.getIgnored()
-        return tree, ignored
-    # Recursive descent parser
-    def _require(self, tokentype):
-        if not self._check(tokentype):
-            t = self._tokens[self._index]
-            msg = "Token %r required, %r found" % (tokentype, t)
-            raise ParseTree.ParseError, msg
-    def _check(self, tokentype):
-        if self._tokentypes[self._index] is tokentype:
-            self._index += 1
-            return 1
-        else:
-            return 0
-    def _peek(self, tokentype):
-        return self._tokentypes[self._index] is tokentype
-    def _get(self, tokentype):
-        t = self._tokens[self._index]
-        self._require(tokentype)
-        return t
-    def _parseOrExpr(self):
-        L = []
-        L.append(self._parseAndExpr())
-        while self._check(_OR):
-            L.append(self._parseAndExpr())
-        L = filter(None, L)
-        if not L:
-            return None # Only stopwords
-        elif len(L) == 1:
-            return L[0]
-        else:
-            return ParseTree.OrNode(L)
-    def _parseAndExpr(self):
-        L = []
-        t = self._parseTerm()
-        if t is not None:
-            L.append(t)
-        Nots = []
-        while self._check(_AND):
-            t = self._parseNotExpr()
-            if t is None:
-                continue
-            if isinstance(t, ParseTree.NotNode):
-                Nots.append(t)
-            else:
-                L.append(t)
-        if not L:
-            return None # Only stopwords
-        L.extend(Nots)
-        if len(L) == 1:
-            return L[0]
-        else:
-            return ParseTree.AndNode(L)
-    def _parseNotExpr(self):
-        if self._check(_NOT):
-            t = self._parseTerm()
-            if t is None:
-                return None # Only stopwords
-            return ParseTree.NotNode(t)
-        else:
-            return self._parseTerm()
-    def _parseTerm(self):
-        if self._check(_LPAREN):
-            tree = self._parseOrExpr()
-            self._require(_RPAREN)
-        else:
-            nodes = []
-            nodes = [self._parseAtom()]
-            while self._peek(_ATOM):
-                nodes.append(self._parseAtom())
-            nodes = filter(None, nodes)
-            if not nodes:
-                return None # Only stopwords
-            structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
-                         for i in range(len(nodes))]
-            structure.sort()
-            nodes = [node for (bit, index, node) in structure]
-            if isinstance(nodes[0], ParseTree.NotNode):
-                raise ParseTree.ParseError(
-                    "a term must have at least one positive word")
-            if len(nodes) == 1:
-                return nodes[0]
-            tree = ParseTree.AndNode(nodes)
-        return tree
-    def _parseAtom(self):
-        term = self._get(_ATOM)
-        words = self._lexicon.parseTerms(term)
-        if not words:
-            self._ignored.append(term)
-            return None
-        if len(words) > 1:
-            tree = ParseTree.PhraseNode(words)
-        elif self._lexicon.isGlob(words[0]):
-            tree = ParseTree.GlobNode(words[0])
-        else:
-            tree = ParseTree.AtomNode(words[0])
-        if term[0] == "-":
-            tree = ParseTree.NotNode(tree)
-        return tree
--- a/src/Products/ZCTextIndex/README.txt
+++ b/src/Products/ZCTextIndex/README.txt
-ZCTextIndex
-===========
-This product is a replacement for the full text indexing facility of
-ZCatalog.  Specifically, it is an alternative to
-PluginIndexes/TextIndex.
-Advantages of using ZCTextIndex over TextIndex:
- A new query language, supporting both explicit and implicit Boolean
-  operators, parentheses, globbing, and phrase searching.  Apart from
-  explicit operators and globbing, the syntax is roughly the same as
-  that popularized by Google.
- A more refined scoring algorithm, resulting in better selectiveness:
-  it's much more likely that you'll find the document you are looking
-  for among the first few highest-ranked results.
- Actually, ZCTextIndex gives you a choice of two scoring algorithms
-  from recent literature: the Cosine ranking from the Managing
-  Gigabytes book, and Okapi from more recent research papers.  Okapi
-  usually does better, so it is the default (but your milage may
-  vary).
- A redesigned Lexicon, using a pipeline architecture to split the
-  input text into words.  This makes it possible to mix and match
-  pipeline components, e.g. you can choose between an HTML-aware
-  splitter and a plain text splitter, and additional components can be
-  added to the pipeline for case folding, stopword removal, and other
-  features.  Enough example pipeline components are provided to get
-  you started, and it is very easy to write new components.
-Performance is roughly the same as for TextIndex, and we're expecting
-to make tweaks to the code that will make it faster.
-This code can be used outside of Zope too; all you need is a
-standalone ZODB installation to make your index persistent.  Several
-functional test programs in the tests subdirectory show how to do
-this, for example mhindex.py, mailtest.py, indexhtml.py, and
-queryhtml.py.
-See the online help for how to use ZCTextIndex within Zope.  (Included
-in the subdirectory "help".)
-Code overview
-------------
-ZMI interface:
-__init__.py			ZMI publishing code
-ZCTextIndex.py			pluggable index class
-PipelineFactory.py		ZMI helper to configure the pipeline
-Indexing:
-BaseIndex.py			common code for Cosine and Okapi index
-CosineIndex.py			Cosine index implementation
-OkapiIndex.py			Okapi index implementation
-okascore.c			C implementation of scoring loop
-Lexicon:
-Lexicon.py			lexicon and sample pipeline elements
-HTMLSplitter.py			HTML-aware splitter
-StopDict.py			list of English stopwords
-stopper.c			C implementation of stop word remover
-Query parser:
-QueryParser.py			parse a query into a parse tree
-ParseTree.py			parse tree node classes and exceptions
-Utilities:
-NBest.py			find N best items in a list without sorting
-SetOps.py			efficient weighted set operations
-WidCode.py			list compression allowing phrase searches
-RiceCode.py			list compression code (as yet unused)
-Interfaces (these speak for themselves):
-IIndex.py
-ILexicon.py
-INBest.py
-IPipelineElement.py
-IPipelineElementFactory.py
-IQueryParseTree.py
-IQueryParser.py
-ISplitter.py
-Subdirectories:
-dtml				ZMI templates
-help				ZMI help files
-tests				unittests and some functional tests/examples
-www				images used in the ZMI
-Tests
-----
-Functional tests and helpers:
-hs-tool.py			helper to interpret hotshot profiler logs
-indexhtml.py			index a collection of HTML files
-mailtest.py			index and query a Unix mailbox file
-mhindex.py			index and query a set of MH folders
-python.txt			output from benchmark queries
-queryhtml.py			query an index created by indexhtml.py
-wordstats.py			dump statistics about each indexed word
-Unit tests (these speak for themselves):
-testIndex.py			
-testLexicon.py
-testNBest.py
-testPipelineFactory.py
-testQueryEngine.py
-testQueryParser.py
-testSetOps.py
-testStopper.py
-testZCTextIndex.py
--- a/src/Products/ZCTextIndex/RiceCode.py
+++ b/src/Products/ZCTextIndex/RiceCode.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Rice coding (a variation of Golomb coding)
-Based on a Java implementation by Glen McCluskey described in a Usenix
- ;login: article at
-http://www.usenix.org/publications/login/2000-4/features/java.html
-McCluskey's article explains the approach as follows.  The encoding
-for a value x is represented as a unary part and a binary part.  The
-unary part is a sequence of 1 bits followed by a 0 bit.  The binary
-part encodes some of the lower bits of x-1.
-The encoding is parameterized by a value m that describes how many
-bits to store in the binary part.  If most of the values are smaller
-than 2**m then they can be stored in only m+1 bits.
-Compute the length of the unary part, q, where
-   q = math.floor((x-1)/ 2 ** m)
-   Emit q 1 bits followed by a 0 bit.
-Emit the lower m bits of x-1, treating x-1 as a binary value.
-"""
-import array
-class BitArray:
-    def __init__(self, buf=None):
-        self.bytes = array.array('B')
-        self.nbits = 0
-        self.bitsleft = 0
-        self.tostring = self.bytes.tostring
-    def __getitem__(self, i):
-        byte, offset = divmod(i, 8)
-        mask = 2 ** offset
-        if self.bytes[byte] & mask:
-            return 1
-        else:
-            return 0
-    def __setitem__(self, i, val):
-        byte, offset = divmod(i, 8)
-        mask = 2 ** offset
-        if val:
-            self.bytes[byte] |= mask
-        else:
-            self.bytes[byte] &= ~mask
-    def __len__(self):
-        return self.nbits
-    def append(self, bit):
-        """Append a 1 if bit is true or 1 if it is false."""
-        if self.bitsleft == 0:
-            self.bytes.append(0)
-            self.bitsleft = 8
-        self.__setitem__(self.nbits, bit)
-        self.nbits += 1
-        self.bitsleft -= 1
-    def __getstate__(self):
-        return self.nbits, self.bitsleft, self.tostring()
-    def __setstate__(self, (nbits, bitsleft, s)):
-        self.bytes = array.array('B', s)
-        self.nbits = nbits
-        self.bitsleft = bitsleft
-class RiceCode:
-    def __init__(self, m):
-        """Constructor a RiceCode for m-bit values."""
-        if not (0 <= m <= 16):
-            raise ValueError, "m must be between 0 and 16"
-        self.init(m)
-        self.bits = BitArray()
-        self.len = 0
-    def init(self, m):
-        self.m = m
-        self.lower = (1 << m) - 1
-        self.mask = 1 << (m - 1)
-    def append(self, val):
-        """Append an item to the list."""
-        if val < 1:
-            raise ValueError, "value >= 1 expected, got %s" % `val`
-        val -= 1
-        # emit the unary part of the code
-        q = val >> self.m
-        for i in range(q):
-            self.bits.append(1)
-        self.bits.append(0)
-        # emit the binary part
-        r = val & self.lower
-        mask = self.mask
-        while mask:
-            self.bits.append(r & mask)
-            mask >>= 1
-        self.len += 1
-    def __len__(self):
-        return self.len
-    def tolist(self):
-        """Return the items as a list."""
-        l = []
-        i = 0 # bit offset
-        binary_range = range(self.m)
-        for j in range(self.len):
-            unary = 0
-            while self.bits[i] == 1:
-                unary += 1
-                i += 1
-            assert self.bits[i] == 0
-            i += 1
-            binary = 0
-            for k in binary_range:
-                binary = (binary << 1) | self.bits[i]
-                i += 1
-            l.append((unary << self.m) + (binary + 1))
-        return l
-    def tostring(self):
-        """Return a binary string containing the encoded data.
-        The binary string may contain some extra zeros at the end.
-        """
-        return self.bits.tostring()
-    def __getstate__(self):
-        return self.m, self.bits
-    def __setstate__(self, (m, bits)):
-        self.init(m)
-        self.bits = bits
-def encode(m, l):
-    c = RiceCode(m)
-    for elt in l:
-        c.append(elt)
-    assert c.tolist() == l
-    return c
-def encode_deltas(l):
-    if len(l) == 1:
-        return l[0], []
-    deltas = RiceCode(6)
-    deltas.append(l[1] - l[0])
-    for i in range(2, len(l)):
-        deltas.append(l[i] - l[i - 1])
-    return l[0], deltas
-def decode_deltas(start, enc_deltas):
-    deltas = enc_deltas.tolist()
-    l = [start]
-    for i in range(1, len(deltas)):
-        l.append(l[i-1] + deltas[i])
-    l.append(l[-1] + deltas[-1])
-    return l
-def test():
-    import random
-    for size in [10, 20, 50, 100, 200]:
-        l = [random.randint(1, size) for i in range(50)]
-        c = encode(random.randint(1, 16), l)
-        assert c.tolist() == l
-    for size in [10, 20, 50, 100, 200]:
-        l = range(random.randint(1, size), size + random.randint(1, size))
-        t = encode_deltas(l)
-        l2 = decode_deltas(*t)
-        assert l == l2
-        if l != l2:
-            print l
-            print l2
-def pickle_efficiency():
-    import pickle
-    import random
-    for m in [4, 8, 12]:
-        for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
-            for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
-                l = [random.randint(1, elt_range) for i in range(size)]
-                raw = pickle.dumps(l, 1)
-                enc = pickle.dumps(encode(m, l), 1)
-                print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
-                print "%5d %5d" % (len(raw), len(enc)),
-                if len(raw) > len(enc):
-                    print "win"
-                else:
-                    print "lose"
-if __name__ == "__main__":
-    test()
--- a/src/Products/ZCTextIndex/SETUP.cfg
+++ b/src/Products/ZCTextIndex/SETUP.cfg
-<extension okascore>
-  source okascore.c
-</extension>
-<extension stopper>
-  source stopper.c
-</extension>
--- a/src/Products/ZCTextIndex/SetOps.py
+++ b/src/Products/ZCTextIndex/SetOps.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""SetOps -- Weighted intersections and unions applied to many inputs."""
-from BTrees.IIBTree import IIBucket
-from BTrees.IIBTree import weightedIntersection
-from BTrees.IIBTree import weightedUnion
-from Products.ZCTextIndex.NBest import NBest
-def mass_weightedIntersection(L):
-    "A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
-    L = [(x, wx) for (x, wx) in L if x is not None]
-    if len(L) < 2:
-        return _trivial(L)
-    # Intersect with smallest first.  We expect the input maps to be
-    # IIBuckets, so it doesn't hurt to get their lengths repeatedly
-    # (len(Bucket) is fast; len(BTree) is slow).
-    L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
-    (x, wx), (y, wy) = L[:2]
-    dummy, result = weightedIntersection(x, y, wx, wy)
-    for x, wx in L[2:]:
-        dummy, result = weightedIntersection(result, x, 1, wx)
-    return result
-def mass_weightedUnion(L):
-    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
-    if len(L) < 2:
-        return _trivial(L)
-    # Balance unions as closely as possible, smallest to largest.
-    merge = NBest(len(L))
-    for x, weight in L:
-        merge.add((x, weight), len(x))
-    while len(merge) > 1:
-        # Merge the two smallest so far, and add back to the queue.
-        (x, wx), dummy = merge.pop_smallest()
-        (y, wy), dummy = merge.pop_smallest()
-        dummy, z = weightedUnion(x, y, wx, wy)
-        merge.add((z, 1), len(z))
-    (result, weight), dummy = merge.pop_smallest()
-    return result
-def _trivial(L):
-    # L is empty or has only one (mapping, weight) pair.  If there is a
-    # pair, we may still need to multiply the mapping by its weight.
-    assert len(L) <= 1
-    if len(L) == 0:
-        return IIBucket()
-    [(result, weight)] = L
-    if weight != 1:
-        dummy, result = weightedUnion(IIBucket(), result, 0, weight)
-    return result
--- a/src/Products/ZCTextIndex/Setup
+++ b/src/Products/ZCTextIndex/Setup
-*shared*
-stopper stopper.c
-okascore okascore.c
--- a/src/Products/ZCTextIndex/StopDict.py
+++ b/src/Products/ZCTextIndex/StopDict.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Provide a default list of stop words for the index.
-The specific splitter and lexicon are customizable, but the default
-ZCTextIndex should do something useful.
-"""
-def get_stopdict():
-    """Return a dictionary of stopwords."""
-    return _dict
-# This list of English stopwords comes from Lucene
-_words = [
-    "a", "and", "are", "as", "at", "be", "but", "by",
-    "for", "if", "in", "into", "is", "it",
-    "no", "not", "of", "on", "or", "such",
-    "that", "the", "their", "then", "there", "these",
-    "they", "this", "to", "was", "will", "with"
-]
-_dict = {}
-for w in _words:
-    _dict[w] = None
--- a/src/Products/ZCTextIndex/WidCode.py
+++ b/src/Products/ZCTextIndex/WidCode.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
-# for smaller ints.  This is intended for lists of word ids (wids).  The
-# ordinary string .find() method can be used to find the encoded form of a
-# desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
-# of an encoding can't appear in the interior of an encoding, so find() can't
-# be fooled into starting a match "in the middle" of an encoding. Unlike
-# UTF-8, the initial byte does not tell you how many continuation bytes
-# follow; and there's no ASCII superset property.
-# Details:
-#
-# + Only the first byte of an encoding has the sign bit set.
-#
-# + The first byte has 7 bits of data.
-#
-# + Bytes beyond the first in an encoding have the sign bit clear, followed
-#   by 7 bits of data.
-#
-# + The first byte doesn't tell you how many continuation bytes are
-#   following.  You can tell by searching for the next byte with the
-#   high bit set (or the end of the string).
-#
-# The int to be encoded can contain no more than 28 bits.
-#
-# If it contains no more than 7 bits, 0abcdefg, the encoding is
-#     1abcdefg
-#
-# If it contains 8 thru 14 bits,
-#     00abcdef ghijkLmn
-# the encoding is
-#     1abcdefg 0hijkLmn
-#
-# Static tables _encoding and _decoding capture all encodes and decodes for
-# 14 or fewer bits.
-#
-# If it contains 15 thru 21 bits,
-#    000abcde fghijkLm nopqrstu
-# the encoding is
-#    1abcdefg 0hijkLmn 0opqrstu
-#
-# If it contains 22 thru 28 bits,
-#    0000abcd efghijkL mnopqrst uvwxyzAB
-# the encoding is
-#    1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
-assert 0x80**2 == 0x4000
-assert 0x80**4 == 0x10000000
-import re
-def encode(wids):
-    # Encode a list of wids as a string.
-    wid2enc = _encoding
-    n = len(wid2enc)
-    return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
-_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
-def _encode(w):
-    assert 0x4000 <= w < 0x10000000
-    b, c = divmod(w, 0x80)
-    a, b = divmod(b, 0x80)
-    s = chr(b) + chr(c)
-    if a < 0x80:    # no more than 21 data bits
-        return chr(a + 0x80) + s
-    a, b = divmod(a, 0x80)
-    assert a < 0x80, (w, a, b, s)  # else more than 28 data bits
-    return (chr(a + 0x80) + chr(b)) + s
-_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
-def decode(code):
-    # Decode a string into a list of wids.
-    get = _decoding.get
-    # Obscure:  while _decoding does have the key '\x80', its value is 0,
-    # so the "or" here calls _decode('\x80') anyway.
-    return [get(p) or _decode(p) for p in _prog.findall(code)]
-_decoding = {} # Filled later
-def _decode(s):
-    if s == '\x80':
-        # See comment in decode().  This is here to allow a trick to work.
-        return 0
-    if len(s) == 3:
-        a, b, c = map(ord, s)
-        assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
-        return ((a & 0x7F) << 14) | (b << 7) | c
-    assert len(s) == 4, `s`
-    a, b, c, d = map(ord, s)
-    assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
-    return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
-def _fill():
-    global _encoding
-    for i in range(0x80):
-        s = chr(i + 0x80)
-        _encoding[i] = s
-        _decoding[s] = i
-    for i in range(0x80, 0x4000):
-        hi, lo = divmod(i, 0x80)
-        s = chr(hi + 0x80) + chr(lo)
-        _encoding[i] = s
-        _decoding[s] = i
-    _encoding = tuple(_encoding)
-_fill()
-def test():
-    for i in range(2**20):
-        if i % 1000 == 0: print i
-        wids = [i]
-        code = encode(wids)
-        assert decode(code) == wids, (wids, code, decode(code))
-if __name__ == "__main__":
-    test()
--- a/src/Products/ZCTextIndex/ZCTextIndex.py
+++ b/src/Products/ZCTextIndex/ZCTextIndex.py
--- a/src/Products/ZCTextIndex/__init__.py
+++ b/src/Products/ZCTextIndex/__init__.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""ZCatalog Text Index
-Plugin text index for ZCatalog.
-"""
-from PipelineFactory import element_factory
-from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
-def initialize(context):
-    context.registerClass(
-        ZCTextIndex.ZCTextIndex,
-        permission = 'Add Pluggable Index',
-        constructors = (ZCTextIndex.manage_addZCTextIndexForm,
-                        ZCTextIndex.manage_addZCTextIndex,
-                        getIndexTypes),
-        icon='www/index.gif',
-        visibility=None
-    )
-    context.registerClass(
-        ZCTextIndex.PLexicon,
-        permission = 'Add Vocabularies',
-        constructors = (ZCTextIndex.manage_addLexiconForm,
-                        ZCTextIndex.manage_addLexicon,
-                        getElementGroups, getElementNames),
-        icon='www/lexicon.gif'
-    )
-    context.registerHelp()
-    context.registerHelpTitle("Zope Help")
-## Functions below are for use in the ZMI constructor forms ##
-def getElementGroups(self):
-    return element_factory.getFactoryGroups()
-def getElementNames(self, group):
-    return element_factory.getFactoryNames(group)
-def getIndexTypes(self):
-    return ZCTextIndex.index_types.keys()
-## Allow relevent exceptions to be caught in untrusted code
-from AccessControl import ModuleSecurityInfo
-ModuleSecurityInfo('Products').declarePublic('ZCTextIndex')
-ModuleSecurityInfo('Products.ZCTextIndex').declarePublic('ParseTree')
-ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('QueryError')
-ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('ParseError')
--- a/src/Products/ZCTextIndex/dtml/addLexicon.dtml
+++ b/src/Products/ZCTextIndex/dtml/addLexicon.dtml
-<dtml-var manage_page_header>
-<dtml-var "manage_form_title(this(), _,
-           form_title='Add ZCTextIndex Lexicon',
-           help_product='ZCTextIndex',
-           help_topic='Lexicon_Add.stx'
-	   )">
-<p class="form-help">
-  A ZCTextIndex Lexicon processes and stores the words of documents indexed
-  with a ZCTextIndex. Multiple ZCTextIndexes can share the same lexicon.
-</p>
-<form action="manage_addLexicon" method="POST">
-<table cellspacing="0" cellpadding="2" border="0">
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Id
-    </div>
-    </td>
-    <td align="left" valign="top">
-    <input type="text" name="id" size="40" />
-    </td>
-  </tr>
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-optional">
-    Title
-    </div>
-    </td>
-    <td align="left" valign="top">
-    <input type="text" name="title" size="40" />
-    </td>
-  </tr>
-  <dtml-in name="getElementGroups" prefix="group">
-    <dtml-let elements="getElementNames(group_item)">
-      <tr>
-        <td align="left" valign="top">
-          <div class="form-label">&dtml-group_item;</div>
-        </td>
-        <td align="left" valign="top">
-          <input type="hidden" name="elements.group:records" 
-                 value="&dtml-group_item;" />
-          <dtml-if expr="_.len(elements) > 1">
-            <select name="elements.name:records">
-              <dtml-in name="elements">
-                <option value="&dtml-sequence-item;"
-                >&dtml-sequence-item;</option>
-              </dtml-in>
-            </select>
-          <dtml-else>
-            <input type="checkbox" name="elements.name:records" 
-                   value="<dtml-var expr="elements[0]" html_quote>" checked />
-          </dtml-if>
-        </td>
-      </tr>
-    </dtml-let>
-  </dtml-in>
-  <tr>
-    <td align="left" valign="top">
-    </td>
-    <td align="left" valign="top">
-    <div class="form-element">
-    <input class="form-element" type="submit" name="submit" 
-     value=" Add " /> 
-    </div>
-    </td>
-  </tr>
-</table>
-</form>
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
+++ b/src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
-<dtml-var manage_page_header>
-<dtml-var "manage_form_title(this(), _,
-           form_title='Add ZCTextIndex',
-           help_product='ZCTextIndex',
-           help_topic='ZCTextIndex_Add.stx'
-	   )">
-<p class="form-help">
-<strong>Text Indexes</strong> break text up into individual words, and 
-are often referred to as full-text indexes. Text indexes 
-sort results by score, meaning they return hits in order 
-from the most relevant to the least relevant.
-</p>
-<form action="manage_addZCTextIndex" method="post"
-      enctype="multipart/form-data">
-<table cellspacing="0" cellpadding="2" border="0">
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Id
-    </div>
-    </td>
-    <td align="left" valign="top">
-    <input type="text" name="id" size="40" />
-    </td>
-  </tr>
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Indexed attributes
-    </div></td>
-    <td align="left" valign="top">
-    <input type="text" name="extra.doc_attr:record" size="40" />
-    <em>attribute1,attribute2,...</em> or leave empty
-    </td>
-  </tr>
-  <tr>
-    <td align="left" valign="top">
-    <div class="form-label">
-    Ranking Strategy
-    </div>
-    </td>
-    <td align="left" valign="top">
-      <select name="extra.index_type:record">
-        <dtml-in name="getIndexTypes">
-          <option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
-        </dtml-in>
-      </select>        
-    </td>
-  </tr>
-  <tr>
-    <td align="left" valign"top">
-    <div class="form-label">
-    Lexicon
-    </div></td>
-    <td>
-    <dtml-in expr="superValues('ZCTextIndex Lexicon')">
-      <dtml-if sequence-start>
-        <select name="extra.lexicon_id:record">
-      </dtml-if>
-      <option value="&dtml-id;">
-        &dtml-id; <dtml-var name="title" fmt="(%s)" null html_quote>
-      </option>
-      <dtml-if sequence-end>
-        </select>
-      </dtml-if>
-    <dtml-else>
-      <em>You must create a ZCTextIndex Lexicon first.</em>
-    </dtml-in>
-    </td> 
-  </tr>
-  <tr>
-    <td align="left" valign="top">
-    </td>
-    <td align="left" valign="top">
-    <div class="form-element">
-    <input class="form-element" type="submit" name="submit" 
-     value=" Add " /> 
-    </div>
-    </td>
-  </tr>
-</table>
-</form>
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/manageLexicon.dtml
+++ b/src/Products/ZCTextIndex/dtml/manageLexicon.dtml
-<dtml-var manage_page_header>
-<dtml-var manage_tabs>
-<p class="form-help">
-  The lexicon processes and stores the words found in objects indexed by one 
-  or more ZCTextIndexes.
-</p>
-<p class="section-bar">
-  <span class="form-label">Input Pipeline Stages</span>
-</p>
-<p class="form-help">
-  Text indexed through this lexicon is processed by the following pipeline 
-  stages
-</p>
-<ol class="form-help">
-  <dtml-in name="getPipelineNames">
-    <li>&dtml-sequence-item;</li>
-  </dtml-in>
-</ol>
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
+++ b/src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
-<dtml-var manage_page_header>
-<dtml-var manage_tabs>
-<p class="form-help">
-  Name(s) of attribute(s) indexed: 
-  <em><dtml-var "', '.join(getIndexSourceNames())"></em>
-</p>
-<p class="form-help">
-  Index type: 
-  <em>&dtml-getIndexType;</em>
-</p>
-<p class="form-help">
-  ZCTextIndex Lexicon used: 
-  <dtml-if getLexiconURL>
-    <a href="&dtml-getLexiconURL;/manage_main"
-    >&dtml-getLexiconURL;</a>
-  <dtml-else>
-    <em>(Lexicon Not Found)</em>
-  </dtml-if>
-</p>
-<p class="form-help">
-  <em>Note:</em> The lexicon assigned to the index cannot be changed. To replace
-  the existing lexicon, create a new lexicon in the same place and clear the
-  index. This will make the index use the replacement lexicon.
-</p>
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/dtml/queryLexicon.dtml
+++ b/src/Products/ZCTextIndex/dtml/queryLexicon.dtml
-<dtml-var manage_page_header>
-<dtml-var manage_tabs>
-<p class="form-help">
-  Browse the words in the lexicon or enter the word(s) you are interested in
-  below. Globbing characters (*, ?) are supported
-</p>
-<dtml-let words_str="' '.join(REQUEST.get('words',[]))">
-  <form action="&dtml-URL;">
-    <p class="form-element">
-      <span class="form-label">Word(s)</span>
-      <input name="words:tokens" size="20"  value="&dtml-words_str;" />
-      <input type="submit" value="Query" />
-      <span class="form-label">&nbsp;Output Columns:</span>
-      <input name="cols:int" size="2" value="&dtml-cols;" />
-      <span class="form-label">&nbsp;Rows:</span>
-      <input name="rows:int" size="2" value="&dtml-rows;" />
-    </p>
-  </form>
-  <hr />
-  <form action="&dtml-URL;">
-    <table width="100%"  cellpadding="2" cellspacing="0" border="0">
-    <tr class="section-bar">
-      <td><span class="form-label">
-        &dtml-word_count; Words Found<dtml-if word_count>,
-        Displaying &dtml-start_word;-&dtml-end_word;
-        </dtml-if>
-        <dtml-if expr="page_count > 0">
-          </span></td>
-          <td align="right"><span class="form-label">
-            Page:
-            <select name="page:int" onchange="this.form.submit()">
-              <dtml-in name="page_range" prefix="page">
-                <option value="&dtml-page_item;"
-                <dtml-if expr="page == page_item">
-                  selected
-                </dtml-if>
-                >
-                  <dtml-var expr="page_item+1">
-                </option>
-              </dtml-in>
-            </select>
-            of &dtml-page_count;
-            <input type="submit" value="Go" />
-            <input type="hidden" name="cols:int" value="&dtml-cols;" />
-            <input type="hidden" name="rows:int" value="&dtml-rows;" />
-            <input type="hidden" name="words:tokens" value="&dtml-words_str;" />
-        </dtml-if>
-        </span></td>
-    </tr>
-    </table>
-  </form>
-</dtml-let>
-<dtml-if name="page_columns">
-  <table width="100%" cellpadding="0" cellspacing="10" border="0">
-    <tr>
-    <dtml-in name="page_columns" prefix="column">
-      <td align="left" valign="top">
-        <dtml-var expr="'<br />'.join(column_item)">
-      </td>
-    </dtml-in>
-    </tr>
-  </table>
-</dtml-if>
-<dtml-var manage_page_footer>
--- a/src/Products/ZCTextIndex/help/Lexicon_Add.stx
+++ b/src/Products/ZCTextIndex/help/Lexicon_Add.stx
-ZCTextIndex Lexicon - Add: Create a new ZCTextIndex Lexicon
-    Description
-        This view allows you to create a new ZCTextIndex Lexicon object.
-        ZCTextIndex Lexicons store the words indexed by ZCTextIndexes in a
-        ZCatalog.
-    Controls
-        'Id' -- Allows you to specify the id of the ZCTextIndex Lexicon.
-        'Title' -- Allows you to specify the title of the ZCTextIndex Lexicon.
-        Pipeline Stages
-            The remaining controls allow you to select the desired processing
-            of text to index by selecting pipeline stages.
-            The default available stages are:
-            - **Word Splitter** This is the only mandatory stage. The word
-              splitter breaks the text up into a list of words. Included is a
-              simple whitespace splitter, and a splitter that removes HTML
-              tags. The HTML aware splitter gives best results when all of
-              the incoming content to index is HTML.
-            - **Stop Words** To conserve space in the vocabulary, and possibly
-              increase performance, you can select a stop word remover which
-              subtracts very common or single letter words from the Lexicon.
-              Bear in mind that you will not be able to search on removed stop
-              words, and they will also be removed from queries passed to
-              search ZCTextIndexes using the Lexicon.
-            - **Case Normalizer** The case normalizer removes case information
-              from the words in the  Lexicon. If case-sensitive searching is
-              desires, then omit this element from the pipeline.
--- a/src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
+++ b/src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
-ZCTextIndex  Add: Create a new ZCTextIndex
-    Description
-        A ZCTextIndex is an index for performing full text searches over
-        bodies of text. It includes the following features:
-        - Boolean query operators with parenthetical grouping
-        - Globbing (partial word) and phrase matching
-        - Two selectable relevance scoring algorithms
-        ZCTextIndex is designed as a replacement for standard TextIndex, and
-        has several advantages over it.
-    Controls
-        'Id' -- The id of the ZCTextIndex, must be unique for this ZCatalog.
-        'Field Name' -- The name of the field (object attribute) to be indexed.
-        'Ranking Strategy'
-        - **Okapi BM25 Rank** A relevance scoring technique that seems to
-          work well when the document text is considerably longer than the
-          query string, which is often the case with user specified query
-          strings.
-        - **Cosine Measure** A relevance scoring technique derived from the
-          "*Managing Gigabytes*":http://www.cs.mu.oz.au/mg/ book. It seems
-          to work best when the queries are similar in size and content to
-          the text they are searching.
-        'Lexicon' -- The ZCTextIndex Lexicon to be used by this ZCTextIndex.
-                     Lexicons process and store the words from the text and
-                     help in processing queries. You must define a ZCTextIndex
-                     Lexicon before you can create a ZCTextIndex. Several
-                     ZCTextIndexes can share the same Lexicon if desired.
--- a/src/Products/ZCTextIndex/interfaces.py
+++ b/src/Products/ZCTextIndex/interfaces.py
--- a/src/Products/ZCTextIndex/okascore.c
+++ b/src/Products/ZCTextIndex/okascore.c
-/*****************************************************************************
-  Copyright (c) 2002 Zope Foundation and Contributors.
-  All Rights Reserved.
-  This software is subject to the provisions of the Zope Public License,
-  Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-  FOR A PARTICULAR PURPOSE
- ****************************************************************************/
-/*	okascore.c
- *
- *	The inner scoring loop of OkapiIndex._search_wids() coded in C.
- *
- * Example from an indexed Python-Dev archive, where "python" shows up in all
- * but 2 of the 19,058 messages.  With the Python scoring loop,
- *
- *      query: python
- *      # results: 10 of 19056 in 534.77 ms
- *      query: python
- *      # results: 10 of 19056 in 277.52 ms
- *
- * The first timing is cold, the second timing from an immediate repeat of
- * the same query.  With the scoring loop here in C:
- *
- *     query: python
- *     # results: 10 of 19056 in 380.74 ms  -- 40% speedup
- *     query: python
- *     # results: 10 of 19056 in 118.96 ms  -- 133% speedup
- */
-#include "Python.h"
-#define K1 1.2
-#define B  0.75
-#ifndef PyTuple_CheckExact
-#define PyTuple_CheckExact PyTuple_Check
-#endif
-static PyObject *
-score(PyObject *self, PyObject *args)
-{
-	/* Believe it or not, floating these common subexpressions "by hand"
-	   gets better code out of MSVC 6. */
-	const double B_FROM1 = 1.0 - B;
-	const double K1_PLUS1 = K1 + 1.0;
-	/* Inputs */
-	PyObject *result;	/* IIBucket result, maps d to score */
-	PyObject *d2fitems;	/* ._wordinfo[t].items(), maps d to f(d, t) */
-	PyObject *d2len;	/* ._docweight, maps d to # words in d */
-	double idf;		/* inverse doc frequency of t */
-	double meandoclen;	/* average number of words in a doc */
-	int n, i;
-	if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
-						   &idf, &meandoclen))
-		return NULL;
-	idf *= 1024.0;	/* float out part of the scaled_int computation */
-	n = PyObject_Length(d2fitems);
-	for (i = 0; i < n; ++i) {
-		PyObject *d_and_f;	/* d2f[i], a (d, f) pair */
-		PyObject *d;
-		double f;
-		PyObject *doclen;	/* ._docweight[d] */
-		double lenweight;
-		double tf;
-		PyObject *scaled_int;
-		int status;
-		d_and_f = PySequence_GetItem(d2fitems, i);
-		if (d_and_f == NULL)
-			return NULL;
-		if (!(PyTuple_CheckExact(d_and_f) &&
-		      PyTuple_GET_SIZE(d_and_f) == 2)) {
-			PyErr_SetString(PyExc_TypeError,
-				"d2fitems must produce 2-item tuples");
-			Py_DECREF(d_and_f);
-			return NULL;
-		}
-		d = PyTuple_GET_ITEM(d_and_f, 0);
-		f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
-		doclen = PyObject_GetItem(d2len, d);
-		if (doclen == NULL) {
-			Py_DECREF(d_and_f);
-			return NULL;
-		}
-		lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
-		tf = f * K1_PLUS1 / (f + K1 * lenweight);
-		scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
-		if (scaled_int == NULL)
-			status = -1;
-		else
-			status = PyObject_SetItem(result, d, scaled_int);
-		Py_DECREF(d_and_f);
-		Py_DECREF(doclen);
-		Py_XDECREF(scaled_int);
-		if (status < 0)
-			return NULL;
-	}
-	Py_INCREF(Py_None);
-	return Py_None;
-}
-static char score__doc__[] =
-"score(result, d2fitems, d2len, idf, meandoclen)\n"
-"\n"
-"Do the inner scoring loop for an Okapi index.\n";
-static PyMethodDef okascore_functions[] = {
-	{"score",	   score,	  METH_VARARGS, score__doc__},
-	{NULL}
-};
-void
-initokascore(void)
-{
-	PyObject *m;
-	m = Py_InitModule3("okascore", okascore_functions,
-			    "inner scoring loop for Okapi rank");
-}
--- a/src/Products/ZCTextIndex/stopper.c
+++ b/src/Products/ZCTextIndex/stopper.c
-/*****************************************************************************
-  Copyright (c) 2002 Zope Foundation and Contributors.
-  All Rights Reserved.
-  This software is subject to the provisions of the Zope Public License,
-  Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-  FOR A PARTICULAR PURPOSE
- ****************************************************************************/
-/*  stopper.c
- *
- *  Fast version of the StopWordRemover object.
- */
-#include "Python.h"
-static PyObject *
-stopper_process(PyObject *unused, PyObject *args)
-{
-    PyObject *result = NULL;
-    PyObject *dict;
-    PyObject *seq;
-    int len, i;
-    if (!PyArg_ParseTuple(args, "O!O:process", &PyDict_Type, &dict, &seq))
-        return NULL;
-    seq = PySequence_Fast(seq,
-                          "process() requires a sequence as argument 2");
-    if (seq == NULL)
-        return NULL;
-    result = PyList_New(0);
-    if (result == NULL)
-        goto finally;
-#if PY_VERSION_HEX >= 0x02020000
-    /* Only available in Python 2.2 and newer. */
-    len = PySequence_Fast_GET_SIZE(seq);
-#else
-    len = PyObject_Length(seq);
-#endif
-    for (i = 0; i < len; ++i) {
-        PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
-        /*
-         * PyDict_GetItem() returns NULL if there isn't a matching
-         * item, but without setting an exception, so this does what
-         * we want.
-         */
-        if (PyDict_GetItem(dict, s) == NULL) {
-            if (PyList_Append(result, s) < 0) {
-                Py_DECREF(result);
-                result = NULL;
-                goto finally;
-            }
-        }
-    }
- finally:
-    Py_DECREF(seq);
-    return result;
-}
-static PyMethodDef stopper_functions[] = {
-    {"process", stopper_process, METH_VARARGS,
-     "process(dict, [str, ...]) --> [str, ...]\n"
-     "Remove stop words (the keys of dict) from the input list of strings\n"
-     " to create a new list."},
-    {NULL}
-};
-void
-initstopper(void)
-{
-    Py_InitModule3("stopper", stopper_functions,
-                   "Fast StopWordRemover implementation.");
-}
--- a/src/Products/ZCTextIndex/tests/__init__.py
+++ b/src/Products/ZCTextIndex/tests/__init__.py
-##############################################################################
-#
-# Copyright (c) 2002 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE
-#
-##############################################################################
-"""Test package."""
--- a/src/Products/ZCTextIndex/tests/hs-tool.py
+++ b/src/Products/ZCTextIndex/tests/hs-tool.py
-#! /usr/bin/env python
-import cPickle
-import os.path
-import sys
-from hotshot.log import LogReader
-def load_line_info(log):
-    byline = {}
-    prevloc = None
-    for what, place, tdelta in log:
-        if tdelta > 0:
-            t, nhits = byline.get(prevloc, (0, 0))
-            byline[prevloc] = (tdelta + t), (nhits + 1)
-            prevloc = place
-    return byline
-def basename(path, cache={}):
-    try:
-        return cache[path]
-    except KeyError:
-        fn = os.path.split(path)[1]
-        cache[path] = fn
-        return fn
-def print_results(results):
-    for info, place in results:
-        if place is None:
-            # This is the startup time for the profiler, and only
-            # occurs at the very beginning.  Just ignore it, since it
-            # corresponds to frame setup of the outermost call, not
-            # anything that's actually interesting.
-            continue
-        filename, line, funcname = place
-        print '%8d %8d' % info, basename(filename), line
-def annotate_results(results):
-    files = {}
-    for stats, place in results:
-        if not place:
-            continue
-        time, hits = stats
-        file, line, func = place
-        l = files.get(file)
-        if l is None:
-            l = files[file] = []
-        l.append((line, hits, time))
-    order = files.keys()
-    order.sort()
-    for k in order:
-        if os.path.exists(k):
-            v = files[k]
-            v.sort()
-            annotate(k, v)
-def annotate(file, lines):
-    print "-" * 60
-    print file
-    print "-" * 60
-    f = open(file)
-    i = 1
-    match = lines[0][0]
-    for line in f:
-        if match == i:
-            print "%6d %8d " % lines[0][1:], line,
-            del lines[0]
-            if lines:
-                match = lines[0][0]
-            else:
-                match = None
-        else:
-            print " " * 16, line,
-        i += 1
-    print
-def get_cache_name(filename):
-    d, fn = os.path.split(filename)
-    cache_dir = os.path.join(d, '.hs-tool')
-    cache_file = os.path.join(cache_dir, fn)
-    return cache_dir, cache_file
-def cache_results(filename, results):
-    cache_dir, cache_file = get_cache_name(filename)
-    if not os.path.exists(cache_dir):
-        os.mkdir(cache_dir)
-    fp = open(cache_file, 'wb')
-    try:
-        cPickle.dump(results, fp, 1)
-    finally:
-        fp.close()
-def main(filename, annotate):
-    cache_dir, cache_file = get_cache_name(filename)
-    if (  os.path.isfile(cache_file)
-          and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
-        # cached data is up-to-date:
-        fp = open(cache_file, 'rb')
-        results = cPickle.load(fp)
-        fp.close()
-    else:
-        log = LogReader(filename)
-        byline = load_line_info(log)
-        # Sort
-        results = [(v, k) for k, v in byline.items()]
-        results.sort()
-        cache_results(filename, results)
-    if annotate:
-        annotate_results(results)
-    else:
-        print_results(results)
-if __name__ == "__main__":
-    import getopt
-    annotate_p = 0
-    opts, args = getopt.getopt(sys.argv[1:], 'A')
-    for o, v in opts:
-        if o == '-A':
-            annotate_p = 1
-    if args:
-        filename, = args
-    else:
-        filename = "profile.dat"
-    main(filename, annotate_p)
--- a/src/Products/ZCTextIndex/tests/indexhtml.py
+++ b/src/Products/ZCTextIndex/tests/indexhtml.py
-#! /usr/bin/env python
-"""Index a collection of HTML files on the filesystem.
-usage: indexhtml.py [options] dir
-Will create an index of all files in dir or its subdirectories.
-options:
-f data.fs  -- the path to the filestorage datafile
-"""
-# XXX: Products.PluginIndexes.TextIndex no longer exists
-from __future__ import nested_scopes
-import os
-from time import clock
-import ZODB
-from ZODB.FileStorage import FileStorage
-from BTrees.IOBTree import IOBTree
-import transaction
-from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
-from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
-from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
-def make_zc_index():
-    # there's an elaborate dance necessary to construct an index
-    class Struct:
-        pass
-    extra = Struct()
-    extra.doc_attr = "read"
-    extra.lexicon_id = "lexicon"
-    caller = Struct()
-    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
-    return ZCTextIndex("read", extra, caller)
-# XXX make a splitter more like the HTMLSplitter for TextIndex
-# signature is
-# Splitter(string, stop_words, encoding,
-#          singlechar, indexnumbers, casefolding)
-class MySplitter:
-    def __init__(self):
-        self._v_splitter = HTMLWordSplitter()
-    def __call__(self, text, stopdict, *args, **kwargs):
-        words = self._v_splitter._split(text)
-        def lookup(w):
-            return stopdict.get(w, w)
-        return filter(None, map(lookup, words))
-#def make_old_index():
-#    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
-#    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
-#    from Products.ZCTextIndex.StopDict import get_stopdict
-#
-#    l = Lexicon(get_stopdict())
-#    l.SplitterFunc = MySplitter()
-#    return TextIndex("read", lexicon=l)
-def main(db, root, dir):
-    rt["index"] = index = INDEX()
-    rt["files"] = paths = IOBTree()
-    transaction.commit()
-    zodb_time = 0.0
-    pack_time = 0.0
-    files = [os.path.join(dir, file) for file in os.listdir(dir)]
-    docid = 0
-    t0 = clock()
-    for file in files:
-        if os.path.isdir(file):
-            files += [os.path.join(file, sub) for sub in os.listdir(file)]
-        else:
-            if not file.endswith(".html"):
-                continue
-            docid += 1
-            if LIMIT is not None and docid > LIMIT:
-                break
-            if VERBOSE:
-                print "%5d" % docid, file
-            f = open(file, "rb")
-            paths[docid] = file
-            index.index_object(docid, f)
-            f.close()
-            if docid % TXN_INTERVAL == 0:
-                z0 = clock()
-                transaction.commit()
-                z1 = clock()
-                zodb_time += z1 - z0
-                if VERBOSE:
-                    print "commit took", z1 - z0, zodb_time
-            if docid % PACK_INTERVAL == 0:
-                p0 = clock()
-                db.pack()
-                p1 = clock()
-                zodb_time += p1 - p0
-                pack_time += p1 - p0
-                if VERBOSE:
-                    print "pack took", p1 - p0, pack_time
-    z0 = clock()
-    transaction.commit()
-    z1 = t1 = clock()
-    total_time = t1 - t0
-    zodb_time += z1 - z0
-    if VERBOSE:
-        print "Total index time", total_time
-        print "Non-pack time", total_time - pack_time
-        print "Non-ZODB time", total_time - zodb_time
-if __name__ == "__main__":
-    import sys
-    import getopt
-    VERBOSE = 0
-    FSPATH = "Data.fs"
-    TXN_INTERVAL = 100
-    PACK_INTERVAL = 500
-    LIMIT = None
-    INDEX = make_zc_index
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
-    except getopt.error, msg:
-        print msg
-        print __doc__
-        sys.exit(2)
-    for o, v in opts:
-        if o == '-v':
-            VERBOSE += 1
-        if o == '-f':
-            FSPATH = v
-        if o == '-t':
-            TXN_INTERVAL = int(v)
-        if o == '-p':
-            PACK_INTERVAL = int(v)
-        if o == '-n':
-            LIMIT = int(v)
-#        if o == '-T':
-#            INDEX = make_old_index
-    if len(args) != 1:
-        print "Expected on argument"
-        print __doc__
-        sys.exit(2)
-    dir = args[0]
-    fs = FileStorage(FSPATH)
-    db = ZODB.DB(fs)
-    cn = db.open()
-    rt = cn.root()
-    dir = os.path.join(os.getcwd(), dir)
-    print dir
-    main(db, rt, dir)
-    cn.close()
-    fs.close()
--- a/src/Products/ZCTextIndex/tests/mailtest.py
+++ b/src/Products/ZCTextIndex/tests/mailtest.py
-"""Test an index with a Unix mailbox file.
-usage: python mailtest.py [options] <data.fs>
-options:
-    -v     -- verbose
-    Index Generation
-    -i mailbox
-    -n NNN -- max number of messages to read from mailbox
-    -t NNN -- commit a transaction every NNN messages (default: 1)
-    -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-    -p 0   -- don't pack at all
-    -x     -- exclude the message text from the data.fs
-    Queries
-    -q query
-    -b NNN -- return the NNN best matches (default: 10)
-    -c NNN -- context; if -v, show the first NNN lines of results (default: 5)
-The script either indexes or queries depending on whether -q or -i is
-passed as an option.
-For -i mailbox, the script reads mail messages from the mailbox and
-indexes them.  It indexes one message at a time, then commits the
-transaction.
-For -q query, it performs a query on an existing index.
-If both are specified, the index is performed first.
-You can also interact with the index after it is completed. Load the
-index from the database:
-    import ZODB
-    from ZODB.FileStorage import FileStorage
-    fs = FileStorage(<data.fs>
-    db = ZODB.DB(fs)
-    index = cn.open().root()["index"]
-    index.search("python AND unicode")
-"""
-import ZODB
-import ZODB.FileStorage
-import transaction
-from Products.ZCTextIndex.Lexicon import \
-     Lexicon, CaseNormalizer, Splitter, StopWordRemover
-from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
-from BTrees.IOBTree import IOBTree
-from Products.ZCTextIndex.QueryParser import QueryParser
-import sys
-import mailbox
-import time
-def usage(msg):
-    print msg
-    print __doc__
-    sys.exit(2)
-class Message:
-    total_bytes = 0
-    def __init__(self, msg):
-        subject = msg.getheader('subject', '')
-        author = msg.getheader('from', '')
-        if author:
-            summary = "%s (%s)\n" % (subject, author)
-        else:
-            summary = "%s\n" % subject
-        self.text = summary + msg.fp.read()
-        Message.total_bytes += len(self.text)
-class Extra:
-    pass
-def index(rt, mboxfile, db, profiler):
-    global NUM
-    idx_time = 0
-    pack_time = 0
-    start_time = time.time()
-    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
-    extra = Extra()
-    extra.lexicon_id = 'lexicon'
-    extra.doc_attr = 'text'
-    extra.index_type = 'Okapi BM25 Rank'
-    caller = Extra()
-    caller.lexicon = lexicon
-    rt["index"] = idx = ZCTextIndex("index", extra, caller)
-    if not EXCLUDE_TEXT:
-        rt["documents"] = docs = IOBTree()
-    else:
-        docs = None
-    transaction.commit()
-    mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
-    if VERBOSE:
-        print "opened", mboxfile
-    if not NUM:
-        NUM = sys.maxint
-    if profiler:
-        itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
-    else:
-        itime, ptime, i = indexmbox(mbox, idx, docs, db)
-    idx_time += itime
-    pack_time += ptime
-    transaction.commit()
-    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
-        if VERBOSE >= 2:
-            print "packing one last time..."
-        p0 = time.clock()
-        db.pack(time.time())
-        p1 = time.clock()
-        if VERBOSE:
-            print "pack took %s sec" % (p1 - p0)
-        pack_time += p1 - p0
-    if VERBOSE:
-        finish_time = time.time()
-        print
-        print "Index time", round(idx_time / 60, 3), "minutes"
-        print "Pack time", round(pack_time / 60, 3), "minutes"
-        print "Index bytes", Message.total_bytes
-        rate = (Message.total_bytes / idx_time) / 1024
-        print "Index rate %.2f KB/sec" % rate
-        print "Indexing began", time.ctime(start_time)
-        print "Indexing ended", time.ctime(finish_time)
-        print "Wall clock minutes", round((finish_time - start_time)/60, 3)
-def indexmbox(mbox, idx, docs, db):
-    idx_time = 0
-    pack_time = 0
-    i = 0
-    while i < NUM:
-        _msg = mbox.next()
-        if _msg is None:
-            break
-        i += 1
-        msg = Message(_msg)
-        if VERBOSE >= 2:
-            print "indexing msg", i
-        i0 = time.clock()
-        idx.index_object(i, msg)
-        if not EXCLUDE_TEXT:
-            docs[i] = msg
-        if i % TXN_SIZE == 0:
-            transaction.commit()
-        i1 = time.clock()
-        idx_time += i1 - i0
-        if VERBOSE and i % 50 == 0:
-            print i, "messages indexed"
-            print "cache size", db.cacheSize()
-        if PACK_INTERVAL and i % PACK_INTERVAL == 0:
-            if VERBOSE >= 2:
-                print "packing..."
-            p0 = time.clock()
-            db.pack(time.time())
-            p1 = time.clock()
-            if VERBOSE:
-                print "pack took %s sec" % (p1 - p0)
-            pack_time += p1 - p0
-    return idx_time, pack_time, i
-def query(rt, query_str, profiler):
-    idx = rt["index"]
-    docs = rt["documents"]
-    start = time.clock()
-    if profiler is None:
-        results, num_results = idx.query(query_str, BEST)
-    else:
-        if WARM_CACHE:
-            print "Warming the cache..."
-            idx.query(query_str, BEST)
-        start = time.clock()
-        results, num_results = profiler.runcall(idx.query, query_str, BEST)
-    elapsed = time.clock() - start
-    print "query:", query_str
-    print "# results:", len(results), "of", num_results, \
-          "in %.2f ms" % (elapsed * 1000)
-    tree = QueryParser(idx.lexicon).parseQuery(query_str)
-    qw = idx.index.query_weight(tree.terms())
-    for docid, score in results:
-        scaled = 100.0 * score / qw
-        print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
-        if VERBOSE:
-            msg = docs[docid]
-            ctx = msg.text.split("\n", CONTEXT)
-            del ctx[-1]
-            print "-" * 60
-            print "message:"
-            for l in ctx:
-                print l
-            print "-" * 60
-def main(fs_path, mbox_path, query_str, profiler):
-    f = ZODB.FileStorage.FileStorage(fs_path)
-    db = ZODB.DB(f, cache_size=CACHE_SIZE)
-    cn = db.open()
-    rt = cn.root()
-    if mbox_path is not None:
-        index(rt, mbox_path, db, profiler)
-    if query_str is not None:
-        query(rt, query_str, profiler)
-    cn.close()
-    db.close()
-    f.close()
-if __name__ == "__main__":
-    import getopt
-    NUM = 0
-    VERBOSE = 0
-    PACK_INTERVAL = 500
-    EXCLUDE_TEXT = 0
-    CACHE_SIZE = 10000
-    TXN_SIZE = 1
-    BEST = 10
-    CONTEXT = 5
-    WARM_CACHE = 0
-    query_str = None
-    mbox_path = None
-    profile = None
-    old_profile = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
-                                   ['profile=', 'old-profile='])
-    except getopt.error, msg:
-        usage(msg)
-    if len(args) != 1:
-        usage("exactly 1 filename argument required")
-    for o, v in opts:
-        if o == '-n':
-            NUM = int(v)
-        elif o == '-v':
-            VERBOSE += 1
-        elif o == '-p':
-            PACK_INTERVAL = int(v)
-        elif o == '-q':
-            query_str = v
-        elif o == '-i':
-            mbox_path = v
-        elif o == '-b':
-            BEST = int(v)
-        elif o == '-x':
-            EXCLUDE_TEXT = 1
-        elif o == '-t':
-            TXN_SIZE = int(v)
-        elif o == '-c':
-            CONTEXT = int(v)
-        elif o == '-w':
-            WARM_CACHE = 1
-        elif o == '--profile':
-            profile = v
-        elif o == '--old-profile':
-            old_profile = v
-    fs_path, = args
-    if profile:
-        import hotshot
-        profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
-    elif old_profile:
-        import profile
-        profiler = profile.Profile()
-    else:
-        profiler = None
-    main(fs_path, mbox_path, query_str, profiler)
-    if profile:
-        profiler.close()
-    elif old_profile:
-        import pstats
-        profiler.dump_stats(old_profile)
-        stats = pstats.Stats(old_profile)
-        stats.strip_dirs().sort_stats('time').print_stats(20)
--- a/src/Products/ZCTextIndex/tests/mhindex.py
+++ b/src/Products/ZCTextIndex/tests/mhindex.py
--- a/src/Products/ZCTextIndex/tests/python.txt
+++ b/src/Products/ZCTextIndex/tests/python.txt
--- a/src/Products/ZCTextIndex/tests/queryhtml.py
+++ b/src/Products/ZCTextIndex/tests/queryhtml.py
--- a/src/Products/ZCTextIndex/tests/testHTMLSplitter.py
+++ b/src/Products/ZCTextIndex/tests/testHTMLSplitter.py
-##############################################################################
-#
-# Copyright (c) 2009 Zope Foundation and Contributors.
-# All Rights Reserved.
-#
-# This software is subject to the provisions of the Zope Public License,
-# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
-# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
-# FOR A PARTICULAR PURPOSE.
-#
-##############################################################################
-"""Test zope.index.text.htmlsplitter
-"""
-import unittest
-class HTMLWordSplitterTests(unittest.TestCase):
-    # Subclasses must define '_getBTreesFamily'
-    def _getTargetClass(self):
-        from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
-        return HTMLWordSplitter
-    def _makeOne(self):
-        return self._getTargetClass()()
-    def test_class_conforms_to_ISplitter(self):
-        from zope.interface.verify import verifyClass
-        from Products.ZCTextIndex.interfaces import ISplitter
-        verifyClass(ISplitter, self._getTargetClass())
-    def test_instance_conforms_to_ISplitter(self):
-        from zope.interface.verify import verifyObject
-        from Products.ZCTextIndex.interfaces import ISplitter
-        verifyObject(ISplitter, self._makeOne())
-    def test_process_empty_string(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['']), [])
-    def test_process_no_markup(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['abc def']), ['abc', 'def'])
-    def test_process_w_markup(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['<h1>abc</h1> &nbsp; <p>def</p>']),
-                         ['abc', 'def'])
-    def test_process_no_markup_w_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.process(['abc?def hij*klm nop* qrs?']),
-                         ['abc', 'def', 'hij', 'klm', 'nop', 'qrs'])
-    def test_processGlob_empty_string(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['']), [])
-    def test_processGlob_no_markup_no_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['abc def']), ['abc', 'def'])
-    def test_processGlob_w_markup_no_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['<h1>abc</h1> &nbsp; '
-                                               '<p>def</p>']),
-                         ['abc', 'def'])
-    def test_processGlob_no_markup_w_glob(self):
-        splitter = self._makeOne()
-        self.assertEqual(splitter.processGlob(['abc?def hij*klm nop* qrs?']),
-                         ['abc?def', 'hij*klm', 'nop*', 'qrs?'])
-def test_suite():
-    return unittest.TestSuite((
-        unittest.makeSuite(HTMLWordSplitterTests),
-    ))
--- a/src/Products/ZCTextIndex/tests/testIndex.py
+++ b/src/Products/ZCTextIndex/tests/testIndex.py
--- a/src/Products/ZCTextIndex/tests/testLexicon.py
+++ b/src/Products/ZCTextIndex/tests/testLexicon.py
--- a/src/Products/ZCTextIndex/tests/testNBest.py
+++ b/src/Products/ZCTextIndex/tests/testNBest.py
--- a/src/Products/ZCTextIndex/tests/testParseTree.py
+++ b/src/Products/ZCTextIndex/tests/testParseTree.py
--- a/src/Products/ZCTextIndex/tests/testPipelineFactory.py
+++ b/src/Products/ZCTextIndex/tests/testPipelineFactory.py
--- a/src/Products/ZCTextIndex/tests/testQueryEngine.py
+++ b/src/Products/ZCTextIndex/tests/testQueryEngine.py
--- a/src/Products/ZCTextIndex/tests/testQueryParser.py
+++ b/src/Products/ZCTextIndex/tests/testQueryParser.py
--- a/src/Products/ZCTextIndex/tests/testSetOps.py
+++ b/src/Products/ZCTextIndex/tests/testSetOps.py
--- a/src/Products/ZCTextIndex/tests/testStopper.py
+++ b/src/Products/ZCTextIndex/tests/testStopper.py
--- a/src/Products/ZCTextIndex/tests/testZCTextIndex.py
+++ b/src/Products/ZCTextIndex/tests/testZCTextIndex.py
--- a/src/Products/ZCTextIndex/tests/wordstats.py
+++ b/src/Products/ZCTextIndex/tests/wordstats.py
--- a/src/Products/ZCTextIndex/www/index.gif
+++ b/src/Products/ZCTextIndex/www/index.gif
--- a/src/Products/ZCTextIndex/www/lexicon.gif
+++ b/src/Products/ZCTextIndex/www/lexicon.gif
--- a/versions.cfg
+++ b/versions.cfg
@@ -13,6 +13,7 @@ initgroups = 2.13.0
 Missing = 2.13.1
 MultiMapping = 2.13.0
 Persistence = 2.13.2
+Products.ZCTextIndex = 2.13.0
 Record = 2.13.0
 RestrictedPython = 3.6.0a1
 tempstorage = 2.11.3