Commit 1c1a53d1 authored by Hanno Schlichting's avatar Hanno Schlichting

Products.ZCTextIndex was moved to its own distribution

parent 48f67574
...@@ -44,6 +44,7 @@ eggs = ...@@ -44,6 +44,7 @@ eggs =
Missing Missing
MultiMapping MultiMapping
Persistence Persistence
Products.ZCTextIndex
Record Record
RestrictedPython RestrictedPython
initgroups initgroups
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
############################################################################## ##############################################################################
import os import os
from setuptools import setup, find_packages, Extension from setuptools import setup, find_packages
setup(name='Zope2', setup(name='Zope2',
...@@ -29,18 +29,6 @@ setup(name='Zope2', ...@@ -29,18 +29,6 @@ setup(name='Zope2',
packages=find_packages('src'), packages=find_packages('src'),
namespace_packages=['Products'], namespace_packages=['Products'],
package_dir={'': 'src'}, package_dir={'': 'src'},
ext_modules=[
# indexes
Extension(
name='Products.ZCTextIndex.stopper',
sources=['src/Products/ZCTextIndex/stopper.c']),
Extension(
name='Products.ZCTextIndex.okascore',
sources=['src/Products/ZCTextIndex/okascore.c']),
],
install_requires=[ install_requires=[
'AccessControl', 'AccessControl',
'Acquisition', 'Acquisition',
...@@ -50,6 +38,7 @@ setup(name='Zope2', ...@@ -50,6 +38,7 @@ setup(name='Zope2',
'Missing', 'Missing',
'MultiMapping', 'MultiMapping',
'Persistence', 'Persistence',
'Products.ZCTextIndex',
'Record', 'Record',
'RestrictedPython', 'RestrictedPython',
'ZConfig', 'ZConfig',
......
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Abstract base class for full text index with relevance ranking."""
import math
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree
from BTrees.IIBTree import IIBucket
from BTrees.IIBTree import IITreeSet
from BTrees.IIBTree import difference
from BTrees.IIBTree import intersection
from BTrees.Length import Length
from Persistence import Persistent
from zope.interface import implements
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.interfaces import IIndex
from Products.ZCTextIndex.SetOps import mass_weightedIntersection
from Products.ZCTextIndex.SetOps import mass_weightedUnion
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and
# small enough so that scaled values should almost always fit in a signed
# 16-bit int (we're generally storing logs, so a few bits before the radix
# point goes a long way; on the flip side, for reasonably small numbers x
# most of the info in log(x) is in the fractional bits, so we do want to
# save a lot of those).
SCALE_FACTOR = 1024.0
def scaled_int(f, scale=SCALE_FACTOR):
# We expect only positive inputs, so "add a half and chop" is the
# same as round(). Surprising, calling round() is significantly more
# expensive.
return int(f * scale + 0.5)
def unique(L):
"""Return a list of the unique elements in L."""
return IITreeSet(L).keys()
class BaseIndex(Persistent):
implements(IIndex)
def __init__(self, lexicon):
self._lexicon = lexicon
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word we don't currently know about. For example, if we
# unindex the last doc containing a particular word, that wid
# remains in the lexicon, but is no longer in our _wordinfo map;
# lexicons can also be shared across indices, and some other index
# may introduce a lexicon word we've never seen.
# A word is in-vocabulary for this index if and only if
# _wordinfo.has_key(wid). Note that wid 0 must not be a key.
self._wordinfo = IOBTree()
# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self._docweight = IIBTree()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
# Use a BTree length for efficient length computation w/o conflicts
self.length = Length()
self.document_count = Length()
def length(self):
"""Return the number of words in the index."""
# This is overridden per instance
return len(self._wordinfo)
def document_count(self):
"""Return the number of documents in the index"""
# This is overridden per instance
return len(self._docweight)
def get_words(self, docid):
"""Return a list of the wordids for a given docid."""
# Note this is overridden in the instance
return WidCode.decode(self._docwords[docid])
# A subclass may wish to extend or override this.
def index_doc(self, docid, text):
if self._docwords.has_key(docid):
return self._reindex_doc(docid, text)
wids = self._lexicon.sourceToWordIds(text)
wid2weight, docweight = self._get_frequencies(wids)
self._mass_add_wordinfo(wid2weight, docid)
self._docweight[docid] = docweight
self._docwords[docid] = WidCode.encode(wids)
try:
self.document_count.change(1)
except AttributeError:
# Upgrade document_count to Length object
self.document_count = Length(self.document_count())
return len(wids)
# A subclass may wish to extend or override this. This is for adjusting
# to a new version of a doc that already exists. The goal is to be
# faster than simply unindexing the old version in its entirety and then
# adding the new version in its entirety.
def _reindex_doc(self, docid, text):
# Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
old_wids = self.get_words(docid)
old_wid2w, old_docw = self._get_frequencies(old_wids)
new_wids = self._lexicon.sourceToWordIds(text)
new_wid2w, new_docw = self._get_frequencies(new_wids)
old_widset = IITreeSet(old_wid2w.keys())
new_widset = IITreeSet(new_wid2w.keys())
in_both_widset = intersection(old_widset, new_widset)
only_old_widset = difference(old_widset, in_both_widset)
only_new_widset = difference(new_widset, in_both_widset)
del old_widset, new_widset
for wid in only_old_widset.keys():
self._del_wordinfo(wid, docid)
for wid in only_new_widset.keys():
self._add_wordinfo(wid, new_wid2w[wid], docid)
for wid in in_both_widset.keys():
# For the Okapi indexer, the "if" will trigger only for words
# whose counts have changed. For the cosine indexer, the "if"
# may trigger for every wid, since W(d) probably changed and
# W(d) is divided into every score.
newscore = new_wid2w[wid]
if old_wid2w[wid] != newscore:
self._add_wordinfo(wid, newscore, docid)
self._docweight[docid] = new_docw
self._docwords[docid] = WidCode.encode(new_wids)
return len(new_wids)
# Subclass must override.
def _get_frequencies(self, wids):
# Compute term frequencies and a doc weight, whatever those mean
# to an indexer.
# Return pair:
# {wid0: w(d, wid0), wid1: w(d, wid1), ...],
# docweight
# The wid->weight mappings are fed into _add_wordinfo, and docweight
# becomes the value of _docweight[docid].
raise NotImplementedError
def has_doc(self, docid):
return self._docwords.has_key(docid)
# A subclass may wish to extend or override this.
def unindex_doc(self, docid):
for wid in unique(self.get_words(docid)):
self._del_wordinfo(wid, docid)
del self._docwords[docid]
del self._docweight[docid]
try:
self.document_count.change(-1)
except AttributeError:
# Upgrade document_count to Length object
self.document_count = Length(self.document_count())
def search(self, term):
wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
wids = self._remove_oov_wids(wids)
return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
wids = self._remove_oov_wids(wids)
return mass_weightedUnion(self._search_wids(wids))
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
cleaned_wids = self._remove_oov_wids(wids)
if len(wids) != len(cleaned_wids):
# At least one wid was OOV: can't possibly find it.
return IIBTree()
scores = self._search_wids(wids)
hits = mass_weightedIntersection(scores)
if not hits:
return hits
code = WidCode.encode(wids)
result = IIBTree()
for docid, weight in hits.items():
docwords = self._docwords[docid]
if docwords.find(code) >= 0:
result[docid] = weight
return result
def _remove_oov_wids(self, wids):
return filter(self._wordinfo.has_key, wids)
# Subclass must override.
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t. wids must not
# contain any OOV words.
def _search_wids(self, wids):
raise NotImplementedError
# Subclass must override.
# It's not clear what it should do. It must return an upper bound on
# document scores for the query. It would be nice if a document score
# divided by the query's query_weight gave the proabability that a
# document was relevant, but nobody knows how to do that. For
# CosineIndex, the ratio is the cosine of the angle between the document
# and query vectors. For OkapiIndex, the ratio is a (probably
# unachievable) upper bound with no "intuitive meaning" beyond that.
def query_weight(self, terms):
raise NotImplementedError
DICT_CUTOFF = 10
def _add_wordinfo(self, wid, f, docid):
# Store a wordinfo in a dict as long as there are less than
# DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
# The pickle of a dict is smaller than the pickle of an
# IIBTree, substantially so for small mappings. Thus, we use
# a dictionary until the mapping reaches DICT_CUTOFF elements.
# The cutoff is chosen based on the implementation
# characteristics of Python dictionaries. The dict hashtable
# always has 2**N slots and is resized whenever it is 2/3s
# full. A pickled dict with 10 elts is half the size of an
# IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
# choose 10 as the cutoff for now.
# The IIBTree has a smaller in-memory representation than a
# dictionary, so pickle size isn't the only consideration when
# choosing the threshold. The pickle of a 500-elt dict is 92%
# of the size of the same IIBTree, but the dict uses more
# space when it is live in memory. An IIBTree stores two C
# arrays of ints, one for the keys and one for the values. It
# holds up to 120 key-value pairs in a single bucket.
doc2score = self._wordinfo.get(wid)
if doc2score is None:
doc2score = {}
self.length.change(1)
else:
# _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree.
# Obscure: First check the type. If it's not a dict, it
# can't need conversion, and then we can avoid an expensive
# len(IIBTree).
if (isinstance(doc2score, type({})) and
len(doc2score) == self.DICT_CUTOFF):
doc2score = IIBTree(doc2score)
doc2score[docid] = f
self._wordinfo[wid] = doc2score # not redundant: Persistency!
# self._mass_add_wordinfo(wid2weight, docid)
#
# is the same as
#
# for wid, weight in wid2weight.items():
# self._add_wordinfo(wid, weight, docid)
#
# except that _mass_add_wordinfo doesn't require so many function calls.
def _mass_add_wordinfo(self, wid2weight, docid):
dicttype = type({})
get_doc2score = self._wordinfo.get
new_word_count = 0
for wid, weight in wid2weight.items():
doc2score = get_doc2score(wid)
if doc2score is None:
doc2score = {}
new_word_count += 1
elif (isinstance(doc2score, dicttype) and
len(doc2score) == self.DICT_CUTOFF):
doc2score = IIBTree(doc2score)
doc2score[docid] = weight
self._wordinfo[wid] = doc2score # not redundant: Persistency!
self.length.change(new_word_count)
def _del_wordinfo(self, wid, docid):
doc2score = self._wordinfo[wid]
del doc2score[docid]
if doc2score:
self._wordinfo[wid] = doc2score # not redundant: Persistency!
else:
del self._wordinfo[wid]
self.length.change(-1)
def inverse_doc_frequency(term_count, num_items):
"""Return the inverse doc frequency for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements IDF(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using a cosine measure."""
import math
from BTrees.IIBTree import IIBucket
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IIndex
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.BaseIndex import inverse_doc_frequency
from Products.ZCTextIndex.BaseIndex import scaled_int
from Products.ZCTextIndex.BaseIndex import SCALE_FACTOR
class CosineIndex(BaseIndex):
implements(IIndex)
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# ._docweight for cosine is
# docid -> W(docid)
# Most of the computation for computing a relevance score for the
# document occurs in the _search_wids() method. The code currently
# implements the cosine similarity function described in Managing
# Gigabytes, eq. 4.3, p. 187. The index_object() method
# precomputes some values that are independent of the particular
# query.
# The equation is
#
# sum(for t in I(d,q): w(d,t) * w(q,t))
# cosine(d, q) = -------------------------------------
# W(d) * W(q)
#
# where
# I(d, q) = the intersection of the terms in d and q.
#
# w(d, t) = 1 + log f(d, t)
# computed by doc_term_weight(); for a given word t,
# self._wordinfo[t] is a map from d to w(d, t).
#
# w(q, t) = log(1 + N/f(t))
# computed by inverse_doc_frequency()
#
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in
# self._docweight[d]
#
# W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
# computed by self.query_weight()
def _search_wids(self, wids):
if not wids:
return []
N = float(self.document_count())
L = []
DictType = type({})
for wid in wids:
assert self._wordinfo.has_key(wid) # caller responsible for OOV
d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
idf = inverse_doc_frequency(len(d2w), N) # an unscaled float
#print "idf = %.3f" % idf
if isinstance(d2w, DictType):
d2w = IIBucket(d2w)
L.append((d2w, scaled_int(idf)))
return L
def query_weight(self, terms):
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
N = float(self.document_count())
sum = 0.0
for wid in self._remove_oov_wids(wids):
wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
sum += wt ** 2.0
return scaled_int(math.sqrt(sum))
def _get_frequencies(self, wids):
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
Wsquares = 0.0
for wid, count in d.items():
w = doc_term_weight(count)
Wsquares += w * w
d[wid] = w
W = math.sqrt(Wsquares)
#print "W = %.3f" % W
for wid, weight in d.items():
#print i, ":", "%.3f" % weight,
d[wid] = scaled_int(weight / W)
#print "->", d[wid]
return d, scaled_int(W)
# The rest are helper methods to support unit tests
def _get_wdt(self, d, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
def _get_Wd(self, d):
return self._docweight[d]
def _get_ft(self, t):
wid, = self._lexicon.termToWordIds(t)
return len(self._wordinfo[wid])
def _get_wt(self, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
def doc_term_weight(count):
"""Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t)
return 1.0 + math.log(count)
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import re
from zope.interface import implements
from Products.ZCTextIndex.interfaces import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
class HTMLWordSplitter:
implements(ISplitter)
def process(self, text, wordpat=r"(?L)\w+"):
splat = []
for t in text:
splat += self._split(t, wordpat)
return splat
def processGlob(self, text):
# see Lexicon.globToWordIds()
return self.process(text, r"(?L)\w+[\w*?]*")
def _split(self, text, wordpat):
text = text.lower()
remove = [r"<[^<>]*>",
r"&[A-Za-z]+;"]
for pat in remove:
text = re.sub(pat, " ", text)
return re.findall(wordpat, text)
element_factory.registerFactory('Word Splitter',
'HTML aware splitter',
HTMLWordSplitter)
if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
from Products.ZCTextIndex.interfaces import IIndex # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.interfaces import INBest # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.interfaces import IPipelineElement # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.interfaces import IQueryParser # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.interfaces import ISplitter # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon.
$Id$
"""
import re
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.Length import Length
from Persistence import Persistent
from zope.interface import implements
from Products.ZCTextIndex.interfaces import ILexicon
from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import QueryError
from Products.ZCTextIndex.PipelineFactory import element_factory
class Lexicon(Persistent):
implements(ILexicon)
def __init__(self, *pipeline):
self._wids = OIBTree() # word -> wid
self._words = IOBTree() # wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
# of vocabulary). This can happen, e.g., if a query contains a word
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
self.length = Length()
self._pipeline = pipeline
def length(self):
"""Return the number of unique terms in the lexicon."""
# Overridden in instances
return len(self._wids)
def words(self):
return self._wids.keys()
def wids(self):
return self._words.keys()
def items(self):
return self._wids.items()
def sourceToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
if not hasattr(self.length, 'change'):
# Make sure length is overridden with a BTrees.Length.Length
self.length = Length(self.length())
# Strategically unload the length value so that we get the most
# recent value written to the database to minimize conflicting wids
# Because length is independent, this will load the most
# recent value stored, regardless of whether MVCC is enabled
self.length._p_deactivate()
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "process_post_glob", element.process)
last = process(last)
wids = []
for word in last:
wids.append(self._wids.get(word, 0))
return wids
def parseTerms(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "processGlob", element.process)
last = process(last)
return last
def isGlob(self, word):
return "*" in word or "?" in word
def get_word(self, wid):
return self._words[wid]
def get_wid(self, word):
return self._wids.get(word, 0)
def globToWordIds(self, pattern):
# Implement * and ? just as in the shell, except the pattern
# must not start with either of these
prefix = ""
while pattern and pattern[0] not in "*?":
prefix += pattern[0]
pattern = pattern[1:]
if not pattern:
# There were no globbing characters in the pattern
wid = self._wids.get(prefix, 0)
if wid:
return [wid]
else:
return []
if not prefix:
# The pattern starts with a globbing character.
# This is too efficient, so we raise an exception.
raise QueryError(
"pattern %r shouldn't start with glob character" % pattern)
pat = prefix
for c in pattern:
if c == "*":
pat += ".*"
elif c == "?":
pat += "."
else:
pat += re.escape(c)
pat += "$"
prog = re.compile(pat)
keys = self._wids.keys(prefix) # Keys starting at prefix
wids = []
for key in keys:
if not key.startswith(prefix):
break
if prog.match(key):
wids.append(self._wids[key])
return wids
def _getWordIdCreate(self, word):
wid = self._wids.get(word)
if wid is None:
wid = self._new_wid()
self._wids[word] = wid
self._words[wid] = word
return wid
def _new_wid(self):
self.length.change(1)
while self._words.has_key(self.length()): # just to be safe
self.length.change(1)
return self.length()
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
return text
else:
return [text]
# Sample pipeline elements
class Splitter:
import re
rx = re.compile(r"(?L)\w+")
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
element_factory.registerFactory('Word Splitter',
'Whitespace splitter',
Splitter)
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
element_factory.registerFactory('Case Normalizer',
'Case Normalizer',
CaseNormalizer)
element_factory.registerFactory('Stop Words',
' Don\'t remove stop words',
None)
class StopWordRemover:
dict = get_stopdict().copy()
try:
from Products.ZCTextIndex.stopper import process as _process
except ImportError:
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
else:
def process(self, lst):
return self._process(self.dict, lst)
element_factory.registerFactory('Stop Words',
'Remove listed stop words only',
StopWordRemover)
class StopWordAndSingleCharRemover(StopWordRemover):
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
element_factory.registerFactory('Stop Words',
'Remove listed and single char words',
StopWordAndSingleCharRemover)
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from bisect import bisect
from zope.interface import implements
from Products.ZCTextIndex.interfaces import INBest
class NBest:
implements(INBest)
def __init__(self, N):
"Build an NBest object to remember the N best-scoring objects."
if N < 1:
raise ValueError("NBest() argument must be at least 1")
self._capacity = N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self._scores = []
self._items = []
def __len__(self):
return len(self._scores)
def capacity(self):
return self._capacity
def add(self, item, score):
self.addmany([(item, score)])
def addmany(self, sequence):
scores, items, capacity = self._scores, self._items, self._capacity
n = len(scores)
for item, score in sequence:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if n >= capacity and score <= scores[0]:
continue
i = bisect(scores, score)
scores.insert(i, score)
items.insert(i, item)
if n == capacity:
del items[0], scores[0]
else:
n += 1
assert n == len(scores)
def getbest(self):
result = zip(self._items, self._scores)
result.reverse()
return result
def pop_smallest(self):
if self._scores:
return self._items.pop(0), self._scores.pop(0)
raise IndexError("pop_smallest() called on empty NBest object")
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using an Okapi BM25 rank."""
# Lots of comments are at the bottom of this file. Read them to
# understand what's going on.
from BTrees.IIBTree import IIBucket
from BTrees.Length import Length
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IIndex
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.BaseIndex import inverse_doc_frequency
from Products.ZCTextIndex.BaseIndex import scaled_int
from Products.ZCTextIndex.okascore import score
class OkapiIndex(BaseIndex):
implements(IIndex)
# BM25 free parameters.
K1 = 1.2
B = 0.75
assert K1 >= 0.0
assert 0.0 <= B <= 1.0
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
# Use a BTree.Length.Length object to avoid concurrent write conflicts
self._totaldoclen = Length(0L)
def index_doc(self, docid, text):
count = BaseIndex.index_doc(self, docid, text)
self._change_doc_len(count)
return count
def _reindex_doc(self, docid, text):
self._change_doc_len(-self._docweight[docid])
return BaseIndex._reindex_doc(self, docid, text)
def unindex_doc(self, docid):
self._change_doc_len(-self._docweight[docid])
BaseIndex.unindex_doc(self, docid)
def _change_doc_len(self, delta):
# Change total doc length used for scoring
try:
self._totaldoclen.change(delta)
except AttributeError:
# Opportunistically upgrade _totaldoclen attribute to Length object
self._totaldoclen = Length(long(self._totaldoclen + delta))
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
# As currently written, the weights are always 1, and the IIBucket maps
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float
# but stored as a scaled_int.
# NOTE: This is overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
def _search_wids(self, wids):
if not wids:
return []
N = float(self.document_count()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
B_from1 = 1.0 - B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
for docid, f in d2f.items():
lenweight = B_from1 + B * docid2len[docid] / meandoclen
tf = f * K1_plus1 / (f + K1 * lenweight)
result[docid] = scaled_int(tf * idf)
L.append((result, 1))
return L
# Note about the above: the result is tf * idf. tf is small -- it
# can't be larger than k1+1 = 2.2. idf is formally unbounded, but
# is less than 14 for a term that appears in only 1 of a million
# documents. So the product is probably less than 32, or 5 bits
# before the radix point. If we did the scaled-int business on
# both of them, we'd be up to 25 bits. Add 64 of those and we'd
# be in overflow territory. That's pretty unlikely, so we *could*
# just store scaled_int(tf) in result[docid], and use scaled_int(idf)
# as an invariant weight across the whole result. But besides
# skating near the edge, it's not a speed cure, since the computation
# of tf would still be done at Python speed, and it's a lot more
# work than just multiplying by idf.
# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def _search_wids(self, wids):
if not wids:
return []
N = float(self.document_count()) # total # of docs
try:
doclen = self._totaldoclen()
except TypeError:
# _totaldoclen has not yet been upgraded
doclen = self._totaldoclen
meandoclen = doclen / N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
score(result, d2f.items(), docid2len, idf, meandoclen)
L.append((result, 1))
return L
def query_weight(self, terms):
# Get the wids.
wids = []
for term in terms:
termwids = self._lexicon.termToWordIds(term)
wids.extend(termwids)
# The max score for term t is the maximum value of
# TF(D, t) * IDF(Q, t)
# We can compute IDF directly, and as noted in the comments below
# TF(D, t) is bounded above by 1+K1.
N = float(len(self._docweight))
tfmax = 1.0 + self.K1
sum = 0
for t in self._remove_oov_wids(wids):
idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
sum += scaled_int(idf * tfmax)
return sum
def _get_frequencies(self, wids):
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
return d, len(wids)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in
documents, not on an abstract vector space model. A long paper by its
principal inventors gives an excellent overview of how it was derived:
A probabilistic model of information retrieval: development and status
K. Sparck Jones, S. Walker, S.E. Robertson
http://citeseer.nj.nec.com/jones98probabilistic.html
Spellings that ignore relevance information (which we don't have) are of this
high-level form:
score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
where
D a specific document
Q a specific query
t a term (word, atomic phrase, whatever)
D&Q the terms common to D and Q
TF(D, t) a measure of t's importance in D -- a kind of term frequency
weight
IDF(Q, t) a measure of t's importance in the query and in the set of
documents as a whole -- a kind of inverse document frequency
weight
The IDF(Q, t) here is identical to the one used for our cosine measure.
Since queries are expected to be short, it ignores Q entirely:
IDF(Q, t) = log(1.0 + N / f(t))
where
N the total number of documents
f(t) the number of documents in which t appears
Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
that becomes 0 for a term that's in every document, and, e.g., if someone
is searching for "documentation" on python.org (a term that may well show
up on every page, due to the top navigation bar), we still want to find the
pages that use the word a lot (which is TF's job to find, not IDF's -- we
just want to stop IDF from considering this t to be irrelevant).
The TF(D, t) spellings are more interesting. With lots of variations, the
most basic spelling is of the form
f(D, t)
TF(D, t) = ---------------
f(D, t) + K(D)
where
f(D, t) the number of times t appears in D
K(D) a measure of the length of D, normalized to mean doc length
The functional *form* f/(f+K) is clever. It's a gross approximation to a
mixture of two distinct Poisson distributions, based on the idea that t
probably appears in D for one of two reasons:
1. More or less at random.
2. Because it's important to D's purpose in life ("eliteness" in papers).
Note that f/(f+K) is always between 0 and 1. If f is very large compared to
K, it approaches 1. If K is very large compared to f, it approaches 0. If
t appears in D more or less "for random reasons", f is likely to be small,
and so K will dominate unless it's a very small doc, and the ratio will be
small. OTOH, if t appears a lot in D, f will dominate unless it's a very
large doc, and the ratio will be close to 1.
We use a variation on that simple theme, a simplification of what's called
BM25 in the literature (it was the 25th stab at a Best Match function from
the Okapi group; "a simplification" means we're setting some of BM25's more
esoteric free parameters to 0):
f(D, t) * (k1 + 1)
TF(D, t) = --------------------
f(D, t) + k1 * K(D)
where
k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
the usual default value. This constant adjusts the curve to
look more like a theoretical 2-Poisson curve.
Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
an asymptote of k1+1 from below.
Finally, we use
K(D) = (1-b) + b * len(D)/E(len(D))
where
b is another free parameter, discussed below. We use 0.75.
len(D) the length of D in words
E(len(D)) the expected value of len(D) across the whole document set;
or, IOW, the average document length
b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
10 times as often in document d2 than in document d1. If document d2 is
also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
f(d2, t) * (k1 + 1)
TF(d2, t) = --------------------------------- =
f(d2, t) + k1 * len(d2)/E(len(D))
10 * f(d1, t) * (k1 + 1)
----------------------------------------------- = TF(d1, t)
10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
because the 10's cancel out. This is appropriate if we believe that a word
appearing 10x more often in a doc 10x as long is simply due to that the
longer doc is more verbose. If we do believe that, the longer doc and the
shorter doc are probably equally relevant. OTOH, it *could* be that the
longer doc is talking about t in greater depth too, in which case it's
probably more relevant than the shorter doc.
At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
completely, and a doc scores higher for having more occurences of a word
regardless of the doc's length.
Reality is between these extremes, and probably varies by document and word
too. Reports in the literature suggest that b=0.75 is a good compromise "in
general", favoring the "verbosity hypothesis" end of the scale.
Putting it all together, the final TF function is
f(D, t) * (k1 + 1)
TF(D, t) = --------------------------------------------
f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
with k1=1.2 and b=0.75.
Query Term Weighting
--------------------
I'm ignoring the query adjustment part of Okapi BM25 because I expect our
queries are very short. Full BM25 takes them into account by adding the
following to every score(D, Q); it depends on the lengths of D and Q, but
not on the specific words in Q, or even on whether they appear in D(!):
E(len(D)) - len(D)
k2 * len(Q) * -------------------
E(len(D)) + len(D)
Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
so it apparently doesn't do much good (or may even hurt).
Full BM25 *also* multiplies the following factor into IDF(Q, t):
f(Q, t) * (k3 + 1)
------------------
f(Q, t) + k3
where k3 is yet another free parameter, and f(Q,t) is the number of times t
appears in Q. Since we're using short "web style" queries, I expect f(Q,t)
to always be 1, and then that quotient is
1 * (k3 + 1)
------------ = 1
1 + k3
regardless of k3's value. So, in a trivial sense, we are incorporating
this measure (and optimizing it by not bothering to multiply by 1 <wink>).
"""
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from BTrees.IIBTree import difference
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IQueryParseTree
from Products.ZCTextIndex.SetOps import mass_weightedIntersection
from Products.ZCTextIndex.SetOps import mass_weightedUnion
class QueryError(Exception):
pass
class ParseError(Exception):
pass
class ParseTreeNode:
implements(IQueryParseTree)
_nodeType = None
def __init__(self, value):
self._value = value
def nodeType(self):
return self._nodeType
def getValue(self):
return self._value
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.getValue())
def terms(self):
t = []
for v in self.getValue():
t.extend(v.terms())
return t
def executeQuery(self, index):
raise NotImplementedError
class NotNode(ParseTreeNode):
_nodeType = "NOT"
def terms(self):
return []
def executeQuery(self, index):
raise QueryError, "NOT parse tree node cannot be executed directly"
class AndNode(ParseTreeNode):
_nodeType = "AND"
def executeQuery(self, index):
L = []
Nots = []
for subnode in self.getValue():
if subnode.nodeType() == "NOT":
r = subnode.getValue().executeQuery(index)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word AND NOT stop_word
# to act like plain real_word).
if r is not None:
Nots.append((r, 1))
else:
r = subnode.executeQuery(index)
# If None, technically it matches every doc, so needn't be
# included.
if r is not None:
L.append((r, 1))
set = mass_weightedIntersection(L)
if Nots:
notset = mass_weightedUnion(Nots)
set = difference(set, notset)
return set
class OrNode(ParseTreeNode):
_nodeType = "OR"
def executeQuery(self, index):
weighted = []
for node in self.getValue():
r = node.executeQuery(index)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word OR stop_word
# to act like plain real_word).
if r is not None:
weighted.append((r, 1))
return mass_weightedUnion(weighted)
class AtomNode(ParseTreeNode):
_nodeType = "ATOM"
def terms(self):
return [self.getValue()]
def executeQuery(self, index):
return index.search(self.getValue())
class PhraseNode(AtomNode):
_nodeType = "PHRASE"
def executeQuery(self, index):
return index.search_phrase(self.getValue())
class GlobNode(AtomNode):
_nodeType = "GLOB"
def executeQuery(self, index):
return index.search_glob(self.getValue())
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IPipelineElementFactory
class PipelineElementFactory:
implements(IPipelineElementFactory)
def __init__(self):
self._groups = {}
def registerFactory(self, group, name, factory):
if self._groups.has_key(group) and \
self._groups[group].has_key(name):
raise ValueError('ZCTextIndex lexicon element "%s" '
'already registered in group "%s"'
% (name, group))
elements = self._groups.get(group)
if elements is None:
elements = self._groups[group] = {}
elements[name] = factory
def getFactoryGroups(self):
groups = self._groups.keys()
groups.sort()
return groups
def getFactoryNames(self, group):
names = self._groups[group].keys()
names.sort()
return names
def instantiate(self, group, name):
factory = self._groups[group][name]
if factory is not None:
return factory()
element_factory = PipelineElementFactory()
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the
string can contain whitespace, parentheses and key words, but not
quotes.
+ A hyphen followed by one of the two forms above, meaning that it
must not be present.
An unquoted ATOM may also contain globbing characters. Globbing
syntax is defined by the lexicon; for example "foo*" could mean any
word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
"""
import re
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IQueryParser
from Products.ZCTextIndex import ParseTree
# Create unique symbols for token types.
_AND = intern("AND")
_OR = intern("OR")
_NOT = intern("NOT")
_LPAREN = intern("(")
_RPAREN = intern(")")
_ATOM = intern("ATOM")
_EOF = intern("EOF")
# Map keyword string to token type.
_keywords = {
_AND: _AND,
_OR: _OR,
_NOT: _NOT,
_LPAREN: _LPAREN,
_RPAREN: _RPAREN,
}
# Regular expression to tokenize.
_tokenizer_regex = re.compile(r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
)
""", re.VERBOSE)
# Use unicode regex to treat fullwidth space characters defined in Unicode
# as valid whitespace.
_tokenizer_unicode_regex = re.compile(
_tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
class QueryParser:
implements(IQueryParser)
# This class is not thread-safe;
# each thread should have its own instance
def __init__(self, lexicon):
self._lexicon = lexicon
self._ignored = None
# Public API methods
def parseQuery(self, query):
# Lexical analysis.
try:
# Try to use unicode and treat fullwidth whitespace as valid one.
if not isinstance(query, unicode):
query = query.decode('utf-8')
tokens = _tokenizer_unicode_regex.findall(query)
except UnicodeDecodeError:
tokens = _tokenizer_regex.findall(query)
self._tokens = tokens
# classify tokens
self._tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self._tokens.append(_EOF)
self._tokentypes.append(_EOF)
self._index = 0
# Syntactical analysis.
self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr()
self._require(_EOF)
if tree is None:
raise ParseTree.ParseError(
"Query contains only common words: %s" % repr(query))
return tree
def getIgnored(self):
return self._ignored
def parseQueryEx(self, query):
tree = self.parseQuery(query)
ignored = self.getIgnored()
return tree, ignored
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self._tokentypes[self._index] is tokentype:
self._index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self._tokentypes[self._index] is tokentype
def _get(self, tokentype):
t = self._tokens[self._index]
self._require(tokentype)
return t
def _parseOrExpr(self):
L = []
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
L = filter(None, L)
if not L:
return None # Only stopwords
elif len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
t = self._parseTerm()
if t is not None:
L.append(t)
Nots = []
while self._check(_AND):
t = self._parseNotExpr()
if t is None:
continue
if isinstance(t, ParseTree.NotNode):
Nots.append(t)
else:
L.append(t)
if not L:
return None # Only stopwords
L.extend(Nots)
if len(L) == 1:
return L[0]
else:
return ParseTree.AndNode(L)
def _parseNotExpr(self):
if self._check(_NOT):
t = self._parseTerm()
if t is None:
return None # Only stopwords
return ParseTree.NotNode(t)
else:
return self._parseTerm()
def _parseTerm(self):
if self._check(_LPAREN):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
nodes = []
nodes = [self._parseAtom()]
while self._peek(_ATOM):
nodes.append(self._parseAtom())
nodes = filter(None, nodes)
if not nodes:
return None # Only stopwords
structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
for i in range(len(nodes))]
structure.sort()
nodes = [node for (bit, index, node) in structure]
if isinstance(nodes[0], ParseTree.NotNode):
raise ParseTree.ParseError(
"a term must have at least one positive word")
if len(nodes) == 1:
return nodes[0]
tree = ParseTree.AndNode(nodes)
return tree
def _parseAtom(self):
term = self._get(_ATOM)
words = self._lexicon.parseTerms(term)
if not words:
self._ignored.append(term)
return None
if len(words) > 1:
tree = ParseTree.PhraseNode(words)
elif self._lexicon.isGlob(words[0]):
tree = ParseTree.GlobNode(words[0])
else:
tree = ParseTree.AtomNode(words[0])
if term[0] == "-":
tree = ParseTree.NotNode(tree)
return tree
ZCTextIndex
===========
This product is a replacement for the full text indexing facility of
ZCatalog. Specifically, it is an alternative to
PluginIndexes/TextIndex.
Advantages of using ZCTextIndex over TextIndex:
- A new query language, supporting both explicit and implicit Boolean
operators, parentheses, globbing, and phrase searching. Apart from
explicit operators and globbing, the syntax is roughly the same as
that popularized by Google.
- A more refined scoring algorithm, resulting in better selectiveness:
it's much more likely that you'll find the document you are looking
for among the first few highest-ranked results.
- Actually, ZCTextIndex gives you a choice of two scoring algorithms
from recent literature: the Cosine ranking from the Managing
Gigabytes book, and Okapi from more recent research papers. Okapi
usually does better, so it is the default (but your milage may
vary).
- A redesigned Lexicon, using a pipeline architecture to split the
input text into words. This makes it possible to mix and match
pipeline components, e.g. you can choose between an HTML-aware
splitter and a plain text splitter, and additional components can be
added to the pipeline for case folding, stopword removal, and other
features. Enough example pipeline components are provided to get
you started, and it is very easy to write new components.
Performance is roughly the same as for TextIndex, and we're expecting
to make tweaks to the code that will make it faster.
This code can be used outside of Zope too; all you need is a
standalone ZODB installation to make your index persistent. Several
functional test programs in the tests subdirectory show how to do
this, for example mhindex.py, mailtest.py, indexhtml.py, and
queryhtml.py.
See the online help for how to use ZCTextIndex within Zope. (Included
in the subdirectory "help".)
Code overview
-------------
ZMI interface:
__init__.py ZMI publishing code
ZCTextIndex.py pluggable index class
PipelineFactory.py ZMI helper to configure the pipeline
Indexing:
BaseIndex.py common code for Cosine and Okapi index
CosineIndex.py Cosine index implementation
OkapiIndex.py Okapi index implementation
okascore.c C implementation of scoring loop
Lexicon:
Lexicon.py lexicon and sample pipeline elements
HTMLSplitter.py HTML-aware splitter
StopDict.py list of English stopwords
stopper.c C implementation of stop word remover
Query parser:
QueryParser.py parse a query into a parse tree
ParseTree.py parse tree node classes and exceptions
Utilities:
NBest.py find N best items in a list without sorting
SetOps.py efficient weighted set operations
WidCode.py list compression allowing phrase searches
RiceCode.py list compression code (as yet unused)
Interfaces (these speak for themselves):
IIndex.py
ILexicon.py
INBest.py
IPipelineElement.py
IPipelineElementFactory.py
IQueryParseTree.py
IQueryParser.py
ISplitter.py
Subdirectories:
dtml ZMI templates
help ZMI help files
tests unittests and some functional tests/examples
www images used in the ZMI
Tests
-----
Functional tests and helpers:
hs-tool.py helper to interpret hotshot profiler logs
indexhtml.py index a collection of HTML files
mailtest.py index and query a Unix mailbox file
mhindex.py index and query a set of MH folders
python.txt output from benchmark queries
queryhtml.py query an index created by indexhtml.py
wordstats.py dump statistics about each indexed word
Unit tests (these speak for themselves):
testIndex.py
testLexicon.py
testNBest.py
testPipelineFactory.py
testQueryEngine.py
testQueryParser.py
testSetOps.py
testStopper.py
testZCTextIndex.py
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Rice coding (a variation of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import array
class BitArray:
def __init__(self, buf=None):
self.bytes = array.array('B')
self.nbits = 0
self.bitsleft = 0
self.tostring = self.bytes.tostring
def __getitem__(self, i):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if self.bytes[byte] & mask:
return 1
else:
return 0
def __setitem__(self, i, val):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if val:
self.bytes[byte] |= mask
else:
self.bytes[byte] &= ~mask
def __len__(self):
return self.nbits
def append(self, bit):
"""Append a 1 if bit is true or 1 if it is false."""
if self.bitsleft == 0:
self.bytes.append(0)
self.bitsleft = 8
self.__setitem__(self.nbits, bit)
self.nbits += 1
self.bitsleft -= 1
def __getstate__(self):
return self.nbits, self.bitsleft, self.tostring()
def __setstate__(self, (nbits, bitsleft, s)):
self.bytes = array.array('B', s)
self.nbits = nbits
self.bitsleft = bitsleft
class RiceCode:
def __init__(self, m):
"""Constructor a RiceCode for m-bit values."""
if not (0 <= m <= 16):
raise ValueError, "m must be between 0 and 16"
self.init(m)
self.bits = BitArray()
self.len = 0
def init(self, m):
self.m = m
self.lower = (1 << m) - 1
self.mask = 1 << (m - 1)
def append(self, val):
"""Append an item to the list."""
if val < 1:
raise ValueError, "value >= 1 expected, got %s" % `val`
val -= 1
# emit the unary part of the code
q = val >> self.m
for i in range(q):
self.bits.append(1)
self.bits.append(0)
# emit the binary part
r = val & self.lower
mask = self.mask
while mask:
self.bits.append(r & mask)
mask >>= 1
self.len += 1
def __len__(self):
return self.len
def tolist(self):
"""Return the items as a list."""
l = []
i = 0 # bit offset
binary_range = range(self.m)
for j in range(self.len):
unary = 0
while self.bits[i] == 1:
unary += 1
i += 1
assert self.bits[i] == 0
i += 1
binary = 0
for k in binary_range:
binary = (binary << 1) | self.bits[i]
i += 1
l.append((unary << self.m) + (binary + 1))
return l
def tostring(self):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return self.bits.tostring()
def __getstate__(self):
return self.m, self.bits
def __setstate__(self, (m, bits)):
self.init(m)
self.bits = bits
def encode(m, l):
c = RiceCode(m)
for elt in l:
c.append(elt)
assert c.tolist() == l
return c
def encode_deltas(l):
if len(l) == 1:
return l[0], []
deltas = RiceCode(6)
deltas.append(l[1] - l[0])
for i in range(2, len(l)):
deltas.append(l[i] - l[i - 1])
return l[0], deltas
def decode_deltas(start, enc_deltas):
deltas = enc_deltas.tolist()
l = [start]
for i in range(1, len(deltas)):
l.append(l[i-1] + deltas[i])
l.append(l[-1] + deltas[-1])
return l
def test():
import random
for size in [10, 20, 50, 100, 200]:
l = [random.randint(1, size) for i in range(50)]
c = encode(random.randint(1, 16), l)
assert c.tolist() == l
for size in [10, 20, 50, 100, 200]:
l = range(random.randint(1, size), size + random.randint(1, size))
t = encode_deltas(l)
l2 = decode_deltas(*t)
assert l == l2
if l != l2:
print l
print l2
def pickle_efficiency():
import pickle
import random
for m in [4, 8, 12]:
for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
l = [random.randint(1, elt_range) for i in range(size)]
raw = pickle.dumps(l, 1)
enc = pickle.dumps(encode(m, l), 1)
print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
print "%5d %5d" % (len(raw), len(enc)),
if len(raw) > len(enc):
print "win"
else:
print "lose"
if __name__ == "__main__":
test()
<extension okascore>
source okascore.c
</extension>
<extension stopper>
source stopper.c
</extension>
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""SetOps -- Weighted intersections and unions applied to many inputs."""
from BTrees.IIBTree import IIBucket
from BTrees.IIBTree import weightedIntersection
from BTrees.IIBTree import weightedUnion
from Products.ZCTextIndex.NBest import NBest
def mass_weightedIntersection(L):
"A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
L = [(x, wx) for (x, wx) in L if x is not None]
if len(L) < 2:
return _trivial(L)
# Intersect with smallest first. We expect the input maps to be
# IIBuckets, so it doesn't hurt to get their lengths repeatedly
# (len(Bucket) is fast; len(BTree) is slow).
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
(x, wx), (y, wy) = L[:2]
dummy, result = weightedIntersection(x, y, wx, wy)
for x, wx in L[2:]:
dummy, result = weightedIntersection(result, x, 1, wx)
return result
def mass_weightedUnion(L):
"A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
if len(L) < 2:
return _trivial(L)
# Balance unions as closely as possible, smallest to largest.
merge = NBest(len(L))
for x, weight in L:
merge.add((x, weight), len(x))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
(x, wx), dummy = merge.pop_smallest()
(y, wy), dummy = merge.pop_smallest()
dummy, z = weightedUnion(x, y, wx, wy)
merge.add((z, 1), len(z))
(result, weight), dummy = merge.pop_smallest()
return result
def _trivial(L):
# L is empty or has only one (mapping, weight) pair. If there is a
# pair, we may still need to multiply the mapping by its weight.
assert len(L) <= 1
if len(L) == 0:
return IIBucket()
[(result, weight)] = L
if weight != 1:
dummy, result = weightedUnion(IIBucket(), result, 0, weight)
return result
*shared*
stopper stopper.c
okascore okascore.c
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def get_stopdict():
"""Return a dictionary of stopwords."""
return _dict
# This list of English stopwords comes from Lucene
_words = [
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
]
_dict = {}
for w in _words:
_dict[w] = None
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding. Unlike
# UTF-8, the initial byte does not tell you how many continuation bytes
# follow; and there's no ASCII superset property.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The first byte has 7 bits of data.
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The first byte doesn't tell you how many continuation bytes are
# following. You can tell by searching for the next byte with the
# high bit set (or the end of the string).
#
# The int to be encoded can contain no more than 28 bits.
#
# If it contains no more than 7 bits, 0abcdefg, the encoding is
# 1abcdefg
#
# If it contains 8 thru 14 bits,
# 00abcdef ghijkLmn
# the encoding is
# 1abcdefg 0hijkLmn
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 14 or fewer bits.
#
# If it contains 15 thru 21 bits,
# 000abcde fghijkLm nopqrstu
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu
#
# If it contains 22 thru 28 bits,
# 0000abcd efghijkL mnopqrst uvwxyzAB
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
assert 0x80**2 == 0x4000
assert 0x80**4 == 0x10000000
import re
def encode(wids):
# Encode a list of wids as a string.
wid2enc = _encoding
n = len(wid2enc)
return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
def _encode(w):
assert 0x4000 <= w < 0x10000000
b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80)
s = chr(b) + chr(c)
if a < 0x80: # no more than 21 data bits
return chr(a + 0x80) + s
a, b = divmod(a, 0x80)
assert a < 0x80, (w, a, b, s) # else more than 28 data bits
return (chr(a + 0x80) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
def decode(code):
# Decode a string into a list of wids.
get = _decoding.get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return [get(p) or _decode(p) for p in _prog.findall(code)]
_decoding = {} # Filled later
def _decode(s):
if s == '\x80':
# See comment in decode(). This is here to allow a trick to work.
return 0
if len(s) == 3:
a, b, c = map(ord, s)
assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
return ((a & 0x7F) << 14) | (b << 7) | c
assert len(s) == 4, `s`
a, b, c, d = map(ord, s)
assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
def _fill():
global _encoding
for i in range(0x80):
s = chr(i + 0x80)
_encoding[i] = s
_decoding[s] = i
for i in range(0x80, 0x4000):
hi, lo = divmod(i, 0x80)
s = chr(hi + 0x80) + chr(lo)
_encoding[i] = s
_decoding[s] = i
_encoding = tuple(_encoding)
_fill()
def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))
if __name__ == "__main__":
test()
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking.
$Id$
"""
from cgi import escape
from AccessControl.class_init import InitializeClass
from AccessControl.Permissions import manage_vocabulary
from AccessControl.Permissions import manage_zcatalog_indexes
from AccessControl.Permissions import query_vocabulary
from AccessControl.Permissions import search_zcatalog
from AccessControl.SecurityInfo import ClassSecurityInfo
from Acquisition import aq_base
from Acquisition import aq_inner
from Acquisition import aq_parent
from Acquisition import Implicit
from App.special_dtml import DTMLFile
from OFS.SimpleItem import SimpleItem
from Persistence import Persistent
from zope.interface import implements
from Products.PluginIndexes.common.util import parseIndexRequest
from Products.PluginIndexes.common import safe_callable
from Products.PluginIndexes.interfaces import IPluggableIndex
from Products.ZCTextIndex.Lexicon import CaseNormalizer
from Products.ZCTextIndex.Lexicon import Lexicon
from Products.ZCTextIndex.Lexicon import Splitter
from Products.ZCTextIndex.Lexicon import StopWordRemover
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.interfaces import ILexicon
from Products.ZCTextIndex.interfaces import IZCLexicon
from Products.ZCTextIndex.interfaces import IZCTextIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
from Products.ZCTextIndex.PipelineFactory import element_factory
index_types = {'Okapi BM25 Rank':OkapiIndex,
'Cosine Measure':CosineIndex}
class ZCTextIndex(Persistent, Implicit, SimpleItem):
"""Persistent text index.
"""
implements(IZCTextIndex, IPluggableIndex)
## Magic class attributes ##
meta_type = 'ZCTextIndex'
query_options = ('query',)
manage_options = (
{'label': 'Overview', 'action': 'manage_main'},
)
security = ClassSecurityInfo()
security.declareObjectProtected(manage_zcatalog_indexes)
## Constructor ##
def __init__(self, id, extra=None, caller=None, index_factory=None,
field_name=None, lexicon_id=None):
self.id = id
# Arguments can be passed directly to the constructor or
# via the silly "extra" record.
self._fieldname = field_name or getattr(extra, 'doc_attr', '') or id
self._indexed_attrs = self._fieldname.split(',')
self._indexed_attrs = [ attr.strip()
for attr in self._indexed_attrs if attr ]
lexicon_id = lexicon_id or getattr(extra, 'lexicon_id', '')
lexicon = getattr(caller, lexicon_id, None)
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id)
if not ILexicon.providedBy(lexicon):
raise ValueError('Object "%s" does not implement '
'ZCTextIndex Lexicon interface'
% lexicon.getId())
self.lexicon_id = lexicon.getId()
self._v_lexicon = lexicon
if index_factory is None:
if extra.index_type not in index_types.keys():
raise ValueError, 'Invalid index type "%s"' % escape(
extra.index_type)
self._index_factory = index_types[extra.index_type]
self._index_type = extra.index_type
else:
self._index_factory = index_factory
self.index = self._index_factory(aq_base(self.getLexicon()))
## Private Methods ##
security.declarePrivate('getLexicon')
def getLexicon(self):
"""Get the lexicon for this index
"""
if hasattr(aq_base(self), 'lexicon'):
# Fix up old ZCTextIndexes by removing direct lexicon ref
# and changing it to an ID
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon.getId())
self.lexicon_id = lexicon.getId()
del self.lexicon
if getattr(aq_base(self), 'lexicon_path', None):
# Fix up slightly less old ZCTextIndexes by removing
# the physical path and changing it to an ID.
# There's no need to use a physical path, which otherwise
# makes it difficult to move or rename ZCatalogs.
self.lexicon_id = self.lexicon_path[-1]
del self.lexicon_path
try:
return self._v_lexicon
except AttributeError:
lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id)
if not ILexicon.providedBy(lexicon):
raise TypeError('Object "%s" is not a ZCTextIndex Lexicon'
% repr(lexicon))
self._v_lexicon = lexicon
return lexicon
## External methods not in the Pluggable Index API ##
security.declareProtected(search_zcatalog, 'query')
def query(self, query, nbest=10):
"""Return pair (mapping from docids to scores, num results).
The num results is the total number of results before trimming
to the nbest results.
"""
tree = QueryParser(self.getLexicon()).parseQuery(query)
results = tree.executeQuery(self.index)
if results is None:
return [], 0
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest(), len(results)
## Pluggable Index APIs ##
def index_object(self, documentId, obj, threshold=None):
"""Wrapper for index_doc() handling indexing of multiple attributes.
Enter the document with the specified documentId in the index
under the terms extracted from the indexed text attributes,
each of which should yield either a string or a list of
strings (Unicode or otherwise) to be passed to index_doc().
"""
# XXX We currently ignore subtransaction threshold
# needed for backward compatibility
try: fields = self._indexed_attrs
except: fields = [ self._fieldname ]
res = 0
all_texts = []
for attr in fields:
text = getattr(obj, attr, None)
if text is None:
continue
if safe_callable(text):
text = text()
if text is None:
continue
if text:
if isinstance(text, (list, tuple, )):
all_texts.extend(text)
else:
all_texts.append(text)
# Check that we're sending only strings
all_texts = filter(lambda text: isinstance(text, basestring), \
all_texts)
if all_texts:
return self.index.index_doc(documentId, all_texts)
return res
def unindex_object(self, docid):
if self.index.has_doc(docid):
self.index.unindex_doc(docid)
def _apply_index(self, request):
"""Apply query specified by request, a mapping containing the query.
Returns two object on success, the resultSet containing the
matching record numbers and a tuple containing the names of
the fields used
Returns None if request is not valid for this index.
"""
record = parseIndexRequest(request, self.id, self.query_options)
if record.keys is None:
return None
query_str = ' '.join(record.keys)
if not query_str:
return None
tree = QueryParser(self.getLexicon()).parseQuery(query_str)
results = tree.executeQuery(self.index)
return results, (self.id,)
def getEntryForObject(self, documentId, default=None):
"""Return the list of words indexed for documentId"""
try:
word_ids = self.index.get_words(documentId)
except KeyError:
return default
get_word = self.getLexicon().get_word
return [get_word(wid) for wid in word_ids]
def uniqueValues(self, name=None, withLengths=0):
raise NotImplementedError
## The ZCatalog Index management screen uses these methods ##
def numObjects(self):
"""Return number of unique words in the index"""
return self.index.length()
def indexSize(self):
"""Return the number of indexes objects """
return self.index.document_count()
def clear(self):
"""reinitialize the index (but not the lexicon)"""
try:
# Remove the cached reference to the lexicon
# So that it is refreshed
del self._v_lexicon
except (AttributeError, KeyError):
pass
self.index = self._index_factory(aq_base(self.getLexicon()))
## User Interface Methods ##
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
def getIndexSourceNames(self):
"""Return sequence of names of indexed attributes"""
try:
return self._indexed_attrs
except:
return [self._fieldname]
def getIndexType(self):
"""Return index type string"""
return getattr(self, '_index_type', self._index_factory.__name__)
def getLexiconURL(self):
"""Return the url of the lexicon used by the index"""
try:
lex = self.getLexicon()
except (KeyError, AttributeError):
return None
else:
return lex.absolute_url()
InitializeClass(ZCTextIndex)
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None):
"""Add a text index"""
if REQUEST is None:
URL3 = None
else:
URL3 = REQUEST.URL3
return self.manage_addIndex(id, 'ZCTextIndex', extra,
REQUEST, RESPONSE, URL3)
manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title='', elements=[], REQUEST=None):
"""Add ZCTextIndex Lexicon"""
pipeline = []
for el_record in elements:
if not hasattr(el_record, 'name'):
continue # Skip over records that only specify element group
element = element_factory.instantiate(el_record.group, el_record.name)
if element is not None:
if el_record.group == 'Word Splitter':
# I don't like hardcoding this, but its a simple solution
# to get the splitter element first in the pipeline
pipeline.insert(0, element)
else:
pipeline.append(element)
lexicon = PLexicon(id, title, *pipeline)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
# I am borrowing the existing vocabulary permissions for now to avoid
# adding new permissions. This may change when old style Vocabs go away
LexiconQueryPerm = query_vocabulary
LexiconMgmtPerm = manage_vocabulary
class PLexicon(Lexicon, Implicit, SimpleItem):
"""Lexicon for ZCTextIndex.
"""
implements(IZCLexicon)
meta_type = 'ZCTextIndex Lexicon'
manage_options = ({'label':'Overview', 'action':'manage_main'},
{'label':'Query', 'action':'queryLexicon'},
) + SimpleItem.manage_options
security = ClassSecurityInfo()
security.declareObjectProtected(LexiconQueryPerm)
def __init__(self, id, title='', *pipeline):
self.id = str(id)
self.title = str(title)
PLexicon.inheritedAttribute('__init__')(self, *pipeline)
## User Interface Methods ##
def getPipelineNames(self):
"""Return list of names of pipeline element classes"""
return [element.__class__.__name__ for element in self._pipeline]
_queryLexicon = DTMLFile('dtml/queryLexicon', globals())
security.declareProtected(LexiconQueryPerm, 'queryLexicon')
def queryLexicon(self, REQUEST, words=None, page=0, rows=20, cols=4):
"""Lexicon browser/query user interface
"""
if words:
wids = []
for word in self.parseTerms(words):
wids.extend(self.globToWordIds(word))
words = [self.get_word(wid) for wid in wids]
else:
words = self.words()
word_count = len(words)
rows = max(min(rows, 500), 1)
cols = max(min(cols, 12), 1)
page_count = word_count / (rows * cols) + \
(word_count % (rows * cols) > 0)
page = max(min(page, page_count - 1), 0)
start = rows * cols * page
end = min(rows * cols * (page + 1), word_count)
if word_count:
words = list(words[start:end])
else:
words = []
columns = []
i = 0
while i < len(words):
columns.append(words[i:i + rows])
i += rows
info = dict(page=page,
rows=rows,
cols=cols,
start_word=start+1,
end_word=end,
word_count=word_count,
page_count=page_count,
page_range=xrange(page_count),
page_columns=columns)
if REQUEST is not None:
return self._queryLexicon(self, REQUEST, **info)
return info
security.declareProtected(LexiconMgmtPerm, 'manage_main')
manage_main = DTMLFile('dtml/manageLexicon', globals())
InitializeClass(PLexicon)
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""ZCatalog Text Index
Plugin text index for ZCatalog.
"""
from PipelineFactory import element_factory
from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
def initialize(context):
context.registerClass(
ZCTextIndex.ZCTextIndex,
permission = 'Add Pluggable Index',
constructors = (ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex,
getIndexTypes),
icon='www/index.gif',
visibility=None
)
context.registerClass(
ZCTextIndex.PLexicon,
permission = 'Add Vocabularies',
constructors = (ZCTextIndex.manage_addLexiconForm,
ZCTextIndex.manage_addLexicon,
getElementGroups, getElementNames),
icon='www/lexicon.gif'
)
context.registerHelp()
context.registerHelpTitle("Zope Help")
## Functions below are for use in the ZMI constructor forms ##
def getElementGroups(self):
return element_factory.getFactoryGroups()
def getElementNames(self, group):
return element_factory.getFactoryNames(group)
def getIndexTypes(self):
return ZCTextIndex.index_types.keys()
## Allow relevent exceptions to be caught in untrusted code
from AccessControl import ModuleSecurityInfo
ModuleSecurityInfo('Products').declarePublic('ZCTextIndex')
ModuleSecurityInfo('Products.ZCTextIndex').declarePublic('ParseTree')
ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('QueryError')
ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('ParseError')
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex Lexicon',
help_product='ZCTextIndex',
help_topic='Lexicon_Add.stx'
)">
<p class="form-help">
A ZCTextIndex Lexicon processes and stores the words of documents indexed
with a ZCTextIndex. Multiple ZCTextIndexes can share the same lexicon.
</p>
<form action="manage_addLexicon" method="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<dtml-in name="getElementGroups" prefix="group">
<dtml-let elements="getElementNames(group_item)">
<tr>
<td align="left" valign="top">
<div class="form-label">&dtml-group_item;</div>
</td>
<td align="left" valign="top">
<input type="hidden" name="elements.group:records"
value="&dtml-group_item;" />
<dtml-if expr="_.len(elements) > 1">
<select name="elements.name:records">
<dtml-in name="elements">
<option value="&dtml-sequence-item;"
>&dtml-sequence-item;</option>
</dtml-in>
</select>
<dtml-else>
<input type="checkbox" name="elements.name:records"
value="<dtml-var expr="elements[0]" html_quote>" checked />
</dtml-if>
</td>
</tr>
</dtml-let>
</dtml-in>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex',
help_product='ZCTextIndex',
help_topic='ZCTextIndex_Add.stx'
)">
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score, meaning they return hits in order
from the most relevant to the least relevant.
</p>
<form action="manage_addZCTextIndex" method="post"
enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Indexed attributes
</div></td>
<td align="left" valign="top">
<input type="text" name="extra.doc_attr:record" size="40" />
<em>attribute1,attribute2,...</em> or leave empty
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Ranking Strategy
</div>
</td>
<td align="left" valign="top">
<select name="extra.index_type:record">
<dtml-in name="getIndexTypes">
<option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign"top">
<div class="form-label">
Lexicon
</div></td>
<td>
<dtml-in expr="superValues('ZCTextIndex Lexicon')">
<dtml-if sequence-start>
<select name="extra.lexicon_id:record">
</dtml-if>
<option value="&dtml-id;">
&dtml-id; <dtml-var name="title" fmt="(%s)" null html_quote>
</option>
<dtml-if sequence-end>
</select>
</dtml-if>
<dtml-else>
<em>You must create a ZCTextIndex Lexicon first.</em>
</dtml-in>
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
The lexicon processes and stores the words found in objects indexed by one
or more ZCTextIndexes.
</p>
<p class="section-bar">
<span class="form-label">Input Pipeline Stages</span>
</p>
<p class="form-help">
Text indexed through this lexicon is processed by the following pipeline
stages
</p>
<ol class="form-help">
<dtml-in name="getPipelineNames">
<li>&dtml-sequence-item;</li>
</dtml-in>
</ol>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
Name(s) of attribute(s) indexed:
<em><dtml-var "', '.join(getIndexSourceNames())"></em>
</p>
<p class="form-help">
Index type:
<em>&dtml-getIndexType;</em>
</p>
<p class="form-help">
ZCTextIndex Lexicon used:
<dtml-if getLexiconURL>
<a href="&dtml-getLexiconURL;/manage_main"
>&dtml-getLexiconURL;</a>
<dtml-else>
<em>(Lexicon Not Found)</em>
</dtml-if>
</p>
<p class="form-help">
<em>Note:</em> The lexicon assigned to the index cannot be changed. To replace
the existing lexicon, create a new lexicon in the same place and clear the
index. This will make the index use the replacement lexicon.
</p>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
Browse the words in the lexicon or enter the word(s) you are interested in
below. Globbing characters (*, ?) are supported
</p>
<dtml-let words_str="' '.join(REQUEST.get('words',[]))">
<form action="&dtml-URL;">
<p class="form-element">
<span class="form-label">Word(s)</span>
<input name="words:tokens" size="20" value="&dtml-words_str;" />
<input type="submit" value="Query" />
<span class="form-label">&nbsp;Output Columns:</span>
<input name="cols:int" size="2" value="&dtml-cols;" />
<span class="form-label">&nbsp;Rows:</span>
<input name="rows:int" size="2" value="&dtml-rows;" />
</p>
</form>
<hr />
<form action="&dtml-URL;">
<table width="100%" cellpadding="2" cellspacing="0" border="0">
<tr class="section-bar">
<td><span class="form-label">
&dtml-word_count; Words Found<dtml-if word_count>,
Displaying &dtml-start_word;-&dtml-end_word;
</dtml-if>
<dtml-if expr="page_count > 0">
</span></td>
<td align="right"><span class="form-label">
Page:
<select name="page:int" onchange="this.form.submit()">
<dtml-in name="page_range" prefix="page">
<option value="&dtml-page_item;"
<dtml-if expr="page == page_item">
selected
</dtml-if>
>
<dtml-var expr="page_item+1">
</option>
</dtml-in>
</select>
of &dtml-page_count;
<input type="submit" value="Go" />
<input type="hidden" name="cols:int" value="&dtml-cols;" />
<input type="hidden" name="rows:int" value="&dtml-rows;" />
<input type="hidden" name="words:tokens" value="&dtml-words_str;" />
</dtml-if>
</span></td>
</tr>
</table>
</form>
</dtml-let>
<dtml-if name="page_columns">
<table width="100%" cellpadding="0" cellspacing="10" border="0">
<tr>
<dtml-in name="page_columns" prefix="column">
<td align="left" valign="top">
<dtml-var expr="'<br />'.join(column_item)">
</td>
</dtml-in>
</tr>
</table>
</dtml-if>
<dtml-var manage_page_footer>
ZCTextIndex Lexicon - Add: Create a new ZCTextIndex Lexicon
Description
This view allows you to create a new ZCTextIndex Lexicon object.
ZCTextIndex Lexicons store the words indexed by ZCTextIndexes in a
ZCatalog.
Controls
'Id' -- Allows you to specify the id of the ZCTextIndex Lexicon.
'Title' -- Allows you to specify the title of the ZCTextIndex Lexicon.
Pipeline Stages
The remaining controls allow you to select the desired processing
of text to index by selecting pipeline stages.
The default available stages are:
- **Word Splitter** This is the only mandatory stage. The word
splitter breaks the text up into a list of words. Included is a
simple whitespace splitter, and a splitter that removes HTML
tags. The HTML aware splitter gives best results when all of
the incoming content to index is HTML.
- **Stop Words** To conserve space in the vocabulary, and possibly
increase performance, you can select a stop word remover which
subtracts very common or single letter words from the Lexicon.
Bear in mind that you will not be able to search on removed stop
words, and they will also be removed from queries passed to
search ZCTextIndexes using the Lexicon.
- **Case Normalizer** The case normalizer removes case information
from the words in the Lexicon. If case-sensitive searching is
desires, then omit this element from the pipeline.
ZCTextIndex Add: Create a new ZCTextIndex
Description
A ZCTextIndex is an index for performing full text searches over
bodies of text. It includes the following features:
- Boolean query operators with parenthetical grouping
- Globbing (partial word) and phrase matching
- Two selectable relevance scoring algorithms
ZCTextIndex is designed as a replacement for standard TextIndex, and
has several advantages over it.
Controls
'Id' -- The id of the ZCTextIndex, must be unique for this ZCatalog.
'Field Name' -- The name of the field (object attribute) to be indexed.
'Ranking Strategy'
- **Okapi BM25 Rank** A relevance scoring technique that seems to
work well when the document text is considerably longer than the
query string, which is often the case with user specified query
strings.
- **Cosine Measure** A relevance scoring technique derived from the
"*Managing Gigabytes*":http://www.cs.mu.oz.au/mg/ book. It seems
to work best when the queries are similar in size and content to
the text they are searching.
'Lexicon' -- The ZCTextIndex Lexicon to be used by this ZCTextIndex.
Lexicons process and store the words from the text and
help in processing queries. You must define a ZCTextIndex
Lexicon before you can create a ZCTextIndex. Several
ZCTextIndexes can share the same Lexicon if desired.
##############################################################################
#
# Copyright (c) 2005 Zope Foundation and Contributors.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""ZCTextIndex z3 interfaces.
$Id$
"""
from zope.interface import Interface
class IZCTextIndex(Interface):
"""Persistent text index.
"""
class ILexicon(Interface):
"""Object responsible for converting text to word identifiers.
"""
def termToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they are search terms, and skips words
that aren't in the lexicon.
"""
def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they come from a source document, and
creates new word ids for words that aren't (yet) in the
lexicon.
"""
def globToWordIds(pattern):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
Return the wids for all words in the lexicon that match the
pattern.
"""
def length():
"""Return the number of unique term in the lexicon.
"""
def get_word(wid):
"""Return the word for the given word id.
Raise KeyError if the word id is not in the lexicon.
"""
def get_wid(word):
"""Return the wird id for the given word.
Return 0 of the word is not in the lexicon.
"""
def parseTerms(text):
"""Pass the text through the pipeline.
Return a list of words, normalized by the pipeline
(e.g. stopwords removed, case normalized etc.).
"""
def isGlob(word):
"""Return true if the word is a globbing pattern.
The word should be one of the words returned by parseTerm().
"""
class IZCLexicon(Interface):
"""Lexicon for ZCTextIndex.
"""
class ISplitter(Interface):
"""A splitter."""
def process(text):
"""Run the splitter over the input text, returning a list of terms.
"""
class IPipelineElement(Interface):
def process(source):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
def processGlob(source):
"""Process, passing through globbing metacharaters.
This is an optional method; if it is not used, process() is used.
"""
class IPipelineElementFactory(Interface):
"""Class for creating pipeline elements by name"""
def registerFactory(group, name, factory):
"""Registers a pipeline factory by name and element group.
Each name can be registered only once for a given group. Duplicate
registrations will raise a ValueError
"""
def getFactoryGroups():
"""Returns a sorted list of element group names
"""
def getFactoryNames(group):
"""Returns a sorted list of registered pipeline factory names
in the specified element group
"""
def instantiate(group, name):
"""Instantiates a pipeline element by group and name. If name is not
registered raise a KeyError.
"""
class IQueryParseTree(Interface):
"""Interface for parse trees returned by parseQuery()."""
def nodeType():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""
def getValue():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""
def terms():
"""Return a list of all terms in this node, excluding NOT subtrees."""
def executeQuery(index):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
class IQueryParser(Interface):
"""Interface for Query Parsers."""
def parseQuery(query):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
Some of the query terms may be ignored because they are
stopwords; use getIgnored() to find out which terms were
ignored. But if the entire query consists only of stop words,
or of stopwords and one or more negated terms, an exception is
raised.
May raise ParseTree.ParseError.
"""
def getIgnored():
"""Return the list of ignored terms.
Return the list of terms that were ignored by the most recent
call to parseQuery() because they were stopwords.
If parseQuery() was never called this returns None.
"""
def parseQueryEx(query):
"""Parse a query string.
Return a tuple (tree, ignored) where 'tree' is the parse tree
as returned by parseQuery(), and 'ignored' is a list of
ignored terms as returned by getIgnored().
May raise ParseTree.ParseError.
"""
class IIndex(Interface):
"""Interface for an Index."""
def length():
"""Return the number of words in the index."""
def document_count():
"""Return the number of documents in the index."""
def get_words(docid):
"""Return a list of wordids for the given docid."""
def search(term):
"""Execute a search on a single term given as a string.
Return an IIBTree mapping docid to score, or None if all docs
match due to the lexicon returning no wids for the term (e.g.,
if the term is entirely composed of stopwords).
"""
def search_phrase(phrase):
"""Execute a search on a phrase given as a string.
Return an IIBtree mapping docid to score.
"""
def search_glob(pattern):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
Return an IIBTree mapping docid to score.
"""
def query_weight(terms):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
Nothing is defined about what "weight" means, beyond that the
result is an upper bound on document scores returned for the
query.
"""
def index_doc(docid, text):
"""Add a document with the specified id and text to the index. If a
document by that id already exists, replace its text with the new
text provided
text may be either a string (Unicode or otherwise) or a list
of strings from which to extract the terms under which to
index the source document.
"""
def unindex_doc(docid):
"""Remove the document with the specified id from the index"""
def has_doc(docid):
"""Returns true if docid is an id of a document in the index"""
class INBest(Interface):
"""NBest chooser Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
def add(item, score):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""
def addmany(sequence):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""
def getbest():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""
def pop_smallest():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""
def __len__():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""
def capacity():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
/*****************************************************************************
Copyright (c) 2002 Zope Foundation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* okascore.c
*
* The inner scoring loop of OkapiIndex._search_wids() coded in C.
*
* Example from an indexed Python-Dev archive, where "python" shows up in all
* but 2 of the 19,058 messages. With the Python scoring loop,
*
* query: python
* # results: 10 of 19056 in 534.77 ms
* query: python
* # results: 10 of 19056 in 277.52 ms
*
* The first timing is cold, the second timing from an immediate repeat of
* the same query. With the scoring loop here in C:
*
* query: python
* # results: 10 of 19056 in 380.74 ms -- 40% speedup
* query: python
* # results: 10 of 19056 in 118.96 ms -- 133% speedup
*/
#include "Python.h"
#define K1 1.2
#define B 0.75
#ifndef PyTuple_CheckExact
#define PyTuple_CheckExact PyTuple_Check
#endif
static PyObject *
score(PyObject *self, PyObject *args)
{
/* Believe it or not, floating these common subexpressions "by hand"
gets better code out of MSVC 6. */
const double B_FROM1 = 1.0 - B;
const double K1_PLUS1 = K1 + 1.0;
/* Inputs */
PyObject *result; /* IIBucket result, maps d to score */
PyObject *d2fitems; /* ._wordinfo[t].items(), maps d to f(d, t) */
PyObject *d2len; /* ._docweight, maps d to # words in d */
double idf; /* inverse doc frequency of t */
double meandoclen; /* average number of words in a doc */
int n, i;
if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
&idf, &meandoclen))
return NULL;
idf *= 1024.0; /* float out part of the scaled_int computation */
n = PyObject_Length(d2fitems);
for (i = 0; i < n; ++i) {
PyObject *d_and_f; /* d2f[i], a (d, f) pair */
PyObject *d;
double f;
PyObject *doclen; /* ._docweight[d] */
double lenweight;
double tf;
PyObject *scaled_int;
int status;
d_and_f = PySequence_GetItem(d2fitems, i);
if (d_and_f == NULL)
return NULL;
if (!(PyTuple_CheckExact(d_and_f) &&
PyTuple_GET_SIZE(d_and_f) == 2)) {
PyErr_SetString(PyExc_TypeError,
"d2fitems must produce 2-item tuples");
Py_DECREF(d_and_f);
return NULL;
}
d = PyTuple_GET_ITEM(d_and_f, 0);
f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
doclen = PyObject_GetItem(d2len, d);
if (doclen == NULL) {
Py_DECREF(d_and_f);
return NULL;
}
lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
tf = f * K1_PLUS1 / (f + K1 * lenweight);
scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
if (scaled_int == NULL)
status = -1;
else
status = PyObject_SetItem(result, d, scaled_int);
Py_DECREF(d_and_f);
Py_DECREF(doclen);
Py_XDECREF(scaled_int);
if (status < 0)
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
}
static char score__doc__[] =
"score(result, d2fitems, d2len, idf, meandoclen)\n"
"\n"
"Do the inner scoring loop for an Okapi index.\n";
static PyMethodDef okascore_functions[] = {
{"score", score, METH_VARARGS, score__doc__},
{NULL}
};
void
initokascore(void)
{
PyObject *m;
m = Py_InitModule3("okascore", okascore_functions,
"inner scoring loop for Okapi rank");
}
/*****************************************************************************
Copyright (c) 2002 Zope Foundation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* stopper.c
*
* Fast version of the StopWordRemover object.
*/
#include "Python.h"
static PyObject *
stopper_process(PyObject *unused, PyObject *args)
{
PyObject *result = NULL;
PyObject *dict;
PyObject *seq;
int len, i;
if (!PyArg_ParseTuple(args, "O!O:process", &PyDict_Type, &dict, &seq))
return NULL;
seq = PySequence_Fast(seq,
"process() requires a sequence as argument 2");
if (seq == NULL)
return NULL;
result = PyList_New(0);
if (result == NULL)
goto finally;
#if PY_VERSION_HEX >= 0x02020000
/* Only available in Python 2.2 and newer. */
len = PySequence_Fast_GET_SIZE(seq);
#else
len = PyObject_Length(seq);
#endif
for (i = 0; i < len; ++i) {
PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
/*
* PyDict_GetItem() returns NULL if there isn't a matching
* item, but without setting an exception, so this does what
* we want.
*/
if (PyDict_GetItem(dict, s) == NULL) {
if (PyList_Append(result, s) < 0) {
Py_DECREF(result);
result = NULL;
goto finally;
}
}
}
finally:
Py_DECREF(seq);
return result;
}
static PyMethodDef stopper_functions[] = {
{"process", stopper_process, METH_VARARGS,
"process(dict, [str, ...]) --> [str, ...]\n"
"Remove stop words (the keys of dict) from the input list of strings\n"
" to create a new list."},
{NULL}
};
void
initstopper(void)
{
Py_InitModule3("stopper", stopper_functions,
"Fast StopWordRemover implementation.");
}
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Test package."""
#! /usr/bin/env python
import cPickle
import os.path
import sys
from hotshot.log import LogReader
def load_line_info(log):
byline = {}
prevloc = None
for what, place, tdelta in log:
if tdelta > 0:
t, nhits = byline.get(prevloc, (0, 0))
byline[prevloc] = (tdelta + t), (nhits + 1)
prevloc = place
return byline
def basename(path, cache={}):
try:
return cache[path]
except KeyError:
fn = os.path.split(path)[1]
cache[path] = fn
return fn
def print_results(results):
for info, place in results:
if place is None:
# This is the startup time for the profiler, and only
# occurs at the very beginning. Just ignore it, since it
# corresponds to frame setup of the outermost call, not
# anything that's actually interesting.
continue
filename, line, funcname = place
print '%8d %8d' % info, basename(filename), line
def annotate_results(results):
files = {}
for stats, place in results:
if not place:
continue
time, hits = stats
file, line, func = place
l = files.get(file)
if l is None:
l = files[file] = []
l.append((line, hits, time))
order = files.keys()
order.sort()
for k in order:
if os.path.exists(k):
v = files[k]
v.sort()
annotate(k, v)
def annotate(file, lines):
print "-" * 60
print file
print "-" * 60
f = open(file)
i = 1
match = lines[0][0]
for line in f:
if match == i:
print "%6d %8d " % lines[0][1:], line,
del lines[0]
if lines:
match = lines[0][0]
else:
match = None
else:
print " " * 16, line,
i += 1
print
def get_cache_name(filename):
d, fn = os.path.split(filename)
cache_dir = os.path.join(d, '.hs-tool')
cache_file = os.path.join(cache_dir, fn)
return cache_dir, cache_file
def cache_results(filename, results):
cache_dir, cache_file = get_cache_name(filename)
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
fp = open(cache_file, 'wb')
try:
cPickle.dump(results, fp, 1)
finally:
fp.close()
def main(filename, annotate):
cache_dir, cache_file = get_cache_name(filename)
if ( os.path.isfile(cache_file)
and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
# cached data is up-to-date:
fp = open(cache_file, 'rb')
results = cPickle.load(fp)
fp.close()
else:
log = LogReader(filename)
byline = load_line_info(log)
# Sort
results = [(v, k) for k, v in byline.items()]
results.sort()
cache_results(filename, results)
if annotate:
annotate_results(results)
else:
print_results(results)
if __name__ == "__main__":
import getopt
annotate_p = 0
opts, args = getopt.getopt(sys.argv[1:], 'A')
for o, v in opts:
if o == '-A':
annotate_p = 1
if args:
filename, = args
else:
filename = "profile.dat"
main(filename, annotate_p)
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
# XXX: Products.PluginIndexes.TextIndex no longer exists
from __future__ import nested_scopes
import os
from time import clock
import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree
import transaction
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
def make_zc_index():
# there's an elaborate dance necessary to construct an index
class Struct:
pass
extra = Struct()
extra.doc_attr = "read"
extra.lexicon_id = "lexicon"
caller = Struct()
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex("read", extra, caller)
# XXX make a splitter more like the HTMLSplitter for TextIndex
# signature is
# Splitter(string, stop_words, encoding,
# singlechar, indexnumbers, casefolding)
class MySplitter:
def __init__(self):
self._v_splitter = HTMLWordSplitter()
def __call__(self, text, stopdict, *args, **kwargs):
words = self._v_splitter._split(text)
def lookup(w):
return stopdict.get(w, w)
return filter(None, map(lookup, words))
#def make_old_index():
# from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
# from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
# from Products.ZCTextIndex.StopDict import get_stopdict
#
# l = Lexicon(get_stopdict())
# l.SplitterFunc = MySplitter()
# return TextIndex("read", lexicon=l)
def main(db, root, dir):
rt["index"] = index = INDEX()
rt["files"] = paths = IOBTree()
transaction.commit()
zodb_time = 0.0
pack_time = 0.0
files = [os.path.join(dir, file) for file in os.listdir(dir)]
docid = 0
t0 = clock()
for file in files:
if os.path.isdir(file):
files += [os.path.join(file, sub) for sub in os.listdir(file)]
else:
if not file.endswith(".html"):
continue
docid += 1
if LIMIT is not None and docid > LIMIT:
break
if VERBOSE:
print "%5d" % docid, file
f = open(file, "rb")
paths[docid] = file
index.index_object(docid, f)
f.close()
if docid % TXN_INTERVAL == 0:
z0 = clock()
transaction.commit()
z1 = clock()
zodb_time += z1 - z0
if VERBOSE:
print "commit took", z1 - z0, zodb_time
if docid % PACK_INTERVAL == 0:
p0 = clock()
db.pack()
p1 = clock()
zodb_time += p1 - p0
pack_time += p1 - p0
if VERBOSE:
print "pack took", p1 - p0, pack_time
z0 = clock()
transaction.commit()
z1 = t1 = clock()
total_time = t1 - t0
zodb_time += z1 - z0
if VERBOSE:
print "Total index time", total_time
print "Non-pack time", total_time - pack_time
print "Non-ZODB time", total_time - zodb_time
if __name__ == "__main__":
import sys
import getopt
VERBOSE = 0
FSPATH = "Data.fs"
TXN_INTERVAL = 100
PACK_INTERVAL = 500
LIMIT = None
INDEX = make_zc_index
try:
opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
except getopt.error, msg:
print msg
print __doc__
sys.exit(2)
for o, v in opts:
if o == '-v':
VERBOSE += 1
if o == '-f':
FSPATH = v
if o == '-t':
TXN_INTERVAL = int(v)
if o == '-p':
PACK_INTERVAL = int(v)
if o == '-n':
LIMIT = int(v)
# if o == '-T':
# INDEX = make_old_index
if len(args) != 1:
print "Expected on argument"
print __doc__
sys.exit(2)
dir = args[0]
fs = FileStorage(FSPATH)
db = ZODB.DB(fs)
cn = db.open()
rt = cn.root()
dir = os.path.join(os.getcwd(), dir)
print dir
main(db, rt, dir)
cn.close()
fs.close()
"""Test an index with a Unix mailbox file.
usage: python mailtest.py [options] <data.fs>
options:
-v -- verbose
Index Generation
-i mailbox
-n NNN -- max number of messages to read from mailbox
-t NNN -- commit a transaction every NNN messages (default: 1)
-p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-p 0 -- don't pack at all
-x -- exclude the message text from the data.fs
Queries
-q query
-b NNN -- return the NNN best matches (default: 10)
-c NNN -- context; if -v, show the first NNN lines of results (default: 5)
The script either indexes or queries depending on whether -q or -i is
passed as an option.
For -i mailbox, the script reads mail messages from the mailbox and
indexes them. It indexes one message at a time, then commits the
transaction.
For -q query, it performs a query on an existing index.
If both are specified, the index is performed first.
You can also interact with the index after it is completed. Load the
index from the database:
import ZODB
from ZODB.FileStorage import FileStorage
fs = FileStorage(<data.fs>
db = ZODB.DB(fs)
index = cn.open().root()["index"]
index.search("python AND unicode")
"""
import ZODB
import ZODB.FileStorage
import transaction
from Products.ZCTextIndex.Lexicon import \
Lexicon, CaseNormalizer, Splitter, StopWordRemover
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from BTrees.IOBTree import IOBTree
from Products.ZCTextIndex.QueryParser import QueryParser
import sys
import mailbox
import time
def usage(msg):
print msg
print __doc__
sys.exit(2)
class Message:
total_bytes = 0
def __init__(self, msg):
subject = msg.getheader('subject', '')
author = msg.getheader('from', '')
if author:
summary = "%s (%s)\n" % (subject, author)
else:
summary = "%s\n" % subject
self.text = summary + msg.fp.read()
Message.total_bytes += len(self.text)
class Extra:
pass
def index(rt, mboxfile, db, profiler):
global NUM
idx_time = 0
pack_time = 0
start_time = time.time()
lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
extra = Extra()
extra.lexicon_id = 'lexicon'
extra.doc_attr = 'text'
extra.index_type = 'Okapi BM25 Rank'
caller = Extra()
caller.lexicon = lexicon
rt["index"] = idx = ZCTextIndex("index", extra, caller)
if not EXCLUDE_TEXT:
rt["documents"] = docs = IOBTree()
else:
docs = None
transaction.commit()
mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
if VERBOSE:
print "opened", mboxfile
if not NUM:
NUM = sys.maxint
if profiler:
itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
else:
itime, ptime, i = indexmbox(mbox, idx, docs, db)
idx_time += itime
pack_time += ptime
transaction.commit()
if PACK_INTERVAL and i % PACK_INTERVAL != 0:
if VERBOSE >= 2:
print "packing one last time..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
if VERBOSE:
finish_time = time.time()
print
print "Index time", round(idx_time / 60, 3), "minutes"
print "Pack time", round(pack_time / 60, 3), "minutes"
print "Index bytes", Message.total_bytes
rate = (Message.total_bytes / idx_time) / 1024
print "Index rate %.2f KB/sec" % rate
print "Indexing began", time.ctime(start_time)
print "Indexing ended", time.ctime(finish_time)
print "Wall clock minutes", round((finish_time - start_time)/60, 3)
def indexmbox(mbox, idx, docs, db):
idx_time = 0
pack_time = 0
i = 0
while i < NUM:
_msg = mbox.next()
if _msg is None:
break
i += 1
msg = Message(_msg)
if VERBOSE >= 2:
print "indexing msg", i
i0 = time.clock()
idx.index_object(i, msg)
if not EXCLUDE_TEXT:
docs[i] = msg
if i % TXN_SIZE == 0:
transaction.commit()
i1 = time.clock()
idx_time += i1 - i0
if VERBOSE and i % 50 == 0:
print i, "messages indexed"
print "cache size", db.cacheSize()
if PACK_INTERVAL and i % PACK_INTERVAL == 0:
if VERBOSE >= 2:
print "packing..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
return idx_time, pack_time, i
def query(rt, query_str, profiler):
idx = rt["index"]
docs = rt["documents"]
start = time.clock()
if profiler is None:
results, num_results = idx.query(query_str, BEST)
else:
if WARM_CACHE:
print "Warming the cache..."
idx.query(query_str, BEST)
start = time.clock()
results, num_results = profiler.runcall(idx.query, query_str, BEST)
elapsed = time.clock() - start
print "query:", query_str
print "# results:", len(results), "of", num_results, \
"in %.2f ms" % (elapsed * 1000)
tree = QueryParser(idx.lexicon).parseQuery(query_str)
qw = idx.index.query_weight(tree.terms())
for docid, score in results:
scaled = 100.0 * score / qw
print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
if VERBOSE:
msg = docs[docid]
ctx = msg.text.split("\n", CONTEXT)
del ctx[-1]
print "-" * 60
print "message:"
for l in ctx:
print l
print "-" * 60
def main(fs_path, mbox_path, query_str, profiler):
f = ZODB.FileStorage.FileStorage(fs_path)
db = ZODB.DB(f, cache_size=CACHE_SIZE)
cn = db.open()
rt = cn.root()
if mbox_path is not None:
index(rt, mbox_path, db, profiler)
if query_str is not None:
query(rt, query_str, profiler)
cn.close()
db.close()
f.close()
if __name__ == "__main__":
import getopt
NUM = 0
VERBOSE = 0
PACK_INTERVAL = 500
EXCLUDE_TEXT = 0
CACHE_SIZE = 10000
TXN_SIZE = 1
BEST = 10
CONTEXT = 5
WARM_CACHE = 0
query_str = None
mbox_path = None
profile = None
old_profile = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
['profile=', 'old-profile='])
except getopt.error, msg:
usage(msg)
if len(args) != 1:
usage("exactly 1 filename argument required")
for o, v in opts:
if o == '-n':
NUM = int(v)
elif o == '-v':
VERBOSE += 1
elif o == '-p':
PACK_INTERVAL = int(v)
elif o == '-q':
query_str = v
elif o == '-i':
mbox_path = v
elif o == '-b':
BEST = int(v)
elif o == '-x':
EXCLUDE_TEXT = 1
elif o == '-t':
TXN_SIZE = int(v)
elif o == '-c':
CONTEXT = int(v)
elif o == '-w':
WARM_CACHE = 1
elif o == '--profile':
profile = v
elif o == '--old-profile':
old_profile = v
fs_path, = args
if profile:
import hotshot
profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
elif old_profile:
import profile
profiler = profile.Profile()
else:
profiler = None
main(fs_path, mbox_path, query_str, profiler)
if profile:
profiler.close()
elif old_profile:
import pstats
profiler.dump_stats(old_profile)
stats = pstats.Stats(old_profile)
stats.strip_dirs().sort_stats('time').print_stats(20)
"""MH mail indexer.
To index messages from a single folder (messages defaults to 'all'):
mhindex.py [options] -u +folder [messages ...]
To bulk index all messages from several folders:
mhindex.py [options] -b folder ...; the folder name ALL means all folders.
To execute a single query:
mhindex.py [options] query
To enter interactive query mode:
mhindex.py [options]
Common options:
-d FILE -- specify the Data.fs to use (default ~/.Data.fs)
-w -- dump the word list in alphabetical order and exit
-W -- dump the word list ordered by word id and exit
Indexing options:
-O -- do a prescan on the data to compute optimal word id assignments;
this is only useful the first time the Data.fs is used
-t N -- commit a transaction after every N messages (default 20000)
-p N -- pack after every N commits (by default no packing is done)
Querying options:
-m N -- show at most N matching lines from the message (default 3)
-n N -- show the N best matching messages (default 3)
"""
import os
import re
import sys
import time
import mhlib
import getopt
import traceback
from StringIO import StringIO
from stat import ST_MTIME
DATAFS = "~/.Data.fs"
ZOPECODE = "~/projects/Zope/lib/python"
sys.path.append(os.path.expanduser(ZOPECODE))
from ZODB import DB
from ZODB.FileStorage import FileStorage
from Persistence import Persistent
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.IIBTree import IIBTree
import transaction
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.StopDict import get_stopdict
NBEST = 3
MAXLINES = 3
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "bd:fhm:n:Op:t:uwW")
except getopt.error, msg:
print msg
print "use -h for help"
return 2
update = 0
bulk = 0
optimize = 0
nbest = NBEST
maxlines = MAXLINES
datafs = os.path.expanduser(DATAFS)
pack = 0
trans = 20000
dumpwords = dumpwids = dumpfreqs = 0
for o, a in opts:
if o == "-b":
bulk = 1
if o == "-d":
datafs = a
if o == "-f":
dumpfreqs = 1
if o == "-h":
print __doc__
return
if o == "-m":
maxlines = int(a)
if o == "-n":
nbest = int(a)
if o == "-O":
optimize = 1
if o == "-p":
pack = int(a)
if o == "-t":
trans = int(a)
if o == "-u":
update = 1
if o == "-w":
dumpwords = 1
if o == "-W":
dumpwids = 1
ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
if dumpfreqs:
ix.dumpfreqs()
if dumpwords:
ix.dumpwords()
if dumpwids:
ix.dumpwids()
if dumpwords or dumpwids or dumpfreqs:
return
if bulk:
if optimize:
ix.optimize(args)
ix.bulkupdate(args)
elif update:
ix.update(args)
elif args:
for i in range(len(args)):
a = args[i]
if " " in a:
if a[0] == "-":
args[i] = '-"' + a[1:] + '"'
else:
args[i] = '"' + a + '"'
ix.query(" ".join(args), nbest, maxlines)
else:
ix.interact(nbest)
if pack:
ix.pack()
class Indexer:
filestorage = database = connection = root = None
def __init__(self, datafs, writable=0, trans=0, pack=0):
self.trans_limit = trans
self.pack_limit = pack
self.trans_count = 0
self.pack_count = 0
self.stopdict = get_stopdict()
self.mh = mhlib.MH()
self.filestorage = FileStorage(datafs, read_only=(not writable))
self.database = DB(self.filestorage)
self.connection = self.database.open()
self.root = self.connection.root()
try:
self.index = self.root["index"]
except KeyError:
self.index = self.root["index"] = TextIndex()
try:
self.docpaths = self.root["docpaths"]
except KeyError:
self.docpaths = self.root["docpaths"] = IOBTree()
try:
self.doctimes = self.root["doctimes"]
except KeyError:
self.doctimes = self.root["doctimes"] = IIBTree()
try:
self.watchfolders = self.root["watchfolders"]
except KeyError:
self.watchfolders = self.root["watchfolders"] = {}
self.path2docid = OIBTree()
for docid in self.docpaths.keys():
path = self.docpaths[docid]
self.path2docid[path] = docid
try:
self.maxdocid = max(self.docpaths.keys())
except ValueError:
self.maxdocid = 0
print len(self.docpaths), "Document ids"
print len(self.path2docid), "Pathnames"
print self.index.lexicon.length(), "Words"
def dumpfreqs(self):
lexicon = self.index.lexicon
index = self.index.index
assert isinstance(index, OkapiIndex)
L = []
for wid in lexicon.wids():
freq = 0
for f in index._wordinfo.get(wid, {}).values():
freq += f
L.append((freq, wid, lexicon.get_word(wid)))
L.sort()
L.reverse()
for freq, wid, word in L:
print "%10d %10d %s" % (wid, freq, word)
def dumpwids(self):
lexicon = self.index.lexicon
index = self.index.index
assert isinstance(index, OkapiIndex)
for wid in lexicon.wids():
freq = 0
for f in index._wordinfo.get(wid, {}).values():
freq += f
print "%10d %10d %s" % (wid, freq, lexicon.get_word(wid))
def dumpwords(self):
lexicon = self.index.lexicon
index = self.index.index
assert isinstance(index, OkapiIndex)
for word in lexicon.words():
wid = lexicon.get_wid(word)
freq = 0
for f in index._wordinfo.get(wid, {}).values():
freq += f
print "%10d %10d %s" % (wid, freq, word)
def close(self):
self.root = None
if self.connection is not None:
self.connection.close()
self.connection = None
if self.database is not None:
self.database.close()
self.database = None
if self.filestorage is not None:
self.filestorage.close()
self.filestorage = None
def interact(self, nbest=NBEST, maxlines=MAXLINES):
try:
import readline
except ImportError:
pass
text = ""
top = 0
results = []
while 1:
try:
line = raw_input("Query: ")
except EOFError:
print "\nBye."
break
line = line.strip()
if line.startswith("/"):
self.specialcommand(line, results, top - nbest)
continue
if line:
text = line
top = 0
else:
if not text:
continue
try:
results, n = self.timequery(text, top + nbest)
except KeyboardInterrupt:
raise
except:
reportexc()
text = ""
continue
if len(results) <= top:
if not n:
print "No hits for %r." % text
else:
print "No more hits for %r." % text
text = ""
continue
print "[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
print "for query %s]" % repr(text)
self.formatresults(text, results, maxlines, top, top+nbest)
top += nbest
def specialcommand(self, line, results, first):
assert line.startswith("/")
line = line[1:]
if not line:
n = first
else:
try:
n = int(line) - 1
except:
print "Huh?"
return
if n < 0 or n >= len(results):
print "Out of range"
return
docid, score = results[n]
path = self.docpaths[docid]
i = path.rfind("/")
assert i > 0
folder = path[:i]
n = path[i+1:]
cmd = "show +%s %s" % (folder, n)
if os.getenv("DISPLAY"):
os.system("xterm -e sh -c '%s | less' &" % cmd)
else:
os.system(cmd)
def query(self, text, nbest=NBEST, maxlines=MAXLINES):
results, n = self.timequery(text, nbest)
if not n:
print "No hits for %r." % text
return
print "[Results 1-%d from %d]" % (len(results), n)
self.formatresults(text, results, maxlines)
def timequery(self, text, nbest):
t0 = time.time()
c0 = time.clock()
results, n = self.index.query(text, nbest)
t1 = time.time()
c1 = time.clock()
print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
return results, n
def formatresults(self, text, results, maxlines=MAXLINES,
lo=0, hi=sys.maxint):
stop = self.stopdict.has_key
words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
pattern = r"\b(" + "|".join(words) + r")\b"
pattern = pattern.replace("*", ".*") # glob -> re syntax
prog = re.compile(pattern, re.IGNORECASE)
print '='*70
rank = lo
qw = self.index.query_weight(text)
for docid, score in results[lo:hi]:
rank += 1
path = self.docpaths[docid]
score = 100.0*score/qw
print "Rank: %d Score: %d%% File: %s" % (rank, score, path)
path = os.path.join(self.mh.getpath(), path)
try:
fp = open(path)
except (IOError, OSError), msg:
print "Can't open:", msg
continue
msg = mhlib.Message("<folder>", 0, fp)
for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
h = msg.getheader(header)
if h:
print "%-8s %s" % (header+":", h)
text = self.getmessagetext(msg)
if text:
print
nleft = maxlines
for part in text:
for line in part.splitlines():
if prog.search(line):
print line
nleft -= 1
if nleft <= 0:
break
if nleft <= 0:
break
print '-'*70
def update(self, args):
folder = None
seqs = []
for arg in args:
if arg.startswith("+"):
if folder is None:
folder = arg[1:]
else:
print "only one folder at a time"
return
else:
seqs.append(arg)
if not folder:
folder = self.mh.getcontext()
if not seqs:
seqs = ['all']
try:
f = self.mh.openfolder(folder)
except mhlib.Error, msg:
print msg
return
dict = {}
for seq in seqs:
try:
nums = f.parsesequence(seq)
except mhlib.Error, msg:
print msg or "unparsable message sequence: %s" % `seq`
return
for n in nums:
dict[n] = n
msgs = dict.keys()
msgs.sort()
self.updatefolder(f, msgs)
self.commit()
def optimize(self, args):
uniqwords = {}
for folder in args:
if folder.startswith("+"):
folder = folder[1:]
print "\nOPTIMIZE FOLDER", folder
try:
f = self.mh.openfolder(folder)
except mhlib.Error, msg:
print msg
continue
self.prescan(f, f.listmessages(), uniqwords)
L = [(uniqwords[word], word) for word in uniqwords.keys()]
L.sort()
L.reverse()
for i in range(100):
print "%3d. %6d %s" % ((i+1,) + L[i])
self.index.lexicon.sourceToWordIds([word for (count, word) in L])
def prescan(self, f, msgs, uniqwords):
pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
for n in msgs:
print "prescanning", n
m = f.openmessage(n)
text = self.getmessagetext(m, f.name)
for p in pipeline:
text = p.process(text)
for word in text:
uniqwords[word] = uniqwords.get(word, 0) + 1
def bulkupdate(self, args):
if not args:
print "No folders specified; use ALL to bulk-index all folders"
return
if "ALL" in args:
i = args.index("ALL")
args[i:i+1] = self.mh.listfolders()
for folder in args:
if folder.startswith("+"):
folder = folder[1:]
print "\nFOLDER", folder
try:
f = self.mh.openfolder(folder)
except mhlib.Error, msg:
print msg
continue
self.updatefolder(f, f.listmessages())
print "Total", len(self.docpaths)
self.commit()
print len(self.index.lexicon._words), "unique words."
def updatefolder(self, f, msgs):
self.watchfolders[f.name] = self.getmtime(f.name)
for n in msgs:
path = "%s/%s" % (f.name, n)
docid = self.path2docid.get(path, 0)
if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
print "unchanged", docid, path
continue
docid = self.newdocid(path)
try:
m = f.openmessage(n)
except IOError:
print "disappeared", docid, path
self.unindexpath(path)
continue
text = self.getmessagetext(m, f.name)
if not text:
self.unindexpath(path)
continue
print "indexing", docid, path
self.index.index_text(docid, text)
self.maycommit()
# Remove messages from the folder that no longer exist
for path in list(self.path2docid.keys(f.name)):
if not path.startswith(f.name + "/"):
break
if self.getmtime(path) == 0:
self.unindexpath(path)
print "done."
def unindexpath(self, path):
if self.path2docid.has_key(path):
docid = self.path2docid[path]
print "unindexing", docid, path
del self.docpaths[docid]
del self.doctimes[docid]
del self.path2docid[path]
try:
self.index.unindex(docid)
except KeyError, msg:
print "KeyError", msg
self.maycommit()
def getmessagetext(self, m, name=None):
L = []
if name:
L.append("_folder " + name) # To restrict search to a folder
self.getheaders(m, L)
try:
self.getmsgparts(m, L, 0)
except KeyboardInterrupt:
raise
except:
print "(getmsgparts failed:)"
reportexc()
return L
def getmsgparts(self, m, L, level):
ctype = m.gettype()
if level or ctype != "text/plain":
print ". "*level + str(ctype)
if ctype == "text/plain":
L.append(m.getbodytext())
elif ctype in ("multipart/alternative", "multipart/mixed"):
for part in m.getbodyparts():
self.getmsgparts(part, L, level+1)
elif ctype == "message/rfc822":
f = StringIO(m.getbodytext())
m = mhlib.Message("<folder>", 0, f)
self.getheaders(m, L)
self.getmsgparts(m, L, level+1)
def getheaders(self, m, L):
H = []
for key in "from", "to", "cc", "bcc", "subject":
value = m.get(key)
if value:
H.append(value)
if H:
L.append("\n".join(H))
def newdocid(self, path):
docid = self.path2docid.get(path)
if docid is not None:
self.doctimes[docid] = self.getmtime(path)
return docid
docid = self.maxdocid + 1
self.maxdocid = docid
self.docpaths[docid] = path
self.doctimes[docid] = self.getmtime(path)
self.path2docid[path] = docid
return docid
def getmtime(self, path):
path = os.path.join(self.mh.getpath(), path)
try:
st = os.stat(path)
except os.error, msg:
return 0
return int(st[ST_MTIME])
def maycommit(self):
self.trans_count += 1
if self.trans_count >= self.trans_limit > 0:
self.commit()
def commit(self):
if self.trans_count > 0:
print "committing..."
transaction.commit()
self.trans_count = 0
self.pack_count += 1
if self.pack_count >= self.pack_limit > 0:
self.pack()
def pack(self):
if self.pack_count > 0:
print "packing..."
self.database.pack()
self.pack_count = 0
class TextIndex(Persistent):
def __init__(self):
self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
self.index = OkapiIndex(self.lexicon)
def index_text(self, docid, text):
self.index.index_doc(docid, text)
self._p_changed = 1 # XXX
def unindex(self, docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX
def query(self, query, nbest=10):
# returns a total hit count and a mapping from docids to scores
parser = QueryParser(self.lexicon)
tree = parser.parseQuery(query)
results = tree.executeQuery(self.index)
if results is None:
return [], 0
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest(), len(results)
def query_weight(self, query):
parser = QueryParser(self.lexicon)
tree = parser.parseQuery(query)
terms = tree.terms()
return self.index.query_weight(terms)
def reportexc():
traceback.print_exc()
if __name__ == "__main__":
sys.exit(main())
Search results for python.org
query: "nested recursive functions"
Ultraseek
83% http://www.python.org/dev/doc/maint22/whatsnew/node9.html
43% http://python.sourceforge.net/peps/pep-0227.txt
37% http://www.python.org/dev/doc/maint22/lib/module-pprint.html
37% http://www.python.org/doc/1.5.2p1/lib/module-pprint.html
37% http://www.python.org/doc/2.0.1/lib/module-pprint.html
37% http://www.python.org/doc/1.5.2/lib/module-pprint.html
37% http://www.python.org/doc/1.6/lib/module-pprint.html
37% http://www.python.org/doc/1.5.1/lib/module-pprint.html
37% http://www.python.org/doc/1.5/lib/node54.html
35% http://www.python.org/workshops/2000-01/proceedings/papers/tismers/
spcpaper.htm
Google
www.python.org/peps/pep-0227.html
www.python.org/dev/doc/maint22/whatsnew/node9.html
www.python.org/cgi-bin/faqw.py?req=recent&days=28
www.python.org/peps/pep-0255.html
www.python.org/doc/current/lib/node558.html
www.python.org/doc/1.5.2/lib/node52.html
www.python.org/workshops/2000-01/proceedings/papers/tismers/spcpaper.pdf
www.python.org/2.0/
www.python.org/2.0.1/NEWS.txt
www.python.org/peps/pep-0266.html
query: "explicit better than implicit"
Ultraseek:
http://www.python.org/dev/doc/maint22/lib/differ-examples.html
http://www.python.org/doc/essays/ppt/python10/py10keynote.ppt
http://www.python.org/dev/culture.html
http://www.python.org/doc/Humor.html
http://www.python.org/dev/doc/maint22/ref/implicit-joining.html
http://www.python.org/dev/doc/maint22/ref/explicit-joining.html
ttp://www.python.org/workshops/2000-01/proceedings/papers/
tigges-wyvill/tigges-wyvill.html
http://www.python.org/peps/pep-0285.txt
http://www.python.org/peps/pep-0285.html
Google:
www.python.org/doc/current/lib/differ-examples.html
www.python.org/doc/essays/ppt/python10/py10keynote.pdf
www.python.org/dev/culture.html
www.python.org/doc/essays/ppt/python10/py10keynote.ppt
www.python.org/peps/pep-0285.html
www.python.org/peps/pep-0287.html
www.python.org/peps/pep-0287.txt
www.python.org/peps/pep-0209.html
www.python.org/~guido/Proposal.txt
www.python.org/~guido/Proposal.doc
query: "build hpux"
Ultraseek:
51% http://www.python.org/1.5/patches-1.5.1/configure.2.txt
47% http://www.python.org/dev/doc/devel/whatsnew/node5.html
43% http://www.python.org/1.5/patches-1.5.1/
43% http://www.python.org/2.0/
41% http://www.python.org/peps/pep-0243.html
41 % http://www.python.org/ftp/python/binaries-1.3/ python-HP-UX-A.09.05-full.README
39% http://www.python.org/ftp/python/binaries-1.3/ python-hppa1.1-hp-hpux10.10.README
39% http://www.python.org/ftp/python/binaries-1.3/ python-hppa1.1-hp-hpux10.10.README
35% http://www.python.org/peps/pep-0243.txt
35% http://www.python.org/2.0.1/NEWS.txt
35% http://python.sourceforge.net/peps/pep-0243.txt
Google:
www.python.org/2.1.1/NEWS.txt
www.python.org/1.5/NEWS-152b2.txt
query: "cannot create 'method-wrapper' instances"
Ultraseek
http://python.sourceforge.net/peps/pep-0007.txt
http://www.python.org/workshops/1994-11/C++Python.txt
http://www.python.org/peps/pep-0231.txt
http://www.python.org/peps/pep-0231.html
http://python.sourceforge.net/peps/pep-0231.txt
http://www.python.org/dev/doc/maint22/lib/node383.html
http://www.python.org/workshops/1994-11/BuiltInClasses/ BuiltInClasses_7.html
http://www.python.org/workshops/1994-11/persistency.html
http://www.python.org/dev/doc/maint22/lib/organizing-tests.html
http://www.python.org/dev/doc/maint22/lib/module-SocketServer.html
Google:
no matches
query: "extension module C++"
http://www.python.org/dev/doc/devel/ext/building.html
http://www.python.org/dev/doc/maint22/ext/module-defn-options.html
http://www.python.org/dev/doc/maint21/ext/building-on-unix.html
http://www.python.org/doc/1.6/ext/building-on-unix.html
http://www.python.org/sigs/c++-sig/
http://www.python.org/dev/doc/maint22/ext/intro.html
http://www.python.org/dev/doc/maint22/ext/cplusplus.html
http://www.python.org/doc/1.4/ext/node18.html
http://www.python.org/doc/1.6/ext/building-on-windows.html
http://www.python.org/doc/1.6/dist/node12.html
Google:
www.python.org/doc/current/ext/building-on-unix.html
www.python.org/doc/current/ext/intro.html
www.python.org/doc/current/ext/ext.html
www.python.org/sigs/c++-sig/
www.python.org/doc/current/ext/ module-defn-options.html
www.python.org/doc/1.5.2p2/ext/building-on-unix.html
www.python.org/doc/1.5.2p2/ext/contents.html
www.python.org/doc/1.5.2p2/ext/ext.html
www.python.org/doc/2.1.2/ext/building-on-unix.html
www.python.org/doc/1.5.1/ext/intro.html
# XXX: Products.PluginIndexes.TextIndex no longer exists
import os
from time import clock
import ZODB
from ZODB.FileStorage import FileStorage
QUERIES = ["nested recursive functions",
"explicit better than implicit",
"build hpux",
"cannot create 'method-wrapper' instances",
"extension module C++",
"class method",
"instance variable",
"articulate information",
"import default files",
"gopher ftp http",
"documentation",
]
def path2url(p):
# convert the paths to a python.org URL
# hack: only works for the way Jeremy indexed his copy of python.org
marker = "www.python.org/."
i = p.find(marker)
if i == -1:
return p
i += len(marker)
return "http://www.python.org" + p[i:]
#from Products.PluginIndexes.TextIndex.TextIndex import And, Or
from Products.ZCTextIndex.tests.indexhtml import MySplitter
from Products.ZCTextIndex.NBest import NBest
def main(rt):
index = rt["index"]
files = rt["files"]
times = {}
ITERS = range(50)
for i in range(11):
for q in QUERIES:
terms = q.split()
for c in " OR ", " AND ":
query = c.join(terms)
t0 = clock()
if TEXTINDEX:
if c == " OR ":
op = Or
else:
op = And
_q = " ".join(terms)
for _ in ITERS:
b = index.query(_q, op).bucket()
num = len(b)
chooser = NBest(10)
chooser.addmany(b.items())
results = chooser.getbest()
else:
try:
for _ in ITERS:
results, num = index.query(query)
except:
continue
t1 = clock()
print "<p>Query: \"%s\"" % query
print "<br>Num results: %d" % num
print "<br>time.clock(): %s" % (t1 - t0)
key = query
if i == 0:
print "<ol>"
for docid, score in results:
url = path2url(files[docid])
fmt = '<li><a href="%s">%s</A> score = %s'
print fmt % (url, url, score)
print "</ol>"
continue
l = times.setdefault(key, [])
l.append(t1 - t0)
l = times.keys()
l.sort()
print "<hr>"
for k in l:
v = times[k]
print "<p>Query: \"%s\"" % k
print "<br>Min time: %s" % min(v)
print "<br>All times: %s" % " ".join(map(str, v))
if __name__ == "__main__":
import sys
import getopt
VERBOSE = 0
FSPATH = "Data.fs"
TEXTINDEX = 0
try:
opts, args = getopt.getopt(sys.argv[1:], 'vf:T')
except getopt.error, msg:
print msg
print __doc__
sys.exit(2)
for o, v in opts:
if o == '-v':
VERBOSE += 1
if o == '-f':
FSPATH = v
# if o == '-T':
# TEXTINDEX = 1
fs = FileStorage(FSPATH, read_only=1)
db = ZODB.DB(fs, cache_size=10000)
cn = db.open()
rt = cn.root()
main(rt)
##############################################################################
#
# Copyright (c) 2009 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Test zope.index.text.htmlsplitter
"""
import unittest
class HTMLWordSplitterTests(unittest.TestCase):
# Subclasses must define '_getBTreesFamily'
def _getTargetClass(self):
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
return HTMLWordSplitter
def _makeOne(self):
return self._getTargetClass()()
def test_class_conforms_to_ISplitter(self):
from zope.interface.verify import verifyClass
from Products.ZCTextIndex.interfaces import ISplitter
verifyClass(ISplitter, self._getTargetClass())
def test_instance_conforms_to_ISplitter(self):
from zope.interface.verify import verifyObject
from Products.ZCTextIndex.interfaces import ISplitter
verifyObject(ISplitter, self._makeOne())
def test_process_empty_string(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['']), [])
def test_process_no_markup(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['abc def']), ['abc', 'def'])
def test_process_w_markup(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['<h1>abc</h1> &nbsp; <p>def</p>']),
['abc', 'def'])
def test_process_no_markup_w_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['abc?def hij*klm nop* qrs?']),
['abc', 'def', 'hij', 'klm', 'nop', 'qrs'])
def test_processGlob_empty_string(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['']), [])
def test_processGlob_no_markup_no_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['abc def']), ['abc', 'def'])
def test_processGlob_w_markup_no_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['<h1>abc</h1> &nbsp; '
'<p>def</p>']),
['abc', 'def'])
def test_processGlob_no_markup_w_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['abc?def hij*klm nop* qrs?']),
['abc?def', 'hij*klm', 'nop*', 'qrs?'])
def test_suite():
return unittest.TestSuite((
unittest.makeSuite(HTMLWordSplitterTests),
))
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import os
from unittest import TestCase, TestSuite, main, makeSuite
import transaction
from BTrees.Length import Length
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
# Subclasses must set a class variable IndexFactory to the appropriate
# index object constructor.
class IndexTest(TestCase):
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.index = self.IndexFactory(self.lexicon)
def test_index_document(self, DOCID=1):
doc = "simple document contains five words"
self.assert_(not self.index.has_doc(DOCID))
self.index.index_doc(DOCID, doc)
self.assert_(self.index.has_doc(DOCID))
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._docweight), 1)
self.assertEqual(
len(self.index._docweight), self.index.document_count())
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 5)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_unindex_document(self):
DOCID = 1
self.test_index_document(DOCID)
self.index.unindex_doc(DOCID)
self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(
len(self.index._docweight), self.index.document_count())
self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
def test_index_two_documents(self):
self.test_index_document()
doc = "another document just four"
DOCID = 2
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._docweight), 2)
self.assertEqual(
len(self.index._docweight), self.index.document_count())
self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
wids = self.lexicon.termToWordIds("document")
self.assertEqual(len(wids), 1)
document_wid = wids[0]
for wid, map in self.index._wordinfo.items():
if wid == document_wid:
self.assertEqual(len(map), 2)
self.assert_(map.has_key(1))
self.assert_(map.has_key(DOCID))
else:
self.assertEqual(len(map), 1)
def test_index_two_unindex_one(self):
# index two documents, unindex one, and test the results
self.test_index_two_documents()
self.index.unindex_doc(1)
DOCID = 2
self.assertEqual(len(self.index._docweight), 1)
self.assertEqual(
len(self.index._docweight), self.index.document_count())
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_index_duplicated_words(self, DOCID=1):
doc = "very simple repeat repeat repeat document test"
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 7)
self.assertEqual(len(self.index._wordinfo),
self.index.length())
self.assertEqual(
len(self.index._docweight), self.index.document_count())
wids = self.lexicon.termToWordIds("repeat")
self.assertEqual(len(wids), 1)
repititive_wid = wids[0]
for wid, map in self.index._wordinfo.items():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_simple_query_oneresult(self):
self.index.index_doc(1, 'not the same document')
results = self.index.search("document")
self.assertEqual(list(results.keys()), [1])
def test_simple_query_noresults(self):
self.index.index_doc(1, 'not the same document')
results = self.index.search("frobnicate")
self.assertEqual(list(results.keys()), [])
def test_query_oneresult(self):
self.index.index_doc(1, 'not the same document')
self.index.index_doc(2, 'something about something else')
results = self.index.search("document")
self.assertEqual(list(results.keys()), [1])
def test_search_phrase(self):
self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
results = self.index.search_phrase("quick brown fox")
self.assertEqual(list(results.keys()), [1])
def test_search_glob(self):
self.index.index_doc(1, "how now brown cow")
self.index.index_doc(2, "hough nough browne cough")
self.index.index_doc(3, "bar brawl")
results = self.index.search_glob("bro*")
self.assertEqual(list(results.keys()), [1, 2])
results = self.index.search_glob("b*")
self.assertEqual(list(results.keys()), [1, 2, 3])
class CosineIndexTest(IndexTest):
IndexFactory = CosineIndex
class OkapiIndexTest(IndexTest):
IndexFactory = OkapiIndex
class TestIndexConflict(TestCase):
db = None
def tearDown(self):
if self.db is not None:
self.db.close()
self.storage.cleanup()
def openDB(self):
from ZODB.FileStorage import FileStorage
from ZODB.DB import DB
n = 'fs_tmp__%s' % os.getpid()
self.storage = FileStorage(n)
self.db = DB(self.storage)
def test_index_doc_conflict(self):
self.index = OkapiIndex(Lexicon())
self.openDB()
r1 = self.db.open().root()
r1['i'] = self.index
transaction.commit()
r2 = self.db.open().root()
copy = r2['i']
# Make sure the data is loaded
list(copy._docweight.items())
list(copy._docwords.items())
list(copy._wordinfo.items())
list(copy._lexicon._wids.items())
list(copy._lexicon._words.items())
self.assertEqual(self.index._p_serial, copy._p_serial)
self.index.index_doc(0, 'The time has come')
transaction.commit()
copy.index_doc(1, 'That time has gone')
transaction.commit()
def test_reindex_doc_conflict(self):
self.index = OkapiIndex(Lexicon())
self.index.index_doc(0, 'Sometimes change is good')
self.index.index_doc(1, 'Then again, who asked')
self.openDB()
r1 = self.db.open().root()
r1['i'] = self.index
transaction.commit()
r2 = self.db.open().root()
copy = r2['i']
# Make sure the data is loaded
list(copy._docweight.items())
list(copy._docwords.items())
list(copy._wordinfo.items())
list(copy._lexicon._wids.items())
list(copy._lexicon._words.items())
self.assertEqual(self.index._p_serial, copy._p_serial)
self.index.index_doc(0, 'Sometimes change isn\'t bad')
transaction.commit()
copy.index_doc(1, 'Then again, who asked you?')
transaction.commit()
class TestUpgrade(TestCase):
def test_query_before_totaldoclen_upgrade(self):
self.index1 = OkapiIndex(Lexicon(Splitter()))
self.index1.index_doc(0, 'The quiet of night')
# Revert index1 back to a long to simulate an older index instance
self.index1._totaldoclen = long(self.index1._totaldoclen())
self.assertEqual(len(self.index1.search('night')), 1)
def test_upgrade_totaldoclen(self):
self.index1 = OkapiIndex(Lexicon())
self.index2 = OkapiIndex(Lexicon())
self.index1.index_doc(0, 'The quiet of night')
self.index2.index_doc(0, 'The quiet of night')
# Revert index1 back to a long to simulate an older index instance
self.index1._totaldoclen = long(self.index1._totaldoclen())
self.index1.index_doc(1, 'gazes upon my shadow')
self.index2.index_doc(1, 'gazes upon my shadow')
self.assertEqual(
self.index1._totaldoclen(), self.index2._totaldoclen())
self.index1._totaldoclen = long(self.index1._totaldoclen())
self.index1.unindex_doc(0)
self.index2.unindex_doc(0)
self.assertEqual(
self.index1._totaldoclen(), self.index2._totaldoclen())
def test_query_before_document_count_upgrade(self):
self.index1 = OkapiIndex(Lexicon(Splitter()))
self.index1.index_doc(0, 'The quiet of night')
# Revert index1 back to a long to simulate an older index instance
del self.index1.document_count
self.assertEqual(len(self.index1.search('night')), 1)
def test_upgrade_document_count(self):
self.index1 = OkapiIndex(Lexicon())
self.index2 = OkapiIndex(Lexicon())
self.index1.index_doc(0, 'The quiet of night')
self.index2.index_doc(0, 'The quiet of night')
# Revert index1 back to simulate an older index instance
del self.index1.document_count
self.index1.index_doc(1, 'gazes upon my shadow')
self.index2.index_doc(1, 'gazes upon my shadow')
self.assert_(self.index1.document_count.__class__ is Length)
self.assertEqual(
self.index1.document_count(), self.index2.document_count())
del self.index1.document_count
self.index1.unindex_doc(0)
self.index2.unindex_doc(0)
self.assert_(self.index1.document_count.__class__ is Length)
self.assertEqual(
self.index1.document_count(), self.index2.document_count())
def test_suite():
return TestSuite((makeSuite(CosineIndexTest),
makeSuite(OkapiIndexTest),
makeSuite(TestIndexConflict),
makeSuite(TestUpgrade),
))
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon unit tests.
$Id$
"""
import unittest
import os, sys
import ZODB
import transaction
from Products.ZCTextIndex.Lexicon import Lexicon
from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer
class StupidPipelineElement:
def __init__(self, fromword, toword):
self.__fromword = fromword
self.__toword = toword
def process(self, seq):
res = []
for term in seq:
if term == self.__fromword:
res.append(self.__toword)
else:
res.append(term)
return res
class WackyReversePipelineElement:
def __init__(self, revword):
self.__revword = revword
def process(self, seq):
res = []
for term in seq:
if term == self.__revword:
x = list(term)
x.reverse()
res.append(''.join(x))
else:
res.append(term)
return res
class StopWordPipelineElement:
def __init__(self, stopdict={}):
self.__stopdict = stopdict
def process(self, seq):
res = []
for term in seq:
if self.__stopdict.get(term):
continue
else:
res.append(term)
return res
class Test(unittest.TestCase):
def test_z3interfaces(self):
from Products.ZCTextIndex.interfaces import ILexicon
from zope.interface.verify import verifyClass
verifyClass(ILexicon, Lexicon)
def testSourceToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
def testTermToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [3])
def testMissingTermToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('boxes')
self.assertEqual(wids, [0])
def testTermToWordIdsWithProcess_post_glob(self):
"""This test is for added process_post_glob"""
class AddedSplitter(Splitter):
def process_post_glob(self, lst):
assert lst == ['dogs']
return ['dogs']
lexicon = Lexicon(AddedSplitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [3])
def testMissingTermToWordIdsWithProcess_post_glob(self):
"""This test is for added process_post_glob"""
class AddedSplitter(Splitter):
def process_post_glob(self, lst):
assert lst == ['dogs']
return ['fox']
lexicon = Lexicon(AddedSplitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [0])
def testOnePipelineElement(self):
lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('fish')
self.assertEqual(wids, [3])
def testSplitterAdaptorFold(self):
lexicon = Lexicon(Splitter(), CaseNormalizer())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
def testSplitterAdaptorNofold(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [0, 2, 3])
def testTwoElementPipeline(self):
lexicon = Lexicon(Splitter(),
StupidPipelineElement('cats', 'fish'),
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [1])
def testThreeElementPipeline(self):
lexicon = Lexicon(Splitter(),
StopWordPipelineElement({'and':1}),
StupidPipelineElement('dogs', 'fish'),
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [2])
def testSplitterLocaleAwareness(self):
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
import locale
loc = locale.setlocale(locale.LC_ALL) # get current locale
# set German locale
try:
if sys.platform != 'win32':
locale.setlocale(locale.LC_ALL, 'de_DE.ISO8859-1')
else:
locale.setlocale(locale.LC_ALL, 'German_Germany.1252')
except locale.Error:
return # This test doesn't work here :-(
expected = ['m\xfclltonne', 'waschb\xe4r',
'beh\xf6rde', '\xfcberflieger']
words = [" ".join(expected)]
words = Splitter().process(words)
self.assertEqual(words, expected)
words = HTMLWordSplitter().process(words)
self.assertEqual(words, expected)
locale.setlocale(locale.LC_ALL, loc) # restore saved locale
def testUpgradeLength(self):
from BTrees.Length import Length
lexicon = Lexicon(Splitter())
del lexicon.length # Older instances don't override length
lexicon.sourceToWordIds('how now brown cow')
self.assert_(lexicon.length.__class__ is Length)
class TestLexiconConflict(unittest.TestCase):
db = None
def tearDown(self):
if self.db is not None:
self.db.close()
self.storage.cleanup()
def openDB(self):
from ZODB.FileStorage import FileStorage
from ZODB.DB import DB
n = 'fs_tmp__%s' % os.getpid()
self.storage = FileStorage(n)
self.db = DB(self.storage)
def testAddWordConflict(self):
self.l = Lexicon(Splitter())
self.openDB()
r1 = self.db.open().root()
r1['l'] = self.l
transaction.commit()
r2 = self.db.open().root()
copy = r2['l']
# Make sure the data is loaded
list(copy._wids.items())
list(copy._words.items())
copy.length()
self.assertEqual(self.l._p_serial, copy._p_serial)
self.l.sourceToWordIds('mary had a little lamb')
transaction.commit()
copy.sourceToWordIds('whose fleece was')
copy.sourceToWordIds('white as snow')
transaction.commit()
self.assertEqual(copy.length(), 11)
self.assertEqual(copy.length(), len(copy._words))
def test_suite():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test))
suite.addTest(unittest.makeSuite(TestLexiconConflict))
return suite
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.NBest import NBest
class NBestTest(TestCase):
def testConstructor(self):
self.assertRaises(ValueError, NBest, 0)
self.assertRaises(ValueError, NBest, -1)
for n in range(1, 11):
nb = NBest(n)
self.assertEqual(len(nb), 0)
self.assertEqual(nb.capacity(), n)
def testOne(self):
nb = NBest(1)
nb.add('a', 0)
self.assertEqual(nb.getbest(), [('a', 0)])
nb.add('b', 1)
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('b', 1)])
nb.add('c', -1)
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('b', 1)])
nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('f', 5)])
def testMany(self):
import random
inputs = [(-i, i) for i in range(50)]
reversed_inputs = inputs[:]
reversed_inputs.reverse()
# Test the N-best for a variety of n (1, 6, 11, ... 50).
for n in range(1, len(inputs)+1, 5):
expected = inputs[-n:]
expected.reverse()
random_inputs = inputs[:]
random.shuffle(random_inputs)
for source in inputs, reversed_inputs, random_inputs:
# Try feeding them one at a time.
nb = NBest(n)
for item, score in source:
nb.add(item, score)
self.assertEqual(len(nb), n)
self.assertEqual(nb.capacity(), n)
self.assertEqual(nb.getbest(), expected)
# And again in one gulp.
nb = NBest(n)
nb.addmany(source)
self.assertEqual(len(nb), n)
self.assertEqual(nb.capacity(), n)
self.assertEqual(nb.getbest(), expected)
for i in range(1, n+1):
self.assertEqual(nb.pop_smallest(), expected[-i])
self.assertRaises(IndexError, nb.pop_smallest)
def test_suite():
return makeSuite(NBestTest)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2008 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import unittest
class ParseTreeTests(unittest.TestCase):
def _conforms(self, klass):
from zope.interface.verify import verifyClass
from Products.ZCTextIndex.interfaces import IQueryParseTree
verifyClass(IQueryParseTree, klass)
def test_ParseTreeNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import ParseTreeNode
self._conforms(ParseTreeNode)
def test_OrNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import OrNode
self._conforms(OrNode)
def test_AndNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import AndNode
self._conforms(AndNode)
def test_NotNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import NotNode
self._conforms(NotNode)
def test_GlobNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import GlobNode
self._conforms(GlobNode)
def test_AtomNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self._conforms(AtomNode)
def test_PhraseNode_conforms_to_IQueryParseTree(self):
from Products.ZCTextIndex.ParseTree import PhraseNode
self._conforms(PhraseNode)
def test_suite():
return unittest.TestSuite((
unittest.makeSuite(ParseTreeTests),
))
if __name__=="__main__":
unittest.main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.interfaces import IPipelineElement
from Products.ZCTextIndex.PipelineFactory import PipelineElementFactory
from zope.interface import implements
class NullPipelineElement:
implements(IPipelineElement)
def process(source):
pass
class PipelineFactoryTest(TestCase):
def setUp(self):
self.huey = NullPipelineElement()
self.dooey = NullPipelineElement()
self.louie = NullPipelineElement()
self.daffy = NullPipelineElement()
def testPipeline(self):
pf = PipelineElementFactory()
pf.registerFactory('donald', 'huey', self.huey)
pf.registerFactory('donald', 'dooey', self.dooey)
pf.registerFactory('donald', 'louie', self.louie)
pf.registerFactory('looney', 'daffy', self.daffy)
self.assertRaises(ValueError, pf.registerFactory,'donald', 'huey',
self.huey)
self.assertEqual(pf.getFactoryGroups(), ['donald', 'looney'])
self.assertEqual(pf.getFactoryNames('donald'),
['dooey', 'huey', 'louie'])
def test_suite():
return makeSuite(PipelineFactoryTest)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from BTrees.IIBTree import IIBucket
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.ParseTree import ParseError, QueryError
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
class FauxIndex:
def search(self, term):
b = IIBucket()
if term == "foo":
b[1] = b[3] = 1
elif term == "bar":
b[1] = b[2] = 1
elif term == "ham":
b[1] = b[2] = b[3] = b[4] = 1
return b
class TestQueryEngine(TestCase):
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.parser = QueryParser(self.lexicon)
self.index = FauxIndex()
def compareSet(self, set, dict):
d = {}
for k, v in set.items():
d[k] = v
self.assertEqual(d, dict)
def compareQuery(self, query, dict):
tree = self.parser.parseQuery(query)
set = tree.executeQuery(self.index)
self.compareSet(set, dict)
def testExecuteQuery(self):
self.compareQuery("foo AND bar", {1: 2})
self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
self.compareQuery("foo AND NOT bar", {3: 1})
self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
self.compareQuery("ham AND foo AND bar", {1: 3})
def testInvalidQuery(self):
from Products.ZCTextIndex.ParseTree import NotNode, AtomNode
tree = NotNode(AtomNode("foo"))
self.assertRaises(QueryError, tree.executeQuery, self.index)
def test_suite():
return makeSuite(TestQueryEngine)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
class TestInterfaces(TestCase):
def testInterfaces(self):
from zope.interface.verify import verifyClass
from Products.ZCTextIndex.interfaces import IQueryParser
from Products.ZCTextIndex.QueryParser import QueryParser
verifyClass(IQueryParser, QueryParser)
class TestQueryParserBase(TestCase):
def setUp(self):
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.Lexicon import Lexicon
from Products.ZCTextIndex.Lexicon import Splitter
self.lexicon = Lexicon(Splitter())
self.parser = QueryParser(self.lexicon)
def expect(self, input, output, expected_ignored=[]):
tree = self.parser.parseQuery(input)
ignored = self.parser.getIgnored()
self.compareParseTrees(tree, output)
self.assertEqual(ignored, expected_ignored)
# Check that parseQueryEx() == (parseQuery(), getIgnored())
ex_tree, ex_ignored = self.parser.parseQueryEx(input)
self.compareParseTrees(ex_tree, tree)
self.assertEqual(ex_ignored, expected_ignored)
def failure(self, input):
from Products.ZCTextIndex.ParseTree import ParseError
self.assertRaises(ParseError, self.parser.parseQuery, input)
self.assertRaises(ParseError, self.parser.parseQueryEx, input)
def compareParseTrees(self, got, expected, msg=None):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import GlobNode
from Products.ZCTextIndex.ParseTree import NotNode
from Products.ZCTextIndex.ParseTree import OrNode
from Products.ZCTextIndex.ParseTree import ParseTreeNode
from Products.ZCTextIndex.ParseTree import PhraseNode
if msg is None:
msg = repr(got)
self.assertEqual(isinstance(got, ParseTreeNode), 1)
self.assertEqual(got.__class__, expected.__class__, msg)
if isinstance(got, PhraseNode):
self.assertEqual(got.nodeType(), "PHRASE", msg)
self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, GlobNode):
self.assertEqual(got.nodeType(), "GLOB", msg)
self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, AtomNode):
self.assertEqual(got.nodeType(), "ATOM", msg)
self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, NotNode):
self.assertEqual(got.nodeType(), "NOT")
self.compareParseTrees(got.getValue(), expected.getValue(), msg)
elif isinstance(got, AndNode) or isinstance(got, OrNode):
self.assertEqual(got.nodeType(),
isinstance(got, AndNode) and "AND" or "OR", msg)
list1 = got.getValue()
list2 = expected.getValue()
self.assertEqual(len(list1), len(list2), msg)
for i in range(len(list1)):
self.compareParseTrees(list1[i], list2[i], msg)
class TestQueryParser(TestQueryParserBase):
def test001(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("foo", AtomNode("foo"))
def test002(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("note", AtomNode("note"))
def test003(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("aa and bb AND cc",
AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
def test004(self):
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import OrNode
self.expect("aa OR bb or cc",
OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
def test005(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import OrNode
self.expect("aa AND bb OR cc AnD dd",
OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
AndNode([AtomNode("cc"), AtomNode("dd")])]))
def test006(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import OrNode
self.expect("(aa OR bb) AND (cc OR dd)",
AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
OrNode([AtomNode("cc"), AtomNode("dd")])]))
def test007(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import NotNode
self.expect("aa AND NOT bb",
AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
def test010(self):
from Products.ZCTextIndex.ParseTree import PhraseNode
self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
def test011(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
def test012(self):
from Products.ZCTextIndex.ParseTree import PhraseNode
self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
def test013(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
def test014(self):
from Products.ZCTextIndex.ParseTree import PhraseNode
self.expect("foo-bar", PhraseNode(["foo", "bar"]))
def test015(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import NotNode
self.expect("foo -bar", AndNode([AtomNode("foo"),
NotNode(AtomNode("bar"))]))
def test016(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import NotNode
self.expect("-foo bar", AndNode([AtomNode("bar"),
NotNode(AtomNode("foo"))]))
def test017(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import NotNode
from Products.ZCTextIndex.ParseTree import PhraseNode
self.expect("booh -foo-bar",
AndNode([AtomNode("booh"),
NotNode(PhraseNode(["foo", "bar"]))]))
def test018(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import NotNode
from Products.ZCTextIndex.ParseTree import PhraseNode
self.expect('booh -"foo bar"',
AndNode([AtomNode("booh"),
NotNode(PhraseNode(["foo", "bar"]))]))
def test019(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('foo"bar"',
AndNode([AtomNode("foo"), AtomNode("bar")]))
def test020(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('"foo"bar',
AndNode([AtomNode("foo"), AtomNode("bar")]))
def test021(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('foo"bar"blech',
AndNode([AtomNode("foo"), AtomNode("bar"),
AtomNode("blech")]))
def test022(self):
from Products.ZCTextIndex.ParseTree import GlobNode
self.expect("foo*", GlobNode("foo*"))
def test023(self):
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
from Products.ZCTextIndex.ParseTree import GlobNode
self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")]))
def test024(self):
# Split by UTF-8 fullwidth space
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect("foo\xe3\x80\x80bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
def test025(self):
# Split by Unicode fullwidth space
from Products.ZCTextIndex.ParseTree import AndNode
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect(u"foo\u3000bar", AndNode([AtomNode(u"foo"), AtomNode(u"bar")]))
def test101(self):
self.failure("")
def test102(self):
self.failure("not")
def test103(self):
self.failure("or")
def test104(self):
self.failure("and")
def test105(self):
self.failure("NOT")
def test106(self):
self.failure("OR")
def test107(self):
self.failure("AND")
def test108(self):
self.failure("NOT foo")
def test109(self):
self.failure(")")
def test110(self):
self.failure("(")
def test111(self):
self.failure("foo OR")
def test112(self):
self.failure("foo AND")
def test113(self):
self.failure("OR foo")
def test114(self):
self.failure("AND foo")
def test115(self):
self.failure("(foo) bar")
def test116(self):
self.failure("(foo OR)")
def test117(self):
self.failure("(foo AND)")
def test118(self):
self.failure("(NOT foo)")
def test119(self):
self.failure("-foo")
def test120(self):
self.failure("-foo -bar")
def test121(self):
self.failure("foo OR -bar")
def test122(self):
self.failure("foo AND -bar")
class StopWordTestQueryParser(TestQueryParserBase):
def setUp(self):
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.Lexicon import Lexicon
from Products.ZCTextIndex.Lexicon import Splitter
# Only 'stop' is a stopword (but 'and' is still an operator)
self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
self.parser = QueryParser(self.lexicon)
def test201(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('and/', AtomNode("and"))
def test202(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('foo AND stop', AtomNode("foo"), ["stop"])
def test203(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
def test204(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('stop AND foo', AtomNode("foo"), ["stop"])
def test205(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('foo OR stop', AtomNode("foo"), ["stop"])
def test206(self):
from Products.ZCTextIndex.ParseTree import AtomNode
self.expect('stop OR foo', AtomNode("foo"), ["stop"])
def test301(self):
self.failure('stop')
def test302(self):
self.failure('stop stop')
def test303(self):
self.failure('stop AND stop')
def test304(self):
self.failure('stop OR stop')
def test305(self):
self.failure('stop -foo')
def test306(self):
self.failure('stop AND NOT foo')
class FakeStopWordRemover:
def process(self, list):
return [word for word in list if word != "stop"]
def test_suite():
return TestSuite((makeSuite(TestQueryParser),
makeSuite(StopWordTestQueryParser),
makeSuite(TestInterfaces),
))
if __name__=="__main__":
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.SetOps import mass_weightedIntersection
from Products.ZCTextIndex.SetOps import mass_weightedUnion
class TestSetOps(TestCase):
def testEmptyLists(self):
self.assertEqual(len(mass_weightedIntersection([])), 0)
self.assertEqual(len(mass_weightedUnion([])), 0)
def testIdentity(self):
t = IIBTree([(1, 2)])
b = IIBucket([(1, 2)])
for x in t, b:
for func in mass_weightedUnion, mass_weightedIntersection:
result = func([(x, 1)])
self.assertEqual(len(result), 1)
self.assertEqual(list(result.items()), list(x.items()))
def testScalarMultiply(self):
t = IIBTree([(1, 2), (2, 3), (3, 4)])
allkeys = [1, 2, 3]
b = IIBucket(t)
for x in t, b:
self.assertEqual(list(x.keys()), allkeys)
for func in mass_weightedUnion, mass_weightedIntersection:
for factor in 0, 1, 5, 10:
result = func([(x, factor)])
self.assertEqual(allkeys, list(result.keys()))
for key in x.keys():
self.assertEqual(x[key] * factor, result[key])
def testPairs(self):
t1 = IIBTree([(1, 10), (3, 30), (7, 70)])
t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)])
allkeys = [1, 3, 5, 7, 9]
b1 = IIBucket(t1)
b2 = IIBucket(t2)
for x in t1, t2, b1, b2:
for key in x.keys():
self.assertEqual(key in allkeys, 1)
for y in t1, t2, b1, b2:
for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3):
# Test the union.
expected = []
for key in allkeys:
if x.has_key(key) or y.has_key(key):
result = x.get(key, 0) * w1 + y.get(key, 0) * w2
expected.append((key, result))
expected.sort()
got = mass_weightedUnion([(x, w1), (y, w2)])
self.assertEqual(expected, list(got.items()))
got = mass_weightedUnion([(y, w2), (x, w1)])
self.assertEqual(expected, list(got.items()))
# Test the intersection.
expected = []
for key in allkeys:
if x.has_key(key) and y.has_key(key):
result = x[key] * w1 + y[key] * w2
expected.append((key, result))
expected.sort()
got = mass_weightedIntersection([(x, w1), (y, w2)])
self.assertEqual(expected, list(got.items()))
got = mass_weightedIntersection([(y, w2), (x, w1)])
self.assertEqual(expected, list(got.items()))
def testMany(self):
import random
N = 15 # number of IIBTrees to feed in
L = []
commonkey = N * 1000
allkeys = {commonkey: 1}
for i in range(N):
t = IIBTree()
t[commonkey] = i
for j in range(N-i):
key = i + j
allkeys[key] = 1
t[key] = N*i + j
L.append((t, i+1))
random.shuffle(L)
allkeys = allkeys.keys()
allkeys.sort()
# Test the union.
expected = []
for key in allkeys:
sum = 0
for t, w in L:
if t.has_key(key):
sum += t[key] * w
expected.append((key, sum))
# print 'union', expected
got = mass_weightedUnion(L)
self.assertEqual(expected, list(got.items()))
# Test the intersection.
expected = []
for key in allkeys:
sum = 0
for t, w in L:
if t.has_key(key):
sum += t[key] * w
else:
break
else:
# We didn't break out of the loop so it's in the intersection.
expected.append((key, sum))
# print 'intersection', expected
got = mass_weightedIntersection(L)
self.assertEqual(expected, list(got.items()))
def test_suite():
return makeSuite(TestSetOps)
if __name__=="__main__":
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Tests for the C version of the StopWordRemover."""
import unittest
from Products.ZCTextIndex import stopper
class StopperTest(unittest.TestCase):
def test_process_typeerror(self):
self.assertRaises(TypeError, stopper.process, 42, [])
self.assertRaises(TypeError, stopper.process, {}, 42)
self.assertRaises(TypeError, stopper.process, {})
self.assertRaises(TypeError, stopper.process, {}, [], 'extra arg')
def test_process_nostops(self):
words = ['a', 'b', 'c', 'splat!']
self.assertEqual(words, stopper.process({}, words))
def test_process_somestops(self):
d = {'b':1, 'splat!':1}
words = ['a', 'b', 'c', 'splat!']
self.assertEqual(['a', 'c'], stopper.process(d, words))
def test_process_allstops(self):
d = {'a':1, 'b':1, 'c':1, 'splat!':1}
words = ['a', 'b', 'c', 'splat!']
self.assertEqual([], stopper.process(d, words))
def test_suite():
return unittest.makeSuite(StopperTest)
if __name__ == "__main__":
unittest.main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""ZCTextIndex unit tests.
$Id$
"""
import unittest
import re
import Acquisition
from zExceptions import NotFound
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex, PLexicon
from Products.ZCTextIndex.tests import \
testIndex, testQueryEngine, testQueryParser
from Products.ZCTextIndex.BaseIndex import \
scaled_int, SCALE_FACTOR, inverse_doc_frequency
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
from Products.ZCTextIndex.Lexicon import Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import ParseError
class Indexable:
def __init__(self, text):
self.text = text
class Indexable2:
def __init__(self, text1, text2):
self.text1 = text1
self.text2 = text2
class LexiconHolder(Acquisition.Implicit):
def __init__(self, lexicon):
self.lexicon = lexicon
def getPhysicalPath(self):
return ('',) # Pretend to be the root
def dummyUnrestrictedTraverse(self, path):
if path == ('', 'lexicon',):
return self.lexicon
raise NotFound, path
# The tests classes below create a ZCTextIndex(). Then they create
# instance variables that point to the internal components used by
# ZCTextIndex. These tests run the individual module unit tests with
# the fully integrated ZCTextIndex.
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
if abs(scaled1 - scaled2) > epsilon:
raise AssertionError, "%s != %s" % (scaled1, scaled2)
# A series of text chunks to use for the re-index tests (testDocUpdate).
text = [
"""Here's a knocking indeed! If a
man were porter of hell-gate, he should have
old turning the key. knock (that made sure
sure there's at least one word in common)."""
"""Knock,
knock, knock! Who's there, i' the name of
Beelzebub? Here's a farmer, that hanged
himself on the expectation of plenty: come in
time; have napkins enow about you; here
you'll sweat for't.""",
"""Knock,
knock! Who's there, in the other devil's
name? Faith, here's an equivocator, that could
swear in both the scales against either scale;
who committed treason enough for God's sake,
yet could not equivocate to heaven: O, come
in, equivocator.""",
"""Knock,
knock, knock! Who's there? Faith, here's an
English tailor come hither, for stealing out of
a French hose: come in, tailor; here you may
roast your goose.""",
"""Knock,
knock; never at quiet! What are you? But
this place is too cold for hell. I'll devil-porter
it no further: I had thought to have let in
some of all professions that go the primrose
way to the everlasting bonfire."""
]
# Subclasses should derive from one of testIndex.{CosineIndexTest,
# OkapiIndexTest} too.
class ZCIndexTestsBase:
def setUp(self):
self.lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text',
'lexicon')
self.index = self.zc_index.index
def parserFailure(self, query):
self.assertRaises(ParseError, self.zc_index.query, query)
def parserSuccess(self, query, n):
r, num = self.zc_index.query(query)
self.assertEqual(num, n)
if n:
self.assertEqual(r[0][0], 1)
def testMultipleAttributes(self):
lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text1,text2',
'lexicon')
doc = Indexable2('foo bar', 'alpha omega')
zc_index.index_object(1, doc)
nbest, total = zc_index.query('foo')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('foo alpha')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('foo alpha gamma')
self.assertEqual(len(nbest), 0)
def testListAttributes(self):
lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text1,text2',
'lexicon')
doc = Indexable2('Hello Tim', \
['Now is the winter of our discontent',
'Made glorious summer by this sun of York', ])
zc_index.index_object(1, doc)
nbest, total = zc_index.query('glorious')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('York Tim')
self.assertEqual(len(nbest), 1)
nbest, total = zc_index.query('Tuesday Tim York')
self.assertEqual(len(nbest), 0)
def testStopWords(self):
# the only non-stopword is question
text = ("to be or not to be "
"that is the question")
doc = Indexable(text)
self.zc_index.index_object(1, doc)
for word in text.split():
if word != "question":
wids = self.lexicon.termToWordIds(word)
self.assertEqual(wids, [])
self.assertEqual(len(self.index.get_words(1)), 1)
self.parserSuccess('question', 1)
self.parserSuccess('question AND to AND be', 1)
self.parserSuccess('to AND question AND be', 1)
self.parserSuccess('question AND NOT gardenia', 1)
self.parserSuccess('question AND gardenia', 0)
self.parserSuccess('gardenia', 0)
self.parserSuccess('question OR gardenia', 1)
self.parserSuccess('question AND NOT to AND NOT be', 1)
self.parserSuccess('question OR to OR be', 1)
self.parserSuccess('question to be', 1)
self.parserFailure('to be')
self.parserFailure('to AND be')
self.parserFailure('to OR be')
self.parserFailure('to AND NOT be')
self.parserFailure('to AND NOT question')
self.parserFailure('to AND NOT gardenia')
def testDocUpdate(self):
docid = 1 # doesn't change -- we index the same doc repeatedly
N = len(text)
stop = get_stopdict()
d = {} # word -> list of version numbers containing that word
for version, i in zip(text, range(N)):
# use a simple splitter rather than an official one
words = [w for w in re.split("\W+", version.lower())
if len(w) > 1 and not stop.has_key(w)]
word_seen = {}
for w in words:
if not word_seen.has_key(w):
d.setdefault(w, []).append(i)
word_seen[w] = 1
unique = {} # version number -> list of words unique to that version
common = [] # list of words common to all versions
for w, versionlist in d.items():
if len(versionlist) == 1:
unique.setdefault(versionlist[0], []).append(w)
elif len(versionlist) == N:
common.append(w)
self.assert_(len(common) > 0)
self.assert_(len(unique) > 0)
for version, i in zip(text, range(N)):
doc = Indexable(version)
self.zc_index.index_object(docid, doc)
for w in common:
nbest, total = self.zc_index.query(w)
self.assertEqual(total, 1, "did not find %s" % w)
for k, v in unique.items():
if k == i:
continue
for w in v:
nbest, total = self.zc_index.query(w)
self.assertEqual(total, 0, "did not expect to find %s" % w)
class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
# A fairly involved test of the ranking calculations based on
# an example set of documents in queries in Managing
# Gigabytes, pp. 180-188. This test peeks into many internals of the
# cosine indexer.
def test_z3interfaces(self):
from Products.PluginIndexes.interfaces import IPluggableIndex
from Products.ZCTextIndex.interfaces import IZCTextIndex
from zope.interface.verify import verifyClass
verifyClass(IPluggableIndex, ZCTextIndex)
verifyClass(IZCTextIndex, ZCTextIndex)
def testRanking(self):
self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
"pease", "porridge", "pot"]
self.docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
self._ranking_index()
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
# A digression to exercise re-indexing.
docs = self.docs
for variant in "hot cold porridge python", "pease hot pithy":
self.zc_index.index_object(len(docs), Indexable(variant))
try:
self._ranking_tf()
except (AssertionError, KeyError):
pass
else:
self.fail("expected _ranking_tf() to fail -- reindex")
try:
self._ranking_idf()
except (AssertionError, KeyError):
pass
else:
self.fail("expected _ranking_idf() to fail -- reindex")
try:
self._ranking_queries()
except AssertionError:
pass
else:
self.fail("expected _ranking_queries() to fail -- reindex")
# This should leave things exactly as they were.
self.zc_index.index_object(len(docs), Indexable(docs[-1]))
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self):
docs = self.docs
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
def _ranking_tf(self):
# matrix of term weights for the rows are docids
# and the columns are indexes into this list:
l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
for i in range(len(l_Wd)):
docid = i + 1
scaled_Wd = scaled_int(l_Wd[i])
eq(scaled_Wd, self.index._get_Wd(docid))
wdts = [scaled_int(t) for t in l_wdt[i]]
for j in range(len(wdts)):
wdt = self.index._get_wdt(docid, self.words[j])
eq(wdts[j], wdt)
def _ranking_idf(self):
word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
for i in range(len(self.words)):
word = self.words[i]
eq(word_freqs[i], self.index._get_ft(word))
eq(scaled_int(idfs[i]), self.index._get_wt(word))
def _ranking_queries(self):
queries = ["eat", "porridge", "hot OR porridge",
"eat OR nine OR day OR old OR porridge"]
wqs = [1.95, 1.10, 1.77, 3.55]
results = [[(6, 0.71)],
[(1, 0.61), (2, 0.58), (5, 0.71)],
[(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
[(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
for i in range(len(queries)):
raw = queries[i]
q = QueryParser(self.lexicon).parseQuery(raw)
wq = self.index.query_weight(q.terms())
eq(wq, scaled_int(wqs[i]))
r, n = self.zc_index.query(raw)
self.assertEqual(len(r), len(results[i]))
# convert the results to a dict for each checking
d = {}
for doc, score in results[i]:
d[doc] = scaled_int(score)
for doc, score in r:
score = scaled_int(float(score / SCALE_FACTOR) / wq)
self.assert_(0 <= score <= SCALE_FACTOR)
eq(d[doc], score)
class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
# A white-box test.
def testAbsoluteScores(self):
docs = ["one",
"one two",
"one two three"]
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
self._checkAbsoluteScores()
# Exercise re-indexing.
for variant in "one xyz", "xyz two three", "abc def":
self.zc_index.index_object(len(docs), Indexable(variant))
try:
self._checkAbsoluteScores()
except AssertionError:
pass
else:
self.fail("expected _checkAbsoluteScores() to fail -- reindex")
# This should leave things exactly as they were.
self.zc_index.index_object(len(docs), Indexable(docs[-1]))
self._checkAbsoluteScores()
def _checkAbsoluteScores(self):
self.assertEqual(self.index._totaldoclen(), 6)
# So the mean doc length is 2. We use that later.
r, num = self.zc_index.query("one")
self.assertEqual(num, 3)
self.assertEqual(len(r), 3)
# Because our Okapi's B parameter is > 0, and "one" only appears
# once in each doc, the verbosity hypothesis favors shorter docs.
self.assertEqual([doc for doc, score in r], [1, 2, 3])
# The way the Okapi math works, a word that appears exactly once in
# an average (length) doc gets tf score 1. Our second doc has
# an average length, so its score should by 1 (tf) times the
# inverse doc frequency of "one". But "one" appears in every
# doc, so its IDF is log(1 + 3/3) = log(2).
self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3)))
# Similarly for "two".
r, num = self.zc_index.query("two")
self.assertEqual(num, 2)
self.assertEqual(len(r), 2)
self.assertEqual([doc for doc, score in r], [2, 3])
self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3)))
# And "three", except that doesn't appear in an average-size doc, so
# the math is much more involved.
r, num = self.zc_index.query("three")
self.assertEqual(num, 1)
self.assertEqual(len(r), 1)
self.assertEqual([doc for doc, score in r], [3])
idf = inverse_doc_frequency(1, 3)
meandoclen = 2.0
lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen
tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight)
self.assertEqual(r[0][1], scaled_int(tf * idf))
# More of a black-box test, but based on insight into how Okapi is trying
# to think.
def testRelativeScores(self):
# Create 9 10-word docs.
# All contain one instance of "one".
# Doc #i contains i instances of "two" and 9-i of "xyz".
for i in range(1, 10):
doc = "one " + "two " * i + "xyz " * (9 - i)
self.zc_index.index_object(i, Indexable(doc))
self._checkRelativeScores()
# Exercise re-indexing.
self.zc_index.index_object(9, Indexable("two xyz"))
try:
self._checkRelativeScores()
except AssertionError:
pass
else:
self.fail("expected _checkRelativeScores() to fail after reindex")
# This should leave things exactly as they were.
self.zc_index.index_object(9, Indexable(doc))
self._checkRelativeScores()
def _checkRelativeScores(self):
r, num = self.zc_index.query("one two")
self.assertEqual(num, 9)
self.assertEqual(len(r), 9)
# The more twos in a doc, the better the score should be.
self.assertEqual([doc for doc, score in r], range(9, 0, -1))
# Search for "two" alone shouldn't make any difference to relative
# results.
r, num = self.zc_index.query("two")
self.assertEqual(num, 9)
self.assertEqual(len(r), 9)
self.assertEqual([doc for doc, score in r], range(9, 0, -1))
# Searching for xyz should skip doc 9, and favor the lower-numbered
# docs (they have more instances of xyz).
r, num = self.zc_index.query("xyz")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual([doc for doc, score in r], range(1, 9))
# And relative results shouldn't change if we add "one".
r, num = self.zc_index.query("xyz one")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual([doc for doc, score in r], range(1, 9))
# But if we search for all the words, it's much muddier. The boost
# in going from i instances to i+1 of a given word is smaller than
# the boost in going from i-1 to i, so the winner will be the one
# that balances the # of twos and xyzs best. But the test is nasty
# that way: doc 4 has 4 two and 5 xyz, while doc 5 has the reverse.
# However, xyz is missing from doc 9, so xyz has a larger idf than
# two has. Since all the doc lengths are the same, doc lengths don't
# matter. So doc 4 should win, and doc 5 should come in second.
# The loser will be the most unbalanced, but is that doc 1 (1 two 8
# xyz) or doc 8 (8 two 1 xyz)? Again xyz has a higher idf, so doc 1
# is more valuable, and doc 8 is the loser.
r, num = self.zc_index.query("xyz one two")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual(r[0][0], 4) # winner
self.assertEqual(r[1][0], 5) # runner up
self.assertEqual(r[-1][0], 8) # loser
self.assertEqual(r[-2][0], 1) # penultimate loser
# And nothing about the relative results in the last test should
# change if we leave "one" out of the search (it appears in all
# docs, so it's a wash).
r, num = self.zc_index.query("two xyz")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual(r[0][0], 4) # winner
self.assertEqual(r[1][0], 5) # runner up
self.assertEqual(r[-1][0], 8) # loser
self.assertEqual(r[-2][0], 1) # penultimate loser
############################################################################
# Subclasses of QueryTestsBase must set a class variable IndexFactory to
# the kind of index to be constructed.
class QueryTestsBase(testQueryEngine.TestQueryEngine,
testQueryParser.TestQueryParser):
# The FauxIndex in testQueryEngine contains four documents.
# docid 1: foo, bar, ham
# docid 2: bar, ham
# docid 3: foo, ham
# docid 4: ham
docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
def setUp(self):
self.lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text',
'lexicon')
self.parser = QueryParser(self.lexicon)
self.index = self.zc_index.index
self.add_docs()
def add_docs(self):
for i in range(len(self.docs)):
text = self.docs[i]
obj = Indexable(text)
self.zc_index.index_object(i + 1, obj)
def compareSet(self, set, dict):
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
setkeys = list(set.keys())
dictkeys = dict.keys()
setkeys.sort()
dictkeys.sort()
self.assertEqual(setkeys, dictkeys)
class CosineQueryTests(QueryTestsBase):
IndexFactory = CosineIndex
class OkapiQueryTests(QueryTestsBase):
IndexFactory = OkapiIndex
class PLexiconTests(unittest.TestCase):
def _getTargetClass(self):
from Products.ZCTextIndex.ZCTextIndex import PLexicon
return PLexicon
def _makeOne(self, id='testing', title='Testing', *pipeline):
return self._getTargetClass()(id, title, *pipeline)
def test_class_conforms_to_ILexicon(self):
from Products.ZCTextIndex.interfaces import ILexicon
from zope.interface.verify import verifyClass
verifyClass(ILexicon, self._getTargetClass())
def test_instance_conforms_to_ILexicon(self):
from Products.ZCTextIndex.interfaces import ILexicon
from zope.interface.verify import verifyObject
verifyObject(ILexicon, self._makeOne())
def test_class_conforms_to_IZCLexicon(self):
from Products.ZCTextIndex.interfaces import IZCLexicon
from zope.interface.verify import verifyClass
verifyClass(IZCLexicon, self._getTargetClass())
def test_instance_conforms_to_IZCLexicon(self):
from Products.ZCTextIndex.interfaces import IZCLexicon
from zope.interface.verify import verifyObject
verifyObject(IZCLexicon, self._makeOne())
def test_queryLexicon_defaults_empty(self):
lexicon = self._makeOne()
info = lexicon.queryLexicon(REQUEST=None, words=None)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 0)
self.assertEqual(info['word_count'], 0)
self.assertEqual(list(info['page_range']), [])
self.assertEqual(info['page_columns'], [])
def test_queryLexicon_defaults_non_empty(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 7)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [WORDS])
def test_queryLexicon_row_breaks(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None, rows=4)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 4)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 7)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [WORDS[0:4], WORDS[4:]])
def test_queryLexicon_page_breaks(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None, rows=2, cols=2)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 2)
self.assertEqual(info['cols'], 2)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 4)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0, 1])
self.assertEqual(info['page_columns'], [WORDS[0:2], WORDS[2:4]])
def test_queryLexicon_page_break_not_first(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None,
page=1, rows=2, cols=2)
self.assertEqual(info['page'], 1)
self.assertEqual(info['rows'], 2)
self.assertEqual(info['cols'], 2)
self.assertEqual(info['start_word'], 5)
self.assertEqual(info['end_word'], 7)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0, 1])
self.assertEqual(info['page_columns'], [WORDS[4:6], WORDS[6:]])
def test_queryLexicon_words_no_globbing(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=['aaa', 'bbb'])
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 2)
self.assertEqual(info['word_count'], 2)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def test_queryLexicon_words_w_globbing(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=['aa*', 'bbb*'])
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 2)
self.assertEqual(info['word_count'], 2)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def test_queryLexicon_uses_pipeline_for_normalization(self):
from Products.ZCTextIndex.Lexicon import CaseNormalizer
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne('test', 'Testing', CaseNormalizer())
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=['AA*', 'Bbb*'])
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 2)
self.assertEqual(info['word_count'], 2)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def test_suite():
s = unittest.TestSuite()
for klass in (CosineIndexTests, OkapiIndexTests,
CosineQueryTests, OkapiQueryTests, PLexiconTests):
s.addTest(unittest.makeSuite(klass))
return s
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
#! /usr/bin/env python
"""Dump statistics about each word in the index.
usage: wordstats.py data.fs [index key]
"""
import ZODB
from ZODB.FileStorage import FileStorage
def main(fspath, key):
fs = FileStorage(fspath, read_only=1)
db = ZODB.DB(fs)
rt = db.open().root()
index = rt[key]
lex = index.lexicon
idx = index.index
print "Words", lex.length()
print "Documents", idx.length()
print "Word frequencies: count, word, wid"
for word, wid in lex.items():
docs = idx._wordinfo[wid]
print len(docs), word, wid
print "Per-doc scores: wid, (doc, score,)+"
for wid in lex.wids():
print wid,
docs = idx._wordinfo[wid]
for docid, score in docs.items():
print docid, score,
print
if __name__ == "__main__":
import sys
args = sys.argv[1:]
index_key = "index"
if len(args) == 1:
fspath = args[0]
elif len(args) == 2:
fspath, index_key = args
else:
print "Expected 1 or 2 args, got", len(args)
main(fspath, index_key)
...@@ -13,6 +13,7 @@ initgroups = 2.13.0 ...@@ -13,6 +13,7 @@ initgroups = 2.13.0
Missing = 2.13.1 Missing = 2.13.1
MultiMapping = 2.13.0 MultiMapping = 2.13.0
Persistence = 2.13.2 Persistence = 2.13.2
Products.ZCTextIndex = 2.13.0
Record = 2.13.0 Record = 2.13.0
RestrictedPython = 3.6.0a1 RestrictedPython = 3.6.0a1
tempstorage = 2.11.3 tempstorage = 2.11.3
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment