Commit 1c1a53d1 authored by Hanno Schlichting's avatar Hanno Schlichting

Products.ZCTextIndex was moved to its own distribution

parent 48f67574
...@@ -44,6 +44,7 @@ eggs = ...@@ -44,6 +44,7 @@ eggs =
Missing Missing
MultiMapping MultiMapping
Persistence Persistence
Products.ZCTextIndex
Record Record
RestrictedPython RestrictedPython
initgroups initgroups
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
############################################################################## ##############################################################################
import os import os
from setuptools import setup, find_packages, Extension from setuptools import setup, find_packages
setup(name='Zope2', setup(name='Zope2',
...@@ -29,18 +29,6 @@ setup(name='Zope2', ...@@ -29,18 +29,6 @@ setup(name='Zope2',
packages=find_packages('src'), packages=find_packages('src'),
namespace_packages=['Products'], namespace_packages=['Products'],
package_dir={'': 'src'}, package_dir={'': 'src'},
ext_modules=[
# indexes
Extension(
name='Products.ZCTextIndex.stopper',
sources=['src/Products/ZCTextIndex/stopper.c']),
Extension(
name='Products.ZCTextIndex.okascore',
sources=['src/Products/ZCTextIndex/okascore.c']),
],
install_requires=[ install_requires=[
'AccessControl', 'AccessControl',
'Acquisition', 'Acquisition',
...@@ -50,6 +38,7 @@ setup(name='Zope2', ...@@ -50,6 +38,7 @@ setup(name='Zope2',
'Missing', 'Missing',
'MultiMapping', 'MultiMapping',
'Persistence', 'Persistence',
'Products.ZCTextIndex',
'Record', 'Record',
'RestrictedPython', 'RestrictedPython',
'ZConfig', 'ZConfig',
......
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using a cosine measure."""
import math
from BTrees.IIBTree import IIBucket
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IIndex
from Products.ZCTextIndex.BaseIndex import BaseIndex
from Products.ZCTextIndex.BaseIndex import inverse_doc_frequency
from Products.ZCTextIndex.BaseIndex import scaled_int
from Products.ZCTextIndex.BaseIndex import SCALE_FACTOR
class CosineIndex(BaseIndex):
implements(IIndex)
def __init__(self, lexicon):
BaseIndex.__init__(self, lexicon)
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# ._docweight for cosine is
# docid -> W(docid)
# Most of the computation for computing a relevance score for the
# document occurs in the _search_wids() method. The code currently
# implements the cosine similarity function described in Managing
# Gigabytes, eq. 4.3, p. 187. The index_object() method
# precomputes some values that are independent of the particular
# query.
# The equation is
#
# sum(for t in I(d,q): w(d,t) * w(q,t))
# cosine(d, q) = -------------------------------------
# W(d) * W(q)
#
# where
# I(d, q) = the intersection of the terms in d and q.
#
# w(d, t) = 1 + log f(d, t)
# computed by doc_term_weight(); for a given word t,
# self._wordinfo[t] is a map from d to w(d, t).
#
# w(q, t) = log(1 + N/f(t))
# computed by inverse_doc_frequency()
#
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in
# self._docweight[d]
#
# W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
# computed by self.query_weight()
def _search_wids(self, wids):
if not wids:
return []
N = float(self.document_count())
L = []
DictType = type({})
for wid in wids:
assert self._wordinfo.has_key(wid) # caller responsible for OOV
d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
idf = inverse_doc_frequency(len(d2w), N) # an unscaled float
#print "idf = %.3f" % idf
if isinstance(d2w, DictType):
d2w = IIBucket(d2w)
L.append((d2w, scaled_int(idf)))
return L
def query_weight(self, terms):
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
N = float(self.document_count())
sum = 0.0
for wid in self._remove_oov_wids(wids):
wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
sum += wt ** 2.0
return scaled_int(math.sqrt(sum))
def _get_frequencies(self, wids):
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
Wsquares = 0.0
for wid, count in d.items():
w = doc_term_weight(count)
Wsquares += w * w
d[wid] = w
W = math.sqrt(Wsquares)
#print "W = %.3f" % W
for wid, weight in d.items():
#print i, ":", "%.3f" % weight,
d[wid] = scaled_int(weight / W)
#print "->", d[wid]
return d, scaled_int(W)
# The rest are helper methods to support unit tests
def _get_wdt(self, d, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
def _get_Wd(self, d):
return self._docweight[d]
def _get_ft(self, t):
wid, = self._lexicon.termToWordIds(t)
return len(self._wordinfo[wid])
def _get_wt(self, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
def doc_term_weight(count):
"""Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t)
return 1.0 + math.log(count)
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import re
from zope.interface import implements
from Products.ZCTextIndex.interfaces import ISplitter
from Products.ZCTextIndex.PipelineFactory import element_factory
class HTMLWordSplitter:
implements(ISplitter)
def process(self, text, wordpat=r"(?L)\w+"):
splat = []
for t in text:
splat += self._split(t, wordpat)
return splat
def processGlob(self, text):
# see Lexicon.globToWordIds()
return self.process(text, r"(?L)\w+[\w*?]*")
def _split(self, text, wordpat):
text = text.lower()
remove = [r"<[^<>]*>",
r"&[A-Za-z]+;"]
for pat in remove:
text = re.sub(pat, " ", text)
return re.findall(wordpat, text)
element_factory.registerFactory('Word Splitter',
'HTML aware splitter',
HTMLWordSplitter)
if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
from Products.ZCTextIndex.interfaces import IIndex # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.interfaces import INBest # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.interfaces import IPipelineElement # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.interfaces import IPipelineElementFactory # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.interfaces import IQueryParser # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.interfaces import ISplitter # BBB
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon.
$Id$
"""
import re
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from BTrees.Length import Length
from Persistence import Persistent
from zope.interface import implements
from Products.ZCTextIndex.interfaces import ILexicon
from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import QueryError
from Products.ZCTextIndex.PipelineFactory import element_factory
class Lexicon(Persistent):
implements(ILexicon)
def __init__(self, *pipeline):
self._wids = OIBTree() # word -> wid
self._words = IOBTree() # wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
# of vocabulary). This can happen, e.g., if a query contains a word
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
self.length = Length()
self._pipeline = pipeline
def length(self):
"""Return the number of unique terms in the lexicon."""
# Overridden in instances
return len(self._wids)
def words(self):
return self._wids.keys()
def wids(self):
return self._words.keys()
def items(self):
return self._wids.items()
def sourceToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
if not hasattr(self.length, 'change'):
# Make sure length is overridden with a BTrees.Length.Length
self.length = Length(self.length())
# Strategically unload the length value so that we get the most
# recent value written to the database to minimize conflicting wids
# Because length is independent, this will load the most
# recent value stored, regardless of whether MVCC is enabled
self.length._p_deactivate()
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "process_post_glob", element.process)
last = process(last)
wids = []
for word in last:
wids.append(self._wids.get(word, 0))
return wids
def parseTerms(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "processGlob", element.process)
last = process(last)
return last
def isGlob(self, word):
return "*" in word or "?" in word
def get_word(self, wid):
return self._words[wid]
def get_wid(self, word):
return self._wids.get(word, 0)
def globToWordIds(self, pattern):
# Implement * and ? just as in the shell, except the pattern
# must not start with either of these
prefix = ""
while pattern and pattern[0] not in "*?":
prefix += pattern[0]
pattern = pattern[1:]
if not pattern:
# There were no globbing characters in the pattern
wid = self._wids.get(prefix, 0)
if wid:
return [wid]
else:
return []
if not prefix:
# The pattern starts with a globbing character.
# This is too efficient, so we raise an exception.
raise QueryError(
"pattern %r shouldn't start with glob character" % pattern)
pat = prefix
for c in pattern:
if c == "*":
pat += ".*"
elif c == "?":
pat += "."
else:
pat += re.escape(c)
pat += "$"
prog = re.compile(pat)
keys = self._wids.keys(prefix) # Keys starting at prefix
wids = []
for key in keys:
if not key.startswith(prefix):
break
if prog.match(key):
wids.append(self._wids[key])
return wids
def _getWordIdCreate(self, word):
wid = self._wids.get(word)
if wid is None:
wid = self._new_wid()
self._wids[word] = wid
self._words[wid] = word
return wid
def _new_wid(self):
self.length.change(1)
while self._words.has_key(self.length()): # just to be safe
self.length.change(1)
return self.length()
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
return text
else:
return [text]
# Sample pipeline elements
class Splitter:
import re
rx = re.compile(r"(?L)\w+")
rxGlob = re.compile(r"(?L)\w+[\w*?]*") # See globToWordIds() above
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
element_factory.registerFactory('Word Splitter',
'Whitespace splitter',
Splitter)
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
element_factory.registerFactory('Case Normalizer',
'Case Normalizer',
CaseNormalizer)
element_factory.registerFactory('Stop Words',
' Don\'t remove stop words',
None)
class StopWordRemover:
dict = get_stopdict().copy()
try:
from Products.ZCTextIndex.stopper import process as _process
except ImportError:
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
else:
def process(self, lst):
return self._process(self.dict, lst)
element_factory.registerFactory('Stop Words',
'Remove listed stop words only',
StopWordRemover)
class StopWordAndSingleCharRemover(StopWordRemover):
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
element_factory.registerFactory('Stop Words',
'Remove listed and single char words',
StopWordAndSingleCharRemover)
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from bisect import bisect
from zope.interface import implements
from Products.ZCTextIndex.interfaces import INBest
class NBest:
implements(INBest)
def __init__(self, N):
"Build an NBest object to remember the N best-scoring objects."
if N < 1:
raise ValueError("NBest() argument must be at least 1")
self._capacity = N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self._scores = []
self._items = []
def __len__(self):
return len(self._scores)
def capacity(self):
return self._capacity
def add(self, item, score):
self.addmany([(item, score)])
def addmany(self, sequence):
scores, items, capacity = self._scores, self._items, self._capacity
n = len(scores)
for item, score in sequence:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if n >= capacity and score <= scores[0]:
continue
i = bisect(scores, score)
scores.insert(i, score)
items.insert(i, item)
if n == capacity:
del items[0], scores[0]
else:
n += 1
assert n == len(scores)
def getbest(self):
result = zip(self._items, self._scores)
result.reverse()
return result
def pop_smallest(self):
if self._scores:
return self._items.pop(0), self._scores.pop(0)
raise IndexError("pop_smallest() called on empty NBest object")
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from BTrees.IIBTree import difference
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IQueryParseTree
from Products.ZCTextIndex.SetOps import mass_weightedIntersection
from Products.ZCTextIndex.SetOps import mass_weightedUnion
class QueryError(Exception):
pass
class ParseError(Exception):
pass
class ParseTreeNode:
implements(IQueryParseTree)
_nodeType = None
def __init__(self, value):
self._value = value
def nodeType(self):
return self._nodeType
def getValue(self):
return self._value
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.getValue())
def terms(self):
t = []
for v in self.getValue():
t.extend(v.terms())
return t
def executeQuery(self, index):
raise NotImplementedError
class NotNode(ParseTreeNode):
_nodeType = "NOT"
def terms(self):
return []
def executeQuery(self, index):
raise QueryError, "NOT parse tree node cannot be executed directly"
class AndNode(ParseTreeNode):
_nodeType = "AND"
def executeQuery(self, index):
L = []
Nots = []
for subnode in self.getValue():
if subnode.nodeType() == "NOT":
r = subnode.getValue().executeQuery(index)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word AND NOT stop_word
# to act like plain real_word).
if r is not None:
Nots.append((r, 1))
else:
r = subnode.executeQuery(index)
# If None, technically it matches every doc, so needn't be
# included.
if r is not None:
L.append((r, 1))
set = mass_weightedIntersection(L)
if Nots:
notset = mass_weightedUnion(Nots)
set = difference(set, notset)
return set
class OrNode(ParseTreeNode):
_nodeType = "OR"
def executeQuery(self, index):
weighted = []
for node in self.getValue():
r = node.executeQuery(index)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word OR stop_word
# to act like plain real_word).
if r is not None:
weighted.append((r, 1))
return mass_weightedUnion(weighted)
class AtomNode(ParseTreeNode):
_nodeType = "ATOM"
def terms(self):
return [self.getValue()]
def executeQuery(self, index):
return index.search(self.getValue())
class PhraseNode(AtomNode):
_nodeType = "PHRASE"
def executeQuery(self, index):
return index.search_phrase(self.getValue())
class GlobNode(AtomNode):
_nodeType = "GLOB"
def executeQuery(self, index):
return index.search_glob(self.getValue())
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IPipelineElementFactory
class PipelineElementFactory:
implements(IPipelineElementFactory)
def __init__(self):
self._groups = {}
def registerFactory(self, group, name, factory):
if self._groups.has_key(group) and \
self._groups[group].has_key(name):
raise ValueError('ZCTextIndex lexicon element "%s" '
'already registered in group "%s"'
% (name, group))
elements = self._groups.get(group)
if elements is None:
elements = self._groups[group] = {}
elements[name] = factory
def getFactoryGroups(self):
groups = self._groups.keys()
groups.sort()
return groups
def getFactoryNames(self, group):
names = self._groups[group].keys()
names.sort()
return names
def instantiate(self, group, name):
factory = self._groups[group][name]
if factory is not None:
return factory()
element_factory = PipelineElementFactory()
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the
string can contain whitespace, parentheses and key words, but not
quotes.
+ A hyphen followed by one of the two forms above, meaning that it
must not be present.
An unquoted ATOM may also contain globbing characters. Globbing
syntax is defined by the lexicon; for example "foo*" could mean any
word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
"""
import re
from zope.interface import implements
from Products.ZCTextIndex.interfaces import IQueryParser
from Products.ZCTextIndex import ParseTree
# Create unique symbols for token types.
_AND = intern("AND")
_OR = intern("OR")
_NOT = intern("NOT")
_LPAREN = intern("(")
_RPAREN = intern(")")
_ATOM = intern("ATOM")
_EOF = intern("EOF")
# Map keyword string to token type.
_keywords = {
_AND: _AND,
_OR: _OR,
_NOT: _NOT,
_LPAREN: _LPAREN,
_RPAREN: _RPAREN,
}
# Regular expression to tokenize.
_tokenizer_regex = re.compile(r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
)
""", re.VERBOSE)
# Use unicode regex to treat fullwidth space characters defined in Unicode
# as valid whitespace.
_tokenizer_unicode_regex = re.compile(
_tokenizer_regex.pattern, _tokenizer_regex.flags|re.UNICODE)
class QueryParser:
implements(IQueryParser)
# This class is not thread-safe;
# each thread should have its own instance
def __init__(self, lexicon):
self._lexicon = lexicon
self._ignored = None
# Public API methods
def parseQuery(self, query):
# Lexical analysis.
try:
# Try to use unicode and treat fullwidth whitespace as valid one.
if not isinstance(query, unicode):
query = query.decode('utf-8')
tokens = _tokenizer_unicode_regex.findall(query)
except UnicodeDecodeError:
tokens = _tokenizer_regex.findall(query)
self._tokens = tokens
# classify tokens
self._tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self._tokens.append(_EOF)
self._tokentypes.append(_EOF)
self._index = 0
# Syntactical analysis.
self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr()
self._require(_EOF)
if tree is None:
raise ParseTree.ParseError(
"Query contains only common words: %s" % repr(query))
return tree
def getIgnored(self):
return self._ignored
def parseQueryEx(self, query):
tree = self.parseQuery(query)
ignored = self.getIgnored()
return tree, ignored
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self._tokentypes[self._index] is tokentype:
self._index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self._tokentypes[self._index] is tokentype
def _get(self, tokentype):
t = self._tokens[self._index]
self._require(tokentype)
return t
def _parseOrExpr(self):
L = []
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
L = filter(None, L)
if not L:
return None # Only stopwords
elif len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
t = self._parseTerm()
if t is not None:
L.append(t)
Nots = []
while self._check(_AND):
t = self._parseNotExpr()
if t is None:
continue
if isinstance(t, ParseTree.NotNode):
Nots.append(t)
else:
L.append(t)
if not L:
return None # Only stopwords
L.extend(Nots)
if len(L) == 1:
return L[0]
else:
return ParseTree.AndNode(L)
def _parseNotExpr(self):
if self._check(_NOT):
t = self._parseTerm()
if t is None:
return None # Only stopwords
return ParseTree.NotNode(t)
else:
return self._parseTerm()
def _parseTerm(self):
if self._check(_LPAREN):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
nodes = []
nodes = [self._parseAtom()]
while self._peek(_ATOM):
nodes.append(self._parseAtom())
nodes = filter(None, nodes)
if not nodes:
return None # Only stopwords
structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
for i in range(len(nodes))]
structure.sort()
nodes = [node for (bit, index, node) in structure]
if isinstance(nodes[0], ParseTree.NotNode):
raise ParseTree.ParseError(
"a term must have at least one positive word")
if len(nodes) == 1:
return nodes[0]
tree = ParseTree.AndNode(nodes)
return tree
def _parseAtom(self):
term = self._get(_ATOM)
words = self._lexicon.parseTerms(term)
if not words:
self._ignored.append(term)
return None
if len(words) > 1:
tree = ParseTree.PhraseNode(words)
elif self._lexicon.isGlob(words[0]):
tree = ParseTree.GlobNode(words[0])
else:
tree = ParseTree.AtomNode(words[0])
if term[0] == "-":
tree = ParseTree.NotNode(tree)
return tree
ZCTextIndex
===========
This product is a replacement for the full text indexing facility of
ZCatalog. Specifically, it is an alternative to
PluginIndexes/TextIndex.
Advantages of using ZCTextIndex over TextIndex:
- A new query language, supporting both explicit and implicit Boolean
operators, parentheses, globbing, and phrase searching. Apart from
explicit operators and globbing, the syntax is roughly the same as
that popularized by Google.
- A more refined scoring algorithm, resulting in better selectiveness:
it's much more likely that you'll find the document you are looking
for among the first few highest-ranked results.
- Actually, ZCTextIndex gives you a choice of two scoring algorithms
from recent literature: the Cosine ranking from the Managing
Gigabytes book, and Okapi from more recent research papers. Okapi
usually does better, so it is the default (but your milage may
vary).
- A redesigned Lexicon, using a pipeline architecture to split the
input text into words. This makes it possible to mix and match
pipeline components, e.g. you can choose between an HTML-aware
splitter and a plain text splitter, and additional components can be
added to the pipeline for case folding, stopword removal, and other
features. Enough example pipeline components are provided to get
you started, and it is very easy to write new components.
Performance is roughly the same as for TextIndex, and we're expecting
to make tweaks to the code that will make it faster.
This code can be used outside of Zope too; all you need is a
standalone ZODB installation to make your index persistent. Several
functional test programs in the tests subdirectory show how to do
this, for example mhindex.py, mailtest.py, indexhtml.py, and
queryhtml.py.
See the online help for how to use ZCTextIndex within Zope. (Included
in the subdirectory "help".)
Code overview
-------------
ZMI interface:
__init__.py ZMI publishing code
ZCTextIndex.py pluggable index class
PipelineFactory.py ZMI helper to configure the pipeline
Indexing:
BaseIndex.py common code for Cosine and Okapi index
CosineIndex.py Cosine index implementation
OkapiIndex.py Okapi index implementation
okascore.c C implementation of scoring loop
Lexicon:
Lexicon.py lexicon and sample pipeline elements
HTMLSplitter.py HTML-aware splitter
StopDict.py list of English stopwords
stopper.c C implementation of stop word remover
Query parser:
QueryParser.py parse a query into a parse tree
ParseTree.py parse tree node classes and exceptions
Utilities:
NBest.py find N best items in a list without sorting
SetOps.py efficient weighted set operations
WidCode.py list compression allowing phrase searches
RiceCode.py list compression code (as yet unused)
Interfaces (these speak for themselves):
IIndex.py
ILexicon.py
INBest.py
IPipelineElement.py
IPipelineElementFactory.py
IQueryParseTree.py
IQueryParser.py
ISplitter.py
Subdirectories:
dtml ZMI templates
help ZMI help files
tests unittests and some functional tests/examples
www images used in the ZMI
Tests
-----
Functional tests and helpers:
hs-tool.py helper to interpret hotshot profiler logs
indexhtml.py index a collection of HTML files
mailtest.py index and query a Unix mailbox file
mhindex.py index and query a set of MH folders
python.txt output from benchmark queries
queryhtml.py query an index created by indexhtml.py
wordstats.py dump statistics about each indexed word
Unit tests (these speak for themselves):
testIndex.py
testLexicon.py
testNBest.py
testPipelineFactory.py
testQueryEngine.py
testQueryParser.py
testSetOps.py
testStopper.py
testZCTextIndex.py
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Rice coding (a variation of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import array
class BitArray:
def __init__(self, buf=None):
self.bytes = array.array('B')
self.nbits = 0
self.bitsleft = 0
self.tostring = self.bytes.tostring
def __getitem__(self, i):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if self.bytes[byte] & mask:
return 1
else:
return 0
def __setitem__(self, i, val):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if val:
self.bytes[byte] |= mask
else:
self.bytes[byte] &= ~mask
def __len__(self):
return self.nbits
def append(self, bit):
"""Append a 1 if bit is true or 1 if it is false."""
if self.bitsleft == 0:
self.bytes.append(0)
self.bitsleft = 8
self.__setitem__(self.nbits, bit)
self.nbits += 1
self.bitsleft -= 1
def __getstate__(self):
return self.nbits, self.bitsleft, self.tostring()
def __setstate__(self, (nbits, bitsleft, s)):
self.bytes = array.array('B', s)
self.nbits = nbits
self.bitsleft = bitsleft
class RiceCode:
def __init__(self, m):
"""Constructor a RiceCode for m-bit values."""
if not (0 <= m <= 16):
raise ValueError, "m must be between 0 and 16"
self.init(m)
self.bits = BitArray()
self.len = 0
def init(self, m):
self.m = m
self.lower = (1 << m) - 1
self.mask = 1 << (m - 1)
def append(self, val):
"""Append an item to the list."""
if val < 1:
raise ValueError, "value >= 1 expected, got %s" % `val`
val -= 1
# emit the unary part of the code
q = val >> self.m
for i in range(q):
self.bits.append(1)
self.bits.append(0)
# emit the binary part
r = val & self.lower
mask = self.mask
while mask:
self.bits.append(r & mask)
mask >>= 1
self.len += 1
def __len__(self):
return self.len
def tolist(self):
"""Return the items as a list."""
l = []
i = 0 # bit offset
binary_range = range(self.m)
for j in range(self.len):
unary = 0
while self.bits[i] == 1:
unary += 1
i += 1
assert self.bits[i] == 0
i += 1
binary = 0
for k in binary_range:
binary = (binary << 1) | self.bits[i]
i += 1
l.append((unary << self.m) + (binary + 1))
return l
def tostring(self):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return self.bits.tostring()
def __getstate__(self):
return self.m, self.bits
def __setstate__(self, (m, bits)):
self.init(m)
self.bits = bits
def encode(m, l):
c = RiceCode(m)
for elt in l:
c.append(elt)
assert c.tolist() == l
return c
def encode_deltas(l):
if len(l) == 1:
return l[0], []
deltas = RiceCode(6)
deltas.append(l[1] - l[0])
for i in range(2, len(l)):
deltas.append(l[i] - l[i - 1])
return l[0], deltas
def decode_deltas(start, enc_deltas):
deltas = enc_deltas.tolist()
l = [start]
for i in range(1, len(deltas)):
l.append(l[i-1] + deltas[i])
l.append(l[-1] + deltas[-1])
return l
def test():
import random
for size in [10, 20, 50, 100, 200]:
l = [random.randint(1, size) for i in range(50)]
c = encode(random.randint(1, 16), l)
assert c.tolist() == l
for size in [10, 20, 50, 100, 200]:
l = range(random.randint(1, size), size + random.randint(1, size))
t = encode_deltas(l)
l2 = decode_deltas(*t)
assert l == l2
if l != l2:
print l
print l2
def pickle_efficiency():
import pickle
import random
for m in [4, 8, 12]:
for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
l = [random.randint(1, elt_range) for i in range(size)]
raw = pickle.dumps(l, 1)
enc = pickle.dumps(encode(m, l), 1)
print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
print "%5d %5d" % (len(raw), len(enc)),
if len(raw) > len(enc):
print "win"
else:
print "lose"
if __name__ == "__main__":
test()
<extension okascore>
source okascore.c
</extension>
<extension stopper>
source stopper.c
</extension>
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""SetOps -- Weighted intersections and unions applied to many inputs."""
from BTrees.IIBTree import IIBucket
from BTrees.IIBTree import weightedIntersection
from BTrees.IIBTree import weightedUnion
from Products.ZCTextIndex.NBest import NBest
def mass_weightedIntersection(L):
"A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
L = [(x, wx) for (x, wx) in L if x is not None]
if len(L) < 2:
return _trivial(L)
# Intersect with smallest first. We expect the input maps to be
# IIBuckets, so it doesn't hurt to get their lengths repeatedly
# (len(Bucket) is fast; len(BTree) is slow).
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
(x, wx), (y, wy) = L[:2]
dummy, result = weightedIntersection(x, y, wx, wy)
for x, wx in L[2:]:
dummy, result = weightedIntersection(result, x, 1, wx)
return result
def mass_weightedUnion(L):
"A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
if len(L) < 2:
return _trivial(L)
# Balance unions as closely as possible, smallest to largest.
merge = NBest(len(L))
for x, weight in L:
merge.add((x, weight), len(x))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
(x, wx), dummy = merge.pop_smallest()
(y, wy), dummy = merge.pop_smallest()
dummy, z = weightedUnion(x, y, wx, wy)
merge.add((z, 1), len(z))
(result, weight), dummy = merge.pop_smallest()
return result
def _trivial(L):
# L is empty or has only one (mapping, weight) pair. If there is a
# pair, we may still need to multiply the mapping by its weight.
assert len(L) <= 1
if len(L) == 0:
return IIBucket()
[(result, weight)] = L
if weight != 1:
dummy, result = weightedUnion(IIBucket(), result, 0, weight)
return result
*shared*
stopper stopper.c
okascore okascore.c
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def get_stopdict():
"""Return a dictionary of stopwords."""
return _dict
# This list of English stopwords comes from Lucene
_words = [
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
]
_dict = {}
for w in _words:
_dict[w] = None
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding. Unlike
# UTF-8, the initial byte does not tell you how many continuation bytes
# follow; and there's no ASCII superset property.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The first byte has 7 bits of data.
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The first byte doesn't tell you how many continuation bytes are
# following. You can tell by searching for the next byte with the
# high bit set (or the end of the string).
#
# The int to be encoded can contain no more than 28 bits.
#
# If it contains no more than 7 bits, 0abcdefg, the encoding is
# 1abcdefg
#
# If it contains 8 thru 14 bits,
# 00abcdef ghijkLmn
# the encoding is
# 1abcdefg 0hijkLmn
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 14 or fewer bits.
#
# If it contains 15 thru 21 bits,
# 000abcde fghijkLm nopqrstu
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu
#
# If it contains 22 thru 28 bits,
# 0000abcd efghijkL mnopqrst uvwxyzAB
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
assert 0x80**2 == 0x4000
assert 0x80**4 == 0x10000000
import re
def encode(wids):
# Encode a list of wids as a string.
wid2enc = _encoding
n = len(wid2enc)
return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
_encoding = [None] * 0x4000 # Filled later, and converted to a tuple
def _encode(w):
assert 0x4000 <= w < 0x10000000
b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80)
s = chr(b) + chr(c)
if a < 0x80: # no more than 21 data bits
return chr(a + 0x80) + s
a, b = divmod(a, 0x80)
assert a < 0x80, (w, a, b, s) # else more than 28 data bits
return (chr(a + 0x80) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
def decode(code):
# Decode a string into a list of wids.
get = _decoding.get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return [get(p) or _decode(p) for p in _prog.findall(code)]
_decoding = {} # Filled later
def _decode(s):
if s == '\x80':
# See comment in decode(). This is here to allow a trick to work.
return 0
if len(s) == 3:
a, b, c = map(ord, s)
assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
return ((a & 0x7F) << 14) | (b << 7) | c
assert len(s) == 4, `s`
a, b, c, d = map(ord, s)
assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
def _fill():
global _encoding
for i in range(0x80):
s = chr(i + 0x80)
_encoding[i] = s
_decoding[s] = i
for i in range(0x80, 0x4000):
hi, lo = divmod(i, 0x80)
s = chr(hi + 0x80) + chr(lo)
_encoding[i] = s
_decoding[s] = i
_encoding = tuple(_encoding)
_fill()
def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))
if __name__ == "__main__":
test()
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""ZCatalog Text Index
Plugin text index for ZCatalog.
"""
from PipelineFactory import element_factory
from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
def initialize(context):
context.registerClass(
ZCTextIndex.ZCTextIndex,
permission = 'Add Pluggable Index',
constructors = (ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex,
getIndexTypes),
icon='www/index.gif',
visibility=None
)
context.registerClass(
ZCTextIndex.PLexicon,
permission = 'Add Vocabularies',
constructors = (ZCTextIndex.manage_addLexiconForm,
ZCTextIndex.manage_addLexicon,
getElementGroups, getElementNames),
icon='www/lexicon.gif'
)
context.registerHelp()
context.registerHelpTitle("Zope Help")
## Functions below are for use in the ZMI constructor forms ##
def getElementGroups(self):
return element_factory.getFactoryGroups()
def getElementNames(self, group):
return element_factory.getFactoryNames(group)
def getIndexTypes(self):
return ZCTextIndex.index_types.keys()
## Allow relevent exceptions to be caught in untrusted code
from AccessControl import ModuleSecurityInfo
ModuleSecurityInfo('Products').declarePublic('ZCTextIndex')
ModuleSecurityInfo('Products.ZCTextIndex').declarePublic('ParseTree')
ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('QueryError')
ModuleSecurityInfo('Products.ZCTextIndex.ParseTree').declarePublic('ParseError')
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex Lexicon',
help_product='ZCTextIndex',
help_topic='Lexicon_Add.stx'
)">
<p class="form-help">
A ZCTextIndex Lexicon processes and stores the words of documents indexed
with a ZCTextIndex. Multiple ZCTextIndexes can share the same lexicon.
</p>
<form action="manage_addLexicon" method="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<dtml-in name="getElementGroups" prefix="group">
<dtml-let elements="getElementNames(group_item)">
<tr>
<td align="left" valign="top">
<div class="form-label">&dtml-group_item;</div>
</td>
<td align="left" valign="top">
<input type="hidden" name="elements.group:records"
value="&dtml-group_item;" />
<dtml-if expr="_.len(elements) > 1">
<select name="elements.name:records">
<dtml-in name="elements">
<option value="&dtml-sequence-item;"
>&dtml-sequence-item;</option>
</dtml-in>
</select>
<dtml-else>
<input type="checkbox" name="elements.name:records"
value="<dtml-var expr="elements[0]" html_quote>" checked />
</dtml-if>
</td>
</tr>
</dtml-let>
</dtml-in>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex',
help_product='ZCTextIndex',
help_topic='ZCTextIndex_Add.stx'
)">
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score, meaning they return hits in order
from the most relevant to the least relevant.
</p>
<form action="manage_addZCTextIndex" method="post"
enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Indexed attributes
</div></td>
<td align="left" valign="top">
<input type="text" name="extra.doc_attr:record" size="40" />
<em>attribute1,attribute2,...</em> or leave empty
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Ranking Strategy
</div>
</td>
<td align="left" valign="top">
<select name="extra.index_type:record">
<dtml-in name="getIndexTypes">
<option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign"top">
<div class="form-label">
Lexicon
</div></td>
<td>
<dtml-in expr="superValues('ZCTextIndex Lexicon')">
<dtml-if sequence-start>
<select name="extra.lexicon_id:record">
</dtml-if>
<option value="&dtml-id;">
&dtml-id; <dtml-var name="title" fmt="(%s)" null html_quote>
</option>
<dtml-if sequence-end>
</select>
</dtml-if>
<dtml-else>
<em>You must create a ZCTextIndex Lexicon first.</em>
</dtml-in>
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
The lexicon processes and stores the words found in objects indexed by one
or more ZCTextIndexes.
</p>
<p class="section-bar">
<span class="form-label">Input Pipeline Stages</span>
</p>
<p class="form-help">
Text indexed through this lexicon is processed by the following pipeline
stages
</p>
<ol class="form-help">
<dtml-in name="getPipelineNames">
<li>&dtml-sequence-item;</li>
</dtml-in>
</ol>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
Name(s) of attribute(s) indexed:
<em><dtml-var "', '.join(getIndexSourceNames())"></em>
</p>
<p class="form-help">
Index type:
<em>&dtml-getIndexType;</em>
</p>
<p class="form-help">
ZCTextIndex Lexicon used:
<dtml-if getLexiconURL>
<a href="&dtml-getLexiconURL;/manage_main"
>&dtml-getLexiconURL;</a>
<dtml-else>
<em>(Lexicon Not Found)</em>
</dtml-if>
</p>
<p class="form-help">
<em>Note:</em> The lexicon assigned to the index cannot be changed. To replace
the existing lexicon, create a new lexicon in the same place and clear the
index. This will make the index use the replacement lexicon.
</p>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
Browse the words in the lexicon or enter the word(s) you are interested in
below. Globbing characters (*, ?) are supported
</p>
<dtml-let words_str="' '.join(REQUEST.get('words',[]))">
<form action="&dtml-URL;">
<p class="form-element">
<span class="form-label">Word(s)</span>
<input name="words:tokens" size="20" value="&dtml-words_str;" />
<input type="submit" value="Query" />
<span class="form-label">&nbsp;Output Columns:</span>
<input name="cols:int" size="2" value="&dtml-cols;" />
<span class="form-label">&nbsp;Rows:</span>
<input name="rows:int" size="2" value="&dtml-rows;" />
</p>
</form>
<hr />
<form action="&dtml-URL;">
<table width="100%" cellpadding="2" cellspacing="0" border="0">
<tr class="section-bar">
<td><span class="form-label">
&dtml-word_count; Words Found<dtml-if word_count>,
Displaying &dtml-start_word;-&dtml-end_word;
</dtml-if>
<dtml-if expr="page_count > 0">
</span></td>
<td align="right"><span class="form-label">
Page:
<select name="page:int" onchange="this.form.submit()">
<dtml-in name="page_range" prefix="page">
<option value="&dtml-page_item;"
<dtml-if expr="page == page_item">
selected
</dtml-if>
>
<dtml-var expr="page_item+1">
</option>
</dtml-in>
</select>
of &dtml-page_count;
<input type="submit" value="Go" />
<input type="hidden" name="cols:int" value="&dtml-cols;" />
<input type="hidden" name="rows:int" value="&dtml-rows;" />
<input type="hidden" name="words:tokens" value="&dtml-words_str;" />
</dtml-if>
</span></td>
</tr>
</table>
</form>
</dtml-let>
<dtml-if name="page_columns">
<table width="100%" cellpadding="0" cellspacing="10" border="0">
<tr>
<dtml-in name="page_columns" prefix="column">
<td align="left" valign="top">
<dtml-var expr="'<br />'.join(column_item)">
</td>
</dtml-in>
</tr>
</table>
</dtml-if>
<dtml-var manage_page_footer>
ZCTextIndex Lexicon - Add: Create a new ZCTextIndex Lexicon
Description
This view allows you to create a new ZCTextIndex Lexicon object.
ZCTextIndex Lexicons store the words indexed by ZCTextIndexes in a
ZCatalog.
Controls
'Id' -- Allows you to specify the id of the ZCTextIndex Lexicon.
'Title' -- Allows you to specify the title of the ZCTextIndex Lexicon.
Pipeline Stages
The remaining controls allow you to select the desired processing
of text to index by selecting pipeline stages.
The default available stages are:
- **Word Splitter** This is the only mandatory stage. The word
splitter breaks the text up into a list of words. Included is a
simple whitespace splitter, and a splitter that removes HTML
tags. The HTML aware splitter gives best results when all of
the incoming content to index is HTML.
- **Stop Words** To conserve space in the vocabulary, and possibly
increase performance, you can select a stop word remover which
subtracts very common or single letter words from the Lexicon.
Bear in mind that you will not be able to search on removed stop
words, and they will also be removed from queries passed to
search ZCTextIndexes using the Lexicon.
- **Case Normalizer** The case normalizer removes case information
from the words in the Lexicon. If case-sensitive searching is
desires, then omit this element from the pipeline.
ZCTextIndex Add: Create a new ZCTextIndex
Description
A ZCTextIndex is an index for performing full text searches over
bodies of text. It includes the following features:
- Boolean query operators with parenthetical grouping
- Globbing (partial word) and phrase matching
- Two selectable relevance scoring algorithms
ZCTextIndex is designed as a replacement for standard TextIndex, and
has several advantages over it.
Controls
'Id' -- The id of the ZCTextIndex, must be unique for this ZCatalog.
'Field Name' -- The name of the field (object attribute) to be indexed.
'Ranking Strategy'
- **Okapi BM25 Rank** A relevance scoring technique that seems to
work well when the document text is considerably longer than the
query string, which is often the case with user specified query
strings.
- **Cosine Measure** A relevance scoring technique derived from the
"*Managing Gigabytes*":http://www.cs.mu.oz.au/mg/ book. It seems
to work best when the queries are similar in size and content to
the text they are searching.
'Lexicon' -- The ZCTextIndex Lexicon to be used by this ZCTextIndex.
Lexicons process and store the words from the text and
help in processing queries. You must define a ZCTextIndex
Lexicon before you can create a ZCTextIndex. Several
ZCTextIndexes can share the same Lexicon if desired.
This diff is collapsed.
/*****************************************************************************
Copyright (c) 2002 Zope Foundation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* okascore.c
*
* The inner scoring loop of OkapiIndex._search_wids() coded in C.
*
* Example from an indexed Python-Dev archive, where "python" shows up in all
* but 2 of the 19,058 messages. With the Python scoring loop,
*
* query: python
* # results: 10 of 19056 in 534.77 ms
* query: python
* # results: 10 of 19056 in 277.52 ms
*
* The first timing is cold, the second timing from an immediate repeat of
* the same query. With the scoring loop here in C:
*
* query: python
* # results: 10 of 19056 in 380.74 ms -- 40% speedup
* query: python
* # results: 10 of 19056 in 118.96 ms -- 133% speedup
*/
#include "Python.h"
#define K1 1.2
#define B 0.75
#ifndef PyTuple_CheckExact
#define PyTuple_CheckExact PyTuple_Check
#endif
static PyObject *
score(PyObject *self, PyObject *args)
{
/* Believe it or not, floating these common subexpressions "by hand"
gets better code out of MSVC 6. */
const double B_FROM1 = 1.0 - B;
const double K1_PLUS1 = K1 + 1.0;
/* Inputs */
PyObject *result; /* IIBucket result, maps d to score */
PyObject *d2fitems; /* ._wordinfo[t].items(), maps d to f(d, t) */
PyObject *d2len; /* ._docweight, maps d to # words in d */
double idf; /* inverse doc frequency of t */
double meandoclen; /* average number of words in a doc */
int n, i;
if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
&idf, &meandoclen))
return NULL;
idf *= 1024.0; /* float out part of the scaled_int computation */
n = PyObject_Length(d2fitems);
for (i = 0; i < n; ++i) {
PyObject *d_and_f; /* d2f[i], a (d, f) pair */
PyObject *d;
double f;
PyObject *doclen; /* ._docweight[d] */
double lenweight;
double tf;
PyObject *scaled_int;
int status;
d_and_f = PySequence_GetItem(d2fitems, i);
if (d_and_f == NULL)
return NULL;
if (!(PyTuple_CheckExact(d_and_f) &&
PyTuple_GET_SIZE(d_and_f) == 2)) {
PyErr_SetString(PyExc_TypeError,
"d2fitems must produce 2-item tuples");
Py_DECREF(d_and_f);
return NULL;
}
d = PyTuple_GET_ITEM(d_and_f, 0);
f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
doclen = PyObject_GetItem(d2len, d);
if (doclen == NULL) {
Py_DECREF(d_and_f);
return NULL;
}
lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
tf = f * K1_PLUS1 / (f + K1 * lenweight);
scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
if (scaled_int == NULL)
status = -1;
else
status = PyObject_SetItem(result, d, scaled_int);
Py_DECREF(d_and_f);
Py_DECREF(doclen);
Py_XDECREF(scaled_int);
if (status < 0)
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
}
static char score__doc__[] =
"score(result, d2fitems, d2len, idf, meandoclen)\n"
"\n"
"Do the inner scoring loop for an Okapi index.\n";
static PyMethodDef okascore_functions[] = {
{"score", score, METH_VARARGS, score__doc__},
{NULL}
};
void
initokascore(void)
{
PyObject *m;
m = Py_InitModule3("okascore", okascore_functions,
"inner scoring loop for Okapi rank");
}
/*****************************************************************************
Copyright (c) 2002 Zope Foundation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* stopper.c
*
* Fast version of the StopWordRemover object.
*/
#include "Python.h"
static PyObject *
stopper_process(PyObject *unused, PyObject *args)
{
PyObject *result = NULL;
PyObject *dict;
PyObject *seq;
int len, i;
if (!PyArg_ParseTuple(args, "O!O:process", &PyDict_Type, &dict, &seq))
return NULL;
seq = PySequence_Fast(seq,
"process() requires a sequence as argument 2");
if (seq == NULL)
return NULL;
result = PyList_New(0);
if (result == NULL)
goto finally;
#if PY_VERSION_HEX >= 0x02020000
/* Only available in Python 2.2 and newer. */
len = PySequence_Fast_GET_SIZE(seq);
#else
len = PyObject_Length(seq);
#endif
for (i = 0; i < len; ++i) {
PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
/*
* PyDict_GetItem() returns NULL if there isn't a matching
* item, but without setting an exception, so this does what
* we want.
*/
if (PyDict_GetItem(dict, s) == NULL) {
if (PyList_Append(result, s) < 0) {
Py_DECREF(result);
result = NULL;
goto finally;
}
}
}
finally:
Py_DECREF(seq);
return result;
}
static PyMethodDef stopper_functions[] = {
{"process", stopper_process, METH_VARARGS,
"process(dict, [str, ...]) --> [str, ...]\n"
"Remove stop words (the keys of dict) from the input list of strings\n"
" to create a new list."},
{NULL}
};
void
initstopper(void)
{
Py_InitModule3("stopper", stopper_functions,
"Fast StopWordRemover implementation.");
}
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Test package."""
#! /usr/bin/env python
import cPickle
import os.path
import sys
from hotshot.log import LogReader
def load_line_info(log):
byline = {}
prevloc = None
for what, place, tdelta in log:
if tdelta > 0:
t, nhits = byline.get(prevloc, (0, 0))
byline[prevloc] = (tdelta + t), (nhits + 1)
prevloc = place
return byline
def basename(path, cache={}):
try:
return cache[path]
except KeyError:
fn = os.path.split(path)[1]
cache[path] = fn
return fn
def print_results(results):
for info, place in results:
if place is None:
# This is the startup time for the profiler, and only
# occurs at the very beginning. Just ignore it, since it
# corresponds to frame setup of the outermost call, not
# anything that's actually interesting.
continue
filename, line, funcname = place
print '%8d %8d' % info, basename(filename), line
def annotate_results(results):
files = {}
for stats, place in results:
if not place:
continue
time, hits = stats
file, line, func = place
l = files.get(file)
if l is None:
l = files[file] = []
l.append((line, hits, time))
order = files.keys()
order.sort()
for k in order:
if os.path.exists(k):
v = files[k]
v.sort()
annotate(k, v)
def annotate(file, lines):
print "-" * 60
print file
print "-" * 60
f = open(file)
i = 1
match = lines[0][0]
for line in f:
if match == i:
print "%6d %8d " % lines[0][1:], line,
del lines[0]
if lines:
match = lines[0][0]
else:
match = None
else:
print " " * 16, line,
i += 1
print
def get_cache_name(filename):
d, fn = os.path.split(filename)
cache_dir = os.path.join(d, '.hs-tool')
cache_file = os.path.join(cache_dir, fn)
return cache_dir, cache_file
def cache_results(filename, results):
cache_dir, cache_file = get_cache_name(filename)
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
fp = open(cache_file, 'wb')
try:
cPickle.dump(results, fp, 1)
finally:
fp.close()
def main(filename, annotate):
cache_dir, cache_file = get_cache_name(filename)
if ( os.path.isfile(cache_file)
and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
# cached data is up-to-date:
fp = open(cache_file, 'rb')
results = cPickle.load(fp)
fp.close()
else:
log = LogReader(filename)
byline = load_line_info(log)
# Sort
results = [(v, k) for k, v in byline.items()]
results.sort()
cache_results(filename, results)
if annotate:
annotate_results(results)
else:
print_results(results)
if __name__ == "__main__":
import getopt
annotate_p = 0
opts, args = getopt.getopt(sys.argv[1:], 'A')
for o, v in opts:
if o == '-A':
annotate_p = 1
if args:
filename, = args
else:
filename = "profile.dat"
main(filename, annotate_p)
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
# XXX: Products.PluginIndexes.TextIndex no longer exists
from __future__ import nested_scopes
import os
from time import clock
import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree
import transaction
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
def make_zc_index():
# there's an elaborate dance necessary to construct an index
class Struct:
pass
extra = Struct()
extra.doc_attr = "read"
extra.lexicon_id = "lexicon"
caller = Struct()
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex("read", extra, caller)
# XXX make a splitter more like the HTMLSplitter for TextIndex
# signature is
# Splitter(string, stop_words, encoding,
# singlechar, indexnumbers, casefolding)
class MySplitter:
def __init__(self):
self._v_splitter = HTMLWordSplitter()
def __call__(self, text, stopdict, *args, **kwargs):
words = self._v_splitter._split(text)
def lookup(w):
return stopdict.get(w, w)
return filter(None, map(lookup, words))
#def make_old_index():
# from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
# from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
# from Products.ZCTextIndex.StopDict import get_stopdict
#
# l = Lexicon(get_stopdict())
# l.SplitterFunc = MySplitter()
# return TextIndex("read", lexicon=l)
def main(db, root, dir):
rt["index"] = index = INDEX()
rt["files"] = paths = IOBTree()
transaction.commit()
zodb_time = 0.0
pack_time = 0.0
files = [os.path.join(dir, file) for file in os.listdir(dir)]
docid = 0
t0 = clock()
for file in files:
if os.path.isdir(file):
files += [os.path.join(file, sub) for sub in os.listdir(file)]
else:
if not file.endswith(".html"):
continue
docid += 1
if LIMIT is not None and docid > LIMIT:
break
if VERBOSE:
print "%5d" % docid, file
f = open(file, "rb")
paths[docid] = file
index.index_object(docid, f)
f.close()
if docid % TXN_INTERVAL == 0:
z0 = clock()
transaction.commit()
z1 = clock()
zodb_time += z1 - z0
if VERBOSE:
print "commit took", z1 - z0, zodb_time
if docid % PACK_INTERVAL == 0:
p0 = clock()
db.pack()
p1 = clock()
zodb_time += p1 - p0
pack_time += p1 - p0
if VERBOSE:
print "pack took", p1 - p0, pack_time
z0 = clock()
transaction.commit()
z1 = t1 = clock()
total_time = t1 - t0
zodb_time += z1 - z0
if VERBOSE:
print "Total index time", total_time
print "Non-pack time", total_time - pack_time
print "Non-ZODB time", total_time - zodb_time
if __name__ == "__main__":
import sys
import getopt
VERBOSE = 0
FSPATH = "Data.fs"
TXN_INTERVAL = 100
PACK_INTERVAL = 500
LIMIT = None
INDEX = make_zc_index
try:
opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
except getopt.error, msg:
print msg
print __doc__
sys.exit(2)
for o, v in opts:
if o == '-v':
VERBOSE += 1
if o == '-f':
FSPATH = v
if o == '-t':
TXN_INTERVAL = int(v)
if o == '-p':
PACK_INTERVAL = int(v)
if o == '-n':
LIMIT = int(v)
# if o == '-T':
# INDEX = make_old_index
if len(args) != 1:
print "Expected on argument"
print __doc__
sys.exit(2)
dir = args[0]
fs = FileStorage(FSPATH)
db = ZODB.DB(fs)
cn = db.open()
rt = cn.root()
dir = os.path.join(os.getcwd(), dir)
print dir
main(db, rt, dir)
cn.close()
fs.close()
"""Test an index with a Unix mailbox file.
usage: python mailtest.py [options] <data.fs>
options:
-v -- verbose
Index Generation
-i mailbox
-n NNN -- max number of messages to read from mailbox
-t NNN -- commit a transaction every NNN messages (default: 1)
-p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-p 0 -- don't pack at all
-x -- exclude the message text from the data.fs
Queries
-q query
-b NNN -- return the NNN best matches (default: 10)
-c NNN -- context; if -v, show the first NNN lines of results (default: 5)
The script either indexes or queries depending on whether -q or -i is
passed as an option.
For -i mailbox, the script reads mail messages from the mailbox and
indexes them. It indexes one message at a time, then commits the
transaction.
For -q query, it performs a query on an existing index.
If both are specified, the index is performed first.
You can also interact with the index after it is completed. Load the
index from the database:
import ZODB
from ZODB.FileStorage import FileStorage
fs = FileStorage(<data.fs>
db = ZODB.DB(fs)
index = cn.open().root()["index"]
index.search("python AND unicode")
"""
import ZODB
import ZODB.FileStorage
import transaction
from Products.ZCTextIndex.Lexicon import \
Lexicon, CaseNormalizer, Splitter, StopWordRemover
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from BTrees.IOBTree import IOBTree
from Products.ZCTextIndex.QueryParser import QueryParser
import sys
import mailbox
import time
def usage(msg):
print msg
print __doc__
sys.exit(2)
class Message:
total_bytes = 0
def __init__(self, msg):
subject = msg.getheader('subject', '')
author = msg.getheader('from', '')
if author:
summary = "%s (%s)\n" % (subject, author)
else:
summary = "%s\n" % subject
self.text = summary + msg.fp.read()
Message.total_bytes += len(self.text)
class Extra:
pass
def index(rt, mboxfile, db, profiler):
global NUM
idx_time = 0
pack_time = 0
start_time = time.time()
lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
extra = Extra()
extra.lexicon_id = 'lexicon'
extra.doc_attr = 'text'
extra.index_type = 'Okapi BM25 Rank'
caller = Extra()
caller.lexicon = lexicon
rt["index"] = idx = ZCTextIndex("index", extra, caller)
if not EXCLUDE_TEXT:
rt["documents"] = docs = IOBTree()
else:
docs = None
transaction.commit()
mbox = mailbox.UnixMailbox(open(mboxfile, 'rb'))
if VERBOSE:
print "opened", mboxfile
if not NUM:
NUM = sys.maxint
if profiler:
itime, ptime, i = profiler.runcall(indexmbox, mbox, idx, docs, db)
else:
itime, ptime, i = indexmbox(mbox, idx, docs, db)
idx_time += itime
pack_time += ptime
transaction.commit()
if PACK_INTERVAL and i % PACK_INTERVAL != 0:
if VERBOSE >= 2:
print "packing one last time..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
if VERBOSE:
finish_time = time.time()
print
print "Index time", round(idx_time / 60, 3), "minutes"
print "Pack time", round(pack_time / 60, 3), "minutes"
print "Index bytes", Message.total_bytes
rate = (Message.total_bytes / idx_time) / 1024
print "Index rate %.2f KB/sec" % rate
print "Indexing began", time.ctime(start_time)
print "Indexing ended", time.ctime(finish_time)
print "Wall clock minutes", round((finish_time - start_time)/60, 3)
def indexmbox(mbox, idx, docs, db):
idx_time = 0
pack_time = 0
i = 0
while i < NUM:
_msg = mbox.next()
if _msg is None:
break
i += 1
msg = Message(_msg)
if VERBOSE >= 2:
print "indexing msg", i
i0 = time.clock()
idx.index_object(i, msg)
if not EXCLUDE_TEXT:
docs[i] = msg
if i % TXN_SIZE == 0:
transaction.commit()
i1 = time.clock()
idx_time += i1 - i0
if VERBOSE and i % 50 == 0:
print i, "messages indexed"
print "cache size", db.cacheSize()
if PACK_INTERVAL and i % PACK_INTERVAL == 0:
if VERBOSE >= 2:
print "packing..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
return idx_time, pack_time, i
def query(rt, query_str, profiler):
idx = rt["index"]
docs = rt["documents"]
start = time.clock()
if profiler is None:
results, num_results = idx.query(query_str, BEST)
else:
if WARM_CACHE:
print "Warming the cache..."
idx.query(query_str, BEST)
start = time.clock()
results, num_results = profiler.runcall(idx.query, query_str, BEST)
elapsed = time.clock() - start
print "query:", query_str
print "# results:", len(results), "of", num_results, \
"in %.2f ms" % (elapsed * 1000)
tree = QueryParser(idx.lexicon).parseQuery(query_str)
qw = idx.index.query_weight(tree.terms())
for docid, score in results:
scaled = 100.0 * score / qw
print "docid %7d score %6d scaled %5.2f%%" % (docid, score, scaled)
if VERBOSE:
msg = docs[docid]
ctx = msg.text.split("\n", CONTEXT)
del ctx[-1]
print "-" * 60
print "message:"
for l in ctx:
print l
print "-" * 60
def main(fs_path, mbox_path, query_str, profiler):
f = ZODB.FileStorage.FileStorage(fs_path)
db = ZODB.DB(f, cache_size=CACHE_SIZE)
cn = db.open()
rt = cn.root()
if mbox_path is not None:
index(rt, mbox_path, db, profiler)
if query_str is not None:
query(rt, query_str, profiler)
cn.close()
db.close()
f.close()
if __name__ == "__main__":
import getopt
NUM = 0
VERBOSE = 0
PACK_INTERVAL = 500
EXCLUDE_TEXT = 0
CACHE_SIZE = 10000
TXN_SIZE = 1
BEST = 10
CONTEXT = 5
WARM_CACHE = 0
query_str = None
mbox_path = None
profile = None
old_profile = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:c:xt:w',
['profile=', 'old-profile='])
except getopt.error, msg:
usage(msg)
if len(args) != 1:
usage("exactly 1 filename argument required")
for o, v in opts:
if o == '-n':
NUM = int(v)
elif o == '-v':
VERBOSE += 1
elif o == '-p':
PACK_INTERVAL = int(v)
elif o == '-q':
query_str = v
elif o == '-i':
mbox_path = v
elif o == '-b':
BEST = int(v)
elif o == '-x':
EXCLUDE_TEXT = 1
elif o == '-t':
TXN_SIZE = int(v)
elif o == '-c':
CONTEXT = int(v)
elif o == '-w':
WARM_CACHE = 1
elif o == '--profile':
profile = v
elif o == '--old-profile':
old_profile = v
fs_path, = args
if profile:
import hotshot
profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
elif old_profile:
import profile
profiler = profile.Profile()
else:
profiler = None
main(fs_path, mbox_path, query_str, profiler)
if profile:
profiler.close()
elif old_profile:
import pstats
profiler.dump_stats(old_profile)
stats = pstats.Stats(old_profile)
stats.strip_dirs().sort_stats('time').print_stats(20)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2009 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Test zope.index.text.htmlsplitter
"""
import unittest
class HTMLWordSplitterTests(unittest.TestCase):
# Subclasses must define '_getBTreesFamily'
def _getTargetClass(self):
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
return HTMLWordSplitter
def _makeOne(self):
return self._getTargetClass()()
def test_class_conforms_to_ISplitter(self):
from zope.interface.verify import verifyClass
from Products.ZCTextIndex.interfaces import ISplitter
verifyClass(ISplitter, self._getTargetClass())
def test_instance_conforms_to_ISplitter(self):
from zope.interface.verify import verifyObject
from Products.ZCTextIndex.interfaces import ISplitter
verifyObject(ISplitter, self._makeOne())
def test_process_empty_string(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['']), [])
def test_process_no_markup(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['abc def']), ['abc', 'def'])
def test_process_w_markup(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['<h1>abc</h1> &nbsp; <p>def</p>']),
['abc', 'def'])
def test_process_no_markup_w_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.process(['abc?def hij*klm nop* qrs?']),
['abc', 'def', 'hij', 'klm', 'nop', 'qrs'])
def test_processGlob_empty_string(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['']), [])
def test_processGlob_no_markup_no_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['abc def']), ['abc', 'def'])
def test_processGlob_w_markup_no_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['<h1>abc</h1> &nbsp; '
'<p>def</p>']),
['abc', 'def'])
def test_processGlob_no_markup_w_glob(self):
splitter = self._makeOne()
self.assertEqual(splitter.processGlob(['abc?def hij*klm nop* qrs?']),
['abc?def', 'hij*klm', 'nop*', 'qrs?'])
def test_suite():
return unittest.TestSuite((
unittest.makeSuite(HTMLWordSplitterTests),
))
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -13,6 +13,7 @@ initgroups = 2.13.0 ...@@ -13,6 +13,7 @@ initgroups = 2.13.0
Missing = 2.13.1 Missing = 2.13.1
MultiMapping = 2.13.0 MultiMapping = 2.13.0
Persistence = 2.13.2 Persistence = 2.13.2
Products.ZCTextIndex = 2.13.0
Record = 2.13.0 Record = 2.13.0
RestrictedPython = 3.6.0a1 RestrictedPython = 3.6.0a1
tempstorage = 2.11.3 tempstorage = 2.11.3
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment