Commit 61e89f2f authored by Guido van Rossum's avatar Guido van Rossum

Merged TextIndexDS9-branch into trunk.

parent a340cb9d
from Products.ZCTextIndex.ISplitter import ISplitter
import re
class HTMLSplitter:
__implements__ = ISplitter
def process(self, text):
return re.sub('<[^>]*>', ' ', text).split()
class HTMLWordSplitter:
__implements__ = ISplitter
def process(self, text):
splat = []
for t in text:
splat += self.split(t)
return splat
def split(self, text):
text = text.lower()
remove = ["<[^>]*>",
"&[A-Za-z]+;",
"\W+"]
for pat in remove:
text = re.sub(pat, " ", text)
rx = re.compile("[A-Za-z]")
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
import Interface
class IIndex(Interface.Base):
"""Interface for an Index."""
def search(term):
"""Execute a search on a single term given as a string.
Return an IIBucket.
"""
def search_phrase(phrase):
"""Execute a search on a phrase given as a string.
Return an IIBucket.
"""
def search_glob(pattern):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
NOTE: Currently only a single trailing * is supported.
Return an IIBucket.
"""
def query_weight(terms):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
"""
def index_doc(docid, text):
"XXX"
def unindex_doc(docid):
"XXX"
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class ILexicon(Interface):
"""Object responsible for converting text to word identifiers."""
def termToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they are search terms, and skips words that
aren't in the lexicon.
"""
def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates
new word ids for words that aren't (yet) in the lexicon.
"""
def globToWordIds(pattern):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
NOTE: Currently only a single trailing * is supported.
Returns the wids for all words in the lexicon that match the
pattern.
"""
def length():
"""Return the number of unique term in the lexicon."""
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""NBest Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
import Interface
class INBest(Interface.Base):
"""Interface for an N-Best chooser."""
def add(item, score):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""
def addmany(sequence):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""
def getbest():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""
def pop_smallest():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""
def __len__():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""
def capacity():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class IPipelineElement(Interface):
def process(source):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser Interface."""
import Interface
class IQueryParser(Interface.Base):
"""Interface for Query Parsers."""
def parseQuery(query):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
May raise ParseTree.ParseError.
"""
class IQueryParseTree(Interface.Base):
"""Interface for parse trees returned by parseQuery()."""
def nodeType():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""
def getValue():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""
def terms():
"""Return a list of all terms in this node, excluding NOT subtrees."""
def executeQuery(index):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class ISplitter(Interface):
"""A splitter."""
def process(text):
"""Run the splitter over the input text, returning a list of terms."""
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import re
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.StopDict import get_stopdict
class Lexicon:
__implements__ = ILexicon
def __init__(self, *pipeline):
self.__wids = OIBTree()
self.__words = IOBTree()
# XXX we're reserving wid 0, but that might be yagni
self.__nextwid = 1
self.__pipeline = pipeline
def length(self):
"""Return the number of unique terms in the lexicon."""
return self.__nextwid - 1
def words(self):
return self.__wids.keys()
def wids(self):
return self.__words.keys()
def items(self):
return self.__wids.items()
def sourceToWordIds(self, text):
last = _text2list(text)
for element in self.__pipeline:
last = element.process(last)
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
last = _text2list(text)
for element in self.__pipeline:
last = element.process(last)
wids = []
for word in last:
wid = self.__wids.get(word)
if wid is not None:
wids.append(wid)
return wids
def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern):
return []
pattern = pattern.lower()
assert pattern.endswith("*")
prefix = pattern[:-1]
assert prefix and not prefix.endswith("*")
keys = self.__wids.keys(prefix) # Keys starting at prefix
wids = []
words = []
for key in keys:
if not key.startswith(prefix):
break
wids.append(self.__wids[key])
words.append(key)
return wids
def _getWordIdCreate(self, word):
wid = self.__wids.get(word)
if wid is None:
wid = self.__new_wid()
self.__wids[word] = wid
self.__words[wid] = word
return wid
def __new_wid(self):
wid = self.__nextwid
self.__nextwid += 1
return wid
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
return text
else:
return [text]
# Sample pipeline elements
class Splitter:
import re
rx = re.compile(r"\w+")
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
class StopWordRemover:
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
try:
from Products.ZCTextIndex import stopper as _stopper
except ImportError:
pass
else:
_stopwords = StopWordRemover.dict
def StopWordRemover():
swr = _stopper.new()
swr.dict.update(_stopwords)
return swr
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from bisect import bisect
from Products.ZCTextIndex.INBest import INBest
class NBest:
__implements__ = INBest
def __init__(self, N):
"Build an NBest object to remember the N best-scoring objects."
if N < 1:
raise ValueError("NBest() argument must be at least 1")
self._capacity = N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self.scores = []
self.items = []
def __len__(self):
return len(self.scores)
def capacity(self):
return self._capacity
def add(self, item, score):
self.addmany([(item, score)])
def addmany(self, sequence):
scores, items, capacity = self.scores, self.items, self._capacity
n = len(scores)
for item, score in sequence:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if n >= capacity and score <= scores[0]:
continue
i = bisect(scores, score)
scores.insert(i, score)
items.insert(i, item)
if n == capacity:
del items[0], scores[0]
else:
n += 1
assert n == len(scores)
def getbest(self):
result = zip(self.items, self.scores)
result.reverse()
return result
def pop_smallest(self):
if self.scores:
return self.items.pop(0), self.scores.pop(0)
raise IndexError("pop_smallest() called on empty NBest object")
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from BTrees.IIBTree import difference, weightedIntersection, weightedUnion
from Products.ZCTextIndex.NBest import NBest
class QueryError(Exception):
pass
class ParseError(Exception):
pass
class ParseTreeNode:
_nodeType = None
def __init__(self, value):
self._value = value
def nodeType(self):
return self._nodeType
def getValue(self):
return self._value
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.getValue())
def terms(self):
t = []
for v in self.getValue():
t.extend(v.terms())
return t
def executeQuery(self, index):
raise NotImplementedError
class NotNode(ParseTreeNode):
_nodeType = "NOT"
def terms(self):
return []
def executeQuery(self, index):
raise QueryError, "NOT operator must occur right after AND"
class AndNode(ParseTreeNode):
_nodeType = "AND"
def executeQuery(self, index):
L = []
Nots = []
for subnode in self.getValue():
if subnode.nodeType() == "NOT":
Nots.append(subnode.getValue().executeQuery(index))
else:
L.append(subnode.executeQuery(index))
assert L
L.sort(lambda x, y: cmp(len(x), len(y)))
set = L[0]
for x in L[1:]:
dummy, set = weightedIntersection(set, x)
if Nots:
Nots.sort(lambda x, y: cmp(len(x), len(y)))
notset = Nots[0]
for x in Nots[1:]:
dummy, notset = weightedUnion(notset, x)
set = difference(set, notset)
return set
class OrNode(ParseTreeNode):
_nodeType = "OR"
def executeQuery(self, index):
# Balance unions as closely as possible, smallest to largest.
allofem = self.getValue()
merge = NBest(len(allofem))
for subnode in allofem:
result = subnode.executeQuery(index)
merge.add(result, len(result))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
x, dummy = merge.pop_smallest()
y, dummy = merge.pop_smallest()
dummy, z = weightedUnion(x, y)
merge.add(z, len(z))
result, dummy = merge.pop_smallest()
return result
class AtomNode(ParseTreeNode):
_nodeType = "ATOM"
def terms(self):
return [self.getValue()]
def executeQuery(self, index):
return index.search(self.getValue())
class PhraseNode(AtomNode):
_nodeType = "PHRASE"
def executeQuery(self, index):
return index.search_phrase(self.getValue())
class GlobNode(AtomNode):
_nodeType = "GLOB"
def executeQuery(self, index):
return index.search_glob(self.getValue())
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the string
can contain whitespace, parentheses and key words.
In addition, an ATOM may optionally be preceded by a hyphen, meaning
that it must not be present.
An unquoted ATOM may also end in a star. This is a primitive
"globbing" function, meaning to search for any word with a given
prefix.
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- a trailing * means globbing (i.e. prefix search), e.g. ``foo*''
"""
import re
import ParseTree # relative import
# Create unique symbols for token types.
_AND = intern("AND")
_OR = intern("OR")
_NOT = intern("NOT")
_LPAREN = intern("(")
_RPAREN = intern(")")
_ATOM = intern("ATOM")
_EOF = intern("EOF")
# Map keyword string to token type.
_keywords = {
_AND: _AND,
_OR: _OR,
_NOT: _NOT,
_LPAREN: _LPAREN,
_RPAREN: _RPAREN,
}
# Regular expression to tokenize.
_tokenizer_regex = re.compile(r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
)
""", re.VERBOSE)
class QueryParser:
def __init__(self):
pass # This parser has no persistent state
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
self.__tokens = tokens
# classify tokens
self.__tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self.__tokens.append(_EOF)
self.__tokentypes.append(_EOF)
self.__index = 0
# Syntactical analysis.
tree = self._parseOrExpr()
self._require(_EOF)
return tree
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self.__tokens[self.__index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self.__tokentypes[self.__index] is tokentype:
self.__index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self.__tokentypes[self.__index] is tokentype
def _get(self, tokentype):
t = self.__tokens[self.__index]
self._require(tokentype)
return t
def _parseOrExpr(self):
L = []
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
if len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
L.append(self._parseTerm())
while self._check(_AND):
L.append(self._parseNotExpr())
if len(L) == 1:
return L[0]
else:
return ParseTree.AndNode(L)
def _parseNotExpr(self):
if self._check(_NOT):
return ParseTree.NotNode(self._parseTerm())
else:
return self._parseTerm()
def _parseTerm(self):
if self._check(_LPAREN):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
atoms = [self._get(_ATOM)]
while self._peek(_ATOM):
atoms.append(self._get(_ATOM))
nodes = []
nots = []
for a in atoms:
words = re.findall(r"\w+\*?", a)
if not words:
continue
if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words))
elif words[0].endswith("*"):
n = ParseTree.GlobNode(words[0])
else:
n = ParseTree.AtomNode(words[0])
if a[0] == "-":
n = ParseTree.NotNode(n)
nots.append(n)
else:
nodes.append(n)
if not nodes:
text = " ".join(atoms)
msg = "At least one positive term required: %r" % text
raise ParseTree.ParseError, msg
nodes.extend(nots)
if len(nodes) == 1:
tree = nodes[0]
else:
tree = ParseTree.AndNode(nodes)
return tree
"""Rice coding (a varaitn of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import array
class BitArray:
def __init__(self, buf=None):
self.bytes = array.array('B')
self.nbits = 0
self.bitsleft = 0
self.tostring = self.bytes.tostring
def __getitem__(self, i):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if self.bytes[byte] & mask:
return 1
else:
return 0
def __setitem__(self, i, val):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if val:
self.bytes[byte] |= mask
else:
self.bytes[byte] &= ~mask
def __len__(self):
return self.nbits
def append(self, bit):
"""Append a 1 if bit is true or 1 if it is false."""
if self.bitsleft == 0:
self.bytes.append(0)
self.bitsleft = 8
self.__setitem__(self.nbits, bit)
self.nbits += 1
self.bitsleft -= 1
def __getstate__(self):
return self.nbits, self.bitsleft, self.tostring()
def __setstate__(self, (nbits, bitsleft, s)):
self.bytes = array.array('B', s)
self.nbits = nbits
self.bitsleft = bitsleft
class RiceCode:
def __init__(self, m):
"""Constructor a RiceCode for m-bit values."""
if not (0 <= m <= 16):
raise ValueError, "m must be between 0 and 16"
self.init(m)
self.bits = BitArray()
self.len = 0
def init(self, m):
self.m = m
self.lower = (1 << m) - 1
self.mask = 1 << (m - 1)
def append(self, val):
"""Append an item to the list."""
if val < 1:
raise ValueError, "value >= 1 expected, got %s" % `val`
val -= 1
# emit the unary part of the code
q = val >> self.m
for i in range(q):
self.bits.append(1)
self.bits.append(0)
# emit the binary part
r = val & self.lower
mask = self.mask
while mask:
self.bits.append(r & mask)
mask >>= 1
self.len += 1
def __len__(self):
return self.len
def tolist(self):
"""Return the items as a list."""
l = []
i = 0 # bit offset
binary_range = range(self.m)
for j in range(self.len):
unary = 0
while self.bits[i] == 1:
unary += 1
i += 1
assert self.bits[i] == 0
i += 1
binary = 0
for k in binary_range:
binary = (binary << 1) | self.bits[i]
i += 1
l.append((unary << self.m) + (binary + 1))
return l
def tostring(self):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return self.bits.tostring()
def __getstate__(self):
return self.m, self.bits
def __setstate__(self, (m, bits)):
self.init(m)
self.bits = bits
def encode(m, l):
c = RiceCode(m)
for elt in l:
c.append(elt)
assert c.tolist() == l
return c
def encode_deltas(l):
if len(l) == 1:
return l[0], []
deltas = RiceCode(6)
deltas.append(l[1] - l[0])
for i in range(2, len(l)):
deltas.append(l[i] - l[i - 1])
return l[0], deltas
def decode_deltas(start, enc_deltas):
deltas = enc_deltas.tolist()
l = [start]
for i in range(1, len(deltas)):
l.append(l[i-1] + deltas[i])
l.append(l[-1] + deltas[-1])
return l
def test():
import random
for size in [10, 20, 50, 100, 200]:
l = [random.randint(1, size) for i in range(50)]
c = encode(random.randint(1, 16), l)
assert c.tolist() == l
for size in [10, 20, 50, 100, 200]:
l = range(random.randint(1, size), size + random.randint(1, size))
t = encode_deltas(l)
l2 = decode_deltas(*t)
assert l == l2
if l != l2:
print l
print l2
def pickle_efficiency():
import pickle
import random
for m in [4, 8, 12]:
for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
l = [random.randint(1, elt_range) for i in range(size)]
raw = pickle.dumps(l, 1)
enc = pickle.dumps(encode(m, l), 1)
print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
print "%5d %5d" % (len(raw), len(enc)),
if len(raw) > len(enc):
print "win"
else:
print "lose"
if __name__ == "__main__":
test()
*shared*
stopper stopper.c
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def get_stopdict():
"""Return a dictionary of stopwords."""
return _dict
# This list of English stopwords comes from Lucene
_words = [
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
]
_dict = {}
for w in _words:
_dict[w] = None
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The number of bytes in the encoding is encoded in unary at the start of
# the first byte (i.e., an encoding with n bytes begins with n 1-bits
# followed by a 0 bit).
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The number of data bits in the first byte of an encoding varies.
#
# The int to be encoded can contain no more than 24 bits.
# XXX this could certainly be increased
#
# If it contains no more than 6 bits, 00abcdef, the encoding is
# 10abcdef
#
# If it contains 7 thru 12 bits,
# 0000abcd efghijkL
# the encoding is
# 110abcde 0fghijkL
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 12 or fewer bits.
#
# If it contains 13 thru 18 bits,
# 000000ab cdefghij kLmnopqr
# the encoding is
# 1110abcd 0efghijk 0Lmnopqr
#
# If it contains 19 thru 24 bits,
# abcdefgh ijkLmnop qrstuvwx
# the encoding is
# 11110abc 0defghij 0kLmnopq 0rstuvwx
import re
def encode(wids):
# Encode a list of wids as a string.
wid2enc = _encoding
n = len(wid2enc)
return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
_encoding = [None] * 0x1000 # Filled later, and converted to a tuple
def _encode(w):
assert 0x1000 <= w < 0x1000000
b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80)
s = chr(b) + chr(c)
if a < 0x10: # no more than 18 data bits
return chr(a + 0xE0) + s
a, b = divmod(a, 0x80)
assert a < 0x4, (w, a, b, s) # else more than 24 data bits
return (chr(a + 0xF0) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
def decode(code):
# Decode a string into a list of wids.
get = _decoding.get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return [get(p) or _decode(p) for p in _prog.findall(code)]
_decoding = {} # Filled later
def _decode(s):
if s == '\x80':
# See comment in decode(). This is here to allow a trick to work.
return 0
if len(s) == 3:
a, b, c = map(ord, s)
assert a & 0xF0 == 0xE0 and not b & 0x80 and not c & 0x80
return ((a & 0xF) << 14) | (b << 7) | c
assert len(s) == 4, `s`
a, b, c, d = map(ord, s)
assert a & 0xF8 == 0xF0 and not b & 0x80 and not c & 0x80 and not d & 0x80
return ((a & 0x7) << 21) | (b << 14) | (c << 7) | d
def _fill():
global _encoding
for i in range(0x40):
s = chr(i + 0x80)
_encoding[i] = s
_decoding[s] = i
for i in range(0x40, 0x1000):
hi, lo = divmod(i, 0x80)
s = chr(hi + 0xC0) + chr(lo)
_encoding[i] = s
_decoding[s] = i
_encoding = tuple(_encoding)
_fill()
def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))
if __name__ == "__main__":
test()
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking."""
import ZODB
from Persistence import Persistent
import Acquisition
from OFS.SimpleItem import SimpleItem
from Products.PluginIndexes.common.PluggableIndex \
import PluggableIndexInterface
from Products.ZCTextIndex.Index import Index
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from Globals import DTMLFile
from Interface import verify_class_implementation
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
__implements__ = PluggableIndexInterface
meta_type = 'ZCTextIndex'
manage_options= (
{'label': 'Settings', 'action': 'manage_main'},
)
def __init__(self, id, extra, caller):
self.id = id
self._fieldname = extra.doc_attr
lexicon = getattr(caller, extra.lexicon_id, None)
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id
verify_class_implementation(ILexicon, lexicon.__class__)
self.lexicon = lexicon
self.index = Index(self.lexicon)
self.parser = QueryParser()
def index_object(self, docid, obj):
self.index.index_doc(docid, self._get_object_text(obj))
self._p_changed = 1 # XXX
def unindex_object(self, docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX
def _apply_index(self, req):
pass # XXX
def query(self, query, nbest=10):
# returns a mapping from docids to scores
tree = self.parser.parseQuery(query)
results = tree.executeQuery(self.index)
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest()
def _get_object_text(self, obj):
x = getattr(obj, self._fieldname)
if callable(x):
return x()
else:
return x
## User Interface Methods ##
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None):
"""Add a text index"""
return self.manage_addIndex(id, 'ZCTextIndex', extra,
REQUEST, RESPONSE, REQUEST.URL3)
manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title, splitter=None, normalizer=None,
stopword=None, REQUEST=None):
elements = []
if splitter:
elements.append(Lexicon.Splitter())
if normalizer:
elements.append(CaseNormalizer())
if stopwords:
elements.append(StopWordRemover())
lexicon = Lexicon(*elements)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""ZCatalog Text Index
Experimental plugin text index for ZCatalog.
"""
def initialize(context):
from Products.ZCTextIndex import ZCTextIndex
context.registerClass(
ZCTextIndex.ZCTextIndex,
permission='Add Pluggable Index',
constructors=(ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex),
visibility=None
)
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add Lexicon',
)">
<FORM ACTION="manage_addLexicon" METHOD="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
splitter?
</td>
<td align="left" valign="top">
<input type="checkbox" name="splitter" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
case normalizer?
</td>
<td align="left" valign="top">
<input type="checkbox" name="normalizer" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
remove stop words?
</td>
<td align="left" valign="top">
<input type="checkbox" name="stopword" />
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex',
)">
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score, meaning they return hits in order
from the most relevant to the least relevant.
</p>
<form action="manage_addZCTextIndex" method="post"
enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<select name="extra.vocabulary:record">
<dtml-in "this().aq_parent.objectItems('Vocabulary')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Field name
</div></td>
<td align="left" valign="top">
<input type="text" name="extra.doc_attr:record" size="40" />
</td>
</tr>
<tr>
<td align="left" valign"top">
<div class="form-label">
Lexicon
</div></td>
<td>
<select name="extra.lexicon_id:record">
<dtml-in "this().aq_parent.objectItems('Lexicon')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Type
</div>
</td>
<td align="left" valign="top">
ZCTextIndex
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
There is nothing to manage here. Move along.
</p>
<dtml-var manage_page_footer>
/* stopper.c
*
* Fast version of the StopWordRemover object.
*/
#include "Python.h"
#include "structmember.h"
typedef struct {
PyObject_HEAD
PyObject *swr_dict;
} StopWordRemover;
static PyObject *
swr_process(StopWordRemover *self, PyObject *args)
{
PyObject *result = NULL;
PyObject *seq;
int len, i;
if (!PyArg_ParseTuple(args, "O:process", &seq))
return NULL;
seq = PySequence_Fast(seq,
"process() requires a sequence as the argument");
if (seq == NULL)
return NULL;
result = PyList_New(0);
if (result == NULL)
goto finally;
#if PY_VERSION_HEX >= 0x02020000
/* Only available in Python 2.2 and newer. */
len = PySequence_Fast_GET_SIZE(seq);
#else
len = PyObject_Length(seq);
#endif
for (i = 0; i < len; ++i) {
PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
/*
* PyDict_GetItem() returns NULL if there isn't a matching
* item, but without setting an exception, so this does what
* we want.
*/
if (PyDict_GetItem(self->swr_dict, s) == NULL)
if (PyList_Append(result, s) < 0) {
Py_DECREF(result);
result = NULL;
goto finally;
}
}
finally:
Py_XDECREF(seq);
return result;
}
static struct memberlist swr_members[] = {
{"dict", T_OBJECT, offsetof(StopWordRemover, swr_dict), READONLY},
{NULL}
};
static PyMethodDef swr_methods[] = {
{"process", (PyCFunction)swr_process, METH_VARARGS,
"process([str, ...]) --> [str, ...]\n"
"Remove stop words from the input list of strings to create a new list."},
{NULL}
};
static PyObject *
swr_getattr(PyObject *self, char *name)
{
PyObject *res;
res = Py_FindMethod(swr_methods, self, name);
if (res != NULL)
return res;
PyErr_Clear();
return PyMember_Get((char *)self, swr_members, name);
}
static void
swr_dealloc(StopWordRemover *self)
{
Py_XDECREF(self->swr_dict);
PyObject_Del(self);
}
static PyTypeObject StopWordRemover_Type = {
PyObject_HEAD_INIT(NULL) /* ob_type */
0, /* ob_size */
"stopper.StopWordRemover", /* tp_name */
sizeof(StopWordRemover), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)swr_dealloc, /* tp_dealloc */
0, /* tp_print */
(getattrfunc)swr_getattr, /* tp_getattr */
0, /* tp_setattr */
};
static PyObject *
swr_new(PyObject *notused, PyObject *args)
{
StopWordRemover *swr = NULL;
PyObject *dict = NULL;
if (PyArg_ParseTuple(args, "|O!:new", &PyDict_Type, &dict)) {
swr = PyObject_New(StopWordRemover, &StopWordRemover_Type);
if (swr != NULL) {
if (dict != NULL) {
Py_INCREF(dict);
swr->swr_dict = dict;
}
else {
swr->swr_dict = PyDict_New();
if (swr->swr_dict == NULL) {
Py_DECREF(swr);
swr = NULL;
}
}
}
}
return (PyObject *) swr;
}
static PyObject*
pickle_constructor = NULL;
PyObject *
swr_pickler(PyObject *unused, PyObject *args)
{
StopWordRemover *swr;
PyObject *result = NULL;
if (PyArg_ParseTuple(args, "O!:_pickler", &StopWordRemover_Type, &swr)) {
result = Py_BuildValue("O(O)", pickle_constructor, swr->swr_dict);
}
return result;
}
static PyMethodDef stopper_functions[] = {
{"new", swr_new, METH_VARARGS,
"new() -> StopWordRemover instance\n"
"Create & return a new stop-word remover."},
{"_pickler", swr_pickler, METH_VARARGS,
"_pickler(StopWordRemover instance) -> pickle magic\n"
"Internal magic used to make stop-word removers picklable."},
{NULL}
};
void
initstopper(void)
{
PyObject *m, *copy_reg;
StopWordRemover_Type.ob_type = &PyType_Type;
m = Py_InitModule3("stopper", stopper_functions,
"Fast StopWordRemover implementation.");
if (m == NULL)
return;
if (PyObject_SetAttrString(m, "StopWordRemoverType",
(PyObject *) &StopWordRemover_Type) < 0)
return;
/* register to support pickling */
copy_reg = PyImport_ImportModule("copy_reg");
if (copy_reg != NULL) {
PyObject *pickler;
if (pickle_constructor == NULL) {
pickle_constructor = PyObject_GetAttrString(m, "new");
Py_XINCREF(pickle_constructor);
}
pickler = PyObject_GetAttrString(m, "_pickler");
if ((pickle_constructor != NULL) && (pickler != NULL)) {
PyObject *res;
res = PyObject_CallMethod(
copy_reg, "pickle", "OOO", &StopWordRemover_Type,
pickler, pickle_constructor);
Py_XDECREF(res);
}
Py_DECREF(copy_reg);
}
}
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""
Revision information:
$Id: __init__.py,v 1.2 2002/05/14 15:12:34 gvanrossum Exp $
"""
#! /usr/bin/env python
import cPickle
import os.path
import sys
from hotshot.log import LogReader
def load_line_info(log):
byline = {}
prevloc = None
for what, place, tdelta in log:
if tdelta > 0:
t, nhits = byline.get(prevloc, (0, 0))
byline[prevloc] = (tdelta + t), (nhits + 1)
prevloc = place
return byline
def basename(path, cache={}):
try:
return cache[path]
except KeyError:
fn = os.path.split(path)[1]
cache[path] = fn
return fn
def print_results(results):
for info, place in results:
if not place:
print 'Bad unpack:', info, place
continue
filename, line, funcname = place
print '%8d %8d' % info, basename(filename), line
def annotate_results(results):
files = {}
for stats, place in results:
if not place:
continue
time, hits = stats
file, line, func = place
l = files.get(file)
if l is None:
l = files[file] = []
l.append((line, hits, time))
order = files.keys()
order.sort()
for k in order:
if os.path.exists(k):
v = files[k]
v.sort()
annotate(k, v)
def annotate(file, lines):
print "-" * 60
print file
print "-" * 60
f = open(file)
i = 1
match = lines[0][0]
for line in f:
if match == i:
print "%6d %8d " % lines[0][1:], line,
del lines[0]
if lines:
match = lines[0][0]
else:
match = None
else:
print " " * 16, line,
i += 1
print
def get_cache_name(filename):
d, fn = os.path.split(filename)
cache_dir = os.path.join(d, '.hs-tool')
cache_file = os.path.join(cache_dir, fn)
return cache_dir, cache_file
def cache_results(filename, results):
cache_dir, cache_file = get_cache_name(filename)
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
fp = open(cache_file, 'wb')
try:
cPickle.dump(results, fp, 1)
finally:
fp.close()
def main(filename, annotate):
cache_dir, cache_file = get_cache_name(filename)
if ( os.path.isfile(cache_file)
and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
# cached data is up-to-date:
fp = open(cache_file, 'rb')
results = cPickle.load(fp)
fp.close()
else:
log = LogReader(filename)
byline = load_line_info(log)
# Sort
results = [(v, k) for k, v in byline.items()]
results.sort()
cache_results(filename, results)
if annotate:
annotate_results(results)
else:
print_results(results)
if __name__ == "__main__":
import getopt
annotate_p = 0
opts, args = getopt.getopt(sys.argv[1:], 'A')
for o, v in opts:
if o == '-A':
annotate_p = 1
if args:
filename, = args
else:
filename = "profile.dat"
main(filename, annotate_p)
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
import os
import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
def make_index():
# there's an elaborate dance necessary to construct an index
class Struct:
pass
extra = Struct()
extra.doc_attr = "read"
extra.lexicon_id = "lexicon"
caller = Struct()
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex(extra, caller)
def main(db, root, dir):
rt["index"] = index = make_index()
rt["files"] = paths = IOBTree()
get_transaction().commit()
files = [os.path.join(dir, file) for file in os.listdir(dir)]
docid = 0
for file in files:
if os.path.isdir(file):
files += [os.path.join(file, sub) for sub in os.listdir(file)]
else:
if not file.endswith(".html"):
continue
docid += 1
print "%5d" % docid, file
f = open(file, "rb")
paths[docid] = file
index.index_object(docid, f)
f.close()
if docid % TXN_INTERVAL == 0:
get_transaction().commit()
if docid % PACK_INTERVAL == 0:
db.pack()
get_transaction().commit()
if __name__ == "__main__":
import sys
import getopt
VERBOSE = 0
FSPATH = "Data.fs"
TXN_INTERVAL = 100
PACK_INTERVAL = 500
try:
opts, args = getopt.getopt(sys.argv[1:], 'vf:')
except getopt.error, msg:
print msg
print __doc__
sys.exit(2)
for o, v in opts:
if o == '-v':
VERBOSE += 1
if o == '-f':
FSPATH = v
if len(args) != 1:
print "Expected on argument"
print __doc__
sys.exit(2)
dir = args[0]
fs = FileStorage(FSPATH)
db = ZODB.DB(fs)
cn = db.open()
rt = cn.root()
dir = os.path.join(os.getcwd(), dir)
print dir
main(db, rt, dir)
cn.close()
fs.close()
"""Test an index with a Unix mailbox file.
usage: python mailtest.py [options] <data.fs>
options:
-v -- verbose
-n NNN -- max number of messages to read from mailbox
-q query
-i mailbox
-p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-p 0 -- don't pack at all
-b NNN -- return the NNN best matches (default: 10)
-x -- exclude the message text from the data.fs
-t NNN -- commit a transaction every NNN messages (default: 1)
The script either indexes or queries depending on whether -q or -i is
passed as an option.
For -i mailbox, the script reads mail messages from the mailbox and
indexes them. It indexes one message at a time, then commits the
transaction.
For -q query, it performs a query on an existing index.
If both are specified, the index is performed first.
You can also interact with the index after it is completed. Load the
index from the database:
import ZODB
from ZODB.FileStorage import FileStorage
fs = FileStorage(<data.fs>
db = ZODB.DB(fs)
index = cn.open().root()["index"]
index.search("python AND unicode")
"""
import ZODB
import ZODB.FileStorage
from Products.ZCTextIndex.Lexicon import Lexicon, \
CaseNormalizer, Splitter, StopWordRemover
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from BTrees.IOBTree import IOBTree
import sys
import mailbox
import time
def usage(msg):
print msg
print __doc__
sys.exit(2)
class Message:
total_bytes = 0
def __init__(self, msg):
subject = msg.getheader('subject', '')
author = msg.getheader('from', '')
if author:
summary = "%s (%s)\n" % (subject, author)
else:
summary = "%s\n" % subject
self.text = summary + msg.fp.read()
Message.total_bytes += len(self.text)
class Extra:
pass
def index(rt, mboxfile, db):
global NUM
idx_time = 0
pack_time = 0
lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
extra = Extra()
extra.lexicon_id = 'lexicon'
extra.doc_attr = 'text'
caller = Extra()
caller.lexicon = lexicon
rt["index"] = idx = ZCTextIndex("index", extra, caller)
if not EXCLUDE_TEXT:
rt["documents"] = docs = IOBTree()
get_transaction().commit()
mbox = mailbox.UnixMailbox(open(mboxfile))
if VERBOSE:
print "opened", mboxfile
if not NUM:
NUM = sys.maxint
i = 0
while i < NUM:
_msg = mbox.next()
if _msg is None:
break
i += 1
msg = Message(_msg)
if VERBOSE >= 2:
print "indexing msg", i
i0 = time.clock()
idx.index_object(i, msg)
if not EXCLUDE_TEXT:
docs[i] = msg
if i % TXN_SIZE == 0:
get_transaction().commit()
i1 = time.clock()
idx_time += i1 - i0
if VERBOSE and i % 50 == 0:
print i, "messages indexed"
print "cache size", db.cacheSize()
if PACK_INTERVAL and i % PACK_INTERVAL == 0:
if VERBOSE >= 2:
print "packing..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
get_transaction().commit()
if PACK_INTERVAL and i % PACK_INTERVAL != 0:
if VERBOSE >= 2:
print "packing one last time..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
if VERBOSE:
print "Index time", idx_time
print "Index bytes", Message.total_bytes
rate = (Message.total_bytes / idx_time) / 1024
print "Index rate %d KB/sec" % int(rate)
def query(rt, query_str):
idx = rt["index"]
docs = rt["documents"]
results = idx.query(query_str, BEST)
print "query:", query_str
print "# results:", len(results)
for docid, score in results:
print "docid %4d score %2d" % (docid, score)
if VERBOSE:
msg = docs[docid]
# print 3 lines of context
CONTEXT = 5
ctx = msg.text.split("\n", CONTEXT)
del ctx[-1]
print "-" * 60
print "message:"
for l in ctx:
print l
print "-" * 60
def main(fs_path, mbox_path, query_str):
f = ZODB.FileStorage.FileStorage(fs_path)
db = ZODB.DB(f, cache_size=CACHE_SIZE)
cn = db.open()
rt = cn.root()
if mbox_path is not None:
index(rt, mbox_path, db)
if query_str is not None:
query(rt, query_str)
cn.close()
db.close()
f.close()
if __name__ == "__main__":
import getopt
NUM = 0
BEST = 10
VERBOSE = 0
PACK_INTERVAL = 500
EXCLUDE_TEXT = 0
CACHE_SIZE = 10000
TXN_SIZE = 1
query_str = None
mbox_path = None
profile = None
old_profile = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:xt:',
['profile=', 'old-profile='])
except getopt.error, msg:
usage(msg)
if len(args) != 1:
usage("exactly 1 filename argument required")
for o, v in opts:
if o == '-n':
NUM = int(v)
elif o == '-v':
VERBOSE += 1
elif o == '-p':
PACK_INTERVAL = int(v)
elif o == '-q':
query_str = v
elif o == '-i':
mbox_path = v
elif o == '-b':
BEST = int(v)
elif o == '-x':
EXCLUDE_TEXT = 1
elif o == '-t':
TXN_SIZE = int(v)
elif o == '--profile':
profile = v
elif o == '--old-profile':
old_profile = v
fs_path, = args
if profile:
import hotshot
profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
profiler.runcall(main, fs_path, mbox_path, query_str)
profiler.close()
elif old_profile:
import profile, pstats
profiler = profile.Profile()
profiler.runcall(main, fs_path, mbox_path, query_str)
profiler.dump_stats(old_profile)
stats = pstats.Stats(old_profile)
stats.strip_dirs().sort_stats('time').print_stats(20)
else:
main(fs_path, mbox_path, query_str)
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.Index import Index
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
class IndexTest(TestCase):
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.index = Index(self.lexicon)
def test_index_document(self, DOCID=1):
doc = "simple document contains five words"
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_unindex_document(self):
DOCID = 1
self.test_index_document(DOCID)
self.index.unindex_doc(DOCID)
self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0)
def test_index_two_documents(self):
self.test_index_document()
doc = "another document just four"
DOCID = 2
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
wids = self.lexicon.termToWordIds("document")
self.assertEqual(len(wids), 1)
document_wid = wids[0]
for wid, map in self.index._wordinfo.items():
if wid == document_wid:
self.assertEqual(len(map), 2)
self.assert_(map.has_key(1))
self.assert_(map.has_key(DOCID))
else:
self.assertEqual(len(map), 1)
def test_index_two_unindex_one(self):
# index two documents, unindex one, and test the results
self.test_index_two_documents()
self.index.unindex_doc(1)
DOCID = 2
self.assertEqual(len(self.index._docweight), 1)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_index_duplicated_words(self, DOCID=1):
doc = "very simple repeat repeat repeat document test"
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
## self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
wids = self.lexicon.termToWordIds("repeat")
self.assertEqual(len(wids), 1)
repititive_wid = wids[0]
for wid, map in self.index._wordinfo.items():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_simple_query_oneresult(self):
self.index.index_doc(1, 'not the same document')
results = self.index.search("document")
self.assertEqual(list(results.keys()), [1])
def test_simple_query_noresults(self):
self.index.index_doc(1, 'not the same document')
results = self.index.search("frobnicate")
self.assertEqual(list(results.keys()), [])
def test_query_oneresult(self):
self.index.index_doc(1, 'not the same document')
self.index.index_doc(2, 'something about something else')
results = self.index.search("document")
self.assertEqual(list(results.keys()), [1])
def test_search_phrase(self):
self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
results = self.index.search_phrase("quick brown fox")
self.assertEqual(list(results.keys()), [1])
def test_search_glob(self):
self.index.index_doc(1, "how now brown cow")
self.index.index_doc(2, "hough nough browne cough")
self.index.index_doc(3, "bar brawl")
results = self.index.search_glob("bro*")
self.assertEqual(list(results.keys()), [1, 2])
results = self.index.search_glob("b*")
self.assertEqual(list(results.keys()), [1, 2, 3])
def test_suite():
return makeSuite(IndexTest)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.Lexicon import Lexicon
from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer
class StupidPipelineElement:
def __init__(self, fromword, toword):
self.__fromword = fromword
self.__toword = toword
def process(self, seq):
res = []
for term in seq:
if term == self.__fromword:
res.append(self.__toword)
else:
res.append(term)
return res
class WackyReversePipelineElement:
def __init__(self, revword):
self.__revword = revword
def process(self, seq):
res = []
for term in seq:
if term == self.__revword:
x = list(term)
x.reverse()
res.append(''.join(x))
else:
res.append(term)
return res
class StopWordPipelineElement:
def __init__(self, stopdict={}):
self.__stopdict = stopdict
def process(self, seq):
res = []
for term in seq:
if self.__stopdict.get(term):
continue
else:
res.append(term)
return res
class Test(TestCase):
def testSourceToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
def testTermToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [3])
def testMissingTermToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('boxes')
self.assertEqual(wids, [])
def testOnePipelineElement(self):
lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('fish')
self.assertEqual(wids, [3])
def testSplitterAdaptorFold(self):
lexicon = Lexicon(Splitter(), CaseNormalizer())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
def testSplitterAdaptorNofold(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [2, 3])
def testTwoElementPipeline(self):
lexicon = Lexicon(Splitter(),
StupidPipelineElement('cats', 'fish'),
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [1])
def testThreeElementPipeline(self):
lexicon = Lexicon(Splitter(),
StopWordPipelineElement({'and':1}),
StupidPipelineElement('dogs', 'fish'),
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [2])
def test_suite():
return makeSuite(Test)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.NBest import NBest
class NBestTest(TestCase):
def testConstructor(self):
self.assertRaises(ValueError, NBest, 0)
self.assertRaises(ValueError, NBest, -1)
for n in range(1, 11):
nb = NBest(n)
self.assertEqual(len(nb), 0)
self.assertEqual(nb.capacity(), n)
def testOne(self):
nb = NBest(1)
nb.add('a', 0)
self.assertEqual(nb.getbest(), [('a', 0)])
nb.add('b', 1)
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('b', 1)])
nb.add('c', -1)
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('b', 1)])
nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('f', 5)])
def testMany(self):
import random
inputs = [(-i, i) for i in range(50)]
reversed_inputs = inputs[:]
reversed_inputs.reverse()
# Test the N-best for a variety of n (1, 6, 11, ... 50).
for n in range(1, len(inputs)+1, 5):
expected = inputs[-n:]
expected.reverse()
random_inputs = inputs[:]
random.shuffle(random_inputs)
for source in inputs, reversed_inputs, random_inputs:
# Try feeding them one at a time.
nb = NBest(n)
for item, score in source:
nb.add(item, score)
self.assertEqual(len(nb), n)
self.assertEqual(nb.capacity(), n)
self.assertEqual(nb.getbest(), expected)
# And again in one gulp.
nb = NBest(n)
nb.addmany(source)
self.assertEqual(len(nb), n)
self.assertEqual(nb.capacity(), n)
self.assertEqual(nb.getbest(), expected)
for i in range(1, n+1):
self.assertEqual(nb.pop_smallest(), expected[-i])
self.assertRaises(IndexError, nb.pop_smallest)
def test_suite():
return makeSuite(NBestTest)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from BTrees.IIBTree import IIBucket
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.ParseTree import ParseError, QueryError
class FauxIndex:
def search(self, term):
b = IIBucket()
if term == "foo":
b[1] = b[3] = 1
elif term == "bar":
b[1] = b[2] = 1
elif term == "ham":
b[1] = b[2] = b[3] = b[4] = 1
return b
class TestQueryEngine(TestCase):
def setUp(self):
self.parser = QueryParser()
self.index = FauxIndex()
def compareSet(self, set, dict):
d = {}
for k, v in set.items():
d[k] = v
self.assertEqual(d, dict)
def compareQuery(self, query, dict):
tree = self.parser.parseQuery(query)
set = tree.executeQuery(self.index)
self.compareSet(set, dict)
def testExecuteQuery(self):
self.compareQuery("foo AND bar", {1: 2})
self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
self.compareQuery("foo AND NOT bar", {3: 1})
self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
self.compareQuery("ham AND foo AND bar", {1: 3})
def testInvalidQuery(self):
from Products.ZCTextIndex.ParseTree import NotNode, AtomNode
tree = NotNode(AtomNode("foo"))
self.assertRaises(QueryError, tree.executeQuery, self.index)
def test_suite():
return makeSuite(TestQueryEngine)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.ParseTree import ParseError, ParseTreeNode
from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode
from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
class TestQueryParser(TestCase):
def compareParseTrees(self, got, expected):
self.assertEqual(isinstance(got, ParseTreeNode), 1)
self.assertEqual(got.__class__, expected.__class__)
if isinstance(got, PhraseNode):
self.assertEqual(got.nodeType(), "PHRASE")
self.assertEqual(got.getValue(), expected.getValue())
elif isinstance(got, GlobNode):
self.assertEqual(got.nodeType(), "GLOB")
self.assertEqual(got.getValue(), expected.getValue())
elif isinstance(got, AtomNode):
self.assertEqual(got.nodeType(), "ATOM")
self.assertEqual(got.getValue(), expected.getValue())
elif isinstance(got, NotNode):
self.assertEqual(got.nodeType(), "NOT")
self.compareParseTrees(got.getValue(), expected.getValue())
elif isinstance(got, AndNode) or isinstance(got, OrNode):
self.assertEqual(got.nodeType(),
isinstance(got, AndNode) and "AND" or "OR")
list1 = got.getValue()
list2 = expected.getValue()
self.assertEqual(len(list1), len(list2))
for i in range(len(list1)):
self.compareParseTrees(list1[i], list2[i])
def expect(self, input, output):
tree = self.p.parseQuery(input)
self.compareParseTrees(tree, output)
def failure(self, input):
self.assertRaises(ParseError, self.p.parseQuery, input)
def setUp(self):
self.p = QueryParser()
def testParseQuery(self):
self.expect("foo", AtomNode("foo"))
self.expect("note", AtomNode("note"))
self.expect("a and b AND c",
AndNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
self.expect("a OR b or c",
OrNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
self.expect("a AND b OR c AnD d",
OrNode([AndNode([AtomNode("a"), AtomNode("b")]),
AndNode([AtomNode("c"), AtomNode("d")])]))
self.expect("(a OR b) AND (c OR d)",
AndNode([OrNode([AtomNode("a"), AtomNode("b")]),
OrNode([AtomNode("c"), AtomNode("d")])]))
self.expect("a AND not b",
AndNode([AtomNode("a"), NotNode(AtomNode("b"))]))
self.expect('"foo bar"', PhraseNode("foo bar"))
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('(("foo bar"))"', PhraseNode("foo bar"))
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('and/', AtomNode("and"))
self.expect("foo-bar", PhraseNode("foo bar"))
self.expect("foo -bar", AndNode([AtomNode("foo"),
NotNode(AtomNode("bar"))]))
self.expect("-foo bar", AndNode([AtomNode("bar"),
NotNode(AtomNode("foo"))]))
self.expect("booh -foo-bar",
AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))]))
self.expect('booh -"foo bar"',
AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))]))
self.expect('foo"bar"',
AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('"foo"bar',
AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('foo"bar"blech',
AndNode([AtomNode("foo"), AtomNode("bar"),
AtomNode("blech")]))
self.expect("foo*", GlobNode("foo*"))
self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")]))
def testParseFailures(self):
self.failure("")
self.failure("not")
self.failure("OR")
self.failure("AND")
self.failure("not foo")
self.failure(")")
self.failure("(")
self.failure("foo OR")
self.failure("foo AND")
self.failure("OR foo")
self.failure("and foo")
self.failure("(foo) bar")
self.failure("(foo OR)")
self.failure("(foo AND)")
self.failure("(NOT foo)")
self.failure("-foo")
self.failure("-foo -bar")
self.failure('""')
def test_suite():
return makeSuite(TestQueryParser)
if __name__=="__main__":
main(defaultTest='test_suite')
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.tests \
import testIndex, testQueryEngine, testQueryParser
from Products.ZCTextIndex.Index import scaled_int, SCALE_FACTOR
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
import unittest
class Indexable:
def __init__(self, text):
self.text = text
class LexiconHolder:
def __init__(self, lexicon):
self.lexicon = lexicon
class Extra:
pass
# The tests classes below create a ZCTextIndex(). Then they create
# instance variables that point to the internal components used by
# ZCTextIndex. These tests run the individual module unit tests with
# the fully integrated ZCTextIndex.
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
if abs(scaled1 - scaled2) > epsilon:
raise AssertionError, "%s != %s" % (scaled1, scaled2)
class IndexTests(testIndex.IndexTest):
def setUp(self):
extra = Extra()
extra.doc_attr = 'text'
extra.lexicon_id = 'lexicon'
caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
StopWordRemover()))
self.zc_index = ZCTextIndex('name', extra, caller)
self.index = self.zc_index.index
self.lexicon = self.zc_index.lexicon
def testStopWords(self):
# the only non-stopword is question
text = ("to be or not to be "
"that is the question")
doc = Indexable(text)
self.zc_index.index_object(1, doc)
for word in text.split():
if word != "question":
wids = self.lexicon.termToWordIds(word)
self.assertEqual(wids, [])
self.assertEqual(len(self.index._get_undoinfo(1)), 1)
def testRanking(self):
# A fairly involved test of the ranking calculations based on
# an example set of documents in queries in Managing
# Gigabytes, pp. 180-188.
self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
"pease", "porridge", "pot"]
self._ranking_index()
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self):
docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
def _ranking_tf(self):
# matrix of term weights for the rows are docids
# and the columns are indexes into this list:
l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
for i in range(len(l_Wd)):
docid = i + 1
scaled_Wd = scaled_int(l_Wd[i])
eq(scaled_Wd, self.index._get_Wd(docid))
wdts = [scaled_int(t) for t in l_wdt[i]]
for j in range(len(wdts)):
wdt = self.index._get_wdt(docid, self.words[j])
eq(wdts[j], wdt)
def _ranking_idf(self):
word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
for i in range(len(self.words)):
word = self.words[i]
eq(word_freqs[i], self.index._get_ft(word))
eq(scaled_int(idfs[i]), self.index._get_wt(word))
def _ranking_queries(self):
queries = ["eat", "porridge", "hot OR porridge",
"eat OR nine OR day OR old OR porridge"]
wqs = [1.95, 1.10, 1.77, 3.55]
results = [[(6, 0.71)],
[(1, 0.61), (2, 0.58), (5, 0.71)],
[(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
[(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
for i in range(len(queries)):
raw = queries[i]
q = self.zc_index.parser.parseQuery(raw)
wq = self.index.query_weight(q.terms())
eq(wq, scaled_int(wqs[i]))
r = self.zc_index.query(raw)
self.assertEqual(len(r), len(results[i]))
# convert the results to a dict for each checking
d = {}
for doc, score in results[i]:
d[doc] = scaled_int(score)
for doc, score in r:
score = scaled_int(float(score / SCALE_FACTOR) / wq)
self.assert_(0 <= score <= SCALE_FACTOR)
eq(d[doc], score)
class QueryTests(testQueryEngine.TestQueryEngine,
testQueryParser.TestQueryParser):
# The FauxIndex in testQueryEngine contains four documents.
# docid 1: foo, bar, ham
# docid 2: bar, ham
# docid 3: foo, ham
# docid 4: ham
docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
def setUp(self):
extra = Extra()
extra.doc_attr = 'text'
extra.lexicon_id = 'lexicon'
caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
StopWordRemover()))
self.zc_index = ZCTextIndex('name', extra, caller)
self.p = self.parser = self.zc_index.parser
self.index = self.zc_index.index
self.add_docs()
def add_docs(self):
for i in range(len(self.docs)):
text = self.docs[i]
obj = Indexable(text)
self.zc_index.index_object(i + 1, obj)
def compareSet(self, set, dict):
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
d = {}
for k, v in set.items():
d[k] = v
self.assertEqual(d.keys(), dict.keys())
def test_suite():
s = unittest.TestSuite()
for klass in IndexTests, QueryTests:
s.addTest(unittest.makeSuite(klass))
return s
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
#! /usr/bin/env python
"""Dump statistics about each word in the index.
usage: wordstats.py data.fs [index key]
"""
import ZODB
from ZODB.FileStorage import FileStorage
def main(fspath, key):
fs = FileStorage(fspath, read_only=1)
db = ZODB.DB(fs)
rt = db.open().root()
index = rt[key]
lex = index.lexicon
idx = index.index
print "Words", lex.length()
print "Documents", idx.length()
print "Word frequencies: count, word, wid"
for word, wid in lex.items():
docs = idx._wordinfo[wid]
print len(docs), word, wid
print "Per-doc scores: wid, (doc, score,)+"
for wid in lex.wids():
print wid,
docs = idx._wordinfo[wid]
for docid, score in docs.items():
print docid, score,
print
if __name__ == "__main__":
import sys
args = sys.argv[1:]
index_key = "index"
if len(args) == 1:
fspath = args[0]
elif len(args) == 2:
fspath, index_key = args
else:
print "Expected 1 or 2 args, got", len(args)
main(fspath, index_key)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment