Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
1c1a53d1
Commit
1c1a53d1
authored
Jun 19, 2010
by
Hanno Schlichting
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Products.ZCTextIndex was moved to its own distribution
parent
48f67574
Changes
59
Show whitespace changes
Inline
Side-by-side
Showing
59 changed files
with
4 additions
and
7278 deletions
+4
-7278
buildout.cfg
buildout.cfg
+1
-0
setup.py
setup.py
+2
-13
src/Products/ZCTextIndex/BaseIndex.py
src/Products/ZCTextIndex/BaseIndex.py
+0
-321
src/Products/ZCTextIndex/CosineIndex.py
src/Products/ZCTextIndex/CosineIndex.py
+0
-138
src/Products/ZCTextIndex/HTMLSplitter.py
src/Products/ZCTextIndex/HTMLSplitter.py
+0
-55
src/Products/ZCTextIndex/IIndex.py
src/Products/ZCTextIndex/IIndex.py
+0
-17
src/Products/ZCTextIndex/INBest.py
src/Products/ZCTextIndex/INBest.py
+0
-15
src/Products/ZCTextIndex/IPipelineElement.py
src/Products/ZCTextIndex/IPipelineElement.py
+0
-15
src/Products/ZCTextIndex/IPipelineElementFactory.py
src/Products/ZCTextIndex/IPipelineElementFactory.py
+0
-15
src/Products/ZCTextIndex/IQueryParseTree.py
src/Products/ZCTextIndex/IQueryParseTree.py
+0
-15
src/Products/ZCTextIndex/IQueryParser.py
src/Products/ZCTextIndex/IQueryParser.py
+0
-15
src/Products/ZCTextIndex/ISplitter.py
src/Products/ZCTextIndex/ISplitter.py
+0
-15
src/Products/ZCTextIndex/Lexicon.py
src/Products/ZCTextIndex/Lexicon.py
+0
-226
src/Products/ZCTextIndex/NBest.py
src/Products/ZCTextIndex/NBest.py
+0
-77
src/Products/ZCTextIndex/OkapiIndex.py
src/Products/ZCTextIndex/OkapiIndex.py
+0
-366
src/Products/ZCTextIndex/ParseTree.py
src/Products/ZCTextIndex/ParseTree.py
+0
-132
src/Products/ZCTextIndex/PipelineFactory.py
src/Products/ZCTextIndex/PipelineFactory.py
+0
-52
src/Products/ZCTextIndex/QueryParser.py
src/Products/ZCTextIndex/QueryParser.py
+0
-255
src/Products/ZCTextIndex/README.txt
src/Products/ZCTextIndex/README.txt
+0
-123
src/Products/ZCTextIndex/RiceCode.py
src/Products/ZCTextIndex/RiceCode.py
+0
-208
src/Products/ZCTextIndex/SETUP.cfg
src/Products/ZCTextIndex/SETUP.cfg
+0
-7
src/Products/ZCTextIndex/SetOps.py
src/Products/ZCTextIndex/SetOps.py
+0
-64
src/Products/ZCTextIndex/Setup
src/Products/ZCTextIndex/Setup
+0
-3
src/Products/ZCTextIndex/StopDict.py
src/Products/ZCTextIndex/StopDict.py
+0
-36
src/Products/ZCTextIndex/WidCode.py
src/Products/ZCTextIndex/WidCode.py
+0
-131
src/Products/ZCTextIndex/ZCTextIndex.py
src/Products/ZCTextIndex/ZCTextIndex.py
+0
-405
src/Products/ZCTextIndex/__init__.py
src/Products/ZCTextIndex/__init__.py
+0
-62
src/Products/ZCTextIndex/dtml/addLexicon.dtml
src/Products/ZCTextIndex/dtml/addLexicon.dtml
+0
-77
src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
+0
-93
src/Products/ZCTextIndex/dtml/manageLexicon.dtml
src/Products/ZCTextIndex/dtml/manageLexicon.dtml
+0
-24
src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
+0
-26
src/Products/ZCTextIndex/dtml/queryLexicon.dtml
src/Products/ZCTextIndex/dtml/queryLexicon.dtml
+0
-71
src/Products/ZCTextIndex/help/Lexicon_Add.stx
src/Products/ZCTextIndex/help/Lexicon_Add.stx
+0
-37
src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
+0
-39
src/Products/ZCTextIndex/interfaces.py
src/Products/ZCTextIndex/interfaces.py
+0
-329
src/Products/ZCTextIndex/okascore.c
src/Products/ZCTextIndex/okascore.c
+0
-131
src/Products/ZCTextIndex/stopper.c
src/Products/ZCTextIndex/stopper.c
+0
-78
src/Products/ZCTextIndex/tests/__init__.py
src/Products/ZCTextIndex/tests/__init__.py
+0
-14
src/Products/ZCTextIndex/tests/hs-tool.py
src/Products/ZCTextIndex/tests/hs-tool.py
+0
-129
src/Products/ZCTextIndex/tests/indexhtml.py
src/Products/ZCTextIndex/tests/indexhtml.py
+0
-156
src/Products/ZCTextIndex/tests/mailtest.py
src/Products/ZCTextIndex/tests/mailtest.py
+0
-288
src/Products/ZCTextIndex/tests/mhindex.py
src/Products/ZCTextIndex/tests/mhindex.py
+0
-601
src/Products/ZCTextIndex/tests/python.txt
src/Products/ZCTextIndex/tests/python.txt
+0
-114
src/Products/ZCTextIndex/tests/queryhtml.py
src/Products/ZCTextIndex/tests/queryhtml.py
+0
-117
src/Products/ZCTextIndex/tests/testHTMLSplitter.py
src/Products/ZCTextIndex/tests/testHTMLSplitter.py
+0
-77
src/Products/ZCTextIndex/tests/testIndex.py
src/Products/ZCTextIndex/tests/testIndex.py
+0
-290
src/Products/ZCTextIndex/tests/testLexicon.py
src/Products/ZCTextIndex/tests/testLexicon.py
+0
-231
src/Products/ZCTextIndex/tests/testNBest.py
src/Products/ZCTextIndex/tests/testNBest.py
+0
-89
src/Products/ZCTextIndex/tests/testParseTree.py
src/Products/ZCTextIndex/tests/testParseTree.py
+0
-59
src/Products/ZCTextIndex/tests/testPipelineFactory.py
src/Products/ZCTextIndex/tests/testPipelineFactory.py
+0
-51
src/Products/ZCTextIndex/tests/testQueryEngine.py
src/Products/ZCTextIndex/tests/testQueryEngine.py
+0
-72
src/Products/ZCTextIndex/tests/testQueryParser.py
src/Products/ZCTextIndex/tests/testQueryParser.py
+0
-359
src/Products/ZCTextIndex/tests/testSetOps.py
src/Products/ZCTextIndex/tests/testSetOps.py
+0
-135
src/Products/ZCTextIndex/tests/testStopper.py
src/Products/ZCTextIndex/tests/testStopper.py
+0
-47
src/Products/ZCTextIndex/tests/testZCTextIndex.py
src/Products/ZCTextIndex/tests/testZCTextIndex.py
+0
-718
src/Products/ZCTextIndex/tests/wordstats.py
src/Products/ZCTextIndex/tests/wordstats.py
+0
-45
src/Products/ZCTextIndex/www/index.gif
src/Products/ZCTextIndex/www/index.gif
+0
-0
src/Products/ZCTextIndex/www/lexicon.gif
src/Products/ZCTextIndex/www/lexicon.gif
+0
-0
versions.cfg
versions.cfg
+1
-0
No files found.
buildout.cfg
View file @
1c1a53d1
...
...
@@ -44,6 +44,7 @@ eggs =
Missing
MultiMapping
Persistence
Products.ZCTextIndex
Record
RestrictedPython
initgroups
...
...
setup.py
View file @
1c1a53d1
...
...
@@ -13,7 +13,7 @@
##############################################################################
import
os
from
setuptools
import
setup
,
find_packages
,
Extension
from
setuptools
import
setup
,
find_packages
setup
(
name
=
'Zope2'
,
...
...
@@ -29,18 +29,6 @@ setup(name='Zope2',
packages
=
find_packages
(
'src'
),
namespace_packages
=
[
'Products'
],
package_dir
=
{
''
:
'src'
},
ext_modules
=
[
# indexes
Extension
(
name
=
'Products.ZCTextIndex.stopper'
,
sources
=
[
'src/Products/ZCTextIndex/stopper.c'
]),
Extension
(
name
=
'Products.ZCTextIndex.okascore'
,
sources
=
[
'src/Products/ZCTextIndex/okascore.c'
]),
],
install_requires
=
[
'AccessControl'
,
'Acquisition'
,
...
...
@@ -50,6 +38,7 @@ setup(name='Zope2',
'Missing'
,
'MultiMapping'
,
'Persistence'
,
'Products.ZCTextIndex'
,
'Record'
,
'RestrictedPython'
,
'ZConfig'
,
...
...
src/Products/ZCTextIndex/BaseIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Abstract base class for full text index with relevance ranking."""
import
math
from
BTrees.IOBTree
import
IOBTree
from
BTrees.IIBTree
import
IIBTree
from
BTrees.IIBTree
import
IIBucket
from
BTrees.IIBTree
import
IITreeSet
from
BTrees.IIBTree
import
difference
from
BTrees.IIBTree
import
intersection
from
BTrees.Length
import
Length
from
Persistence
import
Persistent
from
zope.interface
import
implements
from
Products.ZCTextIndex
import
WidCode
from
Products.ZCTextIndex.interfaces
import
IIndex
from
Products.ZCTextIndex.SetOps
import
mass_weightedIntersection
from
Products.ZCTextIndex.SetOps
import
mass_weightedUnion
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and
# small enough so that scaled values should almost always fit in a signed
# 16-bit int (we're generally storing logs, so a few bits before the radix
# point goes a long way; on the flip side, for reasonably small numbers x
# most of the info in log(x) is in the fractional bits, so we do want to
# save a lot of those).
SCALE_FACTOR
=
1024.0
def
scaled_int
(
f
,
scale
=
SCALE_FACTOR
):
# We expect only positive inputs, so "add a half and chop" is the
# same as round(). Surprising, calling round() is significantly more
# expensive.
return
int
(
f
*
scale
+
0.5
)
def
unique
(
L
):
"""Return a list of the unique elements in L."""
return
IITreeSet
(
L
).
keys
()
class
BaseIndex
(
Persistent
):
implements
(
IIndex
)
def
__init__
(
self
,
lexicon
):
self
.
_lexicon
=
lexicon
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word we don't currently know about. For example, if we
# unindex the last doc containing a particular word, that wid
# remains in the lexicon, but is no longer in our _wordinfo map;
# lexicons can also be shared across indices, and some other index
# may introduce a lexicon word we've never seen.
# A word is in-vocabulary for this index if and only if
# _wordinfo.has_key(wid). Note that wid 0 must not be a key.
self
.
_wordinfo
=
IOBTree
()
# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self
.
_docweight
=
IIBTree
()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self
.
_docwords
=
IOBTree
()
# Use a BTree length for efficient length computation w/o conflicts
self
.
length
=
Length
()
self
.
document_count
=
Length
()
def
length
(
self
):
"""Return the number of words in the index."""
# This is overridden per instance
return
len
(
self
.
_wordinfo
)
def
document_count
(
self
):
"""Return the number of documents in the index"""
# This is overridden per instance
return
len
(
self
.
_docweight
)
def
get_words
(
self
,
docid
):
"""Return a list of the wordids for a given docid."""
# Note this is overridden in the instance
return
WidCode
.
decode
(
self
.
_docwords
[
docid
])
# A subclass may wish to extend or override this.
def
index_doc
(
self
,
docid
,
text
):
if
self
.
_docwords
.
has_key
(
docid
):
return
self
.
_reindex_doc
(
docid
,
text
)
wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
wid2weight
,
docweight
=
self
.
_get_frequencies
(
wids
)
self
.
_mass_add_wordinfo
(
wid2weight
,
docid
)
self
.
_docweight
[
docid
]
=
docweight
self
.
_docwords
[
docid
]
=
WidCode
.
encode
(
wids
)
try
:
self
.
document_count
.
change
(
1
)
except
AttributeError
:
# Upgrade document_count to Length object
self
.
document_count
=
Length
(
self
.
document_count
())
return
len
(
wids
)
# A subclass may wish to extend or override this. This is for adjusting
# to a new version of a doc that already exists. The goal is to be
# faster than simply unindexing the old version in its entirety and then
# adding the new version in its entirety.
def
_reindex_doc
(
self
,
docid
,
text
):
# Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
old_wids
=
self
.
get_words
(
docid
)
old_wid2w
,
old_docw
=
self
.
_get_frequencies
(
old_wids
)
new_wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
new_wid2w
,
new_docw
=
self
.
_get_frequencies
(
new_wids
)
old_widset
=
IITreeSet
(
old_wid2w
.
keys
())
new_widset
=
IITreeSet
(
new_wid2w
.
keys
())
in_both_widset
=
intersection
(
old_widset
,
new_widset
)
only_old_widset
=
difference
(
old_widset
,
in_both_widset
)
only_new_widset
=
difference
(
new_widset
,
in_both_widset
)
del
old_widset
,
new_widset
for
wid
in
only_old_widset
.
keys
():
self
.
_del_wordinfo
(
wid
,
docid
)
for
wid
in
only_new_widset
.
keys
():
self
.
_add_wordinfo
(
wid
,
new_wid2w
[
wid
],
docid
)
for
wid
in
in_both_widset
.
keys
():
# For the Okapi indexer, the "if" will trigger only for words
# whose counts have changed. For the cosine indexer, the "if"
# may trigger for every wid, since W(d) probably changed and
# W(d) is divided into every score.
newscore
=
new_wid2w
[
wid
]
if
old_wid2w
[
wid
]
!=
newscore
:
self
.
_add_wordinfo
(
wid
,
newscore
,
docid
)
self
.
_docweight
[
docid
]
=
new_docw
self
.
_docwords
[
docid
]
=
WidCode
.
encode
(
new_wids
)
return
len
(
new_wids
)
# Subclass must override.
def
_get_frequencies
(
self
,
wids
):
# Compute term frequencies and a doc weight, whatever those mean
# to an indexer.
# Return pair:
# {wid0: w(d, wid0), wid1: w(d, wid1), ...],
# docweight
# The wid->weight mappings are fed into _add_wordinfo, and docweight
# becomes the value of _docweight[docid].
raise
NotImplementedError
def
has_doc
(
self
,
docid
):
return
self
.
_docwords
.
has_key
(
docid
)
# A subclass may wish to extend or override this.
def
unindex_doc
(
self
,
docid
):
for
wid
in
unique
(
self
.
get_words
(
docid
)):
self
.
_del_wordinfo
(
wid
,
docid
)
del
self
.
_docwords
[
docid
]
del
self
.
_docweight
[
docid
]
try
:
self
.
document_count
.
change
(
-
1
)
except
AttributeError
:
# Upgrade document_count to Length object
self
.
document_count
=
Length
(
self
.
document_count
())
def
search
(
self
,
term
):
wids
=
self
.
_lexicon
.
termToWordIds
(
term
)
if
not
wids
:
return
None
# All docs match
wids
=
self
.
_remove_oov_wids
(
wids
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_glob
(
self
,
pattern
):
wids
=
self
.
_lexicon
.
globToWordIds
(
pattern
)
wids
=
self
.
_remove_oov_wids
(
wids
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_phrase
(
self
,
phrase
):
wids
=
self
.
_lexicon
.
termToWordIds
(
phrase
)
cleaned_wids
=
self
.
_remove_oov_wids
(
wids
)
if
len
(
wids
)
!=
len
(
cleaned_wids
):
# At least one wid was OOV: can't possibly find it.
return
IIBTree
()
scores
=
self
.
_search_wids
(
wids
)
hits
=
mass_weightedIntersection
(
scores
)
if
not
hits
:
return
hits
code
=
WidCode
.
encode
(
wids
)
result
=
IIBTree
()
for
docid
,
weight
in
hits
.
items
():
docwords
=
self
.
_docwords
[
docid
]
if
docwords
.
find
(
code
)
>=
0
:
result
[
docid
]
=
weight
return
result
def
_remove_oov_wids
(
self
,
wids
):
return
filter
(
self
.
_wordinfo
.
has_key
,
wids
)
# Subclass must override.
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t. wids must not
# contain any OOV words.
def
_search_wids
(
self
,
wids
):
raise
NotImplementedError
# Subclass must override.
# It's not clear what it should do. It must return an upper bound on
# document scores for the query. It would be nice if a document score
# divided by the query's query_weight gave the proabability that a
# document was relevant, but nobody knows how to do that. For
# CosineIndex, the ratio is the cosine of the angle between the document
# and query vectors. For OkapiIndex, the ratio is a (probably
# unachievable) upper bound with no "intuitive meaning" beyond that.
def
query_weight
(
self
,
terms
):
raise
NotImplementedError
DICT_CUTOFF
=
10
def
_add_wordinfo
(
self
,
wid
,
f
,
docid
):
# Store a wordinfo in a dict as long as there are less than
# DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
# The pickle of a dict is smaller than the pickle of an
# IIBTree, substantially so for small mappings. Thus, we use
# a dictionary until the mapping reaches DICT_CUTOFF elements.
# The cutoff is chosen based on the implementation
# characteristics of Python dictionaries. The dict hashtable
# always has 2**N slots and is resized whenever it is 2/3s
# full. A pickled dict with 10 elts is half the size of an
# IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
# choose 10 as the cutoff for now.
# The IIBTree has a smaller in-memory representation than a
# dictionary, so pickle size isn't the only consideration when
# choosing the threshold. The pickle of a 500-elt dict is 92%
# of the size of the same IIBTree, but the dict uses more
# space when it is live in memory. An IIBTree stores two C
# arrays of ints, one for the keys and one for the values. It
# holds up to 120 key-value pairs in a single bucket.
doc2score
=
self
.
_wordinfo
.
get
(
wid
)
if
doc2score
is
None
:
doc2score
=
{}
self
.
length
.
change
(
1
)
else
:
# _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree.
# Obscure: First check the type. If it's not a dict, it
# can't need conversion, and then we can avoid an expensive
# len(IIBTree).
if
(
isinstance
(
doc2score
,
type
({}))
and
len
(
doc2score
)
==
self
.
DICT_CUTOFF
):
doc2score
=
IIBTree
(
doc2score
)
doc2score
[
docid
]
=
f
self
.
_wordinfo
[
wid
]
=
doc2score
# not redundant: Persistency!
# self._mass_add_wordinfo(wid2weight, docid)
#
# is the same as
#
# for wid, weight in wid2weight.items():
# self._add_wordinfo(wid, weight, docid)
#
# except that _mass_add_wordinfo doesn't require so many function calls.
def
_mass_add_wordinfo
(
self
,
wid2weight
,
docid
):
dicttype
=
type
({})
get_doc2score
=
self
.
_wordinfo
.
get
new_word_count
=
0
for
wid
,
weight
in
wid2weight
.
items
():
doc2score
=
get_doc2score
(
wid
)
if
doc2score
is
None
:
doc2score
=
{}
new_word_count
+=
1
elif
(
isinstance
(
doc2score
,
dicttype
)
and
len
(
doc2score
)
==
self
.
DICT_CUTOFF
):
doc2score
=
IIBTree
(
doc2score
)
doc2score
[
docid
]
=
weight
self
.
_wordinfo
[
wid
]
=
doc2score
# not redundant: Persistency!
self
.
length
.
change
(
new_word_count
)
def
_del_wordinfo
(
self
,
wid
,
docid
):
doc2score
=
self
.
_wordinfo
[
wid
]
del
doc2score
[
docid
]
if
doc2score
:
self
.
_wordinfo
[
wid
]
=
doc2score
# not redundant: Persistency!
else
:
del
self
.
_wordinfo
[
wid
]
self
.
length
.
change
(
-
1
)
def
inverse_doc_frequency
(
term_count
,
num_items
):
"""Return the inverse doc frequency for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements IDF(q, t) = log(1 + N/f(t))
return
math
.
log
(
1.0
+
float
(
num_items
)
/
term_count
)
src/Products/ZCTextIndex/CosineIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using a cosine measure."""
import
math
from
BTrees.IIBTree
import
IIBucket
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
IIndex
from
Products.ZCTextIndex.BaseIndex
import
BaseIndex
from
Products.ZCTextIndex.BaseIndex
import
inverse_doc_frequency
from
Products.ZCTextIndex.BaseIndex
import
scaled_int
from
Products.ZCTextIndex.BaseIndex
import
SCALE_FACTOR
class
CosineIndex
(
BaseIndex
):
implements
(
IIndex
)
def
__init__
(
self
,
lexicon
):
BaseIndex
.
__init__
(
self
,
lexicon
)
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# ._docweight for cosine is
# docid -> W(docid)
# Most of the computation for computing a relevance score for the
# document occurs in the _search_wids() method. The code currently
# implements the cosine similarity function described in Managing
# Gigabytes, eq. 4.3, p. 187. The index_object() method
# precomputes some values that are independent of the particular
# query.
# The equation is
#
# sum(for t in I(d,q): w(d,t) * w(q,t))
# cosine(d, q) = -------------------------------------
# W(d) * W(q)
#
# where
# I(d, q) = the intersection of the terms in d and q.
#
# w(d, t) = 1 + log f(d, t)
# computed by doc_term_weight(); for a given word t,
# self._wordinfo[t] is a map from d to w(d, t).
#
# w(q, t) = log(1 + N/f(t))
# computed by inverse_doc_frequency()
#
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in
# self._docweight[d]
#
# W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
# computed by self.query_weight()
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
N
=
float
(
self
.
document_count
())
L
=
[]
DictType
=
type
({})
for
wid
in
wids
:
assert
self
.
_wordinfo
.
has_key
(
wid
)
# caller responsible for OOV
d2w
=
self
.
_wordinfo
[
wid
]
# maps docid to w(docid, wid)
idf
=
inverse_doc_frequency
(
len
(
d2w
),
N
)
# an unscaled float
#print "idf = %.3f" % idf
if
isinstance
(
d2w
,
DictType
):
d2w
=
IIBucket
(
d2w
)
L
.
append
((
d2w
,
scaled_int
(
idf
)))
return
L
def
query_weight
(
self
,
terms
):
wids
=
[]
for
term
in
terms
:
wids
+=
self
.
_lexicon
.
termToWordIds
(
term
)
N
=
float
(
self
.
document_count
())
sum
=
0.0
for
wid
in
self
.
_remove_oov_wids
(
wids
):
wt
=
inverse_doc_frequency
(
len
(
self
.
_wordinfo
[
wid
]),
N
)
sum
+=
wt
**
2.0
return
scaled_int
(
math
.
sqrt
(
sum
))
def
_get_frequencies
(
self
,
wids
):
d
=
{}
dget
=
d
.
get
for
wid
in
wids
:
d
[
wid
]
=
dget
(
wid
,
0
)
+
1
Wsquares
=
0.0
for
wid
,
count
in
d
.
items
():
w
=
doc_term_weight
(
count
)
Wsquares
+=
w
*
w
d
[
wid
]
=
w
W
=
math
.
sqrt
(
Wsquares
)
#print "W = %.3f" % W
for
wid
,
weight
in
d
.
items
():
#print i, ":", "%.3f" % weight,
d
[
wid
]
=
scaled_int
(
weight
/
W
)
#print "->", d[wid]
return
d
,
scaled_int
(
W
)
# The rest are helper methods to support unit tests
def
_get_wdt
(
self
,
d
,
t
):
wid
,
=
self
.
_lexicon
.
termToWordIds
(
t
)
map
=
self
.
_wordinfo
[
wid
]
return
map
.
get
(
d
,
0
)
*
self
.
_docweight
[
d
]
/
SCALE_FACTOR
def
_get_Wd
(
self
,
d
):
return
self
.
_docweight
[
d
]
def
_get_ft
(
self
,
t
):
wid
,
=
self
.
_lexicon
.
termToWordIds
(
t
)
return
len
(
self
.
_wordinfo
[
wid
])
def
_get_wt
(
self
,
t
):
wid
,
=
self
.
_lexicon
.
termToWordIds
(
t
)
map
=
self
.
_wordinfo
[
wid
]
return
scaled_int
(
math
.
log
(
1
+
len
(
self
.
_docweight
)
/
float
(
len
(
map
))))
def
doc_term_weight
(
count
):
"""Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t)
return
1.0
+
math
.
log
(
count
)
src/Products/ZCTextIndex/HTMLSplitter.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import
re
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
ISplitter
from
Products.ZCTextIndex.PipelineFactory
import
element_factory
class
HTMLWordSplitter
:
implements
(
ISplitter
)
def
process
(
self
,
text
,
wordpat
=
r"(?L)\
w+
"):
splat = []
for t in text:
splat += self._split(t, wordpat)
return splat
def processGlob(self, text):
# see Lexicon.globToWordIds()
return self.process(text, r"
(
?
L
)
\
w
+
[
\
w
*
?
]
*
")
def _split(self, text, wordpat):
text = text.lower()
remove = [r"
<
[
^<>
]
*>
",
r"
&
[
A
-
Za
-
z
]
+
;
"]
for pat in remove:
text = re.sub(pat, "
", text)
return re.findall(wordpat, text)
element_factory.registerFactory('Word Splitter',
'HTML aware splitter',
HTMLWordSplitter)
if __name__ == "
__main__
":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
src/Products/ZCTextIndex/IIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
from
Products.ZCTextIndex.interfaces
import
IIndex
# BBB
src/Products/ZCTextIndex/INBest.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
Products.ZCTextIndex.interfaces
import
INBest
# BBB
src/Products/ZCTextIndex/IPipelineElement.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
Products.ZCTextIndex.interfaces
import
IPipelineElement
# BBB
src/Products/ZCTextIndex/IPipelineElementFactory.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
Products.ZCTextIndex.interfaces
import
IPipelineElementFactory
# BBB
src/Products/ZCTextIndex/IQueryParseTree.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
Products.ZCTextIndex.interfaces
import
IPipelineElementFactory
# BBB
src/Products/ZCTextIndex/IQueryParser.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
Products.ZCTextIndex.interfaces
import
IQueryParser
# BBB
src/Products/ZCTextIndex/ISplitter.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
Products.ZCTextIndex.interfaces
import
ISplitter
# BBB
src/Products/ZCTextIndex/Lexicon.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon.
$Id$
"""
import
re
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OIBTree
import
OIBTree
from
BTrees.Length
import
Length
from
Persistence
import
Persistent
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
ILexicon
from
Products.ZCTextIndex.StopDict
import
get_stopdict
from
Products.ZCTextIndex.ParseTree
import
QueryError
from
Products.ZCTextIndex.PipelineFactory
import
element_factory
class
Lexicon
(
Persistent
):
implements
(
ILexicon
)
def
__init__
(
self
,
*
pipeline
):
self
.
_wids
=
OIBTree
()
# word -> wid
self
.
_words
=
IOBTree
()
# wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
# of vocabulary). This can happen, e.g., if a query contains a word
# we never saw before, and that isn't a known stopword (or otherwise
# filtered out). Returning a special wid value for OOV words is a
# way to let clients know when an OOV word appears.
self
.
length
=
Length
()
self
.
_pipeline
=
pipeline
def
length
(
self
):
"""Return the number of unique terms in the lexicon."""
# Overridden in instances
return
len
(
self
.
_wids
)
def
words
(
self
):
return
self
.
_wids
.
keys
()
def
wids
(
self
):
return
self
.
_words
.
keys
()
def
items
(
self
):
return
self
.
_wids
.
items
()
def
sourceToWordIds
(
self
,
text
):
last
=
_text2list
(
text
)
for
element
in
self
.
_pipeline
:
last
=
element
.
process
(
last
)
if
not
hasattr
(
self
.
length
,
'change'
):
# Make sure length is overridden with a BTrees.Length.Length
self
.
length
=
Length
(
self
.
length
())
# Strategically unload the length value so that we get the most
# recent value written to the database to minimize conflicting wids
# Because length is independent, this will load the most
# recent value stored, regardless of whether MVCC is enabled
self
.
length
.
_p_deactivate
()
return
map
(
self
.
_getWordIdCreate
,
last
)
def
termToWordIds
(
self
,
text
):
last
=
_text2list
(
text
)
for
element
in
self
.
_pipeline
:
process
=
getattr
(
element
,
"process_post_glob"
,
element
.
process
)
last
=
process
(
last
)
wids
=
[]
for
word
in
last
:
wids
.
append
(
self
.
_wids
.
get
(
word
,
0
))
return
wids
def
parseTerms
(
self
,
text
):
last
=
_text2list
(
text
)
for
element
in
self
.
_pipeline
:
process
=
getattr
(
element
,
"processGlob"
,
element
.
process
)
last
=
process
(
last
)
return
last
def
isGlob
(
self
,
word
):
return
"*"
in
word
or
"?"
in
word
def
get_word
(
self
,
wid
):
return
self
.
_words
[
wid
]
def
get_wid
(
self
,
word
):
return
self
.
_wids
.
get
(
word
,
0
)
def
globToWordIds
(
self
,
pattern
):
# Implement * and ? just as in the shell, except the pattern
# must not start with either of these
prefix
=
""
while
pattern
and
pattern
[
0
]
not
in
"*?"
:
prefix
+=
pattern
[
0
]
pattern
=
pattern
[
1
:]
if
not
pattern
:
# There were no globbing characters in the pattern
wid
=
self
.
_wids
.
get
(
prefix
,
0
)
if
wid
:
return
[
wid
]
else
:
return
[]
if
not
prefix
:
# The pattern starts with a globbing character.
# This is too efficient, so we raise an exception.
raise
QueryError
(
"pattern %r shouldn't start with glob character"
%
pattern
)
pat
=
prefix
for
c
in
pattern
:
if
c
==
"*"
:
pat
+=
".*"
elif
c
==
"?"
:
pat
+=
"."
else
:
pat
+=
re
.
escape
(
c
)
pat
+=
"$"
prog
=
re
.
compile
(
pat
)
keys
=
self
.
_wids
.
keys
(
prefix
)
# Keys starting at prefix
wids
=
[]
for
key
in
keys
:
if
not
key
.
startswith
(
prefix
):
break
if
prog
.
match
(
key
):
wids
.
append
(
self
.
_wids
[
key
])
return
wids
def
_getWordIdCreate
(
self
,
word
):
wid
=
self
.
_wids
.
get
(
word
)
if
wid
is
None
:
wid
=
self
.
_new_wid
()
self
.
_wids
[
word
]
=
wid
self
.
_words
[
wid
]
=
word
return
wid
def
_new_wid
(
self
):
self
.
length
.
change
(
1
)
while
self
.
_words
.
has_key
(
self
.
length
()):
# just to be safe
self
.
length
.
change
(
1
)
return
self
.
length
()
def
_text2list
(
text
):
# Helper: splitter input may be a string or a list of strings
try
:
text
+
""
except
:
return
text
else
:
return
[
text
]
# Sample pipeline elements
class
Splitter
:
import
re
rx
=
re
.
compile
(
r"(?L)\
w+
")
rxGlob = re.compile(r"
(
?
L
)
\
w
+
[
\
w
*
?
]
*
") # See globToWordIds() above
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
element_factory.registerFactory('Word Splitter',
'Whitespace splitter',
Splitter)
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
element_factory.registerFactory('Case Normalizer',
'Case Normalizer',
CaseNormalizer)
element_factory.registerFactory('Stop Words',
' Don
\
'
t remove stop words',
None)
class StopWordRemover:
dict = get_stopdict().copy()
try:
from Products.ZCTextIndex.stopper import process as _process
except ImportError:
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
else:
def process(self, lst):
return self._process(self.dict, lst)
element_factory.registerFactory('Stop Words',
'Remove listed stop words only',
StopWordRemover)
class StopWordAndSingleCharRemover(StopWordRemover):
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
element_factory.registerFactory('Stop Words',
'Remove listed and single char words',
StopWordAndSingleCharRemover)
src/Products/ZCTextIndex/NBest.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from
bisect
import
bisect
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
INBest
class
NBest
:
implements
(
INBest
)
def
__init__
(
self
,
N
):
"Build an NBest object to remember the N best-scoring objects."
if
N
<
1
:
raise
ValueError
(
"NBest() argument must be at least 1"
)
self
.
_capacity
=
N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self
.
_scores
=
[]
self
.
_items
=
[]
def
__len__
(
self
):
return
len
(
self
.
_scores
)
def
capacity
(
self
):
return
self
.
_capacity
def
add
(
self
,
item
,
score
):
self
.
addmany
([(
item
,
score
)])
def
addmany
(
self
,
sequence
):
scores
,
items
,
capacity
=
self
.
_scores
,
self
.
_items
,
self
.
_capacity
n
=
len
(
scores
)
for
item
,
score
in
sequence
:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if
n
>=
capacity
and
score
<=
scores
[
0
]:
continue
i
=
bisect
(
scores
,
score
)
scores
.
insert
(
i
,
score
)
items
.
insert
(
i
,
item
)
if
n
==
capacity
:
del
items
[
0
],
scores
[
0
]
else
:
n
+=
1
assert
n
==
len
(
scores
)
def
getbest
(
self
):
result
=
zip
(
self
.
_items
,
self
.
_scores
)
result
.
reverse
()
return
result
def
pop_smallest
(
self
):
if
self
.
_scores
:
return
self
.
_items
.
pop
(
0
),
self
.
_scores
.
pop
(
0
)
raise
IndexError
(
"pop_smallest() called on empty NBest object"
)
src/Products/ZCTextIndex/OkapiIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using an Okapi BM25 rank."""
# Lots of comments are at the bottom of this file. Read them to
# understand what's going on.
from
BTrees.IIBTree
import
IIBucket
from
BTrees.Length
import
Length
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
IIndex
from
Products.ZCTextIndex.BaseIndex
import
BaseIndex
from
Products.ZCTextIndex.BaseIndex
import
inverse_doc_frequency
from
Products.ZCTextIndex.BaseIndex
import
scaled_int
from
Products.ZCTextIndex.okascore
import
score
class
OkapiIndex
(
BaseIndex
):
implements
(
IIndex
)
# BM25 free parameters.
K1
=
1.2
B
=
0.75
assert
K1
>=
0.0
assert
0.0
<=
B
<=
1.0
def
__init__
(
self
,
lexicon
):
BaseIndex
.
__init__
(
self
,
lexicon
)
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
# Use a BTree.Length.Length object to avoid concurrent write conflicts
self
.
_totaldoclen
=
Length
(
0L
)
def
index_doc
(
self
,
docid
,
text
):
count
=
BaseIndex
.
index_doc
(
self
,
docid
,
text
)
self
.
_change_doc_len
(
count
)
return
count
def
_reindex_doc
(
self
,
docid
,
text
):
self
.
_change_doc_len
(
-
self
.
_docweight
[
docid
])
return
BaseIndex
.
_reindex_doc
(
self
,
docid
,
text
)
def
unindex_doc
(
self
,
docid
):
self
.
_change_doc_len
(
-
self
.
_docweight
[
docid
])
BaseIndex
.
unindex_doc
(
self
,
docid
)
def
_change_doc_len
(
self
,
delta
):
# Change total doc length used for scoring
try
:
self
.
_totaldoclen
.
change
(
delta
)
except
AttributeError
:
# Opportunistically upgrade _totaldoclen attribute to Length object
self
.
_totaldoclen
=
Length
(
long
(
self
.
_totaldoclen
+
delta
))
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
# As currently written, the weights are always 1, and the IIBucket maps
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float
# but stored as a scaled_int.
# NOTE: This is overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
N
=
float
(
self
.
document_count
())
# total # of docs
try
:
doclen
=
self
.
_totaldoclen
()
except
TypeError
:
# _totaldoclen has not yet been upgraded
doclen
=
self
.
_totaldoclen
meandoclen
=
doclen
/
N
K1
=
self
.
K1
B
=
self
.
B
K1_plus1
=
K1
+
1.0
B_from1
=
1.0
-
B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L
=
[]
docid2len
=
self
.
_docweight
for
t
in
wids
:
d2f
=
self
.
_wordinfo
[
t
]
# map {docid -> f(docid, t)}
idf
=
inverse_doc_frequency
(
len
(
d2f
),
N
)
# an unscaled float
result
=
IIBucket
()
for
docid
,
f
in
d2f
.
items
():
lenweight
=
B_from1
+
B
*
docid2len
[
docid
]
/
meandoclen
tf
=
f
*
K1_plus1
/
(
f
+
K1
*
lenweight
)
result
[
docid
]
=
scaled_int
(
tf
*
idf
)
L
.
append
((
result
,
1
))
return
L
# Note about the above: the result is tf * idf. tf is small -- it
# can't be larger than k1+1 = 2.2. idf is formally unbounded, but
# is less than 14 for a term that appears in only 1 of a million
# documents. So the product is probably less than 32, or 5 bits
# before the radix point. If we did the scaled-int business on
# both of them, we'd be up to 25 bits. Add 64 of those and we'd
# be in overflow territory. That's pretty unlikely, so we *could*
# just store scaled_int(tf) in result[docid], and use scaled_int(idf)
# as an invariant weight across the whole result. But besides
# skating near the edge, it's not a speed cure, since the computation
# of tf would still be done at Python speed, and it's a lot more
# work than just multiplying by idf.
# The same function as _search_wids above, but with the inner scoring
# loop written in C (module okascore, function score()).
# Cautions: okascore hardcodes the values of K, B1, and the scaled_int
# function.
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
N
=
float
(
self
.
document_count
())
# total # of docs
try
:
doclen
=
self
.
_totaldoclen
()
except
TypeError
:
# _totaldoclen has not yet been upgraded
doclen
=
self
.
_totaldoclen
meandoclen
=
doclen
/
N
#K1 = self.K1
#B = self.B
#K1_plus1 = K1 + 1.0
#B_from1 = 1.0 - B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L
=
[]
docid2len
=
self
.
_docweight
for
t
in
wids
:
d2f
=
self
.
_wordinfo
[
t
]
# map {docid -> f(docid, t)}
idf
=
inverse_doc_frequency
(
len
(
d2f
),
N
)
# an unscaled float
result
=
IIBucket
()
score
(
result
,
d2f
.
items
(),
docid2len
,
idf
,
meandoclen
)
L
.
append
((
result
,
1
))
return
L
def
query_weight
(
self
,
terms
):
# Get the wids.
wids
=
[]
for
term
in
terms
:
termwids
=
self
.
_lexicon
.
termToWordIds
(
term
)
wids
.
extend
(
termwids
)
# The max score for term t is the maximum value of
# TF(D, t) * IDF(Q, t)
# We can compute IDF directly, and as noted in the comments below
# TF(D, t) is bounded above by 1+K1.
N
=
float
(
len
(
self
.
_docweight
))
tfmax
=
1.0
+
self
.
K1
sum
=
0
for
t
in
self
.
_remove_oov_wids
(
wids
):
idf
=
inverse_doc_frequency
(
len
(
self
.
_wordinfo
[
t
]),
N
)
sum
+=
scaled_int
(
idf
*
tfmax
)
return
sum
def
_get_frequencies
(
self
,
wids
):
d
=
{}
dget
=
d
.
get
for
wid
in
wids
:
d
[
wid
]
=
dget
(
wid
,
0
)
+
1
return
d
,
len
(
wids
)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in
documents, not on an abstract vector space model. A long paper by its
principal inventors gives an excellent overview of how it was derived:
A probabilistic model of information retrieval: development and status
K. Sparck Jones, S. Walker, S.E. Robertson
http://citeseer.nj.nec.com/jones98probabilistic.html
Spellings that ignore relevance information (which we don't have) are of this
high-level form:
score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
where
D a specific document
Q a specific query
t a term (word, atomic phrase, whatever)
D&Q the terms common to D and Q
TF(D, t) a measure of t's importance in D -- a kind of term frequency
weight
IDF(Q, t) a measure of t's importance in the query and in the set of
documents as a whole -- a kind of inverse document frequency
weight
The IDF(Q, t) here is identical to the one used for our cosine measure.
Since queries are expected to be short, it ignores Q entirely:
IDF(Q, t) = log(1.0 + N / f(t))
where
N the total number of documents
f(t) the number of documents in which t appears
Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
that becomes 0 for a term that's in every document, and, e.g., if someone
is searching for "documentation" on python.org (a term that may well show
up on every page, due to the top navigation bar), we still want to find the
pages that use the word a lot (which is TF's job to find, not IDF's -- we
just want to stop IDF from considering this t to be irrelevant).
The TF(D, t) spellings are more interesting. With lots of variations, the
most basic spelling is of the form
f(D, t)
TF(D, t) = ---------------
f(D, t) + K(D)
where
f(D, t) the number of times t appears in D
K(D) a measure of the length of D, normalized to mean doc length
The functional *form* f/(f+K) is clever. It's a gross approximation to a
mixture of two distinct Poisson distributions, based on the idea that t
probably appears in D for one of two reasons:
1. More or less at random.
2. Because it's important to D's purpose in life ("eliteness" in papers).
Note that f/(f+K) is always between 0 and 1. If f is very large compared to
K, it approaches 1. If K is very large compared to f, it approaches 0. If
t appears in D more or less "for random reasons", f is likely to be small,
and so K will dominate unless it's a very small doc, and the ratio will be
small. OTOH, if t appears a lot in D, f will dominate unless it's a very
large doc, and the ratio will be close to 1.
We use a variation on that simple theme, a simplification of what's called
BM25 in the literature (it was the 25th stab at a Best Match function from
the Okapi group; "a simplification" means we're setting some of BM25's more
esoteric free parameters to 0):
f(D, t) * (k1 + 1)
TF(D, t) = --------------------
f(D, t) + k1 * K(D)
where
k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
the usual default value. This constant adjusts the curve to
look more like a theoretical 2-Poisson curve.
Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
an asymptote of k1+1 from below.
Finally, we use
K(D) = (1-b) + b * len(D)/E(len(D))
where
b is another free parameter, discussed below. We use 0.75.
len(D) the length of D in words
E(len(D)) the expected value of len(D) across the whole document set;
or, IOW, the average document length
b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
10 times as often in document d2 than in document d1. If document d2 is
also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
f(d2, t) * (k1 + 1)
TF(d2, t) = --------------------------------- =
f(d2, t) + k1 * len(d2)/E(len(D))
10 * f(d1, t) * (k1 + 1)
----------------------------------------------- = TF(d1, t)
10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
because the 10's cancel out. This is appropriate if we believe that a word
appearing 10x more often in a doc 10x as long is simply due to that the
longer doc is more verbose. If we do believe that, the longer doc and the
shorter doc are probably equally relevant. OTOH, it *could* be that the
longer doc is talking about t in greater depth too, in which case it's
probably more relevant than the shorter doc.
At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
completely, and a doc scores higher for having more occurences of a word
regardless of the doc's length.
Reality is between these extremes, and probably varies by document and word
too. Reports in the literature suggest that b=0.75 is a good compromise "in
general", favoring the "verbosity hypothesis" end of the scale.
Putting it all together, the final TF function is
f(D, t) * (k1 + 1)
TF(D, t) = --------------------------------------------
f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
with k1=1.2 and b=0.75.
Query Term Weighting
--------------------
I'm ignoring the query adjustment part of Okapi BM25 because I expect our
queries are very short. Full BM25 takes them into account by adding the
following to every score(D, Q); it depends on the lengths of D and Q, but
not on the specific words in Q, or even on whether they appear in D(!):
E(len(D)) - len(D)
k2 * len(Q) * -------------------
E(len(D)) + len(D)
Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
so it apparently doesn't do much good (or may even hurt).
Full BM25 *also* multiplies the following factor into IDF(Q, t):
f(Q, t) * (k3 + 1)
------------------
f(Q, t) + k3
where k3 is yet another free parameter, and f(Q,t) is the number of times t
appears in Q. Since we're using short "web style" queries, I expect f(Q,t)
to always be 1, and then that quotient is
1 * (k3 + 1)
------------ = 1
1 + k3
regardless of k3's value. So, in a trivial sense, we are incorporating
this measure (and optimizing it by not bothering to multiply by 1 <wink>).
"""
src/Products/ZCTextIndex/ParseTree.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from
BTrees.IIBTree
import
difference
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
IQueryParseTree
from
Products.ZCTextIndex.SetOps
import
mass_weightedIntersection
from
Products.ZCTextIndex.SetOps
import
mass_weightedUnion
class
QueryError
(
Exception
):
pass
class
ParseError
(
Exception
):
pass
class
ParseTreeNode
:
implements
(
IQueryParseTree
)
_nodeType
=
None
def
__init__
(
self
,
value
):
self
.
_value
=
value
def
nodeType
(
self
):
return
self
.
_nodeType
def
getValue
(
self
):
return
self
.
_value
def
__repr__
(
self
):
return
"%s(%r)"
%
(
self
.
__class__
.
__name__
,
self
.
getValue
())
def
terms
(
self
):
t
=
[]
for
v
in
self
.
getValue
():
t
.
extend
(
v
.
terms
())
return
t
def
executeQuery
(
self
,
index
):
raise
NotImplementedError
class
NotNode
(
ParseTreeNode
):
_nodeType
=
"NOT"
def
terms
(
self
):
return
[]
def
executeQuery
(
self
,
index
):
raise
QueryError
,
"NOT parse tree node cannot be executed directly"
class
AndNode
(
ParseTreeNode
):
_nodeType
=
"AND"
def
executeQuery
(
self
,
index
):
L
=
[]
Nots
=
[]
for
subnode
in
self
.
getValue
():
if
subnode
.
nodeType
()
==
"NOT"
:
r
=
subnode
.
getValue
().
executeQuery
(
index
)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word AND NOT stop_word
# to act like plain real_word).
if
r
is
not
None
:
Nots
.
append
((
r
,
1
))
else
:
r
=
subnode
.
executeQuery
(
index
)
# If None, technically it matches every doc, so needn't be
# included.
if
r
is
not
None
:
L
.
append
((
r
,
1
))
set
=
mass_weightedIntersection
(
L
)
if
Nots
:
notset
=
mass_weightedUnion
(
Nots
)
set
=
difference
(
set
,
notset
)
return
set
class
OrNode
(
ParseTreeNode
):
_nodeType
=
"OR"
def
executeQuery
(
self
,
index
):
weighted
=
[]
for
node
in
self
.
getValue
():
r
=
node
.
executeQuery
(
index
)
# If None, technically it matches every doc, but we treat
# it as if it matched none (we want
# real_word OR stop_word
# to act like plain real_word).
if
r
is
not
None
:
weighted
.
append
((
r
,
1
))
return
mass_weightedUnion
(
weighted
)
class
AtomNode
(
ParseTreeNode
):
_nodeType
=
"ATOM"
def
terms
(
self
):
return
[
self
.
getValue
()]
def
executeQuery
(
self
,
index
):
return
index
.
search
(
self
.
getValue
())
class
PhraseNode
(
AtomNode
):
_nodeType
=
"PHRASE"
def
executeQuery
(
self
,
index
):
return
index
.
search_phrase
(
self
.
getValue
())
class
GlobNode
(
AtomNode
):
_nodeType
=
"GLOB"
def
executeQuery
(
self
,
index
):
return
index
.
search_glob
(
self
.
getValue
())
src/Products/ZCTextIndex/PipelineFactory.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
IPipelineElementFactory
class
PipelineElementFactory
:
implements
(
IPipelineElementFactory
)
def
__init__
(
self
):
self
.
_groups
=
{}
def
registerFactory
(
self
,
group
,
name
,
factory
):
if
self
.
_groups
.
has_key
(
group
)
and
\
self
.
_groups
[
group
].
has_key
(
name
):
raise
ValueError
(
'ZCTextIndex lexicon element "%s" '
'already registered in group "%s"'
%
(
name
,
group
))
elements
=
self
.
_groups
.
get
(
group
)
if
elements
is
None
:
elements
=
self
.
_groups
[
group
]
=
{}
elements
[
name
]
=
factory
def
getFactoryGroups
(
self
):
groups
=
self
.
_groups
.
keys
()
groups
.
sort
()
return
groups
def
getFactoryNames
(
self
,
group
):
names
=
self
.
_groups
[
group
].
keys
()
names
.
sort
()
return
names
def
instantiate
(
self
,
group
,
name
):
factory
=
self
.
_groups
[
group
][
name
]
if
factory
is
not
None
:
return
factory
()
element_factory
=
PipelineElementFactory
()
src/Products/ZCTextIndex/QueryParser.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the
string can contain whitespace, parentheses and key words, but not
quotes.
+ A hyphen followed by one of the two forms above, meaning that it
must not be present.
An unquoted ATOM may also contain globbing characters. Globbing
syntax is defined by the lexicon; for example "foo*" could mean any
word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- * and ? are used for globbing (i.e. prefix search), e.g. ``foo*''
"""
import
re
from
zope.interface
import
implements
from
Products.ZCTextIndex.interfaces
import
IQueryParser
from
Products.ZCTextIndex
import
ParseTree
# Create unique symbols for token types.
_AND
=
intern
(
"AND"
)
_OR
=
intern
(
"OR"
)
_NOT
=
intern
(
"NOT"
)
_LPAREN
=
intern
(
"("
)
_RPAREN
=
intern
(
")"
)
_ATOM
=
intern
(
"ATOM"
)
_EOF
=
intern
(
"EOF"
)
# Map keyword string to token type.
_keywords
=
{
_AND
:
_AND
,
_OR
:
_OR
,
_NOT
:
_NOT
,
_LPAREN
:
_LPAREN
,
_RPAREN
:
_RPAREN
,
}
# Regular expression to tokenize.
_tokenizer_regex
=
re
.
compile
(
r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\
s
"]+
)
"""
,
re
.
VERBOSE
)
# Use unicode regex to treat fullwidth space characters defined in Unicode
# as valid whitespace.
_tokenizer_unicode_regex
=
re
.
compile
(
_tokenizer_regex
.
pattern
,
_tokenizer_regex
.
flags
|
re
.
UNICODE
)
class
QueryParser
:
implements
(
IQueryParser
)
# This class is not thread-safe;
# each thread should have its own instance
def
__init__
(
self
,
lexicon
):
self
.
_lexicon
=
lexicon
self
.
_ignored
=
None
# Public API methods
def
parseQuery
(
self
,
query
):
# Lexical analysis.
try
:
# Try to use unicode and treat fullwidth whitespace as valid one.
if
not
isinstance
(
query
,
unicode
):
query
=
query
.
decode
(
'utf-8'
)
tokens
=
_tokenizer_unicode_regex
.
findall
(
query
)
except
UnicodeDecodeError
:
tokens
=
_tokenizer_regex
.
findall
(
query
)
self
.
_tokens
=
tokens
# classify tokens
self
.
_tokentypes
=
[
_keywords
.
get
(
token
.
upper
(),
_ATOM
)
for
token
in
tokens
]
# add _EOF
self
.
_tokens
.
append
(
_EOF
)
self
.
_tokentypes
.
append
(
_EOF
)
self
.
_index
=
0
# Syntactical analysis.
self
.
_ignored
=
[]
# Ignored words in the query, for parseQueryEx
tree
=
self
.
_parseOrExpr
()
self
.
_require
(
_EOF
)
if
tree
is
None
:
raise
ParseTree
.
ParseError
(
"Query contains only common words: %s"
%
repr
(
query
))
return
tree
def
getIgnored
(
self
):
return
self
.
_ignored
def
parseQueryEx
(
self
,
query
):
tree
=
self
.
parseQuery
(
query
)
ignored
=
self
.
getIgnored
()
return
tree
,
ignored
# Recursive descent parser
def
_require
(
self
,
tokentype
):
if
not
self
.
_check
(
tokentype
):
t
=
self
.
_tokens
[
self
.
_index
]
msg
=
"Token %r required, %r found"
%
(
tokentype
,
t
)
raise
ParseTree
.
ParseError
,
msg
def
_check
(
self
,
tokentype
):
if
self
.
_tokentypes
[
self
.
_index
]
is
tokentype
:
self
.
_index
+=
1
return
1
else
:
return
0
def
_peek
(
self
,
tokentype
):
return
self
.
_tokentypes
[
self
.
_index
]
is
tokentype
def
_get
(
self
,
tokentype
):
t
=
self
.
_tokens
[
self
.
_index
]
self
.
_require
(
tokentype
)
return
t
def
_parseOrExpr
(
self
):
L
=
[]
L
.
append
(
self
.
_parseAndExpr
())
while
self
.
_check
(
_OR
):
L
.
append
(
self
.
_parseAndExpr
())
L
=
filter
(
None
,
L
)
if
not
L
:
return
None
# Only stopwords
elif
len
(
L
)
==
1
:
return
L
[
0
]
else
:
return
ParseTree
.
OrNode
(
L
)
def
_parseAndExpr
(
self
):
L
=
[]
t
=
self
.
_parseTerm
()
if
t
is
not
None
:
L
.
append
(
t
)
Nots
=
[]
while
self
.
_check
(
_AND
):
t
=
self
.
_parseNotExpr
()
if
t
is
None
:
continue
if
isinstance
(
t
,
ParseTree
.
NotNode
):
Nots
.
append
(
t
)
else
:
L
.
append
(
t
)
if
not
L
:
return
None
# Only stopwords
L
.
extend
(
Nots
)
if
len
(
L
)
==
1
:
return
L
[
0
]
else
:
return
ParseTree
.
AndNode
(
L
)
def
_parseNotExpr
(
self
):
if
self
.
_check
(
_NOT
):
t
=
self
.
_parseTerm
()
if
t
is
None
:
return
None
# Only stopwords
return
ParseTree
.
NotNode
(
t
)
else
:
return
self
.
_parseTerm
()
def
_parseTerm
(
self
):
if
self
.
_check
(
_LPAREN
):
tree
=
self
.
_parseOrExpr
()
self
.
_require
(
_RPAREN
)
else
:
nodes
=
[]
nodes
=
[
self
.
_parseAtom
()]
while
self
.
_peek
(
_ATOM
):
nodes
.
append
(
self
.
_parseAtom
())
nodes
=
filter
(
None
,
nodes
)
if
not
nodes
:
return
None
# Only stopwords
structure
=
[(
isinstance
(
nodes
[
i
],
ParseTree
.
NotNode
),
i
,
nodes
[
i
])
for
i
in
range
(
len
(
nodes
))]
structure
.
sort
()
nodes
=
[
node
for
(
bit
,
index
,
node
)
in
structure
]
if
isinstance
(
nodes
[
0
],
ParseTree
.
NotNode
):
raise
ParseTree
.
ParseError
(
"a term must have at least one positive word"
)
if
len
(
nodes
)
==
1
:
return
nodes
[
0
]
tree
=
ParseTree
.
AndNode
(
nodes
)
return
tree
def
_parseAtom
(
self
):
term
=
self
.
_get
(
_ATOM
)
words
=
self
.
_lexicon
.
parseTerms
(
term
)
if
not
words
:
self
.
_ignored
.
append
(
term
)
return
None
if
len
(
words
)
>
1
:
tree
=
ParseTree
.
PhraseNode
(
words
)
elif
self
.
_lexicon
.
isGlob
(
words
[
0
]):
tree
=
ParseTree
.
GlobNode
(
words
[
0
])
else
:
tree
=
ParseTree
.
AtomNode
(
words
[
0
])
if
term
[
0
]
==
"-"
:
tree
=
ParseTree
.
NotNode
(
tree
)
return
tree
src/Products/ZCTextIndex/README.txt
deleted
100644 → 0
View file @
48f67574
ZCTextIndex
===========
This product is a replacement for the full text indexing facility of
ZCatalog. Specifically, it is an alternative to
PluginIndexes/TextIndex.
Advantages of using ZCTextIndex over TextIndex:
- A new query language, supporting both explicit and implicit Boolean
operators, parentheses, globbing, and phrase searching. Apart from
explicit operators and globbing, the syntax is roughly the same as
that popularized by Google.
- A more refined scoring algorithm, resulting in better selectiveness:
it's much more likely that you'll find the document you are looking
for among the first few highest-ranked results.
- Actually, ZCTextIndex gives you a choice of two scoring algorithms
from recent literature: the Cosine ranking from the Managing
Gigabytes book, and Okapi from more recent research papers. Okapi
usually does better, so it is the default (but your milage may
vary).
- A redesigned Lexicon, using a pipeline architecture to split the
input text into words. This makes it possible to mix and match
pipeline components, e.g. you can choose between an HTML-aware
splitter and a plain text splitter, and additional components can be
added to the pipeline for case folding, stopword removal, and other
features. Enough example pipeline components are provided to get
you started, and it is very easy to write new components.
Performance is roughly the same as for TextIndex, and we're expecting
to make tweaks to the code that will make it faster.
This code can be used outside of Zope too; all you need is a
standalone ZODB installation to make your index persistent. Several
functional test programs in the tests subdirectory show how to do
this, for example mhindex.py, mailtest.py, indexhtml.py, and
queryhtml.py.
See the online help for how to use ZCTextIndex within Zope. (Included
in the subdirectory "help".)
Code overview
-------------
ZMI interface:
__init__.py ZMI publishing code
ZCTextIndex.py pluggable index class
PipelineFactory.py ZMI helper to configure the pipeline
Indexing:
BaseIndex.py common code for Cosine and Okapi index
CosineIndex.py Cosine index implementation
OkapiIndex.py Okapi index implementation
okascore.c C implementation of scoring loop
Lexicon:
Lexicon.py lexicon and sample pipeline elements
HTMLSplitter.py HTML-aware splitter
StopDict.py list of English stopwords
stopper.c C implementation of stop word remover
Query parser:
QueryParser.py parse a query into a parse tree
ParseTree.py parse tree node classes and exceptions
Utilities:
NBest.py find N best items in a list without sorting
SetOps.py efficient weighted set operations
WidCode.py list compression allowing phrase searches
RiceCode.py list compression code (as yet unused)
Interfaces (these speak for themselves):
IIndex.py
ILexicon.py
INBest.py
IPipelineElement.py
IPipelineElementFactory.py
IQueryParseTree.py
IQueryParser.py
ISplitter.py
Subdirectories:
dtml ZMI templates
help ZMI help files
tests unittests and some functional tests/examples
www images used in the ZMI
Tests
-----
Functional tests and helpers:
hs-tool.py helper to interpret hotshot profiler logs
indexhtml.py index a collection of HTML files
mailtest.py index and query a Unix mailbox file
mhindex.py index and query a set of MH folders
python.txt output from benchmark queries
queryhtml.py query an index created by indexhtml.py
wordstats.py dump statistics about each indexed word
Unit tests (these speak for themselves):
testIndex.py
testLexicon.py
testNBest.py
testPipelineFactory.py
testQueryEngine.py
testQueryParser.py
testSetOps.py
testStopper.py
testZCTextIndex.py
src/Products/ZCTextIndex/RiceCode.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Rice coding (a variation of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import
array
class
BitArray
:
def
__init__
(
self
,
buf
=
None
):
self
.
bytes
=
array
.
array
(
'B'
)
self
.
nbits
=
0
self
.
bitsleft
=
0
self
.
tostring
=
self
.
bytes
.
tostring
def
__getitem__
(
self
,
i
):
byte
,
offset
=
divmod
(
i
,
8
)
mask
=
2
**
offset
if
self
.
bytes
[
byte
]
&
mask
:
return
1
else
:
return
0
def
__setitem__
(
self
,
i
,
val
):
byte
,
offset
=
divmod
(
i
,
8
)
mask
=
2
**
offset
if
val
:
self
.
bytes
[
byte
]
|=
mask
else
:
self
.
bytes
[
byte
]
&=
~
mask
def
__len__
(
self
):
return
self
.
nbits
def
append
(
self
,
bit
):
"""Append a 1 if bit is true or 1 if it is false."""
if
self
.
bitsleft
==
0
:
self
.
bytes
.
append
(
0
)
self
.
bitsleft
=
8
self
.
__setitem__
(
self
.
nbits
,
bit
)
self
.
nbits
+=
1
self
.
bitsleft
-=
1
def
__getstate__
(
self
):
return
self
.
nbits
,
self
.
bitsleft
,
self
.
tostring
()
def
__setstate__
(
self
,
(
nbits
,
bitsleft
,
s
)):
self
.
bytes
=
array
.
array
(
'B'
,
s
)
self
.
nbits
=
nbits
self
.
bitsleft
=
bitsleft
class
RiceCode
:
def
__init__
(
self
,
m
):
"""Constructor a RiceCode for m-bit values."""
if
not
(
0
<=
m
<=
16
):
raise
ValueError
,
"m must be between 0 and 16"
self
.
init
(
m
)
self
.
bits
=
BitArray
()
self
.
len
=
0
def
init
(
self
,
m
):
self
.
m
=
m
self
.
lower
=
(
1
<<
m
)
-
1
self
.
mask
=
1
<<
(
m
-
1
)
def
append
(
self
,
val
):
"""Append an item to the list."""
if
val
<
1
:
raise
ValueError
,
"value >= 1 expected, got %s"
%
`val`
val
-=
1
# emit the unary part of the code
q
=
val
>>
self
.
m
for
i
in
range
(
q
):
self
.
bits
.
append
(
1
)
self
.
bits
.
append
(
0
)
# emit the binary part
r
=
val
&
self
.
lower
mask
=
self
.
mask
while
mask
:
self
.
bits
.
append
(
r
&
mask
)
mask
>>=
1
self
.
len
+=
1
def
__len__
(
self
):
return
self
.
len
def
tolist
(
self
):
"""Return the items as a list."""
l
=
[]
i
=
0
# bit offset
binary_range
=
range
(
self
.
m
)
for
j
in
range
(
self
.
len
):
unary
=
0
while
self
.
bits
[
i
]
==
1
:
unary
+=
1
i
+=
1
assert
self
.
bits
[
i
]
==
0
i
+=
1
binary
=
0
for
k
in
binary_range
:
binary
=
(
binary
<<
1
)
|
self
.
bits
[
i
]
i
+=
1
l
.
append
((
unary
<<
self
.
m
)
+
(
binary
+
1
))
return
l
def
tostring
(
self
):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return
self
.
bits
.
tostring
()
def
__getstate__
(
self
):
return
self
.
m
,
self
.
bits
def
__setstate__
(
self
,
(
m
,
bits
)):
self
.
init
(
m
)
self
.
bits
=
bits
def
encode
(
m
,
l
):
c
=
RiceCode
(
m
)
for
elt
in
l
:
c
.
append
(
elt
)
assert
c
.
tolist
()
==
l
return
c
def
encode_deltas
(
l
):
if
len
(
l
)
==
1
:
return
l
[
0
],
[]
deltas
=
RiceCode
(
6
)
deltas
.
append
(
l
[
1
]
-
l
[
0
])
for
i
in
range
(
2
,
len
(
l
)):
deltas
.
append
(
l
[
i
]
-
l
[
i
-
1
])
return
l
[
0
],
deltas
def
decode_deltas
(
start
,
enc_deltas
):
deltas
=
enc_deltas
.
tolist
()
l
=
[
start
]
for
i
in
range
(
1
,
len
(
deltas
)):
l
.
append
(
l
[
i
-
1
]
+
deltas
[
i
])
l
.
append
(
l
[
-
1
]
+
deltas
[
-
1
])
return
l
def
test
():
import
random
for
size
in
[
10
,
20
,
50
,
100
,
200
]:
l
=
[
random
.
randint
(
1
,
size
)
for
i
in
range
(
50
)]
c
=
encode
(
random
.
randint
(
1
,
16
),
l
)
assert
c
.
tolist
()
==
l
for
size
in
[
10
,
20
,
50
,
100
,
200
]:
l
=
range
(
random
.
randint
(
1
,
size
),
size
+
random
.
randint
(
1
,
size
))
t
=
encode_deltas
(
l
)
l2
=
decode_deltas
(
*
t
)
assert
l
==
l2
if
l
!=
l2
:
print
l
print
l2
def
pickle_efficiency
():
import
pickle
import
random
for
m
in
[
4
,
8
,
12
]:
for
size
in
[
10
,
20
,
50
,
100
,
200
,
500
,
1000
,
2000
,
5000
]:
for
elt_range
in
[
10
,
20
,
50
,
100
,
200
,
500
,
1000
]:
l
=
[
random
.
randint
(
1
,
elt_range
)
for
i
in
range
(
size
)]
raw
=
pickle
.
dumps
(
l
,
1
)
enc
=
pickle
.
dumps
(
encode
(
m
,
l
),
1
)
print
"m=%2d size=%4d range=%4d"
%
(
m
,
size
,
elt_range
),
print
"%5d %5d"
%
(
len
(
raw
),
len
(
enc
)),
if
len
(
raw
)
>
len
(
enc
):
print
"win"
else
:
print
"lose"
if
__name__
==
"__main__"
:
test
()
src/Products/ZCTextIndex/SETUP.cfg
deleted
100644 → 0
View file @
48f67574
<extension okascore>
source okascore.c
</extension>
<extension stopper>
source stopper.c
</extension>
src/Products/ZCTextIndex/SetOps.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""SetOps -- Weighted intersections and unions applied to many inputs."""
from
BTrees.IIBTree
import
IIBucket
from
BTrees.IIBTree
import
weightedIntersection
from
BTrees.IIBTree
import
weightedUnion
from
Products.ZCTextIndex.NBest
import
NBest
def
mass_weightedIntersection
(
L
):
"A list of (mapping, weight) pairs -> their weightedIntersection IIBucket."
L
=
[(
x
,
wx
)
for
(
x
,
wx
)
in
L
if
x
is
not
None
]
if
len
(
L
)
<
2
:
return
_trivial
(
L
)
# Intersect with smallest first. We expect the input maps to be
# IIBuckets, so it doesn't hurt to get their lengths repeatedly
# (len(Bucket) is fast; len(BTree) is slow).
L
.
sort
(
lambda
x
,
y
:
cmp
(
len
(
x
[
0
]),
len
(
y
[
0
])))
(
x
,
wx
),
(
y
,
wy
)
=
L
[:
2
]
dummy
,
result
=
weightedIntersection
(
x
,
y
,
wx
,
wy
)
for
x
,
wx
in
L
[
2
:]:
dummy
,
result
=
weightedIntersection
(
result
,
x
,
1
,
wx
)
return
result
def
mass_weightedUnion
(
L
):
"A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
if
len
(
L
)
<
2
:
return
_trivial
(
L
)
# Balance unions as closely as possible, smallest to largest.
merge
=
NBest
(
len
(
L
))
for
x
,
weight
in
L
:
merge
.
add
((
x
,
weight
),
len
(
x
))
while
len
(
merge
)
>
1
:
# Merge the two smallest so far, and add back to the queue.
(
x
,
wx
),
dummy
=
merge
.
pop_smallest
()
(
y
,
wy
),
dummy
=
merge
.
pop_smallest
()
dummy
,
z
=
weightedUnion
(
x
,
y
,
wx
,
wy
)
merge
.
add
((
z
,
1
),
len
(
z
))
(
result
,
weight
),
dummy
=
merge
.
pop_smallest
()
return
result
def
_trivial
(
L
):
# L is empty or has only one (mapping, weight) pair. If there is a
# pair, we may still need to multiply the mapping by its weight.
assert
len
(
L
)
<=
1
if
len
(
L
)
==
0
:
return
IIBucket
()
[(
result
,
weight
)]
=
L
if
weight
!=
1
:
dummy
,
result
=
weightedUnion
(
IIBucket
(),
result
,
0
,
weight
)
return
result
src/Products/ZCTextIndex/Setup
deleted
100644 → 0
View file @
48f67574
*shared*
stopper stopper.c
okascore okascore.c
src/Products/ZCTextIndex/StopDict.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def
get_stopdict
():
"""Return a dictionary of stopwords."""
return
_dict
# This list of English stopwords comes from Lucene
_words
=
[
"a"
,
"and"
,
"are"
,
"as"
,
"at"
,
"be"
,
"but"
,
"by"
,
"for"
,
"if"
,
"in"
,
"into"
,
"is"
,
"it"
,
"no"
,
"not"
,
"of"
,
"on"
,
"or"
,
"such"
,
"that"
,
"the"
,
"their"
,
"then"
,
"there"
,
"these"
,
"they"
,
"this"
,
"to"
,
"was"
,
"will"
,
"with"
]
_dict
=
{}
for
w
in
_words
:
_dict
[
w
]
=
None
src/Products/ZCTextIndex/WidCode.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding. Unlike
# UTF-8, the initial byte does not tell you how many continuation bytes
# follow; and there's no ASCII superset property.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The first byte has 7 bits of data.
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The first byte doesn't tell you how many continuation bytes are
# following. You can tell by searching for the next byte with the
# high bit set (or the end of the string).
#
# The int to be encoded can contain no more than 28 bits.
#
# If it contains no more than 7 bits, 0abcdefg, the encoding is
# 1abcdefg
#
# If it contains 8 thru 14 bits,
# 00abcdef ghijkLmn
# the encoding is
# 1abcdefg 0hijkLmn
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 14 or fewer bits.
#
# If it contains 15 thru 21 bits,
# 000abcde fghijkLm nopqrstu
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu
#
# If it contains 22 thru 28 bits,
# 0000abcd efghijkL mnopqrst uvwxyzAB
# the encoding is
# 1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
assert
0x80
**
2
==
0x4000
assert
0x80
**
4
==
0x10000000
import
re
def
encode
(
wids
):
# Encode a list of wids as a string.
wid2enc
=
_encoding
n
=
len
(
wid2enc
)
return
""
.
join
([
w
<
n
and
wid2enc
[
w
]
or
_encode
(
w
)
for
w
in
wids
])
_encoding
=
[
None
]
*
0x4000
# Filled later, and converted to a tuple
def
_encode
(
w
):
assert
0x4000
<=
w
<
0x10000000
b
,
c
=
divmod
(
w
,
0x80
)
a
,
b
=
divmod
(
b
,
0x80
)
s
=
chr
(
b
)
+
chr
(
c
)
if
a
<
0x80
:
# no more than 21 data bits
return
chr
(
a
+
0x80
)
+
s
a
,
b
=
divmod
(
a
,
0x80
)
assert
a
<
0x80
,
(
w
,
a
,
b
,
s
)
# else more than 28 data bits
return
(
chr
(
a
+
0x80
)
+
chr
(
b
))
+
s
_prog
=
re
.
compile
(
r"[\x80-\xFF][\x00-\x7F]*"
)
def
decode
(
code
):
# Decode a string into a list of wids.
get
=
_decoding
.
get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return
[
get
(
p
)
or
_decode
(
p
)
for
p
in
_prog
.
findall
(
code
)]
_decoding
=
{}
# Filled later
def
_decode
(
s
):
if
s
==
'
\
x80
'
:
# See comment in decode(). This is here to allow a trick to work.
return
0
if
len
(
s
)
==
3
:
a
,
b
,
c
=
map
(
ord
,
s
)
assert
a
&
0x80
==
0x80
and
not
b
&
0x80
and
not
c
&
0x80
return
((
a
&
0x7F
)
<<
14
)
|
(
b
<<
7
)
|
c
assert
len
(
s
)
==
4
,
`s`
a
,
b
,
c
,
d
=
map
(
ord
,
s
)
assert
a
&
0x80
==
0x80
and
not
b
&
0x80
and
not
c
&
0x80
and
not
d
&
0x80
return
((
a
&
0x7F
)
<<
21
)
|
(
b
<<
14
)
|
(
c
<<
7
)
|
d
def
_fill
():
global
_encoding
for
i
in
range
(
0x80
):
s
=
chr
(
i
+
0x80
)
_encoding
[
i
]
=
s
_decoding
[
s
]
=
i
for
i
in
range
(
0x80
,
0x4000
):
hi
,
lo
=
divmod
(
i
,
0x80
)
s
=
chr
(
hi
+
0x80
)
+
chr
(
lo
)
_encoding
[
i
]
=
s
_decoding
[
s
]
=
i
_encoding
=
tuple
(
_encoding
)
_fill
()
def
test
():
for
i
in
range
(
2
**
20
):
if
i
%
1000
==
0
:
print
i
wids
=
[
i
]
code
=
encode
(
wids
)
assert
decode
(
code
)
==
wids
,
(
wids
,
code
,
decode
(
code
))
if
__name__
==
"__main__"
:
test
()
src/Products/ZCTextIndex/ZCTextIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking.
$Id$
"""
from
cgi
import
escape
from
AccessControl.class_init
import
InitializeClass
from
AccessControl.Permissions
import
manage_vocabulary
from
AccessControl.Permissions
import
manage_zcatalog_indexes
from
AccessControl.Permissions
import
query_vocabulary
from
AccessControl.Permissions
import
search_zcatalog
from
AccessControl.SecurityInfo
import
ClassSecurityInfo
from
Acquisition
import
aq_base
from
Acquisition
import
aq_inner
from
Acquisition
import
aq_parent
from
Acquisition
import
Implicit
from
App.special_dtml
import
DTMLFile
from
OFS.SimpleItem
import
SimpleItem
from
Persistence
import
Persistent
from
zope.interface
import
implements
from
Products.PluginIndexes.common.util
import
parseIndexRequest
from
Products.PluginIndexes.common
import
safe_callable
from
Products.PluginIndexes.interfaces
import
IPluggableIndex
from
Products.ZCTextIndex.Lexicon
import
CaseNormalizer
from
Products.ZCTextIndex.Lexicon
import
Lexicon
from
Products.ZCTextIndex.Lexicon
import
Splitter
from
Products.ZCTextIndex.Lexicon
import
StopWordRemover
from
Products.ZCTextIndex.NBest
import
NBest
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.CosineIndex
import
CosineIndex
from
Products.ZCTextIndex.interfaces
import
ILexicon
from
Products.ZCTextIndex.interfaces
import
IZCLexicon
from
Products.ZCTextIndex.interfaces
import
IZCTextIndex
from
Products.ZCTextIndex.OkapiIndex
import
OkapiIndex
from
Products.ZCTextIndex.PipelineFactory
import
element_factory
index_types
=
{
'Okapi BM25 Rank'
:
OkapiIndex
,
'Cosine Measure'
:
CosineIndex
}
class
ZCTextIndex
(
Persistent
,
Implicit
,
SimpleItem
):
"""Persistent text index.
"""
implements
(
IZCTextIndex
,
IPluggableIndex
)
## Magic class attributes ##
meta_type
=
'ZCTextIndex'
query_options
=
(
'query'
,)
manage_options
=
(
{
'label'
:
'Overview'
,
'action'
:
'manage_main'
},
)
security
=
ClassSecurityInfo
()
security
.
declareObjectProtected
(
manage_zcatalog_indexes
)
## Constructor ##
def
__init__
(
self
,
id
,
extra
=
None
,
caller
=
None
,
index_factory
=
None
,
field_name
=
None
,
lexicon_id
=
None
):
self
.
id
=
id
# Arguments can be passed directly to the constructor or
# via the silly "extra" record.
self
.
_fieldname
=
field_name
or
getattr
(
extra
,
'doc_attr'
,
''
)
or
id
self
.
_indexed_attrs
=
self
.
_fieldname
.
split
(
','
)
self
.
_indexed_attrs
=
[
attr
.
strip
()
for
attr
in
self
.
_indexed_attrs
if
attr
]
lexicon_id
=
lexicon_id
or
getattr
(
extra
,
'lexicon_id'
,
''
)
lexicon
=
getattr
(
caller
,
lexicon_id
,
None
)
if
lexicon
is
None
:
raise
LookupError
,
'Lexicon "%s" not found'
%
escape
(
lexicon_id
)
if
not
ILexicon
.
providedBy
(
lexicon
):
raise
ValueError
(
'Object "%s" does not implement '
'ZCTextIndex Lexicon interface'
%
lexicon
.
getId
())
self
.
lexicon_id
=
lexicon
.
getId
()
self
.
_v_lexicon
=
lexicon
if
index_factory
is
None
:
if
extra
.
index_type
not
in
index_types
.
keys
():
raise
ValueError
,
'Invalid index type "%s"'
%
escape
(
extra
.
index_type
)
self
.
_index_factory
=
index_types
[
extra
.
index_type
]
self
.
_index_type
=
extra
.
index_type
else
:
self
.
_index_factory
=
index_factory
self
.
index
=
self
.
_index_factory
(
aq_base
(
self
.
getLexicon
()))
## Private Methods ##
security
.
declarePrivate
(
'getLexicon'
)
def
getLexicon
(
self
):
"""Get the lexicon for this index
"""
if
hasattr
(
aq_base
(
self
),
'lexicon'
):
# Fix up old ZCTextIndexes by removing direct lexicon ref
# and changing it to an ID
lexicon
=
getattr
(
aq_parent
(
aq_inner
(
self
)),
self
.
lexicon
.
getId
())
self
.
lexicon_id
=
lexicon
.
getId
()
del
self
.
lexicon
if
getattr
(
aq_base
(
self
),
'lexicon_path'
,
None
):
# Fix up slightly less old ZCTextIndexes by removing
# the physical path and changing it to an ID.
# There's no need to use a physical path, which otherwise
# makes it difficult to move or rename ZCatalogs.
self
.
lexicon_id
=
self
.
lexicon_path
[
-
1
]
del
self
.
lexicon_path
try
:
return
self
.
_v_lexicon
except
AttributeError
:
lexicon
=
getattr
(
aq_parent
(
aq_inner
(
self
)),
self
.
lexicon_id
)
if
not
ILexicon
.
providedBy
(
lexicon
):
raise
TypeError
(
'Object "%s" is not a ZCTextIndex Lexicon'
%
repr
(
lexicon
))
self
.
_v_lexicon
=
lexicon
return
lexicon
## External methods not in the Pluggable Index API ##
security
.
declareProtected
(
search_zcatalog
,
'query'
)
def
query
(
self
,
query
,
nbest
=
10
):
"""Return pair (mapping from docids to scores, num results).
The num results is the total number of results before trimming
to the nbest results.
"""
tree
=
QueryParser
(
self
.
getLexicon
()).
parseQuery
(
query
)
results
=
tree
.
executeQuery
(
self
.
index
)
if
results
is
None
:
return
[],
0
chooser
=
NBest
(
nbest
)
chooser
.
addmany
(
results
.
items
())
return
chooser
.
getbest
(),
len
(
results
)
## Pluggable Index APIs ##
def
index_object
(
self
,
documentId
,
obj
,
threshold
=
None
):
"""Wrapper for index_doc() handling indexing of multiple attributes.
Enter the document with the specified documentId in the index
under the terms extracted from the indexed text attributes,
each of which should yield either a string or a list of
strings (Unicode or otherwise) to be passed to index_doc().
"""
# XXX We currently ignore subtransaction threshold
# needed for backward compatibility
try
:
fields
=
self
.
_indexed_attrs
except
:
fields
=
[
self
.
_fieldname
]
res
=
0
all_texts
=
[]
for
attr
in
fields
:
text
=
getattr
(
obj
,
attr
,
None
)
if
text
is
None
:
continue
if
safe_callable
(
text
):
text
=
text
()
if
text
is
None
:
continue
if
text
:
if
isinstance
(
text
,
(
list
,
tuple
,
)):
all_texts
.
extend
(
text
)
else
:
all_texts
.
append
(
text
)
# Check that we're sending only strings
all_texts
=
filter
(
lambda
text
:
isinstance
(
text
,
basestring
),
\
all_texts
)
if
all_texts
:
return
self
.
index
.
index_doc
(
documentId
,
all_texts
)
return
res
def
unindex_object
(
self
,
docid
):
if
self
.
index
.
has_doc
(
docid
):
self
.
index
.
unindex_doc
(
docid
)
def
_apply_index
(
self
,
request
):
"""Apply query specified by request, a mapping containing the query.
Returns two object on success, the resultSet containing the
matching record numbers and a tuple containing the names of
the fields used
Returns None if request is not valid for this index.
"""
record
=
parseIndexRequest
(
request
,
self
.
id
,
self
.
query_options
)
if
record
.
keys
is
None
:
return
None
query_str
=
' '
.
join
(
record
.
keys
)
if
not
query_str
:
return
None
tree
=
QueryParser
(
self
.
getLexicon
()).
parseQuery
(
query_str
)
results
=
tree
.
executeQuery
(
self
.
index
)
return
results
,
(
self
.
id
,)
def
getEntryForObject
(
self
,
documentId
,
default
=
None
):
"""Return the list of words indexed for documentId"""
try
:
word_ids
=
self
.
index
.
get_words
(
documentId
)
except
KeyError
:
return
default
get_word
=
self
.
getLexicon
().
get_word
return
[
get_word
(
wid
)
for
wid
in
word_ids
]
def
uniqueValues
(
self
,
name
=
None
,
withLengths
=
0
):
raise
NotImplementedError
## The ZCatalog Index management screen uses these methods ##
def
numObjects
(
self
):
"""Return number of unique words in the index"""
return
self
.
index
.
length
()
def
indexSize
(
self
):
"""Return the number of indexes objects """
return
self
.
index
.
document_count
()
def
clear
(
self
):
"""reinitialize the index (but not the lexicon)"""
try
:
# Remove the cached reference to the lexicon
# So that it is refreshed
del
self
.
_v_lexicon
except
(
AttributeError
,
KeyError
):
pass
self
.
index
=
self
.
_index_factory
(
aq_base
(
self
.
getLexicon
()))
## User Interface Methods ##
manage_main
=
DTMLFile
(
'dtml/manageZCTextIndex'
,
globals
())
def
getIndexSourceNames
(
self
):
"""Return sequence of names of indexed attributes"""
try
:
return
self
.
_indexed_attrs
except
:
return
[
self
.
_fieldname
]
def
getIndexType
(
self
):
"""Return index type string"""
return
getattr
(
self
,
'_index_type'
,
self
.
_index_factory
.
__name__
)
def
getLexiconURL
(
self
):
"""Return the url of the lexicon used by the index"""
try
:
lex
=
self
.
getLexicon
()
except
(
KeyError
,
AttributeError
):
return
None
else
:
return
lex
.
absolute_url
()
InitializeClass
(
ZCTextIndex
)
def
manage_addZCTextIndex
(
self
,
id
,
extra
=
None
,
REQUEST
=
None
,
RESPONSE
=
None
):
"""Add a text index"""
if
REQUEST
is
None
:
URL3
=
None
else
:
URL3
=
REQUEST
.
URL3
return
self
.
manage_addIndex
(
id
,
'ZCTextIndex'
,
extra
,
REQUEST
,
RESPONSE
,
URL3
)
manage_addZCTextIndexForm
=
DTMLFile
(
'dtml/addZCTextIndex'
,
globals
())
manage_addLexiconForm
=
DTMLFile
(
'dtml/addLexicon'
,
globals
())
def
manage_addLexicon
(
self
,
id
,
title
=
''
,
elements
=
[],
REQUEST
=
None
):
"""Add ZCTextIndex Lexicon"""
pipeline
=
[]
for
el_record
in
elements
:
if
not
hasattr
(
el_record
,
'name'
):
continue
# Skip over records that only specify element group
element
=
element_factory
.
instantiate
(
el_record
.
group
,
el_record
.
name
)
if
element
is
not
None
:
if
el_record
.
group
==
'Word Splitter'
:
# I don't like hardcoding this, but its a simple solution
# to get the splitter element first in the pipeline
pipeline
.
insert
(
0
,
element
)
else
:
pipeline
.
append
(
element
)
lexicon
=
PLexicon
(
id
,
title
,
*
pipeline
)
self
.
_setObject
(
id
,
lexicon
)
if
REQUEST
is
not
None
:
return
self
.
manage_main
(
self
,
REQUEST
,
update_menu
=
1
)
# I am borrowing the existing vocabulary permissions for now to avoid
# adding new permissions. This may change when old style Vocabs go away
LexiconQueryPerm
=
query_vocabulary
LexiconMgmtPerm
=
manage_vocabulary
class
PLexicon
(
Lexicon
,
Implicit
,
SimpleItem
):
"""Lexicon for ZCTextIndex.
"""
implements
(
IZCLexicon
)
meta_type
=
'ZCTextIndex Lexicon'
manage_options
=
({
'label'
:
'Overview'
,
'action'
:
'manage_main'
},
{
'label'
:
'Query'
,
'action'
:
'queryLexicon'
},
)
+
SimpleItem
.
manage_options
security
=
ClassSecurityInfo
()
security
.
declareObjectProtected
(
LexiconQueryPerm
)
def
__init__
(
self
,
id
,
title
=
''
,
*
pipeline
):
self
.
id
=
str
(
id
)
self
.
title
=
str
(
title
)
PLexicon
.
inheritedAttribute
(
'__init__'
)(
self
,
*
pipeline
)
## User Interface Methods ##
def
getPipelineNames
(
self
):
"""Return list of names of pipeline element classes"""
return
[
element
.
__class__
.
__name__
for
element
in
self
.
_pipeline
]
_queryLexicon
=
DTMLFile
(
'dtml/queryLexicon'
,
globals
())
security
.
declareProtected
(
LexiconQueryPerm
,
'queryLexicon'
)
def
queryLexicon
(
self
,
REQUEST
,
words
=
None
,
page
=
0
,
rows
=
20
,
cols
=
4
):
"""Lexicon browser/query user interface
"""
if
words
:
wids
=
[]
for
word
in
self
.
parseTerms
(
words
):
wids
.
extend
(
self
.
globToWordIds
(
word
))
words
=
[
self
.
get_word
(
wid
)
for
wid
in
wids
]
else
:
words
=
self
.
words
()
word_count
=
len
(
words
)
rows
=
max
(
min
(
rows
,
500
),
1
)
cols
=
max
(
min
(
cols
,
12
),
1
)
page_count
=
word_count
/
(
rows
*
cols
)
+
\
(
word_count
%
(
rows
*
cols
)
>
0
)
page
=
max
(
min
(
page
,
page_count
-
1
),
0
)
start
=
rows
*
cols
*
page
end
=
min
(
rows
*
cols
*
(
page
+
1
),
word_count
)
if
word_count
:
words
=
list
(
words
[
start
:
end
])
else
:
words
=
[]
columns
=
[]
i
=
0
while
i
<
len
(
words
):
columns
.
append
(
words
[
i
:
i
+
rows
])
i
+=
rows
info
=
dict
(
page
=
page
,
rows
=
rows
,
cols
=
cols
,
start_word
=
start
+
1
,
end_word
=
end
,
word_count
=
word_count
,
page_count
=
page_count
,
page_range
=
xrange
(
page_count
),
page_columns
=
columns
)
if
REQUEST
is
not
None
:
return
self
.
_queryLexicon
(
self
,
REQUEST
,
**
info
)
return
info
security
.
declareProtected
(
LexiconMgmtPerm
,
'manage_main'
)
manage_main
=
DTMLFile
(
'dtml/manageLexicon'
,
globals
())
InitializeClass
(
PLexicon
)
src/Products/ZCTextIndex/__init__.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""ZCatalog Text Index
Plugin text index for ZCatalog.
"""
from
PipelineFactory
import
element_factory
from
Products.ZCTextIndex
import
ZCTextIndex
,
HTMLSplitter
def
initialize
(
context
):
context
.
registerClass
(
ZCTextIndex
.
ZCTextIndex
,
permission
=
'Add Pluggable Index'
,
constructors
=
(
ZCTextIndex
.
manage_addZCTextIndexForm
,
ZCTextIndex
.
manage_addZCTextIndex
,
getIndexTypes
),
icon
=
'www/index.gif'
,
visibility
=
None
)
context
.
registerClass
(
ZCTextIndex
.
PLexicon
,
permission
=
'Add Vocabularies'
,
constructors
=
(
ZCTextIndex
.
manage_addLexiconForm
,
ZCTextIndex
.
manage_addLexicon
,
getElementGroups
,
getElementNames
),
icon
=
'www/lexicon.gif'
)
context
.
registerHelp
()
context
.
registerHelpTitle
(
"Zope Help"
)
## Functions below are for use in the ZMI constructor forms ##
def
getElementGroups
(
self
):
return
element_factory
.
getFactoryGroups
()
def
getElementNames
(
self
,
group
):
return
element_factory
.
getFactoryNames
(
group
)
def
getIndexTypes
(
self
):
return
ZCTextIndex
.
index_types
.
keys
()
## Allow relevent exceptions to be caught in untrusted code
from
AccessControl
import
ModuleSecurityInfo
ModuleSecurityInfo
(
'Products'
).
declarePublic
(
'ZCTextIndex'
)
ModuleSecurityInfo
(
'Products.ZCTextIndex'
).
declarePublic
(
'ParseTree'
)
ModuleSecurityInfo
(
'Products.ZCTextIndex.ParseTree'
).
declarePublic
(
'QueryError'
)
ModuleSecurityInfo
(
'Products.ZCTextIndex.ParseTree'
).
declarePublic
(
'ParseError'
)
src/Products/ZCTextIndex/dtml/addLexicon.dtml
deleted
100644 → 0
View file @
48f67574
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex Lexicon',
help_product='ZCTextIndex',
help_topic='Lexicon_Add.stx'
)">
<p class="form-help">
A ZCTextIndex Lexicon processes and stores the words of documents indexed
with a ZCTextIndex. Multiple ZCTextIndexes can share the same lexicon.
</p>
<form action="manage_addLexicon" method="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<dtml-in name="getElementGroups" prefix="group">
<dtml-let elements="getElementNames(group_item)">
<tr>
<td align="left" valign="top">
<div class="form-label">&dtml-group_item;</div>
</td>
<td align="left" valign="top">
<input type="hidden" name="elements.group:records"
value="&dtml-group_item;" />
<dtml-if expr="_.len(elements) > 1">
<select name="elements.name:records">
<dtml-in name="elements">
<option value="&dtml-sequence-item;"
>&dtml-sequence-item;</option>
</dtml-in>
</select>
<dtml-else>
<input type="checkbox" name="elements.name:records"
value="<dtml-var expr="elements[0]" html_quote>" checked />
</dtml-if>
</td>
</tr>
</dtml-let>
</dtml-in>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
src/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
deleted
100644 → 0
View file @
48f67574
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex',
help_product='ZCTextIndex',
help_topic='ZCTextIndex_Add.stx'
)">
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score, meaning they return hits in order
from the most relevant to the least relevant.
</p>
<form action="manage_addZCTextIndex" method="post"
enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Indexed attributes
</div></td>
<td align="left" valign="top">
<input type="text" name="extra.doc_attr:record" size="40" />
<em>attribute1,attribute2,...</em> or leave empty
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Ranking Strategy
</div>
</td>
<td align="left" valign="top">
<select name="extra.index_type:record">
<dtml-in name="getIndexTypes">
<option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign"top">
<div class="form-label">
Lexicon
</div></td>
<td>
<dtml-in expr="superValues('ZCTextIndex Lexicon')">
<dtml-if sequence-start>
<select name="extra.lexicon_id:record">
</dtml-if>
<option value="&dtml-id;">
&dtml-id; <dtml-var name="title" fmt="(%s)" null html_quote>
</option>
<dtml-if sequence-end>
</select>
</dtml-if>
<dtml-else>
<em>You must create a ZCTextIndex Lexicon first.</em>
</dtml-in>
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
src/Products/ZCTextIndex/dtml/manageLexicon.dtml
deleted
100644 → 0
View file @
48f67574
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
The lexicon processes and stores the words found in objects indexed by one
or more ZCTextIndexes.
</p>
<p class="section-bar">
<span class="form-label">Input Pipeline Stages</span>
</p>
<p class="form-help">
Text indexed through this lexicon is processed by the following pipeline
stages
</p>
<ol class="form-help">
<dtml-in name="getPipelineNames">
<li>&dtml-sequence-item;</li>
</dtml-in>
</ol>
<dtml-var manage_page_footer>
src/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
deleted
100644 → 0
View file @
48f67574
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
Name(s) of attribute(s) indexed:
<em><dtml-var "', '.join(getIndexSourceNames())"></em>
</p>
<p class="form-help">
Index type:
<em>&dtml-getIndexType;</em>
</p>
<p class="form-help">
ZCTextIndex Lexicon used:
<dtml-if getLexiconURL>
<a href="&dtml-getLexiconURL;/manage_main"
>&dtml-getLexiconURL;</a>
<dtml-else>
<em>(Lexicon Not Found)</em>
</dtml-if>
</p>
<p class="form-help">
<em>Note:</em> The lexicon assigned to the index cannot be changed. To replace
the existing lexicon, create a new lexicon in the same place and clear the
index. This will make the index use the replacement lexicon.
</p>
<dtml-var manage_page_footer>
src/Products/ZCTextIndex/dtml/queryLexicon.dtml
deleted
100644 → 0
View file @
48f67574
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
Browse the words in the lexicon or enter the word(s) you are interested in
below. Globbing characters (*, ?) are supported
</p>
<dtml-let words_str="' '.join(REQUEST.get('words',[]))">
<form action="&dtml-URL;">
<p class="form-element">
<span class="form-label">Word(s)</span>
<input name="words:tokens" size="20" value="&dtml-words_str;" />
<input type="submit" value="Query" />
<span class="form-label"> Output Columns:</span>
<input name="cols:int" size="2" value="&dtml-cols;" />
<span class="form-label"> Rows:</span>
<input name="rows:int" size="2" value="&dtml-rows;" />
</p>
</form>
<hr />
<form action="&dtml-URL;">
<table width="100%" cellpadding="2" cellspacing="0" border="0">
<tr class="section-bar">
<td><span class="form-label">
&dtml-word_count; Words Found<dtml-if word_count>,
Displaying &dtml-start_word;-&dtml-end_word;
</dtml-if>
<dtml-if expr="page_count > 0">
</span></td>
<td align="right"><span class="form-label">
Page:
<select name="page:int" onchange="this.form.submit()">
<dtml-in name="page_range" prefix="page">
<option value="&dtml-page_item;"
<dtml-if expr="page == page_item">
selected
</dtml-if>
>
<dtml-var expr="page_item+1">
</option>
</dtml-in>
</select>
of &dtml-page_count;
<input type="submit" value="Go" />
<input type="hidden" name="cols:int" value="&dtml-cols;" />
<input type="hidden" name="rows:int" value="&dtml-rows;" />
<input type="hidden" name="words:tokens" value="&dtml-words_str;" />
</dtml-if>
</span></td>
</tr>
</table>
</form>
</dtml-let>
<dtml-if name="page_columns">
<table width="100%" cellpadding="0" cellspacing="10" border="0">
<tr>
<dtml-in name="page_columns" prefix="column">
<td align="left" valign="top">
<dtml-var expr="'<br />'.join(column_item)">
</td>
</dtml-in>
</tr>
</table>
</dtml-if>
<dtml-var manage_page_footer>
src/Products/ZCTextIndex/help/Lexicon_Add.stx
deleted
100644 → 0
View file @
48f67574
ZCTextIndex Lexicon - Add: Create a new ZCTextIndex Lexicon
Description
This view allows you to create a new ZCTextIndex Lexicon object.
ZCTextIndex Lexicons store the words indexed by ZCTextIndexes in a
ZCatalog.
Controls
'Id' -- Allows you to specify the id of the ZCTextIndex Lexicon.
'Title' -- Allows you to specify the title of the ZCTextIndex Lexicon.
Pipeline Stages
The remaining controls allow you to select the desired processing
of text to index by selecting pipeline stages.
The default available stages are:
- **Word Splitter** This is the only mandatory stage. The word
splitter breaks the text up into a list of words. Included is a
simple whitespace splitter, and a splitter that removes HTML
tags. The HTML aware splitter gives best results when all of
the incoming content to index is HTML.
- **Stop Words** To conserve space in the vocabulary, and possibly
increase performance, you can select a stop word remover which
subtracts very common or single letter words from the Lexicon.
Bear in mind that you will not be able to search on removed stop
words, and they will also be removed from queries passed to
search ZCTextIndexes using the Lexicon.
- **Case Normalizer** The case normalizer removes case information
from the words in the Lexicon. If case-sensitive searching is
desires, then omit this element from the pipeline.
src/Products/ZCTextIndex/help/ZCTextIndex_Add.stx
deleted
100644 → 0
View file @
48f67574
ZCTextIndex Add: Create a new ZCTextIndex
Description
A ZCTextIndex is an index for performing full text searches over
bodies of text. It includes the following features:
- Boolean query operators with parenthetical grouping
- Globbing (partial word) and phrase matching
- Two selectable relevance scoring algorithms
ZCTextIndex is designed as a replacement for standard TextIndex, and
has several advantages over it.
Controls
'Id' -- The id of the ZCTextIndex, must be unique for this ZCatalog.
'Field Name' -- The name of the field (object attribute) to be indexed.
'Ranking Strategy'
- **Okapi BM25 Rank** A relevance scoring technique that seems to
work well when the document text is considerably longer than the
query string, which is often the case with user specified query
strings.
- **Cosine Measure** A relevance scoring technique derived from the
"*Managing Gigabytes*":http://www.cs.mu.oz.au/mg/ book. It seems
to work best when the queries are similar in size and content to
the text they are searching.
'Lexicon' -- The ZCTextIndex Lexicon to be used by this ZCTextIndex.
Lexicons process and store the words from the text and
help in processing queries. You must define a ZCTextIndex
Lexicon before you can create a ZCTextIndex. Several
ZCTextIndexes can share the same Lexicon if desired.
src/Products/ZCTextIndex/interfaces.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2005 Zope Foundation and Contributors.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""ZCTextIndex z3 interfaces.
$Id$
"""
from
zope.interface
import
Interface
class
IZCTextIndex
(
Interface
):
"""Persistent text index.
"""
class
ILexicon
(
Interface
):
"""Object responsible for converting text to word identifiers.
"""
def
termToWordIds
(
text
):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they are search terms, and skips words
that aren't in the lexicon.
"""
def
sourceToWordIds
(
text
):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parse the text as if they come from a source document, and
creates new word ids for words that aren't (yet) in the
lexicon.
"""
def
globToWordIds
(
pattern
):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
Return the wids for all words in the lexicon that match the
pattern.
"""
def
length
():
"""Return the number of unique term in the lexicon.
"""
def
get_word
(
wid
):
"""Return the word for the given word id.
Raise KeyError if the word id is not in the lexicon.
"""
def
get_wid
(
word
):
"""Return the wird id for the given word.
Return 0 of the word is not in the lexicon.
"""
def
parseTerms
(
text
):
"""Pass the text through the pipeline.
Return a list of words, normalized by the pipeline
(e.g. stopwords removed, case normalized etc.).
"""
def
isGlob
(
word
):
"""Return true if the word is a globbing pattern.
The word should be one of the words returned by parseTerm().
"""
class
IZCLexicon
(
Interface
):
"""Lexicon for ZCTextIndex.
"""
class
ISplitter
(
Interface
):
"""A splitter."""
def
process
(
text
):
"""Run the splitter over the input text, returning a list of terms.
"""
class
IPipelineElement
(
Interface
):
def
process
(
source
):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
def
processGlob
(
source
):
"""Process, passing through globbing metacharaters.
This is an optional method; if it is not used, process() is used.
"""
class
IPipelineElementFactory
(
Interface
):
"""Class for creating pipeline elements by name"""
def
registerFactory
(
group
,
name
,
factory
):
"""Registers a pipeline factory by name and element group.
Each name can be registered only once for a given group. Duplicate
registrations will raise a ValueError
"""
def
getFactoryGroups
():
"""Returns a sorted list of element group names
"""
def
getFactoryNames
(
group
):
"""Returns a sorted list of registered pipeline factory names
in the specified element group
"""
def
instantiate
(
group
,
name
):
"""Instantiates a pipeline element by group and name. If name is not
registered raise a KeyError.
"""
class
IQueryParseTree
(
Interface
):
"""Interface for parse trees returned by parseQuery()."""
def
nodeType
():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""
def
getValue
():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""
def
terms
():
"""Return a list of all terms in this node, excluding NOT subtrees."""
def
executeQuery
(
index
):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
class
IQueryParser
(
Interface
):
"""Interface for Query Parsers."""
def
parseQuery
(
query
):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
Some of the query terms may be ignored because they are
stopwords; use getIgnored() to find out which terms were
ignored. But if the entire query consists only of stop words,
or of stopwords and one or more negated terms, an exception is
raised.
May raise ParseTree.ParseError.
"""
def
getIgnored
():
"""Return the list of ignored terms.
Return the list of terms that were ignored by the most recent
call to parseQuery() because they were stopwords.
If parseQuery() was never called this returns None.
"""
def
parseQueryEx
(
query
):
"""Parse a query string.
Return a tuple (tree, ignored) where 'tree' is the parse tree
as returned by parseQuery(), and 'ignored' is a list of
ignored terms as returned by getIgnored().
May raise ParseTree.ParseError.
"""
class
IIndex
(
Interface
):
"""Interface for an Index."""
def
length
():
"""Return the number of words in the index."""
def
document_count
():
"""Return the number of documents in the index."""
def
get_words
(
docid
):
"""Return a list of wordids for the given docid."""
def
search
(
term
):
"""Execute a search on a single term given as a string.
Return an IIBTree mapping docid to score, or None if all docs
match due to the lexicon returning no wids for the term (e.g.,
if the term is entirely composed of stopwords).
"""
def
search_phrase
(
phrase
):
"""Execute a search on a phrase given as a string.
Return an IIBtree mapping docid to score.
"""
def
search_glob
(
pattern
):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
Return an IIBTree mapping docid to score.
"""
def
query_weight
(
terms
):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
Nothing is defined about what "weight" means, beyond that the
result is an upper bound on document scores returned for the
query.
"""
def
index_doc
(
docid
,
text
):
"""Add a document with the specified id and text to the index. If a
document by that id already exists, replace its text with the new
text provided
text may be either a string (Unicode or otherwise) or a list
of strings from which to extract the terms under which to
index the source document.
"""
def
unindex_doc
(
docid
):
"""Remove the document with the specified id from the index"""
def
has_doc
(
docid
):
"""Returns true if docid is an id of a document in the index"""
class
INBest
(
Interface
):
"""NBest chooser Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
def
add
(
item
,
score
):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""
def
addmany
(
sequence
):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""
def
getbest
():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""
def
pop_smallest
():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""
def
__len__
():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""
def
capacity
():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
src/Products/ZCTextIndex/okascore.c
deleted
100644 → 0
View file @
48f67574
/*****************************************************************************
Copyright (c) 2002 Zope Foundation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* okascore.c
*
* The inner scoring loop of OkapiIndex._search_wids() coded in C.
*
* Example from an indexed Python-Dev archive, where "python" shows up in all
* but 2 of the 19,058 messages. With the Python scoring loop,
*
* query: python
* # results: 10 of 19056 in 534.77 ms
* query: python
* # results: 10 of 19056 in 277.52 ms
*
* The first timing is cold, the second timing from an immediate repeat of
* the same query. With the scoring loop here in C:
*
* query: python
* # results: 10 of 19056 in 380.74 ms -- 40% speedup
* query: python
* # results: 10 of 19056 in 118.96 ms -- 133% speedup
*/
#include "Python.h"
#define K1 1.2
#define B 0.75
#ifndef PyTuple_CheckExact
#define PyTuple_CheckExact PyTuple_Check
#endif
static
PyObject
*
score
(
PyObject
*
self
,
PyObject
*
args
)
{
/* Believe it or not, floating these common subexpressions "by hand"
gets better code out of MSVC 6. */
const
double
B_FROM1
=
1
.
0
-
B
;
const
double
K1_PLUS1
=
K1
+
1
.
0
;
/* Inputs */
PyObject
*
result
;
/* IIBucket result, maps d to score */
PyObject
*
d2fitems
;
/* ._wordinfo[t].items(), maps d to f(d, t) */
PyObject
*
d2len
;
/* ._docweight, maps d to # words in d */
double
idf
;
/* inverse doc frequency of t */
double
meandoclen
;
/* average number of words in a doc */
int
n
,
i
;
if
(
!
PyArg_ParseTuple
(
args
,
"OOOdd:score"
,
&
result
,
&
d2fitems
,
&
d2len
,
&
idf
,
&
meandoclen
))
return
NULL
;
idf
*=
1024
.
0
;
/* float out part of the scaled_int computation */
n
=
PyObject_Length
(
d2fitems
);
for
(
i
=
0
;
i
<
n
;
++
i
)
{
PyObject
*
d_and_f
;
/* d2f[i], a (d, f) pair */
PyObject
*
d
;
double
f
;
PyObject
*
doclen
;
/* ._docweight[d] */
double
lenweight
;
double
tf
;
PyObject
*
scaled_int
;
int
status
;
d_and_f
=
PySequence_GetItem
(
d2fitems
,
i
);
if
(
d_and_f
==
NULL
)
return
NULL
;
if
(
!
(
PyTuple_CheckExact
(
d_and_f
)
&&
PyTuple_GET_SIZE
(
d_and_f
)
==
2
))
{
PyErr_SetString
(
PyExc_TypeError
,
"d2fitems must produce 2-item tuples"
);
Py_DECREF
(
d_and_f
);
return
NULL
;
}
d
=
PyTuple_GET_ITEM
(
d_and_f
,
0
);
f
=
(
double
)
PyInt_AsLong
(
PyTuple_GET_ITEM
(
d_and_f
,
1
));
doclen
=
PyObject_GetItem
(
d2len
,
d
);
if
(
doclen
==
NULL
)
{
Py_DECREF
(
d_and_f
);
return
NULL
;
}
lenweight
=
B_FROM1
+
B
*
PyInt_AS_LONG
(
doclen
)
/
meandoclen
;
tf
=
f
*
K1_PLUS1
/
(
f
+
K1
*
lenweight
);
scaled_int
=
PyInt_FromLong
((
long
)(
tf
*
idf
+
0
.
5
));
if
(
scaled_int
==
NULL
)
status
=
-
1
;
else
status
=
PyObject_SetItem
(
result
,
d
,
scaled_int
);
Py_DECREF
(
d_and_f
);
Py_DECREF
(
doclen
);
Py_XDECREF
(
scaled_int
);
if
(
status
<
0
)
return
NULL
;
}
Py_INCREF
(
Py_None
);
return
Py_None
;
}
static
char
score__doc__
[]
=
"score(result, d2fitems, d2len, idf, meandoclen)
\n
"
"
\n
"
"Do the inner scoring loop for an Okapi index.
\n
"
;
static
PyMethodDef
okascore_functions
[]
=
{
{
"score"
,
score
,
METH_VARARGS
,
score__doc__
},
{
NULL
}
};
void
initokascore
(
void
)
{
PyObject
*
m
;
m
=
Py_InitModule3
(
"okascore"
,
okascore_functions
,
"inner scoring loop for Okapi rank"
);
}
src/Products/ZCTextIndex/stopper.c
deleted
100644 → 0
View file @
48f67574
/*****************************************************************************
Copyright (c) 2002 Zope Foundation and Contributors.
All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
/* stopper.c
*
* Fast version of the StopWordRemover object.
*/
#include "Python.h"
static
PyObject
*
stopper_process
(
PyObject
*
unused
,
PyObject
*
args
)
{
PyObject
*
result
=
NULL
;
PyObject
*
dict
;
PyObject
*
seq
;
int
len
,
i
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!O:process"
,
&
PyDict_Type
,
&
dict
,
&
seq
))
return
NULL
;
seq
=
PySequence_Fast
(
seq
,
"process() requires a sequence as argument 2"
);
if
(
seq
==
NULL
)
return
NULL
;
result
=
PyList_New
(
0
);
if
(
result
==
NULL
)
goto
finally
;
#if PY_VERSION_HEX >= 0x02020000
/* Only available in Python 2.2 and newer. */
len
=
PySequence_Fast_GET_SIZE
(
seq
);
#else
len
=
PyObject_Length
(
seq
);
#endif
for
(
i
=
0
;
i
<
len
;
++
i
)
{
PyObject
*
s
=
PySequence_Fast_GET_ITEM
(
seq
,
i
);
/*
* PyDict_GetItem() returns NULL if there isn't a matching
* item, but without setting an exception, so this does what
* we want.
*/
if
(
PyDict_GetItem
(
dict
,
s
)
==
NULL
)
{
if
(
PyList_Append
(
result
,
s
)
<
0
)
{
Py_DECREF
(
result
);
result
=
NULL
;
goto
finally
;
}
}
}
finally:
Py_DECREF
(
seq
);
return
result
;
}
static
PyMethodDef
stopper_functions
[]
=
{
{
"process"
,
stopper_process
,
METH_VARARGS
,
"process(dict, [str, ...]) --> [str, ...]
\n
"
"Remove stop words (the keys of dict) from the input list of strings
\n
"
" to create a new list."
},
{
NULL
}
};
void
initstopper
(
void
)
{
Py_InitModule3
(
"stopper"
,
stopper_functions
,
"Fast StopWordRemover implementation."
);
}
src/Products/ZCTextIndex/tests/__init__.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Test package."""
src/Products/ZCTextIndex/tests/hs-tool.py
deleted
100755 → 0
View file @
48f67574
#! /usr/bin/env python
import
cPickle
import
os.path
import
sys
from
hotshot.log
import
LogReader
def
load_line_info
(
log
):
byline
=
{}
prevloc
=
None
for
what
,
place
,
tdelta
in
log
:
if
tdelta
>
0
:
t
,
nhits
=
byline
.
get
(
prevloc
,
(
0
,
0
))
byline
[
prevloc
]
=
(
tdelta
+
t
),
(
nhits
+
1
)
prevloc
=
place
return
byline
def
basename
(
path
,
cache
=
{}):
try
:
return
cache
[
path
]
except
KeyError
:
fn
=
os
.
path
.
split
(
path
)[
1
]
cache
[
path
]
=
fn
return
fn
def
print_results
(
results
):
for
info
,
place
in
results
:
if
place
is
None
:
# This is the startup time for the profiler, and only
# occurs at the very beginning. Just ignore it, since it
# corresponds to frame setup of the outermost call, not
# anything that's actually interesting.
continue
filename
,
line
,
funcname
=
place
print
'%8d %8d'
%
info
,
basename
(
filename
),
line
def
annotate_results
(
results
):
files
=
{}
for
stats
,
place
in
results
:
if
not
place
:
continue
time
,
hits
=
stats
file
,
line
,
func
=
place
l
=
files
.
get
(
file
)
if
l
is
None
:
l
=
files
[
file
]
=
[]
l
.
append
((
line
,
hits
,
time
))
order
=
files
.
keys
()
order
.
sort
()
for
k
in
order
:
if
os
.
path
.
exists
(
k
):
v
=
files
[
k
]
v
.
sort
()
annotate
(
k
,
v
)
def
annotate
(
file
,
lines
):
print
"-"
*
60
print
file
print
"-"
*
60
f
=
open
(
file
)
i
=
1
match
=
lines
[
0
][
0
]
for
line
in
f
:
if
match
==
i
:
print
"%6d %8d "
%
lines
[
0
][
1
:],
line
,
del
lines
[
0
]
if
lines
:
match
=
lines
[
0
][
0
]
else
:
match
=
None
else
:
print
" "
*
16
,
line
,
i
+=
1
print
def
get_cache_name
(
filename
):
d
,
fn
=
os
.
path
.
split
(
filename
)
cache_dir
=
os
.
path
.
join
(
d
,
'.hs-tool'
)
cache_file
=
os
.
path
.
join
(
cache_dir
,
fn
)
return
cache_dir
,
cache_file
def
cache_results
(
filename
,
results
):
cache_dir
,
cache_file
=
get_cache_name
(
filename
)
if
not
os
.
path
.
exists
(
cache_dir
):
os
.
mkdir
(
cache_dir
)
fp
=
open
(
cache_file
,
'wb'
)
try
:
cPickle
.
dump
(
results
,
fp
,
1
)
finally
:
fp
.
close
()
def
main
(
filename
,
annotate
):
cache_dir
,
cache_file
=
get_cache_name
(
filename
)
if
(
os
.
path
.
isfile
(
cache_file
)
and
os
.
path
.
getmtime
(
cache_file
)
>
os
.
path
.
getmtime
(
filename
)):
# cached data is up-to-date:
fp
=
open
(
cache_file
,
'rb'
)
results
=
cPickle
.
load
(
fp
)
fp
.
close
()
else
:
log
=
LogReader
(
filename
)
byline
=
load_line_info
(
log
)
# Sort
results
=
[(
v
,
k
)
for
k
,
v
in
byline
.
items
()]
results
.
sort
()
cache_results
(
filename
,
results
)
if
annotate
:
annotate_results
(
results
)
else
:
print_results
(
results
)
if
__name__
==
"__main__"
:
import
getopt
annotate_p
=
0
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'A'
)
for
o
,
v
in
opts
:
if
o
==
'-A'
:
annotate_p
=
1
if
args
:
filename
,
=
args
else
:
filename
=
"profile.dat"
main
(
filename
,
annotate_p
)
src/Products/ZCTextIndex/tests/indexhtml.py
deleted
100644 → 0
View file @
48f67574
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
# XXX: Products.PluginIndexes.TextIndex no longer exists
from
__future__
import
nested_scopes
import
os
from
time
import
clock
import
ZODB
from
ZODB.FileStorage
import
FileStorage
from
BTrees.IOBTree
import
IOBTree
import
transaction
from
Products.ZCTextIndex.ZCTextIndex
import
ZCTextIndex
from
Products.ZCTextIndex.HTMLSplitter
import
HTMLWordSplitter
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
StopWordRemover
def
make_zc_index
():
# there's an elaborate dance necessary to construct an index
class
Struct
:
pass
extra
=
Struct
()
extra
.
doc_attr
=
"read"
extra
.
lexicon_id
=
"lexicon"
caller
=
Struct
()
caller
.
lexicon
=
Lexicon
(
HTMLWordSplitter
(),
StopWordRemover
())
return
ZCTextIndex
(
"read"
,
extra
,
caller
)
# XXX make a splitter more like the HTMLSplitter for TextIndex
# signature is
# Splitter(string, stop_words, encoding,
# singlechar, indexnumbers, casefolding)
class
MySplitter
:
def
__init__
(
self
):
self
.
_v_splitter
=
HTMLWordSplitter
()
def
__call__
(
self
,
text
,
stopdict
,
*
args
,
**
kwargs
):
words
=
self
.
_v_splitter
.
_split
(
text
)
def
lookup
(
w
):
return
stopdict
.
get
(
w
,
w
)
return
filter
(
None
,
map
(
lookup
,
words
))
#def make_old_index():
# from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
# from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
# from Products.ZCTextIndex.StopDict import get_stopdict
#
# l = Lexicon(get_stopdict())
# l.SplitterFunc = MySplitter()
# return TextIndex("read", lexicon=l)
def
main
(
db
,
root
,
dir
):
rt
[
"index"
]
=
index
=
INDEX
()
rt
[
"files"
]
=
paths
=
IOBTree
()
transaction
.
commit
()
zodb_time
=
0.0
pack_time
=
0.0
files
=
[
os
.
path
.
join
(
dir
,
file
)
for
file
in
os
.
listdir
(
dir
)]
docid
=
0
t0
=
clock
()
for
file
in
files
:
if
os
.
path
.
isdir
(
file
):
files
+=
[
os
.
path
.
join
(
file
,
sub
)
for
sub
in
os
.
listdir
(
file
)]
else
:
if
not
file
.
endswith
(
".html"
):
continue
docid
+=
1
if
LIMIT
is
not
None
and
docid
>
LIMIT
:
break
if
VERBOSE
:
print
"%5d"
%
docid
,
file
f
=
open
(
file
,
"rb"
)
paths
[
docid
]
=
file
index
.
index_object
(
docid
,
f
)
f
.
close
()
if
docid
%
TXN_INTERVAL
==
0
:
z0
=
clock
()
transaction
.
commit
()
z1
=
clock
()
zodb_time
+=
z1
-
z0
if
VERBOSE
:
print
"commit took"
,
z1
-
z0
,
zodb_time
if
docid
%
PACK_INTERVAL
==
0
:
p0
=
clock
()
db
.
pack
()
p1
=
clock
()
zodb_time
+=
p1
-
p0
pack_time
+=
p1
-
p0
if
VERBOSE
:
print
"pack took"
,
p1
-
p0
,
pack_time
z0
=
clock
()
transaction
.
commit
()
z1
=
t1
=
clock
()
total_time
=
t1
-
t0
zodb_time
+=
z1
-
z0
if
VERBOSE
:
print
"Total index time"
,
total_time
print
"Non-pack time"
,
total_time
-
pack_time
print
"Non-ZODB time"
,
total_time
-
zodb_time
if
__name__
==
"__main__"
:
import
sys
import
getopt
VERBOSE
=
0
FSPATH
=
"Data.fs"
TXN_INTERVAL
=
100
PACK_INTERVAL
=
500
LIMIT
=
None
INDEX
=
make_zc_index
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'vf:t:p:n:T'
)
except
getopt
.
error
,
msg
:
print
msg
print
__doc__
sys
.
exit
(
2
)
for
o
,
v
in
opts
:
if
o
==
'-v'
:
VERBOSE
+=
1
if
o
==
'-f'
:
FSPATH
=
v
if
o
==
'-t'
:
TXN_INTERVAL
=
int
(
v
)
if
o
==
'-p'
:
PACK_INTERVAL
=
int
(
v
)
if
o
==
'-n'
:
LIMIT
=
int
(
v
)
# if o == '-T':
# INDEX = make_old_index
if
len
(
args
)
!=
1
:
print
"Expected on argument"
print
__doc__
sys
.
exit
(
2
)
dir
=
args
[
0
]
fs
=
FileStorage
(
FSPATH
)
db
=
ZODB
.
DB
(
fs
)
cn
=
db
.
open
()
rt
=
cn
.
root
()
dir
=
os
.
path
.
join
(
os
.
getcwd
(),
dir
)
print
dir
main
(
db
,
rt
,
dir
)
cn
.
close
()
fs
.
close
()
src/Products/ZCTextIndex/tests/mailtest.py
deleted
100644 → 0
View file @
48f67574
"""Test an index with a Unix mailbox file.
usage: python mailtest.py [options] <data.fs>
options:
-v -- verbose
Index Generation
-i mailbox
-n NNN -- max number of messages to read from mailbox
-t NNN -- commit a transaction every NNN messages (default: 1)
-p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-p 0 -- don't pack at all
-x -- exclude the message text from the data.fs
Queries
-q query
-b NNN -- return the NNN best matches (default: 10)
-c NNN -- context; if -v, show the first NNN lines of results (default: 5)
The script either indexes or queries depending on whether -q or -i is
passed as an option.
For -i mailbox, the script reads mail messages from the mailbox and
indexes them. It indexes one message at a time, then commits the
transaction.
For -q query, it performs a query on an existing index.
If both are specified, the index is performed first.
You can also interact with the index after it is completed. Load the
index from the database:
import ZODB
from ZODB.FileStorage import FileStorage
fs = FileStorage(<data.fs>
db = ZODB.DB(fs)
index = cn.open().root()["index"]
index.search("python AND unicode")
"""
import
ZODB
import
ZODB.FileStorage
import
transaction
from
Products.ZCTextIndex.Lexicon
import
\
Lexicon
,
CaseNormalizer
,
Splitter
,
StopWordRemover
from
Products.ZCTextIndex.ZCTextIndex
import
ZCTextIndex
from
BTrees.IOBTree
import
IOBTree
from
Products.ZCTextIndex.QueryParser
import
QueryParser
import
sys
import
mailbox
import
time
def
usage
(
msg
):
print
msg
print
__doc__
sys
.
exit
(
2
)
class
Message
:
total_bytes
=
0
def
__init__
(
self
,
msg
):
subject
=
msg
.
getheader
(
'subject'
,
''
)
author
=
msg
.
getheader
(
'from'
,
''
)
if
author
:
summary
=
"%s (%s)
\
n
"
%
(
subject
,
author
)
else
:
summary
=
"%s
\
n
"
%
subject
self
.
text
=
summary
+
msg
.
fp
.
read
()
Message
.
total_bytes
+=
len
(
self
.
text
)
class
Extra
:
pass
def
index
(
rt
,
mboxfile
,
db
,
profiler
):
global
NUM
idx_time
=
0
pack_time
=
0
start_time
=
time
.
time
()
lexicon
=
Lexicon
(
Splitter
(),
CaseNormalizer
(),
StopWordRemover
())
extra
=
Extra
()
extra
.
lexicon_id
=
'lexicon'
extra
.
doc_attr
=
'text'
extra
.
index_type
=
'Okapi BM25 Rank'
caller
=
Extra
()
caller
.
lexicon
=
lexicon
rt
[
"index"
]
=
idx
=
ZCTextIndex
(
"index"
,
extra
,
caller
)
if
not
EXCLUDE_TEXT
:
rt
[
"documents"
]
=
docs
=
IOBTree
()
else
:
docs
=
None
transaction
.
commit
()
mbox
=
mailbox
.
UnixMailbox
(
open
(
mboxfile
,
'rb'
))
if
VERBOSE
:
print
"opened"
,
mboxfile
if
not
NUM
:
NUM
=
sys
.
maxint
if
profiler
:
itime
,
ptime
,
i
=
profiler
.
runcall
(
indexmbox
,
mbox
,
idx
,
docs
,
db
)
else
:
itime
,
ptime
,
i
=
indexmbox
(
mbox
,
idx
,
docs
,
db
)
idx_time
+=
itime
pack_time
+=
ptime
transaction
.
commit
()
if
PACK_INTERVAL
and
i
%
PACK_INTERVAL
!=
0
:
if
VERBOSE
>=
2
:
print
"packing one last time..."
p0
=
time
.
clock
()
db
.
pack
(
time
.
time
())
p1
=
time
.
clock
()
if
VERBOSE
:
print
"pack took %s sec"
%
(
p1
-
p0
)
pack_time
+=
p1
-
p0
if
VERBOSE
:
finish_time
=
time
.
time
()
print
print
"Index time"
,
round
(
idx_time
/
60
,
3
),
"minutes"
print
"Pack time"
,
round
(
pack_time
/
60
,
3
),
"minutes"
print
"Index bytes"
,
Message
.
total_bytes
rate
=
(
Message
.
total_bytes
/
idx_time
)
/
1024
print
"Index rate %.2f KB/sec"
%
rate
print
"Indexing began"
,
time
.
ctime
(
start_time
)
print
"Indexing ended"
,
time
.
ctime
(
finish_time
)
print
"Wall clock minutes"
,
round
((
finish_time
-
start_time
)
/
60
,
3
)
def
indexmbox
(
mbox
,
idx
,
docs
,
db
):
idx_time
=
0
pack_time
=
0
i
=
0
while
i
<
NUM
:
_msg
=
mbox
.
next
()
if
_msg
is
None
:
break
i
+=
1
msg
=
Message
(
_msg
)
if
VERBOSE
>=
2
:
print
"indexing msg"
,
i
i0
=
time
.
clock
()
idx
.
index_object
(
i
,
msg
)
if
not
EXCLUDE_TEXT
:
docs
[
i
]
=
msg
if
i
%
TXN_SIZE
==
0
:
transaction
.
commit
()
i1
=
time
.
clock
()
idx_time
+=
i1
-
i0
if
VERBOSE
and
i
%
50
==
0
:
print
i
,
"messages indexed"
print
"cache size"
,
db
.
cacheSize
()
if
PACK_INTERVAL
and
i
%
PACK_INTERVAL
==
0
:
if
VERBOSE
>=
2
:
print
"packing..."
p0
=
time
.
clock
()
db
.
pack
(
time
.
time
())
p1
=
time
.
clock
()
if
VERBOSE
:
print
"pack took %s sec"
%
(
p1
-
p0
)
pack_time
+=
p1
-
p0
return
idx_time
,
pack_time
,
i
def
query
(
rt
,
query_str
,
profiler
):
idx
=
rt
[
"index"
]
docs
=
rt
[
"documents"
]
start
=
time
.
clock
()
if
profiler
is
None
:
results
,
num_results
=
idx
.
query
(
query_str
,
BEST
)
else
:
if
WARM_CACHE
:
print
"Warming the cache..."
idx
.
query
(
query_str
,
BEST
)
start
=
time
.
clock
()
results
,
num_results
=
profiler
.
runcall
(
idx
.
query
,
query_str
,
BEST
)
elapsed
=
time
.
clock
()
-
start
print
"query:"
,
query_str
print
"# results:"
,
len
(
results
),
"of"
,
num_results
,
\
"in %.2f ms"
%
(
elapsed
*
1000
)
tree
=
QueryParser
(
idx
.
lexicon
).
parseQuery
(
query_str
)
qw
=
idx
.
index
.
query_weight
(
tree
.
terms
())
for
docid
,
score
in
results
:
scaled
=
100.0
*
score
/
qw
print
"docid %7d score %6d scaled %5.2f%%"
%
(
docid
,
score
,
scaled
)
if
VERBOSE
:
msg
=
docs
[
docid
]
ctx
=
msg
.
text
.
split
(
"
\
n
"
,
CONTEXT
)
del
ctx
[
-
1
]
print
"-"
*
60
print
"message:"
for
l
in
ctx
:
print
l
print
"-"
*
60
def
main
(
fs_path
,
mbox_path
,
query_str
,
profiler
):
f
=
ZODB
.
FileStorage
.
FileStorage
(
fs_path
)
db
=
ZODB
.
DB
(
f
,
cache_size
=
CACHE_SIZE
)
cn
=
db
.
open
()
rt
=
cn
.
root
()
if
mbox_path
is
not
None
:
index
(
rt
,
mbox_path
,
db
,
profiler
)
if
query_str
is
not
None
:
query
(
rt
,
query_str
,
profiler
)
cn
.
close
()
db
.
close
()
f
.
close
()
if
__name__
==
"__main__"
:
import
getopt
NUM
=
0
VERBOSE
=
0
PACK_INTERVAL
=
500
EXCLUDE_TEXT
=
0
CACHE_SIZE
=
10000
TXN_SIZE
=
1
BEST
=
10
CONTEXT
=
5
WARM_CACHE
=
0
query_str
=
None
mbox_path
=
None
profile
=
None
old_profile
=
None
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'vn:p:i:q:b:c:xt:w'
,
[
'profile='
,
'old-profile='
])
except
getopt
.
error
,
msg
:
usage
(
msg
)
if
len
(
args
)
!=
1
:
usage
(
"exactly 1 filename argument required"
)
for
o
,
v
in
opts
:
if
o
==
'-n'
:
NUM
=
int
(
v
)
elif
o
==
'-v'
:
VERBOSE
+=
1
elif
o
==
'-p'
:
PACK_INTERVAL
=
int
(
v
)
elif
o
==
'-q'
:
query_str
=
v
elif
o
==
'-i'
:
mbox_path
=
v
elif
o
==
'-b'
:
BEST
=
int
(
v
)
elif
o
==
'-x'
:
EXCLUDE_TEXT
=
1
elif
o
==
'-t'
:
TXN_SIZE
=
int
(
v
)
elif
o
==
'-c'
:
CONTEXT
=
int
(
v
)
elif
o
==
'-w'
:
WARM_CACHE
=
1
elif
o
==
'--profile'
:
profile
=
v
elif
o
==
'--old-profile'
:
old_profile
=
v
fs_path
,
=
args
if
profile
:
import
hotshot
profiler
=
hotshot
.
Profile
(
profile
,
lineevents
=
1
,
linetimings
=
1
)
elif
old_profile
:
import
profile
profiler
=
profile
.
Profile
()
else
:
profiler
=
None
main
(
fs_path
,
mbox_path
,
query_str
,
profiler
)
if
profile
:
profiler
.
close
()
elif
old_profile
:
import
pstats
profiler
.
dump_stats
(
old_profile
)
stats
=
pstats
.
Stats
(
old_profile
)
stats
.
strip_dirs
().
sort_stats
(
'time'
).
print_stats
(
20
)
src/Products/ZCTextIndex/tests/mhindex.py
deleted
100644 → 0
View file @
48f67574
"""MH mail indexer.
To index messages from a single folder (messages defaults to 'all'):
mhindex.py [options] -u +folder [messages ...]
To bulk index all messages from several folders:
mhindex.py [options] -b folder ...; the folder name ALL means all folders.
To execute a single query:
mhindex.py [options] query
To enter interactive query mode:
mhindex.py [options]
Common options:
-d FILE -- specify the Data.fs to use (default ~/.Data.fs)
-w -- dump the word list in alphabetical order and exit
-W -- dump the word list ordered by word id and exit
Indexing options:
-O -- do a prescan on the data to compute optimal word id assignments;
this is only useful the first time the Data.fs is used
-t N -- commit a transaction after every N messages (default 20000)
-p N -- pack after every N commits (by default no packing is done)
Querying options:
-m N -- show at most N matching lines from the message (default 3)
-n N -- show the N best matching messages (default 3)
"""
import
os
import
re
import
sys
import
time
import
mhlib
import
getopt
import
traceback
from
StringIO
import
StringIO
from
stat
import
ST_MTIME
DATAFS
=
"~/.Data.fs"
ZOPECODE
=
"~/projects/Zope/lib/python"
sys
.
path
.
append
(
os
.
path
.
expanduser
(
ZOPECODE
))
from
ZODB
import
DB
from
ZODB.FileStorage
import
FileStorage
from
Persistence
import
Persistent
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OIBTree
import
OIBTree
from
BTrees.IIBTree
import
IIBTree
import
transaction
from
Products.ZCTextIndex.NBest
import
NBest
from
Products.ZCTextIndex.OkapiIndex
import
OkapiIndex
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
Splitter
from
Products.ZCTextIndex.Lexicon
import
CaseNormalizer
,
StopWordRemover
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.StopDict
import
get_stopdict
NBEST
=
3
MAXLINES
=
3
def
main
():
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
"bd:fhm:n:Op:t:uwW"
)
except
getopt
.
error
,
msg
:
print
msg
print
"use -h for help"
return
2
update
=
0
bulk
=
0
optimize
=
0
nbest
=
NBEST
maxlines
=
MAXLINES
datafs
=
os
.
path
.
expanduser
(
DATAFS
)
pack
=
0
trans
=
20000
dumpwords
=
dumpwids
=
dumpfreqs
=
0
for
o
,
a
in
opts
:
if
o
==
"-b"
:
bulk
=
1
if
o
==
"-d"
:
datafs
=
a
if
o
==
"-f"
:
dumpfreqs
=
1
if
o
==
"-h"
:
print
__doc__
return
if
o
==
"-m"
:
maxlines
=
int
(
a
)
if
o
==
"-n"
:
nbest
=
int
(
a
)
if
o
==
"-O"
:
optimize
=
1
if
o
==
"-p"
:
pack
=
int
(
a
)
if
o
==
"-t"
:
trans
=
int
(
a
)
if
o
==
"-u"
:
update
=
1
if
o
==
"-w"
:
dumpwords
=
1
if
o
==
"-W"
:
dumpwids
=
1
ix
=
Indexer
(
datafs
,
writable
=
update
or
bulk
,
trans
=
trans
,
pack
=
pack
)
if
dumpfreqs
:
ix
.
dumpfreqs
()
if
dumpwords
:
ix
.
dumpwords
()
if
dumpwids
:
ix
.
dumpwids
()
if
dumpwords
or
dumpwids
or
dumpfreqs
:
return
if
bulk
:
if
optimize
:
ix
.
optimize
(
args
)
ix
.
bulkupdate
(
args
)
elif
update
:
ix
.
update
(
args
)
elif
args
:
for
i
in
range
(
len
(
args
)):
a
=
args
[
i
]
if
" "
in
a
:
if
a
[
0
]
==
"-"
:
args
[
i
]
=
'-"'
+
a
[
1
:]
+
'"'
else
:
args
[
i
]
=
'"'
+
a
+
'"'
ix
.
query
(
" "
.
join
(
args
),
nbest
,
maxlines
)
else
:
ix
.
interact
(
nbest
)
if
pack
:
ix
.
pack
()
class
Indexer
:
filestorage
=
database
=
connection
=
root
=
None
def
__init__
(
self
,
datafs
,
writable
=
0
,
trans
=
0
,
pack
=
0
):
self
.
trans_limit
=
trans
self
.
pack_limit
=
pack
self
.
trans_count
=
0
self
.
pack_count
=
0
self
.
stopdict
=
get_stopdict
()
self
.
mh
=
mhlib
.
MH
()
self
.
filestorage
=
FileStorage
(
datafs
,
read_only
=
(
not
writable
))
self
.
database
=
DB
(
self
.
filestorage
)
self
.
connection
=
self
.
database
.
open
()
self
.
root
=
self
.
connection
.
root
()
try
:
self
.
index
=
self
.
root
[
"index"
]
except
KeyError
:
self
.
index
=
self
.
root
[
"index"
]
=
TextIndex
()
try
:
self
.
docpaths
=
self
.
root
[
"docpaths"
]
except
KeyError
:
self
.
docpaths
=
self
.
root
[
"docpaths"
]
=
IOBTree
()
try
:
self
.
doctimes
=
self
.
root
[
"doctimes"
]
except
KeyError
:
self
.
doctimes
=
self
.
root
[
"doctimes"
]
=
IIBTree
()
try
:
self
.
watchfolders
=
self
.
root
[
"watchfolders"
]
except
KeyError
:
self
.
watchfolders
=
self
.
root
[
"watchfolders"
]
=
{}
self
.
path2docid
=
OIBTree
()
for
docid
in
self
.
docpaths
.
keys
():
path
=
self
.
docpaths
[
docid
]
self
.
path2docid
[
path
]
=
docid
try
:
self
.
maxdocid
=
max
(
self
.
docpaths
.
keys
())
except
ValueError
:
self
.
maxdocid
=
0
print
len
(
self
.
docpaths
),
"Document ids"
print
len
(
self
.
path2docid
),
"Pathnames"
print
self
.
index
.
lexicon
.
length
(),
"Words"
def
dumpfreqs
(
self
):
lexicon
=
self
.
index
.
lexicon
index
=
self
.
index
.
index
assert
isinstance
(
index
,
OkapiIndex
)
L
=
[]
for
wid
in
lexicon
.
wids
():
freq
=
0
for
f
in
index
.
_wordinfo
.
get
(
wid
,
{}).
values
():
freq
+=
f
L
.
append
((
freq
,
wid
,
lexicon
.
get_word
(
wid
)))
L
.
sort
()
L
.
reverse
()
for
freq
,
wid
,
word
in
L
:
print
"%10d %10d %s"
%
(
wid
,
freq
,
word
)
def
dumpwids
(
self
):
lexicon
=
self
.
index
.
lexicon
index
=
self
.
index
.
index
assert
isinstance
(
index
,
OkapiIndex
)
for
wid
in
lexicon
.
wids
():
freq
=
0
for
f
in
index
.
_wordinfo
.
get
(
wid
,
{}).
values
():
freq
+=
f
print
"%10d %10d %s"
%
(
wid
,
freq
,
lexicon
.
get_word
(
wid
))
def
dumpwords
(
self
):
lexicon
=
self
.
index
.
lexicon
index
=
self
.
index
.
index
assert
isinstance
(
index
,
OkapiIndex
)
for
word
in
lexicon
.
words
():
wid
=
lexicon
.
get_wid
(
word
)
freq
=
0
for
f
in
index
.
_wordinfo
.
get
(
wid
,
{}).
values
():
freq
+=
f
print
"%10d %10d %s"
%
(
wid
,
freq
,
word
)
def
close
(
self
):
self
.
root
=
None
if
self
.
connection
is
not
None
:
self
.
connection
.
close
()
self
.
connection
=
None
if
self
.
database
is
not
None
:
self
.
database
.
close
()
self
.
database
=
None
if
self
.
filestorage
is
not
None
:
self
.
filestorage
.
close
()
self
.
filestorage
=
None
def
interact
(
self
,
nbest
=
NBEST
,
maxlines
=
MAXLINES
):
try
:
import
readline
except
ImportError
:
pass
text
=
""
top
=
0
results
=
[]
while
1
:
try
:
line
=
raw_input
(
"Query: "
)
except
EOFError
:
print
"
\
n
Bye."
break
line
=
line
.
strip
()
if
line
.
startswith
(
"/"
):
self
.
specialcommand
(
line
,
results
,
top
-
nbest
)
continue
if
line
:
text
=
line
top
=
0
else
:
if
not
text
:
continue
try
:
results
,
n
=
self
.
timequery
(
text
,
top
+
nbest
)
except
KeyboardInterrupt
:
raise
except
:
reportexc
()
text
=
""
continue
if
len
(
results
)
<=
top
:
if
not
n
:
print
"No hits for %r."
%
text
else
:
print
"No more hits for %r."
%
text
text
=
""
continue
print
"[Results %d-%d from %d"
%
(
top
+
1
,
min
(
n
,
top
+
nbest
),
n
),
print
"for query %s]"
%
repr
(
text
)
self
.
formatresults
(
text
,
results
,
maxlines
,
top
,
top
+
nbest
)
top
+=
nbest
def
specialcommand
(
self
,
line
,
results
,
first
):
assert
line
.
startswith
(
"/"
)
line
=
line
[
1
:]
if
not
line
:
n
=
first
else
:
try
:
n
=
int
(
line
)
-
1
except
:
print
"Huh?"
return
if
n
<
0
or
n
>=
len
(
results
):
print
"Out of range"
return
docid
,
score
=
results
[
n
]
path
=
self
.
docpaths
[
docid
]
i
=
path
.
rfind
(
"/"
)
assert
i
>
0
folder
=
path
[:
i
]
n
=
path
[
i
+
1
:]
cmd
=
"show +%s %s"
%
(
folder
,
n
)
if
os
.
getenv
(
"DISPLAY"
):
os
.
system
(
"xterm -e sh -c '%s | less' &"
%
cmd
)
else
:
os
.
system
(
cmd
)
def
query
(
self
,
text
,
nbest
=
NBEST
,
maxlines
=
MAXLINES
):
results
,
n
=
self
.
timequery
(
text
,
nbest
)
if
not
n
:
print
"No hits for %r."
%
text
return
print
"[Results 1-%d from %d]"
%
(
len
(
results
),
n
)
self
.
formatresults
(
text
,
results
,
maxlines
)
def
timequery
(
self
,
text
,
nbest
):
t0
=
time
.
time
()
c0
=
time
.
clock
()
results
,
n
=
self
.
index
.
query
(
text
,
nbest
)
t1
=
time
.
time
()
c1
=
time
.
clock
()
print
"[Query time: %.3f real, %.3f user]"
%
(
t1
-
t0
,
c1
-
c0
)
return
results
,
n
def
formatresults
(
self
,
text
,
results
,
maxlines
=
MAXLINES
,
lo
=
0
,
hi
=
sys
.
maxint
):
stop
=
self
.
stopdict
.
has_key
words
=
[
w
for
w
in
re
.
findall
(
r"\
w+
\*?"
,
text
.
lower
())
if
not
stop
(
w
)]
pattern
=
r"\b("
+
"|"
.
join
(
words
)
+
r")\b"
pattern
=
pattern
.
replace
(
"*"
,
".*"
)
# glob -> re syntax
prog
=
re
.
compile
(
pattern
,
re
.
IGNORECASE
)
print
'='
*
70
rank
=
lo
qw
=
self
.
index
.
query_weight
(
text
)
for
docid
,
score
in
results
[
lo
:
hi
]:
rank
+=
1
path
=
self
.
docpaths
[
docid
]
score
=
100.0
*
score
/
qw
print
"Rank: %d Score: %d%% File: %s"
%
(
rank
,
score
,
path
)
path
=
os
.
path
.
join
(
self
.
mh
.
getpath
(),
path
)
try
:
fp
=
open
(
path
)
except
(
IOError
,
OSError
),
msg
:
print
"Can't open:"
,
msg
continue
msg
=
mhlib
.
Message
(
"<folder>"
,
0
,
fp
)
for
header
in
"From"
,
"To"
,
"Cc"
,
"Bcc"
,
"Subject"
,
"Date"
:
h
=
msg
.
getheader
(
header
)
if
h
:
print
"%-8s %s"
%
(
header
+
":"
,
h
)
text
=
self
.
getmessagetext
(
msg
)
if
text
:
print
nleft
=
maxlines
for
part
in
text
:
for
line
in
part
.
splitlines
():
if
prog
.
search
(
line
):
print
line
nleft
-=
1
if
nleft
<=
0
:
break
if
nleft
<=
0
:
break
print
'-'
*
70
def
update
(
self
,
args
):
folder
=
None
seqs
=
[]
for
arg
in
args
:
if
arg
.
startswith
(
"+"
):
if
folder
is
None
:
folder
=
arg
[
1
:]
else
:
print
"only one folder at a time"
return
else
:
seqs
.
append
(
arg
)
if
not
folder
:
folder
=
self
.
mh
.
getcontext
()
if
not
seqs
:
seqs
=
[
'all'
]
try
:
f
=
self
.
mh
.
openfolder
(
folder
)
except
mhlib
.
Error
,
msg
:
print
msg
return
dict
=
{}
for
seq
in
seqs
:
try
:
nums
=
f
.
parsesequence
(
seq
)
except
mhlib
.
Error
,
msg
:
print
msg
or
"unparsable message sequence: %s"
%
`seq`
return
for
n
in
nums
:
dict
[
n
]
=
n
msgs
=
dict
.
keys
()
msgs
.
sort
()
self
.
updatefolder
(
f
,
msgs
)
self
.
commit
()
def
optimize
(
self
,
args
):
uniqwords
=
{}
for
folder
in
args
:
if
folder
.
startswith
(
"+"
):
folder
=
folder
[
1
:]
print
"
\
n
OPTIMIZE FOLDER"
,
folder
try
:
f
=
self
.
mh
.
openfolder
(
folder
)
except
mhlib
.
Error
,
msg
:
print
msg
continue
self
.
prescan
(
f
,
f
.
listmessages
(),
uniqwords
)
L
=
[(
uniqwords
[
word
],
word
)
for
word
in
uniqwords
.
keys
()]
L
.
sort
()
L
.
reverse
()
for
i
in
range
(
100
):
print
"%3d. %6d %s"
%
((
i
+
1
,)
+
L
[
i
])
self
.
index
.
lexicon
.
sourceToWordIds
([
word
for
(
count
,
word
)
in
L
])
def
prescan
(
self
,
f
,
msgs
,
uniqwords
):
pipeline
=
[
Splitter
(),
CaseNormalizer
(),
StopWordRemover
()]
for
n
in
msgs
:
print
"prescanning"
,
n
m
=
f
.
openmessage
(
n
)
text
=
self
.
getmessagetext
(
m
,
f
.
name
)
for
p
in
pipeline
:
text
=
p
.
process
(
text
)
for
word
in
text
:
uniqwords
[
word
]
=
uniqwords
.
get
(
word
,
0
)
+
1
def
bulkupdate
(
self
,
args
):
if
not
args
:
print
"No folders specified; use ALL to bulk-index all folders"
return
if
"ALL"
in
args
:
i
=
args
.
index
(
"ALL"
)
args
[
i
:
i
+
1
]
=
self
.
mh
.
listfolders
()
for
folder
in
args
:
if
folder
.
startswith
(
"+"
):
folder
=
folder
[
1
:]
print
"
\
n
FOLDER"
,
folder
try
:
f
=
self
.
mh
.
openfolder
(
folder
)
except
mhlib
.
Error
,
msg
:
print
msg
continue
self
.
updatefolder
(
f
,
f
.
listmessages
())
print
"Total"
,
len
(
self
.
docpaths
)
self
.
commit
()
print
len
(
self
.
index
.
lexicon
.
_words
),
"unique words."
def
updatefolder
(
self
,
f
,
msgs
):
self
.
watchfolders
[
f
.
name
]
=
self
.
getmtime
(
f
.
name
)
for
n
in
msgs
:
path
=
"%s/%s"
%
(
f
.
name
,
n
)
docid
=
self
.
path2docid
.
get
(
path
,
0
)
if
docid
and
self
.
getmtime
(
path
)
==
self
.
doctimes
.
get
(
docid
,
0
):
print
"unchanged"
,
docid
,
path
continue
docid
=
self
.
newdocid
(
path
)
try
:
m
=
f
.
openmessage
(
n
)
except
IOError
:
print
"disappeared"
,
docid
,
path
self
.
unindexpath
(
path
)
continue
text
=
self
.
getmessagetext
(
m
,
f
.
name
)
if
not
text
:
self
.
unindexpath
(
path
)
continue
print
"indexing"
,
docid
,
path
self
.
index
.
index_text
(
docid
,
text
)
self
.
maycommit
()
# Remove messages from the folder that no longer exist
for
path
in
list
(
self
.
path2docid
.
keys
(
f
.
name
)):
if
not
path
.
startswith
(
f
.
name
+
"/"
):
break
if
self
.
getmtime
(
path
)
==
0
:
self
.
unindexpath
(
path
)
print
"done."
def
unindexpath
(
self
,
path
):
if
self
.
path2docid
.
has_key
(
path
):
docid
=
self
.
path2docid
[
path
]
print
"unindexing"
,
docid
,
path
del
self
.
docpaths
[
docid
]
del
self
.
doctimes
[
docid
]
del
self
.
path2docid
[
path
]
try
:
self
.
index
.
unindex
(
docid
)
except
KeyError
,
msg
:
print
"KeyError"
,
msg
self
.
maycommit
()
def
getmessagetext
(
self
,
m
,
name
=
None
):
L
=
[]
if
name
:
L
.
append
(
"_folder "
+
name
)
# To restrict search to a folder
self
.
getheaders
(
m
,
L
)
try
:
self
.
getmsgparts
(
m
,
L
,
0
)
except
KeyboardInterrupt
:
raise
except
:
print
"(getmsgparts failed:)"
reportexc
()
return
L
def
getmsgparts
(
self
,
m
,
L
,
level
):
ctype
=
m
.
gettype
()
if
level
or
ctype
!=
"text/plain"
:
print
". "
*
level
+
str
(
ctype
)
if
ctype
==
"text/plain"
:
L
.
append
(
m
.
getbodytext
())
elif
ctype
in
(
"multipart/alternative"
,
"multipart/mixed"
):
for
part
in
m
.
getbodyparts
():
self
.
getmsgparts
(
part
,
L
,
level
+
1
)
elif
ctype
==
"message/rfc822"
:
f
=
StringIO
(
m
.
getbodytext
())
m
=
mhlib
.
Message
(
"<folder>"
,
0
,
f
)
self
.
getheaders
(
m
,
L
)
self
.
getmsgparts
(
m
,
L
,
level
+
1
)
def
getheaders
(
self
,
m
,
L
):
H
=
[]
for
key
in
"from"
,
"to"
,
"cc"
,
"bcc"
,
"subject"
:
value
=
m
.
get
(
key
)
if
value
:
H
.
append
(
value
)
if
H
:
L
.
append
(
"
\
n
"
.
join
(
H
))
def
newdocid
(
self
,
path
):
docid
=
self
.
path2docid
.
get
(
path
)
if
docid
is
not
None
:
self
.
doctimes
[
docid
]
=
self
.
getmtime
(
path
)
return
docid
docid
=
self
.
maxdocid
+
1
self
.
maxdocid
=
docid
self
.
docpaths
[
docid
]
=
path
self
.
doctimes
[
docid
]
=
self
.
getmtime
(
path
)
self
.
path2docid
[
path
]
=
docid
return
docid
def
getmtime
(
self
,
path
):
path
=
os
.
path
.
join
(
self
.
mh
.
getpath
(),
path
)
try
:
st
=
os
.
stat
(
path
)
except
os
.
error
,
msg
:
return
0
return
int
(
st
[
ST_MTIME
])
def
maycommit
(
self
):
self
.
trans_count
+=
1
if
self
.
trans_count
>=
self
.
trans_limit
>
0
:
self
.
commit
()
def
commit
(
self
):
if
self
.
trans_count
>
0
:
print
"committing..."
transaction
.
commit
()
self
.
trans_count
=
0
self
.
pack_count
+=
1
if
self
.
pack_count
>=
self
.
pack_limit
>
0
:
self
.
pack
()
def
pack
(
self
):
if
self
.
pack_count
>
0
:
print
"packing..."
self
.
database
.
pack
()
self
.
pack_count
=
0
class
TextIndex
(
Persistent
):
def
__init__
(
self
):
self
.
lexicon
=
Lexicon
(
Splitter
(),
CaseNormalizer
(),
StopWordRemover
())
self
.
index
=
OkapiIndex
(
self
.
lexicon
)
def
index_text
(
self
,
docid
,
text
):
self
.
index
.
index_doc
(
docid
,
text
)
self
.
_p_changed
=
1
# XXX
def
unindex
(
self
,
docid
):
self
.
index
.
unindex_doc
(
docid
)
self
.
_p_changed
=
1
# XXX
def
query
(
self
,
query
,
nbest
=
10
):
# returns a total hit count and a mapping from docids to scores
parser
=
QueryParser
(
self
.
lexicon
)
tree
=
parser
.
parseQuery
(
query
)
results
=
tree
.
executeQuery
(
self
.
index
)
if
results
is
None
:
return
[],
0
chooser
=
NBest
(
nbest
)
chooser
.
addmany
(
results
.
items
())
return
chooser
.
getbest
(),
len
(
results
)
def
query_weight
(
self
,
query
):
parser
=
QueryParser
(
self
.
lexicon
)
tree
=
parser
.
parseQuery
(
query
)
terms
=
tree
.
terms
()
return
self
.
index
.
query_weight
(
terms
)
def
reportexc
():
traceback
.
print_exc
()
if
__name__
==
"__main__"
:
sys
.
exit
(
main
())
src/Products/ZCTextIndex/tests/python.txt
deleted
100644 → 0
View file @
48f67574
Search results for python.org
query: "nested recursive functions"
Ultraseek
83% http://www.python.org/dev/doc/maint22/whatsnew/node9.html
43% http://python.sourceforge.net/peps/pep-0227.txt
37% http://www.python.org/dev/doc/maint22/lib/module-pprint.html
37% http://www.python.org/doc/1.5.2p1/lib/module-pprint.html
37% http://www.python.org/doc/2.0.1/lib/module-pprint.html
37% http://www.python.org/doc/1.5.2/lib/module-pprint.html
37% http://www.python.org/doc/1.6/lib/module-pprint.html
37% http://www.python.org/doc/1.5.1/lib/module-pprint.html
37% http://www.python.org/doc/1.5/lib/node54.html
35% http://www.python.org/workshops/2000-01/proceedings/papers/tismers/
spcpaper.htm
Google
www.python.org/peps/pep-0227.html
www.python.org/dev/doc/maint22/whatsnew/node9.html
www.python.org/cgi-bin/faqw.py?req=recent&days=28
www.python.org/peps/pep-0255.html
www.python.org/doc/current/lib/node558.html
www.python.org/doc/1.5.2/lib/node52.html
www.python.org/workshops/2000-01/proceedings/papers/tismers/spcpaper.pdf
www.python.org/2.0/
www.python.org/2.0.1/NEWS.txt
www.python.org/peps/pep-0266.html
query: "explicit better than implicit"
Ultraseek:
http://www.python.org/dev/doc/maint22/lib/differ-examples.html
http://www.python.org/doc/essays/ppt/python10/py10keynote.ppt
http://www.python.org/dev/culture.html
http://www.python.org/doc/Humor.html
http://www.python.org/dev/doc/maint22/ref/implicit-joining.html
http://www.python.org/dev/doc/maint22/ref/explicit-joining.html
ttp://www.python.org/workshops/2000-01/proceedings/papers/
tigges-wyvill/tigges-wyvill.html
http://www.python.org/peps/pep-0285.txt
http://www.python.org/peps/pep-0285.html
Google:
www.python.org/doc/current/lib/differ-examples.html
www.python.org/doc/essays/ppt/python10/py10keynote.pdf
www.python.org/dev/culture.html
www.python.org/doc/essays/ppt/python10/py10keynote.ppt
www.python.org/peps/pep-0285.html
www.python.org/peps/pep-0287.html
www.python.org/peps/pep-0287.txt
www.python.org/peps/pep-0209.html
www.python.org/~guido/Proposal.txt
www.python.org/~guido/Proposal.doc
query: "build hpux"
Ultraseek:
51% http://www.python.org/1.5/patches-1.5.1/configure.2.txt
47% http://www.python.org/dev/doc/devel/whatsnew/node5.html
43% http://www.python.org/1.5/patches-1.5.1/
43% http://www.python.org/2.0/
41% http://www.python.org/peps/pep-0243.html
41 % http://www.python.org/ftp/python/binaries-1.3/ python-HP-UX-A.09.05-full.README
39% http://www.python.org/ftp/python/binaries-1.3/ python-hppa1.1-hp-hpux10.10.README
39% http://www.python.org/ftp/python/binaries-1.3/ python-hppa1.1-hp-hpux10.10.README
35% http://www.python.org/peps/pep-0243.txt
35% http://www.python.org/2.0.1/NEWS.txt
35% http://python.sourceforge.net/peps/pep-0243.txt
Google:
www.python.org/2.1.1/NEWS.txt
www.python.org/1.5/NEWS-152b2.txt
query: "cannot create 'method-wrapper' instances"
Ultraseek
http://python.sourceforge.net/peps/pep-0007.txt
http://www.python.org/workshops/1994-11/C++Python.txt
http://www.python.org/peps/pep-0231.txt
http://www.python.org/peps/pep-0231.html
http://python.sourceforge.net/peps/pep-0231.txt
http://www.python.org/dev/doc/maint22/lib/node383.html
http://www.python.org/workshops/1994-11/BuiltInClasses/ BuiltInClasses_7.html
http://www.python.org/workshops/1994-11/persistency.html
http://www.python.org/dev/doc/maint22/lib/organizing-tests.html
http://www.python.org/dev/doc/maint22/lib/module-SocketServer.html
Google:
no matches
query: "extension module C++"
http://www.python.org/dev/doc/devel/ext/building.html
http://www.python.org/dev/doc/maint22/ext/module-defn-options.html
http://www.python.org/dev/doc/maint21/ext/building-on-unix.html
http://www.python.org/doc/1.6/ext/building-on-unix.html
http://www.python.org/sigs/c++-sig/
http://www.python.org/dev/doc/maint22/ext/intro.html
http://www.python.org/dev/doc/maint22/ext/cplusplus.html
http://www.python.org/doc/1.4/ext/node18.html
http://www.python.org/doc/1.6/ext/building-on-windows.html
http://www.python.org/doc/1.6/dist/node12.html
Google:
www.python.org/doc/current/ext/building-on-unix.html
www.python.org/doc/current/ext/intro.html
www.python.org/doc/current/ext/ext.html
www.python.org/sigs/c++-sig/
www.python.org/doc/current/ext/ module-defn-options.html
www.python.org/doc/1.5.2p2/ext/building-on-unix.html
www.python.org/doc/1.5.2p2/ext/contents.html
www.python.org/doc/1.5.2p2/ext/ext.html
www.python.org/doc/2.1.2/ext/building-on-unix.html
www.python.org/doc/1.5.1/ext/intro.html
src/Products/ZCTextIndex/tests/queryhtml.py
deleted
100644 → 0
View file @
48f67574
# XXX: Products.PluginIndexes.TextIndex no longer exists
import
os
from
time
import
clock
import
ZODB
from
ZODB.FileStorage
import
FileStorage
QUERIES
=
[
"nested recursive functions"
,
"explicit better than implicit"
,
"build hpux"
,
"cannot create 'method-wrapper' instances"
,
"extension module C++"
,
"class method"
,
"instance variable"
,
"articulate information"
,
"import default files"
,
"gopher ftp http"
,
"documentation"
,
]
def
path2url
(
p
):
# convert the paths to a python.org URL
# hack: only works for the way Jeremy indexed his copy of python.org
marker
=
"www.python.org/."
i
=
p
.
find
(
marker
)
if
i
==
-
1
:
return
p
i
+=
len
(
marker
)
return
"http://www.python.org"
+
p
[
i
:]
#from Products.PluginIndexes.TextIndex.TextIndex import And, Or
from
Products.ZCTextIndex.tests.indexhtml
import
MySplitter
from
Products.ZCTextIndex.NBest
import
NBest
def
main
(
rt
):
index
=
rt
[
"index"
]
files
=
rt
[
"files"
]
times
=
{}
ITERS
=
range
(
50
)
for
i
in
range
(
11
):
for
q
in
QUERIES
:
terms
=
q
.
split
()
for
c
in
" OR "
,
" AND "
:
query
=
c
.
join
(
terms
)
t0
=
clock
()
if
TEXTINDEX
:
if
c
==
" OR "
:
op
=
Or
else
:
op
=
And
_q
=
" "
.
join
(
terms
)
for
_
in
ITERS
:
b
=
index
.
query
(
_q
,
op
).
bucket
()
num
=
len
(
b
)
chooser
=
NBest
(
10
)
chooser
.
addmany
(
b
.
items
())
results
=
chooser
.
getbest
()
else
:
try
:
for
_
in
ITERS
:
results
,
num
=
index
.
query
(
query
)
except
:
continue
t1
=
clock
()
print
"<p>Query:
\
"
%s
\
"
"
%
query
print
"<br>Num results: %d"
%
num
print
"<br>time.clock(): %s"
%
(
t1
-
t0
)
key
=
query
if
i
==
0
:
print
"<ol>"
for
docid
,
score
in
results
:
url
=
path2url
(
files
[
docid
])
fmt
=
'<li><a href="%s">%s</A> score = %s'
print
fmt
%
(
url
,
url
,
score
)
print
"</ol>"
continue
l
=
times
.
setdefault
(
key
,
[])
l
.
append
(
t1
-
t0
)
l
=
times
.
keys
()
l
.
sort
()
print
"<hr>"
for
k
in
l
:
v
=
times
[
k
]
print
"<p>Query:
\
"
%s
\
"
"
%
k
print
"<br>Min time: %s"
%
min
(
v
)
print
"<br>All times: %s"
%
" "
.
join
(
map
(
str
,
v
))
if
__name__
==
"__main__"
:
import
sys
import
getopt
VERBOSE
=
0
FSPATH
=
"Data.fs"
TEXTINDEX
=
0
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'vf:T'
)
except
getopt
.
error
,
msg
:
print
msg
print
__doc__
sys
.
exit
(
2
)
for
o
,
v
in
opts
:
if
o
==
'-v'
:
VERBOSE
+=
1
if
o
==
'-f'
:
FSPATH
=
v
# if o == '-T':
# TEXTINDEX = 1
fs
=
FileStorage
(
FSPATH
,
read_only
=
1
)
db
=
ZODB
.
DB
(
fs
,
cache_size
=
10000
)
cn
=
db
.
open
()
rt
=
cn
.
root
()
main
(
rt
)
src/Products/ZCTextIndex/tests/testHTMLSplitter.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2009 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Test zope.index.text.htmlsplitter
"""
import
unittest
class
HTMLWordSplitterTests
(
unittest
.
TestCase
):
# Subclasses must define '_getBTreesFamily'
def
_getTargetClass
(
self
):
from
Products.ZCTextIndex.HTMLSplitter
import
HTMLWordSplitter
return
HTMLWordSplitter
def
_makeOne
(
self
):
return
self
.
_getTargetClass
()()
def
test_class_conforms_to_ISplitter
(
self
):
from
zope.interface.verify
import
verifyClass
from
Products.ZCTextIndex.interfaces
import
ISplitter
verifyClass
(
ISplitter
,
self
.
_getTargetClass
())
def
test_instance_conforms_to_ISplitter
(
self
):
from
zope.interface.verify
import
verifyObject
from
Products.ZCTextIndex.interfaces
import
ISplitter
verifyObject
(
ISplitter
,
self
.
_makeOne
())
def
test_process_empty_string
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
process
([
''
]),
[])
def
test_process_no_markup
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
process
([
'abc def'
]),
[
'abc'
,
'def'
])
def
test_process_w_markup
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
process
([
'<h1>abc</h1> <p>def</p>'
]),
[
'abc'
,
'def'
])
def
test_process_no_markup_w_glob
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
process
([
'abc?def hij*klm nop* qrs?'
]),
[
'abc'
,
'def'
,
'hij'
,
'klm'
,
'nop'
,
'qrs'
])
def
test_processGlob_empty_string
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
processGlob
([
''
]),
[])
def
test_processGlob_no_markup_no_glob
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
processGlob
([
'abc def'
]),
[
'abc'
,
'def'
])
def
test_processGlob_w_markup_no_glob
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
processGlob
([
'<h1>abc</h1> '
'<p>def</p>'
]),
[
'abc'
,
'def'
])
def
test_processGlob_no_markup_w_glob
(
self
):
splitter
=
self
.
_makeOne
()
self
.
assertEqual
(
splitter
.
processGlob
([
'abc?def hij*klm nop* qrs?'
]),
[
'abc?def'
,
'hij*klm'
,
'nop*'
,
'qrs?'
])
def
test_suite
():
return
unittest
.
TestSuite
((
unittest
.
makeSuite
(
HTMLWordSplitterTests
),
))
src/Products/ZCTextIndex/tests/testIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import
os
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
import
transaction
from
BTrees.Length
import
Length
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
Splitter
from
Products.ZCTextIndex.CosineIndex
import
CosineIndex
from
Products.ZCTextIndex.OkapiIndex
import
OkapiIndex
# Subclasses must set a class variable IndexFactory to the appropriate
# index object constructor.
class
IndexTest
(
TestCase
):
def
setUp
(
self
):
self
.
lexicon
=
Lexicon
(
Splitter
())
self
.
index
=
self
.
IndexFactory
(
self
.
lexicon
)
def
test_index_document
(
self
,
DOCID
=
1
):
doc
=
"simple document contains five words"
self
.
assert_
(
not
self
.
index
.
has_doc
(
DOCID
))
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
assert_
(
self
.
index
.
has_doc
(
DOCID
))
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
self
.
index
.
document_count
())
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
self
.
index
.
length
())
for
map
in
self
.
index
.
_wordinfo
.
values
():
self
.
assertEqual
(
len
(
map
),
1
)
self
.
assert_
(
map
.
has_key
(
DOCID
))
def
test_unindex_document
(
self
):
DOCID
=
1
self
.
test_index_document
(
DOCID
)
self
.
index
.
unindex_doc
(
DOCID
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
self
.
index
.
document_count
())
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
self
.
index
.
length
())
def
test_index_two_documents
(
self
):
self
.
test_index_document
()
doc
=
"another document just four"
DOCID
=
2
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
2
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
self
.
index
.
document_count
())
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
8
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
2
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
4
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
self
.
index
.
length
())
wids
=
self
.
lexicon
.
termToWordIds
(
"document"
)
self
.
assertEqual
(
len
(
wids
),
1
)
document_wid
=
wids
[
0
]
for
wid
,
map
in
self
.
index
.
_wordinfo
.
items
():
if
wid
==
document_wid
:
self
.
assertEqual
(
len
(
map
),
2
)
self
.
assert_
(
map
.
has_key
(
1
))
self
.
assert_
(
map
.
has_key
(
DOCID
))
else
:
self
.
assertEqual
(
len
(
map
),
1
)
def
test_index_two_unindex_one
(
self
):
# index two documents, unindex one, and test the results
self
.
test_index_two_documents
()
self
.
index
.
unindex_doc
(
1
)
DOCID
=
2
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
self
.
index
.
document_count
())
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
4
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
4
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
self
.
index
.
length
())
for
map
in
self
.
index
.
_wordinfo
.
values
():
self
.
assertEqual
(
len
(
map
),
1
)
self
.
assert_
(
map
.
has_key
(
DOCID
))
def
test_index_duplicated_words
(
self
,
DOCID
=
1
):
doc
=
"very simple repeat repeat repeat document test"
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
7
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
self
.
index
.
length
())
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
self
.
index
.
document_count
())
wids
=
self
.
lexicon
.
termToWordIds
(
"repeat"
)
self
.
assertEqual
(
len
(
wids
),
1
)
repititive_wid
=
wids
[
0
]
for
wid
,
map
in
self
.
index
.
_wordinfo
.
items
():
self
.
assertEqual
(
len
(
map
),
1
)
self
.
assert_
(
map
.
has_key
(
DOCID
))
def
test_simple_query_oneresult
(
self
):
self
.
index
.
index_doc
(
1
,
'not the same document'
)
results
=
self
.
index
.
search
(
"document"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
])
def
test_simple_query_noresults
(
self
):
self
.
index
.
index_doc
(
1
,
'not the same document'
)
results
=
self
.
index
.
search
(
"frobnicate"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[])
def
test_query_oneresult
(
self
):
self
.
index
.
index_doc
(
1
,
'not the same document'
)
self
.
index
.
index_doc
(
2
,
'something about something else'
)
results
=
self
.
index
.
search
(
"document"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
])
def
test_search_phrase
(
self
):
self
.
index
.
index_doc
(
1
,
"the quick brown fox jumps over the lazy dog"
)
self
.
index
.
index_doc
(
2
,
"the quick fox jumps lazy over the brown dog"
)
results
=
self
.
index
.
search_phrase
(
"quick brown fox"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
])
def
test_search_glob
(
self
):
self
.
index
.
index_doc
(
1
,
"how now brown cow"
)
self
.
index
.
index_doc
(
2
,
"hough nough browne cough"
)
self
.
index
.
index_doc
(
3
,
"bar brawl"
)
results
=
self
.
index
.
search_glob
(
"bro*"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
,
2
])
results
=
self
.
index
.
search_glob
(
"b*"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
,
2
,
3
])
class
CosineIndexTest
(
IndexTest
):
IndexFactory
=
CosineIndex
class
OkapiIndexTest
(
IndexTest
):
IndexFactory
=
OkapiIndex
class
TestIndexConflict
(
TestCase
):
db
=
None
def
tearDown
(
self
):
if
self
.
db
is
not
None
:
self
.
db
.
close
()
self
.
storage
.
cleanup
()
def
openDB
(
self
):
from
ZODB.FileStorage
import
FileStorage
from
ZODB.DB
import
DB
n
=
'fs_tmp__%s'
%
os
.
getpid
()
self
.
storage
=
FileStorage
(
n
)
self
.
db
=
DB
(
self
.
storage
)
def
test_index_doc_conflict
(
self
):
self
.
index
=
OkapiIndex
(
Lexicon
())
self
.
openDB
()
r1
=
self
.
db
.
open
().
root
()
r1
[
'i'
]
=
self
.
index
transaction
.
commit
()
r2
=
self
.
db
.
open
().
root
()
copy
=
r2
[
'i'
]
# Make sure the data is loaded
list
(
copy
.
_docweight
.
items
())
list
(
copy
.
_docwords
.
items
())
list
(
copy
.
_wordinfo
.
items
())
list
(
copy
.
_lexicon
.
_wids
.
items
())
list
(
copy
.
_lexicon
.
_words
.
items
())
self
.
assertEqual
(
self
.
index
.
_p_serial
,
copy
.
_p_serial
)
self
.
index
.
index_doc
(
0
,
'The time has come'
)
transaction
.
commit
()
copy
.
index_doc
(
1
,
'That time has gone'
)
transaction
.
commit
()
def
test_reindex_doc_conflict
(
self
):
self
.
index
=
OkapiIndex
(
Lexicon
())
self
.
index
.
index_doc
(
0
,
'Sometimes change is good'
)
self
.
index
.
index_doc
(
1
,
'Then again, who asked'
)
self
.
openDB
()
r1
=
self
.
db
.
open
().
root
()
r1
[
'i'
]
=
self
.
index
transaction
.
commit
()
r2
=
self
.
db
.
open
().
root
()
copy
=
r2
[
'i'
]
# Make sure the data is loaded
list
(
copy
.
_docweight
.
items
())
list
(
copy
.
_docwords
.
items
())
list
(
copy
.
_wordinfo
.
items
())
list
(
copy
.
_lexicon
.
_wids
.
items
())
list
(
copy
.
_lexicon
.
_words
.
items
())
self
.
assertEqual
(
self
.
index
.
_p_serial
,
copy
.
_p_serial
)
self
.
index
.
index_doc
(
0
,
'Sometimes change isn
\
'
t bad'
)
transaction
.
commit
()
copy
.
index_doc
(
1
,
'Then again, who asked you?'
)
transaction
.
commit
()
class
TestUpgrade
(
TestCase
):
def
test_query_before_totaldoclen_upgrade
(
self
):
self
.
index1
=
OkapiIndex
(
Lexicon
(
Splitter
()))
self
.
index1
.
index_doc
(
0
,
'The quiet of night'
)
# Revert index1 back to a long to simulate an older index instance
self
.
index1
.
_totaldoclen
=
long
(
self
.
index1
.
_totaldoclen
())
self
.
assertEqual
(
len
(
self
.
index1
.
search
(
'night'
)),
1
)
def
test_upgrade_totaldoclen
(
self
):
self
.
index1
=
OkapiIndex
(
Lexicon
())
self
.
index2
=
OkapiIndex
(
Lexicon
())
self
.
index1
.
index_doc
(
0
,
'The quiet of night'
)
self
.
index2
.
index_doc
(
0
,
'The quiet of night'
)
# Revert index1 back to a long to simulate an older index instance
self
.
index1
.
_totaldoclen
=
long
(
self
.
index1
.
_totaldoclen
())
self
.
index1
.
index_doc
(
1
,
'gazes upon my shadow'
)
self
.
index2
.
index_doc
(
1
,
'gazes upon my shadow'
)
self
.
assertEqual
(
self
.
index1
.
_totaldoclen
(),
self
.
index2
.
_totaldoclen
())
self
.
index1
.
_totaldoclen
=
long
(
self
.
index1
.
_totaldoclen
())
self
.
index1
.
unindex_doc
(
0
)
self
.
index2
.
unindex_doc
(
0
)
self
.
assertEqual
(
self
.
index1
.
_totaldoclen
(),
self
.
index2
.
_totaldoclen
())
def
test_query_before_document_count_upgrade
(
self
):
self
.
index1
=
OkapiIndex
(
Lexicon
(
Splitter
()))
self
.
index1
.
index_doc
(
0
,
'The quiet of night'
)
# Revert index1 back to a long to simulate an older index instance
del
self
.
index1
.
document_count
self
.
assertEqual
(
len
(
self
.
index1
.
search
(
'night'
)),
1
)
def
test_upgrade_document_count
(
self
):
self
.
index1
=
OkapiIndex
(
Lexicon
())
self
.
index2
=
OkapiIndex
(
Lexicon
())
self
.
index1
.
index_doc
(
0
,
'The quiet of night'
)
self
.
index2
.
index_doc
(
0
,
'The quiet of night'
)
# Revert index1 back to simulate an older index instance
del
self
.
index1
.
document_count
self
.
index1
.
index_doc
(
1
,
'gazes upon my shadow'
)
self
.
index2
.
index_doc
(
1
,
'gazes upon my shadow'
)
self
.
assert_
(
self
.
index1
.
document_count
.
__class__
is
Length
)
self
.
assertEqual
(
self
.
index1
.
document_count
(),
self
.
index2
.
document_count
())
del
self
.
index1
.
document_count
self
.
index1
.
unindex_doc
(
0
)
self
.
index2
.
unindex_doc
(
0
)
self
.
assert_
(
self
.
index1
.
document_count
.
__class__
is
Length
)
self
.
assertEqual
(
self
.
index1
.
document_count
(),
self
.
index2
.
document_count
())
def
test_suite
():
return
TestSuite
((
makeSuite
(
CosineIndexTest
),
makeSuite
(
OkapiIndexTest
),
makeSuite
(
TestIndexConflict
),
makeSuite
(
TestUpgrade
),
))
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testLexicon.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Lexicon unit tests.
$Id$
"""
import
unittest
import
os
,
sys
import
ZODB
import
transaction
from
Products.ZCTextIndex.Lexicon
import
Lexicon
from
Products.ZCTextIndex.Lexicon
import
Splitter
,
CaseNormalizer
class
StupidPipelineElement
:
def
__init__
(
self
,
fromword
,
toword
):
self
.
__fromword
=
fromword
self
.
__toword
=
toword
def
process
(
self
,
seq
):
res
=
[]
for
term
in
seq
:
if
term
==
self
.
__fromword
:
res
.
append
(
self
.
__toword
)
else
:
res
.
append
(
term
)
return
res
class
WackyReversePipelineElement
:
def
__init__
(
self
,
revword
):
self
.
__revword
=
revword
def
process
(
self
,
seq
):
res
=
[]
for
term
in
seq
:
if
term
==
self
.
__revword
:
x
=
list
(
term
)
x
.
reverse
()
res
.
append
(
''
.
join
(
x
))
else
:
res
.
append
(
term
)
return
res
class
StopWordPipelineElement
:
def
__init__
(
self
,
stopdict
=
{}):
self
.
__stopdict
=
stopdict
def
process
(
self
,
seq
):
res
=
[]
for
term
in
seq
:
if
self
.
__stopdict
.
get
(
term
):
continue
else
:
res
.
append
(
term
)
return
res
class
Test
(
unittest
.
TestCase
):
def
test_z3interfaces
(
self
):
from
Products.ZCTextIndex.interfaces
import
ILexicon
from
zope.interface.verify
import
verifyClass
verifyClass
(
ILexicon
,
Lexicon
)
def
testSourceToWordIds
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
self
.
assertEqual
(
wids
,
[
1
,
2
,
3
])
def
testTermToWordIds
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'dogs'
)
self
.
assertEqual
(
wids
,
[
3
])
def
testMissingTermToWordIds
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'boxes'
)
self
.
assertEqual
(
wids
,
[
0
])
def
testTermToWordIdsWithProcess_post_glob
(
self
):
"""This test is for added process_post_glob"""
class
AddedSplitter
(
Splitter
):
def
process_post_glob
(
self
,
lst
):
assert
lst
==
[
'dogs'
]
return
[
'dogs'
]
lexicon
=
Lexicon
(
AddedSplitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'dogs'
)
self
.
assertEqual
(
wids
,
[
3
])
def
testMissingTermToWordIdsWithProcess_post_glob
(
self
):
"""This test is for added process_post_glob"""
class
AddedSplitter
(
Splitter
):
def
process_post_glob
(
self
,
lst
):
assert
lst
==
[
'dogs'
]
return
[
'fox'
]
lexicon
=
Lexicon
(
AddedSplitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'dogs'
)
self
.
assertEqual
(
wids
,
[
0
])
def
testOnePipelineElement
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
StupidPipelineElement
(
'dogs'
,
'fish'
))
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'fish'
)
self
.
assertEqual
(
wids
,
[
3
])
def
testSplitterAdaptorFold
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
CaseNormalizer
())
wids
=
lexicon
.
sourceToWordIds
(
'CATS and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'cats and dogs'
)
self
.
assertEqual
(
wids
,
[
1
,
2
,
3
])
def
testSplitterAdaptorNofold
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'CATS and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'cats and dogs'
)
self
.
assertEqual
(
wids
,
[
0
,
2
,
3
])
def
testTwoElementPipeline
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
StupidPipelineElement
(
'cats'
,
'fish'
),
WackyReversePipelineElement
(
'fish'
))
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'hsif'
)
self
.
assertEqual
(
wids
,
[
1
])
def
testThreeElementPipeline
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
StopWordPipelineElement
({
'and'
:
1
}),
StupidPipelineElement
(
'dogs'
,
'fish'
),
WackyReversePipelineElement
(
'fish'
))
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'hsif'
)
self
.
assertEqual
(
wids
,
[
2
])
def
testSplitterLocaleAwareness
(
self
):
from
Products.ZCTextIndex.HTMLSplitter
import
HTMLWordSplitter
import
locale
loc
=
locale
.
setlocale
(
locale
.
LC_ALL
)
# get current locale
# set German locale
try
:
if
sys
.
platform
!=
'win32'
:
locale
.
setlocale
(
locale
.
LC_ALL
,
'de_DE.ISO8859-1'
)
else
:
locale
.
setlocale
(
locale
.
LC_ALL
,
'German_Germany.1252'
)
except
locale
.
Error
:
return
# This test doesn't work here :-(
expected
=
[
'm
\
xfc
lltonne'
,
'waschb
\
xe4
r'
,
'beh
\
xf6
rde'
,
'
\
xfc
berflieger'
]
words
=
[
" "
.
join
(
expected
)]
words
=
Splitter
().
process
(
words
)
self
.
assertEqual
(
words
,
expected
)
words
=
HTMLWordSplitter
().
process
(
words
)
self
.
assertEqual
(
words
,
expected
)
locale
.
setlocale
(
locale
.
LC_ALL
,
loc
)
# restore saved locale
def
testUpgradeLength
(
self
):
from
BTrees.Length
import
Length
lexicon
=
Lexicon
(
Splitter
())
del
lexicon
.
length
# Older instances don't override length
lexicon
.
sourceToWordIds
(
'how now brown cow'
)
self
.
assert_
(
lexicon
.
length
.
__class__
is
Length
)
class
TestLexiconConflict
(
unittest
.
TestCase
):
db
=
None
def
tearDown
(
self
):
if
self
.
db
is
not
None
:
self
.
db
.
close
()
self
.
storage
.
cleanup
()
def
openDB
(
self
):
from
ZODB.FileStorage
import
FileStorage
from
ZODB.DB
import
DB
n
=
'fs_tmp__%s'
%
os
.
getpid
()
self
.
storage
=
FileStorage
(
n
)
self
.
db
=
DB
(
self
.
storage
)
def
testAddWordConflict
(
self
):
self
.
l
=
Lexicon
(
Splitter
())
self
.
openDB
()
r1
=
self
.
db
.
open
().
root
()
r1
[
'l'
]
=
self
.
l
transaction
.
commit
()
r2
=
self
.
db
.
open
().
root
()
copy
=
r2
[
'l'
]
# Make sure the data is loaded
list
(
copy
.
_wids
.
items
())
list
(
copy
.
_words
.
items
())
copy
.
length
()
self
.
assertEqual
(
self
.
l
.
_p_serial
,
copy
.
_p_serial
)
self
.
l
.
sourceToWordIds
(
'mary had a little lamb'
)
transaction
.
commit
()
copy
.
sourceToWordIds
(
'whose fleece was'
)
copy
.
sourceToWordIds
(
'white as snow'
)
transaction
.
commit
()
self
.
assertEqual
(
copy
.
length
(),
11
)
self
.
assertEqual
(
copy
.
length
(),
len
(
copy
.
_words
))
def
test_suite
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
unittest
.
makeSuite
(
Test
))
suite
.
addTest
(
unittest
.
makeSuite
(
TestLexiconConflict
))
return
suite
if
__name__
==
'__main__'
:
unittest
.
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testNBest.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
Products.ZCTextIndex.NBest
import
NBest
class
NBestTest
(
TestCase
):
def
testConstructor
(
self
):
self
.
assertRaises
(
ValueError
,
NBest
,
0
)
self
.
assertRaises
(
ValueError
,
NBest
,
-
1
)
for
n
in
range
(
1
,
11
):
nb
=
NBest
(
n
)
self
.
assertEqual
(
len
(
nb
),
0
)
self
.
assertEqual
(
nb
.
capacity
(),
n
)
def
testOne
(
self
):
nb
=
NBest
(
1
)
nb
.
add
(
'a'
,
0
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'a'
,
0
)])
nb
.
add
(
'b'
,
1
)
self
.
assertEqual
(
len
(
nb
),
1
)
self
.
assertEqual
(
nb
.
capacity
(),
1
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'b'
,
1
)])
nb
.
add
(
'c'
,
-
1
)
self
.
assertEqual
(
len
(
nb
),
1
)
self
.
assertEqual
(
nb
.
capacity
(),
1
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'b'
,
1
)])
nb
.
addmany
([(
'd'
,
3
),
(
'e'
,
-
6
),
(
'f'
,
5
),
(
'g'
,
4
)])
self
.
assertEqual
(
len
(
nb
),
1
)
self
.
assertEqual
(
nb
.
capacity
(),
1
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'f'
,
5
)])
def
testMany
(
self
):
import
random
inputs
=
[(
-
i
,
i
)
for
i
in
range
(
50
)]
reversed_inputs
=
inputs
[:]
reversed_inputs
.
reverse
()
# Test the N-best for a variety of n (1, 6, 11, ... 50).
for
n
in
range
(
1
,
len
(
inputs
)
+
1
,
5
):
expected
=
inputs
[
-
n
:]
expected
.
reverse
()
random_inputs
=
inputs
[:]
random
.
shuffle
(
random_inputs
)
for
source
in
inputs
,
reversed_inputs
,
random_inputs
:
# Try feeding them one at a time.
nb
=
NBest
(
n
)
for
item
,
score
in
source
:
nb
.
add
(
item
,
score
)
self
.
assertEqual
(
len
(
nb
),
n
)
self
.
assertEqual
(
nb
.
capacity
(),
n
)
self
.
assertEqual
(
nb
.
getbest
(),
expected
)
# And again in one gulp.
nb
=
NBest
(
n
)
nb
.
addmany
(
source
)
self
.
assertEqual
(
len
(
nb
),
n
)
self
.
assertEqual
(
nb
.
capacity
(),
n
)
self
.
assertEqual
(
nb
.
getbest
(),
expected
)
for
i
in
range
(
1
,
n
+
1
):
self
.
assertEqual
(
nb
.
pop_smallest
(),
expected
[
-
i
])
self
.
assertRaises
(
IndexError
,
nb
.
pop_smallest
)
def
test_suite
():
return
makeSuite
(
NBestTest
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testParseTree.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2008 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
import
unittest
class
ParseTreeTests
(
unittest
.
TestCase
):
def
_conforms
(
self
,
klass
):
from
zope.interface.verify
import
verifyClass
from
Products.ZCTextIndex.interfaces
import
IQueryParseTree
verifyClass
(
IQueryParseTree
,
klass
)
def
test_ParseTreeNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
ParseTreeNode
self
.
_conforms
(
ParseTreeNode
)
def
test_OrNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
OrNode
self
.
_conforms
(
OrNode
)
def
test_AndNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
self
.
_conforms
(
AndNode
)
def
test_NotNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
NotNode
self
.
_conforms
(
NotNode
)
def
test_GlobNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
GlobNode
self
.
_conforms
(
GlobNode
)
def
test_AtomNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
_conforms
(
AtomNode
)
def
test_PhraseNode_conforms_to_IQueryParseTree
(
self
):
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
self
.
_conforms
(
PhraseNode
)
def
test_suite
():
return
unittest
.
TestSuite
((
unittest
.
makeSuite
(
ParseTreeTests
),
))
if
__name__
==
"__main__"
:
unittest
.
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testPipelineFactory.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
Products.ZCTextIndex.interfaces
import
IPipelineElement
from
Products.ZCTextIndex.PipelineFactory
import
PipelineElementFactory
from
zope.interface
import
implements
class
NullPipelineElement
:
implements
(
IPipelineElement
)
def
process
(
source
):
pass
class
PipelineFactoryTest
(
TestCase
):
def
setUp
(
self
):
self
.
huey
=
NullPipelineElement
()
self
.
dooey
=
NullPipelineElement
()
self
.
louie
=
NullPipelineElement
()
self
.
daffy
=
NullPipelineElement
()
def
testPipeline
(
self
):
pf
=
PipelineElementFactory
()
pf
.
registerFactory
(
'donald'
,
'huey'
,
self
.
huey
)
pf
.
registerFactory
(
'donald'
,
'dooey'
,
self
.
dooey
)
pf
.
registerFactory
(
'donald'
,
'louie'
,
self
.
louie
)
pf
.
registerFactory
(
'looney'
,
'daffy'
,
self
.
daffy
)
self
.
assertRaises
(
ValueError
,
pf
.
registerFactory
,
'donald'
,
'huey'
,
self
.
huey
)
self
.
assertEqual
(
pf
.
getFactoryGroups
(),
[
'donald'
,
'looney'
])
self
.
assertEqual
(
pf
.
getFactoryNames
(
'donald'
),
[
'dooey'
,
'huey'
,
'louie'
])
def
test_suite
():
return
makeSuite
(
PipelineFactoryTest
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testQueryEngine.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
BTrees.IIBTree
import
IIBucket
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.ParseTree
import
ParseError
,
QueryError
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
Splitter
class
FauxIndex
:
def
search
(
self
,
term
):
b
=
IIBucket
()
if
term
==
"foo"
:
b
[
1
]
=
b
[
3
]
=
1
elif
term
==
"bar"
:
b
[
1
]
=
b
[
2
]
=
1
elif
term
==
"ham"
:
b
[
1
]
=
b
[
2
]
=
b
[
3
]
=
b
[
4
]
=
1
return
b
class
TestQueryEngine
(
TestCase
):
def
setUp
(
self
):
self
.
lexicon
=
Lexicon
(
Splitter
())
self
.
parser
=
QueryParser
(
self
.
lexicon
)
self
.
index
=
FauxIndex
()
def
compareSet
(
self
,
set
,
dict
):
d
=
{}
for
k
,
v
in
set
.
items
():
d
[
k
]
=
v
self
.
assertEqual
(
d
,
dict
)
def
compareQuery
(
self
,
query
,
dict
):
tree
=
self
.
parser
.
parseQuery
(
query
)
set
=
tree
.
executeQuery
(
self
.
index
)
self
.
compareSet
(
set
,
dict
)
def
testExecuteQuery
(
self
):
self
.
compareQuery
(
"foo AND bar"
,
{
1
:
2
})
self
.
compareQuery
(
"foo OR bar"
,
{
1
:
2
,
2
:
1
,
3
:
1
})
self
.
compareQuery
(
"foo AND NOT bar"
,
{
3
:
1
})
self
.
compareQuery
(
"foo AND foo AND foo"
,
{
1
:
3
,
3
:
3
})
self
.
compareQuery
(
"foo OR foo OR foo"
,
{
1
:
3
,
3
:
3
})
self
.
compareQuery
(
"ham AND NOT foo AND NOT bar"
,
{
4
:
1
})
self
.
compareQuery
(
"ham OR foo OR bar"
,
{
1
:
3
,
2
:
2
,
3
:
2
,
4
:
1
})
self
.
compareQuery
(
"ham AND foo AND bar"
,
{
1
:
3
})
def
testInvalidQuery
(
self
):
from
Products.ZCTextIndex.ParseTree
import
NotNode
,
AtomNode
tree
=
NotNode
(
AtomNode
(
"foo"
))
self
.
assertRaises
(
QueryError
,
tree
.
executeQuery
,
self
.
index
)
def
test_suite
():
return
makeSuite
(
TestQueryEngine
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testQueryParser.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
class
TestInterfaces
(
TestCase
):
def
testInterfaces
(
self
):
from
zope.interface.verify
import
verifyClass
from
Products.ZCTextIndex.interfaces
import
IQueryParser
from
Products.ZCTextIndex.QueryParser
import
QueryParser
verifyClass
(
IQueryParser
,
QueryParser
)
class
TestQueryParserBase
(
TestCase
):
def
setUp
(
self
):
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.Lexicon
import
Lexicon
from
Products.ZCTextIndex.Lexicon
import
Splitter
self
.
lexicon
=
Lexicon
(
Splitter
())
self
.
parser
=
QueryParser
(
self
.
lexicon
)
def
expect
(
self
,
input
,
output
,
expected_ignored
=
[]):
tree
=
self
.
parser
.
parseQuery
(
input
)
ignored
=
self
.
parser
.
getIgnored
()
self
.
compareParseTrees
(
tree
,
output
)
self
.
assertEqual
(
ignored
,
expected_ignored
)
# Check that parseQueryEx() == (parseQuery(), getIgnored())
ex_tree
,
ex_ignored
=
self
.
parser
.
parseQueryEx
(
input
)
self
.
compareParseTrees
(
ex_tree
,
tree
)
self
.
assertEqual
(
ex_ignored
,
expected_ignored
)
def
failure
(
self
,
input
):
from
Products.ZCTextIndex.ParseTree
import
ParseError
self
.
assertRaises
(
ParseError
,
self
.
parser
.
parseQuery
,
input
)
self
.
assertRaises
(
ParseError
,
self
.
parser
.
parseQueryEx
,
input
)
def
compareParseTrees
(
self
,
got
,
expected
,
msg
=
None
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
GlobNode
from
Products.ZCTextIndex.ParseTree
import
NotNode
from
Products.ZCTextIndex.ParseTree
import
OrNode
from
Products.ZCTextIndex.ParseTree
import
ParseTreeNode
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
if
msg
is
None
:
msg
=
repr
(
got
)
self
.
assertEqual
(
isinstance
(
got
,
ParseTreeNode
),
1
)
self
.
assertEqual
(
got
.
__class__
,
expected
.
__class__
,
msg
)
if
isinstance
(
got
,
PhraseNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"PHRASE"
,
msg
)
self
.
assertEqual
(
got
.
getValue
(),
expected
.
getValue
(),
msg
)
elif
isinstance
(
got
,
GlobNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"GLOB"
,
msg
)
self
.
assertEqual
(
got
.
getValue
(),
expected
.
getValue
(),
msg
)
elif
isinstance
(
got
,
AtomNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"ATOM"
,
msg
)
self
.
assertEqual
(
got
.
getValue
(),
expected
.
getValue
(),
msg
)
elif
isinstance
(
got
,
NotNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"NOT"
)
self
.
compareParseTrees
(
got
.
getValue
(),
expected
.
getValue
(),
msg
)
elif
isinstance
(
got
,
AndNode
)
or
isinstance
(
got
,
OrNode
):
self
.
assertEqual
(
got
.
nodeType
(),
isinstance
(
got
,
AndNode
)
and
"AND"
or
"OR"
,
msg
)
list1
=
got
.
getValue
()
list2
=
expected
.
getValue
()
self
.
assertEqual
(
len
(
list1
),
len
(
list2
),
msg
)
for
i
in
range
(
len
(
list1
)):
self
.
compareParseTrees
(
list1
[
i
],
list2
[
i
],
msg
)
class
TestQueryParser
(
TestQueryParserBase
):
def
test001
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
"foo"
,
AtomNode
(
"foo"
))
def
test002
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
"note"
,
AtomNode
(
"note"
))
def
test003
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
"aa and bb AND cc"
,
AndNode
([
AtomNode
(
"aa"
),
AtomNode
(
"bb"
),
AtomNode
(
"cc"
)]))
def
test004
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
OrNode
self
.
expect
(
"aa OR bb or cc"
,
OrNode
([
AtomNode
(
"aa"
),
AtomNode
(
"bb"
),
AtomNode
(
"cc"
)]))
def
test005
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
OrNode
self
.
expect
(
"aa AND bb OR cc AnD dd"
,
OrNode
([
AndNode
([
AtomNode
(
"aa"
),
AtomNode
(
"bb"
)]),
AndNode
([
AtomNode
(
"cc"
),
AtomNode
(
"dd"
)])]))
def
test006
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
OrNode
self
.
expect
(
"(aa OR bb) AND (cc OR dd)"
,
AndNode
([
OrNode
([
AtomNode
(
"aa"
),
AtomNode
(
"bb"
)]),
OrNode
([
AtomNode
(
"cc"
),
AtomNode
(
"dd"
)])]))
def
test007
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
NotNode
self
.
expect
(
"aa AND NOT bb"
,
AndNode
([
AtomNode
(
"aa"
),
NotNode
(
AtomNode
(
"bb"
))]))
def
test010
(
self
):
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
self
.
expect
(
'"foo bar"'
,
PhraseNode
([
"foo"
,
"bar"
]))
def
test011
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
"foo bar"
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
def
test012
(
self
):
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
self
.
expect
(
'(("foo bar"))"'
,
PhraseNode
([
"foo"
,
"bar"
]))
def
test013
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
"((foo bar))"
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
def
test014
(
self
):
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
self
.
expect
(
"foo-bar"
,
PhraseNode
([
"foo"
,
"bar"
]))
def
test015
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
NotNode
self
.
expect
(
"foo -bar"
,
AndNode
([
AtomNode
(
"foo"
),
NotNode
(
AtomNode
(
"bar"
))]))
def
test016
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
NotNode
self
.
expect
(
"-foo bar"
,
AndNode
([
AtomNode
(
"bar"
),
NotNode
(
AtomNode
(
"foo"
))]))
def
test017
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
NotNode
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
self
.
expect
(
"booh -foo-bar"
,
AndNode
([
AtomNode
(
"booh"
),
NotNode
(
PhraseNode
([
"foo"
,
"bar"
]))]))
def
test018
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
NotNode
from
Products.ZCTextIndex.ParseTree
import
PhraseNode
self
.
expect
(
'booh -"foo bar"'
,
AndNode
([
AtomNode
(
"booh"
),
NotNode
(
PhraseNode
([
"foo"
,
"bar"
]))]))
def
test019
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'foo"bar"'
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
def
test020
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'"foo"bar'
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
def
test021
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'foo"bar"blech'
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
),
AtomNode
(
"blech"
)]))
def
test022
(
self
):
from
Products.ZCTextIndex.ParseTree
import
GlobNode
self
.
expect
(
"foo*"
,
GlobNode
(
"foo*"
))
def
test023
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
from
Products.ZCTextIndex.ParseTree
import
GlobNode
self
.
expect
(
"foo* bar"
,
AndNode
([
GlobNode
(
"foo*"
),
AtomNode
(
"bar"
)]))
def
test024
(
self
):
# Split by UTF-8 fullwidth space
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
"foo
\
xe3
\
x80
\
x80
bar"
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
def
test025
(
self
):
# Split by Unicode fullwidth space
from
Products.ZCTextIndex.ParseTree
import
AndNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
u"foo
\
u3000
bar"
,
AndNode
([
AtomNode
(
u"foo"
),
AtomNode
(
u"bar"
)]))
def
test101
(
self
):
self
.
failure
(
""
)
def
test102
(
self
):
self
.
failure
(
"not"
)
def
test103
(
self
):
self
.
failure
(
"or"
)
def
test104
(
self
):
self
.
failure
(
"and"
)
def
test105
(
self
):
self
.
failure
(
"NOT"
)
def
test106
(
self
):
self
.
failure
(
"OR"
)
def
test107
(
self
):
self
.
failure
(
"AND"
)
def
test108
(
self
):
self
.
failure
(
"NOT foo"
)
def
test109
(
self
):
self
.
failure
(
")"
)
def
test110
(
self
):
self
.
failure
(
"("
)
def
test111
(
self
):
self
.
failure
(
"foo OR"
)
def
test112
(
self
):
self
.
failure
(
"foo AND"
)
def
test113
(
self
):
self
.
failure
(
"OR foo"
)
def
test114
(
self
):
self
.
failure
(
"AND foo"
)
def
test115
(
self
):
self
.
failure
(
"(foo) bar"
)
def
test116
(
self
):
self
.
failure
(
"(foo OR)"
)
def
test117
(
self
):
self
.
failure
(
"(foo AND)"
)
def
test118
(
self
):
self
.
failure
(
"(NOT foo)"
)
def
test119
(
self
):
self
.
failure
(
"-foo"
)
def
test120
(
self
):
self
.
failure
(
"-foo -bar"
)
def
test121
(
self
):
self
.
failure
(
"foo OR -bar"
)
def
test122
(
self
):
self
.
failure
(
"foo AND -bar"
)
class
StopWordTestQueryParser
(
TestQueryParserBase
):
def
setUp
(
self
):
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.Lexicon
import
Lexicon
from
Products.ZCTextIndex.Lexicon
import
Splitter
# Only 'stop' is a stopword (but 'and' is still an operator)
self
.
lexicon
=
Lexicon
(
Splitter
(),
FakeStopWordRemover
())
self
.
parser
=
QueryParser
(
self
.
lexicon
)
def
test201
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'and/'
,
AtomNode
(
"and"
))
def
test202
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'foo AND stop'
,
AtomNode
(
"foo"
),
[
"stop"
])
def
test203
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'foo AND NOT stop'
,
AtomNode
(
"foo"
),
[
"stop"
])
def
test204
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'stop AND foo'
,
AtomNode
(
"foo"
),
[
"stop"
])
def
test205
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'foo OR stop'
,
AtomNode
(
"foo"
),
[
"stop"
])
def
test206
(
self
):
from
Products.ZCTextIndex.ParseTree
import
AtomNode
self
.
expect
(
'stop OR foo'
,
AtomNode
(
"foo"
),
[
"stop"
])
def
test301
(
self
):
self
.
failure
(
'stop'
)
def
test302
(
self
):
self
.
failure
(
'stop stop'
)
def
test303
(
self
):
self
.
failure
(
'stop AND stop'
)
def
test304
(
self
):
self
.
failure
(
'stop OR stop'
)
def
test305
(
self
):
self
.
failure
(
'stop -foo'
)
def
test306
(
self
):
self
.
failure
(
'stop AND NOT foo'
)
class
FakeStopWordRemover
:
def
process
(
self
,
list
):
return
[
word
for
word
in
list
if
word
!=
"stop"
]
def
test_suite
():
return
TestSuite
((
makeSuite
(
TestQueryParser
),
makeSuite
(
StopWordTestQueryParser
),
makeSuite
(
TestInterfaces
),
))
if
__name__
==
"__main__"
:
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testSetOps.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
BTrees.IIBTree
import
IIBTree
,
IIBucket
from
Products.ZCTextIndex.SetOps
import
mass_weightedIntersection
from
Products.ZCTextIndex.SetOps
import
mass_weightedUnion
class
TestSetOps
(
TestCase
):
def
testEmptyLists
(
self
):
self
.
assertEqual
(
len
(
mass_weightedIntersection
([])),
0
)
self
.
assertEqual
(
len
(
mass_weightedUnion
([])),
0
)
def
testIdentity
(
self
):
t
=
IIBTree
([(
1
,
2
)])
b
=
IIBucket
([(
1
,
2
)])
for
x
in
t
,
b
:
for
func
in
mass_weightedUnion
,
mass_weightedIntersection
:
result
=
func
([(
x
,
1
)])
self
.
assertEqual
(
len
(
result
),
1
)
self
.
assertEqual
(
list
(
result
.
items
()),
list
(
x
.
items
()))
def
testScalarMultiply
(
self
):
t
=
IIBTree
([(
1
,
2
),
(
2
,
3
),
(
3
,
4
)])
allkeys
=
[
1
,
2
,
3
]
b
=
IIBucket
(
t
)
for
x
in
t
,
b
:
self
.
assertEqual
(
list
(
x
.
keys
()),
allkeys
)
for
func
in
mass_weightedUnion
,
mass_weightedIntersection
:
for
factor
in
0
,
1
,
5
,
10
:
result
=
func
([(
x
,
factor
)])
self
.
assertEqual
(
allkeys
,
list
(
result
.
keys
()))
for
key
in
x
.
keys
():
self
.
assertEqual
(
x
[
key
]
*
factor
,
result
[
key
])
def
testPairs
(
self
):
t1
=
IIBTree
([(
1
,
10
),
(
3
,
30
),
(
7
,
70
)])
t2
=
IIBTree
([(
3
,
30
),
(
5
,
50
),
(
7
,
7
),
(
9
,
90
)])
allkeys
=
[
1
,
3
,
5
,
7
,
9
]
b1
=
IIBucket
(
t1
)
b2
=
IIBucket
(
t2
)
for
x
in
t1
,
t2
,
b1
,
b2
:
for
key
in
x
.
keys
():
self
.
assertEqual
(
key
in
allkeys
,
1
)
for
y
in
t1
,
t2
,
b1
,
b2
:
for
w1
,
w2
in
(
0
,
0
),
(
1
,
10
),
(
10
,
1
),
(
2
,
3
):
# Test the union.
expected
=
[]
for
key
in
allkeys
:
if
x
.
has_key
(
key
)
or
y
.
has_key
(
key
):
result
=
x
.
get
(
key
,
0
)
*
w1
+
y
.
get
(
key
,
0
)
*
w2
expected
.
append
((
key
,
result
))
expected
.
sort
()
got
=
mass_weightedUnion
([(
x
,
w1
),
(
y
,
w2
)])
self
.
assertEqual
(
expected
,
list
(
got
.
items
()))
got
=
mass_weightedUnion
([(
y
,
w2
),
(
x
,
w1
)])
self
.
assertEqual
(
expected
,
list
(
got
.
items
()))
# Test the intersection.
expected
=
[]
for
key
in
allkeys
:
if
x
.
has_key
(
key
)
and
y
.
has_key
(
key
):
result
=
x
[
key
]
*
w1
+
y
[
key
]
*
w2
expected
.
append
((
key
,
result
))
expected
.
sort
()
got
=
mass_weightedIntersection
([(
x
,
w1
),
(
y
,
w2
)])
self
.
assertEqual
(
expected
,
list
(
got
.
items
()))
got
=
mass_weightedIntersection
([(
y
,
w2
),
(
x
,
w1
)])
self
.
assertEqual
(
expected
,
list
(
got
.
items
()))
def
testMany
(
self
):
import
random
N
=
15
# number of IIBTrees to feed in
L
=
[]
commonkey
=
N
*
1000
allkeys
=
{
commonkey
:
1
}
for
i
in
range
(
N
):
t
=
IIBTree
()
t
[
commonkey
]
=
i
for
j
in
range
(
N
-
i
):
key
=
i
+
j
allkeys
[
key
]
=
1
t
[
key
]
=
N
*
i
+
j
L
.
append
((
t
,
i
+
1
))
random
.
shuffle
(
L
)
allkeys
=
allkeys
.
keys
()
allkeys
.
sort
()
# Test the union.
expected
=
[]
for
key
in
allkeys
:
sum
=
0
for
t
,
w
in
L
:
if
t
.
has_key
(
key
):
sum
+=
t
[
key
]
*
w
expected
.
append
((
key
,
sum
))
# print 'union', expected
got
=
mass_weightedUnion
(
L
)
self
.
assertEqual
(
expected
,
list
(
got
.
items
()))
# Test the intersection.
expected
=
[]
for
key
in
allkeys
:
sum
=
0
for
t
,
w
in
L
:
if
t
.
has_key
(
key
):
sum
+=
t
[
key
]
*
w
else
:
break
else
:
# We didn't break out of the loop so it's in the intersection.
expected
.
append
((
key
,
sum
))
# print 'intersection', expected
got
=
mass_weightedIntersection
(
L
)
self
.
assertEqual
(
expected
,
list
(
got
.
items
()))
def
test_suite
():
return
makeSuite
(
TestSetOps
)
if
__name__
==
"__main__"
:
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testStopper.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Tests for the C version of the StopWordRemover."""
import
unittest
from
Products.ZCTextIndex
import
stopper
class
StopperTest
(
unittest
.
TestCase
):
def
test_process_typeerror
(
self
):
self
.
assertRaises
(
TypeError
,
stopper
.
process
,
42
,
[])
self
.
assertRaises
(
TypeError
,
stopper
.
process
,
{},
42
)
self
.
assertRaises
(
TypeError
,
stopper
.
process
,
{})
self
.
assertRaises
(
TypeError
,
stopper
.
process
,
{},
[],
'extra arg'
)
def
test_process_nostops
(
self
):
words
=
[
'a'
,
'b'
,
'c'
,
'splat!'
]
self
.
assertEqual
(
words
,
stopper
.
process
({},
words
))
def
test_process_somestops
(
self
):
d
=
{
'b'
:
1
,
'splat!'
:
1
}
words
=
[
'a'
,
'b'
,
'c'
,
'splat!'
]
self
.
assertEqual
([
'a'
,
'c'
],
stopper
.
process
(
d
,
words
))
def
test_process_allstops
(
self
):
d
=
{
'a'
:
1
,
'b'
:
1
,
'c'
:
1
,
'splat!'
:
1
}
words
=
[
'a'
,
'b'
,
'c'
,
'splat!'
]
self
.
assertEqual
([],
stopper
.
process
(
d
,
words
))
def
test_suite
():
return
unittest
.
makeSuite
(
StopperTest
)
if
__name__
==
"__main__"
:
unittest
.
main
(
defaultTest
=
'test_suite'
)
src/Products/ZCTextIndex/tests/testZCTextIndex.py
deleted
100644 → 0
View file @
48f67574
##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""ZCTextIndex unit tests.
$Id$
"""
import
unittest
import
re
import
Acquisition
from
zExceptions
import
NotFound
from
Products.ZCTextIndex.ZCTextIndex
import
ZCTextIndex
,
PLexicon
from
Products.ZCTextIndex.tests
import
\
testIndex
,
testQueryEngine
,
testQueryParser
from
Products.ZCTextIndex.BaseIndex
import
\
scaled_int
,
SCALE_FACTOR
,
inverse_doc_frequency
from
Products.ZCTextIndex.CosineIndex
import
CosineIndex
from
Products.ZCTextIndex.OkapiIndex
import
OkapiIndex
from
Products.ZCTextIndex.Lexicon
import
Splitter
from
Products.ZCTextIndex.Lexicon
import
CaseNormalizer
,
StopWordRemover
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.StopDict
import
get_stopdict
from
Products.ZCTextIndex.ParseTree
import
ParseError
class
Indexable
:
def
__init__
(
self
,
text
):
self
.
text
=
text
class
Indexable2
:
def
__init__
(
self
,
text1
,
text2
):
self
.
text1
=
text1
self
.
text2
=
text2
class
LexiconHolder
(
Acquisition
.
Implicit
):
def
__init__
(
self
,
lexicon
):
self
.
lexicon
=
lexicon
def
getPhysicalPath
(
self
):
return
(
''
,)
# Pretend to be the root
def
dummyUnrestrictedTraverse
(
self
,
path
):
if
path
==
(
''
,
'lexicon'
,):
return
self
.
lexicon
raise
NotFound
,
path
# The tests classes below create a ZCTextIndex(). Then they create
# instance variables that point to the internal components used by
# ZCTextIndex. These tests run the individual module unit tests with
# the fully integrated ZCTextIndex.
def
eq
(
scaled1
,
scaled2
,
epsilon
=
scaled_int
(
0.01
)):
if
abs
(
scaled1
-
scaled2
)
>
epsilon
:
raise
AssertionError
,
"%s != %s"
%
(
scaled1
,
scaled2
)
# A series of text chunks to use for the re-index tests (testDocUpdate).
text
=
[
"""Here's a knocking indeed! If a
man were porter of hell-gate, he should have
old turning the key. knock (that made sure
sure there's at least one word in common)."""
"""Knock,
knock, knock! Who's there, i' the name of
Beelzebub? Here's a farmer, that hanged
himself on the expectation of plenty: come in
time; have napkins enow about you; here
you'll sweat for't."""
,
"""Knock,
knock! Who's there, in the other devil's
name? Faith, here's an equivocator, that could
swear in both the scales against either scale;
who committed treason enough for God's sake,
yet could not equivocate to heaven: O, come
in, equivocator."""
,
"""Knock,
knock, knock! Who's there? Faith, here's an
English tailor come hither, for stealing out of
a French hose: come in, tailor; here you may
roast your goose."""
,
"""Knock,
knock; never at quiet! What are you? But
this place is too cold for hell. I'll devil-porter
it no further: I had thought to have let in
some of all professions that go the primrose
way to the everlasting bonfire."""
]
# Subclasses should derive from one of testIndex.{CosineIndexTest,
# OkapiIndexTest} too.
class
ZCIndexTestsBase
:
def
setUp
(
self
):
self
.
lexicon
=
PLexicon
(
'lexicon'
,
''
,
Splitter
(),
CaseNormalizer
(),
StopWordRemover
())
caller
=
LexiconHolder
(
self
.
lexicon
)
self
.
zc_index
=
ZCTextIndex
(
'name'
,
None
,
caller
,
self
.
IndexFactory
,
'text'
,
'lexicon'
)
self
.
index
=
self
.
zc_index
.
index
def
parserFailure
(
self
,
query
):
self
.
assertRaises
(
ParseError
,
self
.
zc_index
.
query
,
query
)
def
parserSuccess
(
self
,
query
,
n
):
r
,
num
=
self
.
zc_index
.
query
(
query
)
self
.
assertEqual
(
num
,
n
)
if
n
:
self
.
assertEqual
(
r
[
0
][
0
],
1
)
def
testMultipleAttributes
(
self
):
lexicon
=
PLexicon
(
'lexicon'
,
''
,
Splitter
(),
CaseNormalizer
(),
StopWordRemover
())
caller
=
LexiconHolder
(
self
.
lexicon
)
zc_index
=
ZCTextIndex
(
'name'
,
None
,
caller
,
self
.
IndexFactory
,
'text1,text2'
,
'lexicon'
)
doc
=
Indexable2
(
'foo bar'
,
'alpha omega'
)
zc_index
.
index_object
(
1
,
doc
)
nbest
,
total
=
zc_index
.
query
(
'foo'
)
self
.
assertEqual
(
len
(
nbest
),
1
)
nbest
,
total
=
zc_index
.
query
(
'foo alpha'
)
self
.
assertEqual
(
len
(
nbest
),
1
)
nbest
,
total
=
zc_index
.
query
(
'foo alpha gamma'
)
self
.
assertEqual
(
len
(
nbest
),
0
)
def
testListAttributes
(
self
):
lexicon
=
PLexicon
(
'lexicon'
,
''
,
Splitter
(),
CaseNormalizer
(),
StopWordRemover
())
caller
=
LexiconHolder
(
self
.
lexicon
)
zc_index
=
ZCTextIndex
(
'name'
,
None
,
caller
,
self
.
IndexFactory
,
'text1,text2'
,
'lexicon'
)
doc
=
Indexable2
(
'Hello Tim'
,
\
[
'Now is the winter of our discontent'
,
'Made glorious summer by this sun of York'
,
])
zc_index
.
index_object
(
1
,
doc
)
nbest
,
total
=
zc_index
.
query
(
'glorious'
)
self
.
assertEqual
(
len
(
nbest
),
1
)
nbest
,
total
=
zc_index
.
query
(
'York Tim'
)
self
.
assertEqual
(
len
(
nbest
),
1
)
nbest
,
total
=
zc_index
.
query
(
'Tuesday Tim York'
)
self
.
assertEqual
(
len
(
nbest
),
0
)
def
testStopWords
(
self
):
# the only non-stopword is question
text
=
(
"to be or not to be "
"that is the question"
)
doc
=
Indexable
(
text
)
self
.
zc_index
.
index_object
(
1
,
doc
)
for
word
in
text
.
split
():
if
word
!=
"question"
:
wids
=
self
.
lexicon
.
termToWordIds
(
word
)
self
.
assertEqual
(
wids
,
[])
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
1
)),
1
)
self
.
parserSuccess
(
'question'
,
1
)
self
.
parserSuccess
(
'question AND to AND be'
,
1
)
self
.
parserSuccess
(
'to AND question AND be'
,
1
)
self
.
parserSuccess
(
'question AND NOT gardenia'
,
1
)
self
.
parserSuccess
(
'question AND gardenia'
,
0
)
self
.
parserSuccess
(
'gardenia'
,
0
)
self
.
parserSuccess
(
'question OR gardenia'
,
1
)
self
.
parserSuccess
(
'question AND NOT to AND NOT be'
,
1
)
self
.
parserSuccess
(
'question OR to OR be'
,
1
)
self
.
parserSuccess
(
'question to be'
,
1
)
self
.
parserFailure
(
'to be'
)
self
.
parserFailure
(
'to AND be'
)
self
.
parserFailure
(
'to OR be'
)
self
.
parserFailure
(
'to AND NOT be'
)
self
.
parserFailure
(
'to AND NOT question'
)
self
.
parserFailure
(
'to AND NOT gardenia'
)
def
testDocUpdate
(
self
):
docid
=
1
# doesn't change -- we index the same doc repeatedly
N
=
len
(
text
)
stop
=
get_stopdict
()
d
=
{}
# word -> list of version numbers containing that word
for
version
,
i
in
zip
(
text
,
range
(
N
)):
# use a simple splitter rather than an official one
words
=
[
w
for
w
in
re
.
split
(
"
\
W+
"
, version.lower())
if len(w) > 1 and not stop.has_key(w)]
word_seen = {}
for w in words:
if not word_seen.has_key(w):
d.setdefault(w, []).append(i)
word_seen[w] = 1
unique = {} # version number -> list of words unique to that version
common = [] # list of words common to all versions
for w, versionlist in d.items():
if len(versionlist) == 1:
unique.setdefault(versionlist[0], []).append(w)
elif len(versionlist) == N:
common.append(w)
self.assert_(len(common) > 0)
self.assert_(len(unique) > 0)
for version, i in zip(text, range(N)):
doc = Indexable(version)
self.zc_index.index_object(docid, doc)
for w in common:
nbest, total = self.zc_index.query(w)
self.assertEqual(total, 1, "
did
not
find
%
s
" % w)
for k, v in unique.items():
if k == i:
continue
for w in v:
nbest, total = self.zc_index.query(w)
self.assertEqual(total, 0, "
did
not
expect
to
find
%
s
" % w)
class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
# A fairly involved test of the ranking calculations based on
# an example set of documents in queries in Managing
# Gigabytes, pp. 180-188. This test peeks into many internals of the
# cosine indexer.
def test_z3interfaces(self):
from Products.PluginIndexes.interfaces import IPluggableIndex
from Products.ZCTextIndex.interfaces import IZCTextIndex
from zope.interface.verify import verifyClass
verifyClass(IPluggableIndex, ZCTextIndex)
verifyClass(IZCTextIndex, ZCTextIndex)
def testRanking(self):
self.words = ["
cold
", "
days
", "
eat
", "
hot
", "
lot
", "
nine
", "
old
",
"
pease
", "
porridge
", "
pot
"]
self.docs = ["
Pease
porridge
hot
,
pease
porridge
cold
,
",
"
Pease
porridge
in
the
pot
,
",
"
Nine
days
old
.
",
"
In
the
pot
cold
,
in
the
pot
hot
,
",
"
Pease
porridge
,
pease
porridge
,
",
"
Eat
the
lot
.
"]
self._ranking_index()
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
# A digression to exercise re-indexing.
docs = self.docs
for variant in "
hot
cold
porridge
python
", "
pease
hot
pithy
":
self.zc_index.index_object(len(docs), Indexable(variant))
try:
self._ranking_tf()
except (AssertionError, KeyError):
pass
else:
self.fail("
expected
_ranking_tf
()
to
fail
--
reindex
")
try:
self._ranking_idf()
except (AssertionError, KeyError):
pass
else:
self.fail("
expected
_ranking_idf
()
to
fail
--
reindex
")
try:
self._ranking_queries()
except AssertionError:
pass
else:
self.fail("
expected
_ranking_queries
()
to
fail
--
reindex
")
# This should leave things exactly as they were.
self.zc_index.index_object(len(docs), Indexable(docs[-1]))
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self):
docs = self.docs
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
def _ranking_tf(self):
# matrix of term weights for the rows are docids
# and the columns are indexes into this list:
l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
for i in range(len(l_Wd)):
docid = i + 1
scaled_Wd = scaled_int(l_Wd[i])
eq(scaled_Wd, self.index._get_Wd(docid))
wdts = [scaled_int(t) for t in l_wdt[i]]
for j in range(len(wdts)):
wdt = self.index._get_wdt(docid, self.words[j])
eq(wdts[j], wdt)
def _ranking_idf(self):
word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
for i in range(len(self.words)):
word = self.words[i]
eq(word_freqs[i], self.index._get_ft(word))
eq(scaled_int(idfs[i]), self.index._get_wt(word))
def _ranking_queries(self):
queries = ["
eat
", "
porridge
", "
hot
OR
porridge
",
"
eat
OR
nine
OR
day
OR
old
OR
porridge
"]
wqs = [1.95, 1.10, 1.77, 3.55]
results = [[(6, 0.71)],
[(1, 0.61), (2, 0.58), (5, 0.71)],
[(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
[(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
for i in range(len(queries)):
raw = queries[i]
q = QueryParser(self.lexicon).parseQuery(raw)
wq = self.index.query_weight(q.terms())
eq(wq, scaled_int(wqs[i]))
r, n = self.zc_index.query(raw)
self.assertEqual(len(r), len(results[i]))
# convert the results to a dict for each checking
d = {}
for doc, score in results[i]:
d[doc] = scaled_int(score)
for doc, score in r:
score = scaled_int(float(score / SCALE_FACTOR) / wq)
self.assert_(0 <= score <= SCALE_FACTOR)
eq(d[doc], score)
class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
# A white-box test.
def testAbsoluteScores(self):
docs = ["
one
",
"
one
two
",
"
one
two
three
"]
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
self._checkAbsoluteScores()
# Exercise re-indexing.
for variant in "
one
xyz
", "
xyz
two
three
", "
abc
def
":
self.zc_index.index_object(len(docs), Indexable(variant))
try:
self._checkAbsoluteScores()
except AssertionError:
pass
else:
self.fail("
expected
_checkAbsoluteScores
()
to
fail
--
reindex
")
# This should leave things exactly as they were.
self.zc_index.index_object(len(docs), Indexable(docs[-1]))
self._checkAbsoluteScores()
def _checkAbsoluteScores(self):
self.assertEqual(self.index._totaldoclen(), 6)
# So the mean doc length is 2. We use that later.
r, num = self.zc_index.query("
one
")
self.assertEqual(num, 3)
self.assertEqual(len(r), 3)
# Because our Okapi's B parameter is > 0, and "
one
" only appears
# once in each doc, the verbosity hypothesis favors shorter docs.
self.assertEqual([doc for doc, score in r], [1, 2, 3])
# The way the Okapi math works, a word that appears exactly once in
# an average (length) doc gets tf score 1. Our second doc has
# an average length, so its score should by 1 (tf) times the
# inverse doc frequency of "
one
". But "
one
" appears in every
# doc, so its IDF is log(1 + 3/3) = log(2).
self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3)))
# Similarly for "
two
".
r, num = self.zc_index.query("
two
")
self.assertEqual(num, 2)
self.assertEqual(len(r), 2)
self.assertEqual([doc for doc, score in r], [2, 3])
self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3)))
# And "
three
", except that doesn't appear in an average-size doc, so
# the math is much more involved.
r, num = self.zc_index.query("
three
")
self.assertEqual(num, 1)
self.assertEqual(len(r), 1)
self.assertEqual([doc for doc, score in r], [3])
idf = inverse_doc_frequency(1, 3)
meandoclen = 2.0
lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen
tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight)
self.assertEqual(r[0][1], scaled_int(tf * idf))
# More of a black-box test, but based on insight into how Okapi is trying
# to think.
def testRelativeScores(self):
# Create 9 10-word docs.
# All contain one instance of "
one
".
# Doc #i contains i instances of "
two
" and 9-i of "
xyz
".
for i in range(1, 10):
doc = "
one
" + "
two
" * i + "
xyz
" * (9 - i)
self.zc_index.index_object(i, Indexable(doc))
self._checkRelativeScores()
# Exercise re-indexing.
self.zc_index.index_object(9, Indexable("
two
xyz
"))
try:
self._checkRelativeScores()
except AssertionError:
pass
else:
self.fail("
expected
_checkRelativeScores
()
to
fail
after
reindex
")
# This should leave things exactly as they were.
self.zc_index.index_object(9, Indexable(doc))
self._checkRelativeScores()
def _checkRelativeScores(self):
r, num = self.zc_index.query("
one
two
")
self.assertEqual(num, 9)
self.assertEqual(len(r), 9)
# The more twos in a doc, the better the score should be.
self.assertEqual([doc for doc, score in r], range(9, 0, -1))
# Search for "
two
" alone shouldn't make any difference to relative
# results.
r, num = self.zc_index.query("
two
")
self.assertEqual(num, 9)
self.assertEqual(len(r), 9)
self.assertEqual([doc for doc, score in r], range(9, 0, -1))
# Searching for xyz should skip doc 9, and favor the lower-numbered
# docs (they have more instances of xyz).
r, num = self.zc_index.query("
xyz
")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual([doc for doc, score in r], range(1, 9))
# And relative results shouldn't change if we add "
one
".
r, num = self.zc_index.query("
xyz
one
")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual([doc for doc, score in r], range(1, 9))
# But if we search for all the words, it's much muddier. The boost
# in going from i instances to i+1 of a given word is smaller than
# the boost in going from i-1 to i, so the winner will be the one
# that balances the # of twos and xyzs best. But the test is nasty
# that way: doc 4 has 4 two and 5 xyz, while doc 5 has the reverse.
# However, xyz is missing from doc 9, so xyz has a larger idf than
# two has. Since all the doc lengths are the same, doc lengths don't
# matter. So doc 4 should win, and doc 5 should come in second.
# The loser will be the most unbalanced, but is that doc 1 (1 two 8
# xyz) or doc 8 (8 two 1 xyz)? Again xyz has a higher idf, so doc 1
# is more valuable, and doc 8 is the loser.
r, num = self.zc_index.query("
xyz
one
two
")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual(r[0][0], 4) # winner
self.assertEqual(r[1][0], 5) # runner up
self.assertEqual(r[-1][0], 8) # loser
self.assertEqual(r[-2][0], 1) # penultimate loser
# And nothing about the relative results in the last test should
# change if we leave "
one
" out of the search (it appears in all
# docs, so it's a wash).
r, num = self.zc_index.query("
two
xyz
")
self.assertEqual(num, 8)
self.assertEqual(len(r), 8)
self.assertEqual(r[0][0], 4) # winner
self.assertEqual(r[1][0], 5) # runner up
self.assertEqual(r[-1][0], 8) # loser
self.assertEqual(r[-2][0], 1) # penultimate loser
############################################################################
# Subclasses of QueryTestsBase must set a class variable IndexFactory to
# the kind of index to be constructed.
class QueryTestsBase(testQueryEngine.TestQueryEngine,
testQueryParser.TestQueryParser):
# The FauxIndex in testQueryEngine contains four documents.
# docid 1: foo, bar, ham
# docid 2: bar, ham
# docid 3: foo, ham
# docid 4: ham
docs = ["
foo
bar
ham
", "
bar
ham
", "
foo
ham
", "
ham
"]
def setUp(self):
self.lexicon = PLexicon('lexicon', '',
Splitter(),
CaseNormalizer(),
StopWordRemover())
caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name',
None,
caller,
self.IndexFactory,
'text',
'lexicon')
self.parser = QueryParser(self.lexicon)
self.index = self.zc_index.index
self.add_docs()
def add_docs(self):
for i in range(len(self.docs)):
text = self.docs[i]
obj = Indexable(text)
self.zc_index.index_object(i + 1, obj)
def compareSet(self, set, dict):
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
setkeys = list(set.keys())
dictkeys = dict.keys()
setkeys.sort()
dictkeys.sort()
self.assertEqual(setkeys, dictkeys)
class CosineQueryTests(QueryTestsBase):
IndexFactory = CosineIndex
class OkapiQueryTests(QueryTestsBase):
IndexFactory = OkapiIndex
class PLexiconTests(unittest.TestCase):
def _getTargetClass(self):
from Products.ZCTextIndex.ZCTextIndex import PLexicon
return PLexicon
def _makeOne(self, id='testing', title='Testing', *pipeline):
return self._getTargetClass()(id, title, *pipeline)
def test_class_conforms_to_ILexicon(self):
from Products.ZCTextIndex.interfaces import ILexicon
from zope.interface.verify import verifyClass
verifyClass(ILexicon, self._getTargetClass())
def test_instance_conforms_to_ILexicon(self):
from Products.ZCTextIndex.interfaces import ILexicon
from zope.interface.verify import verifyObject
verifyObject(ILexicon, self._makeOne())
def test_class_conforms_to_IZCLexicon(self):
from Products.ZCTextIndex.interfaces import IZCLexicon
from zope.interface.verify import verifyClass
verifyClass(IZCLexicon, self._getTargetClass())
def test_instance_conforms_to_IZCLexicon(self):
from Products.ZCTextIndex.interfaces import IZCLexicon
from zope.interface.verify import verifyObject
verifyObject(IZCLexicon, self._makeOne())
def test_queryLexicon_defaults_empty(self):
lexicon = self._makeOne()
info = lexicon.queryLexicon(REQUEST=None, words=None)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 0)
self.assertEqual(info['word_count'], 0)
self.assertEqual(list(info['page_range']), [])
self.assertEqual(info['page_columns'], [])
def test_queryLexicon_defaults_non_empty(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 7)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [WORDS])
def test_queryLexicon_row_breaks(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None, rows=4)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 4)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 7)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [WORDS[0:4], WORDS[4:]])
def test_queryLexicon_page_breaks(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None, rows=2, cols=2)
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 2)
self.assertEqual(info['cols'], 2)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 4)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0, 1])
self.assertEqual(info['page_columns'], [WORDS[0:2], WORDS[2:4]])
def test_queryLexicon_page_break_not_first(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=None,
page=1, rows=2, cols=2)
self.assertEqual(info['page'], 1)
self.assertEqual(info['rows'], 2)
self.assertEqual(info['cols'], 2)
self.assertEqual(info['start_word'], 5)
self.assertEqual(info['end_word'], 7)
self.assertEqual(info['word_count'], 7)
self.assertEqual(list(info['page_range']), [0, 1])
self.assertEqual(info['page_columns'], [WORDS[4:6], WORDS[6:]])
def test_queryLexicon_words_no_globbing(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=['aaa', 'bbb'])
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 2)
self.assertEqual(info['word_count'], 2)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def test_queryLexicon_words_w_globbing(self):
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne()
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=['aa*', 'bbb*'])
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 2)
self.assertEqual(info['word_count'], 2)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def test_queryLexicon_uses_pipeline_for_normalization(self):
from Products.ZCTextIndex.Lexicon import CaseNormalizer
WORDS = 'aaa bbb ccc ddd eee fff ggg'.split()
lexicon = self._makeOne('test', 'Testing', CaseNormalizer())
lexicon.sourceToWordIds(WORDS)
info = lexicon.queryLexicon(REQUEST=None, words=['AA*', 'Bbb*'])
self.assertEqual(info['page'], 0)
self.assertEqual(info['rows'], 20)
self.assertEqual(info['cols'], 4)
self.assertEqual(info['start_word'], 1)
self.assertEqual(info['end_word'], 2)
self.assertEqual(info['word_count'], 2)
self.assertEqual(list(info['page_range']), [0])
self.assertEqual(info['page_columns'], [['aaa', 'bbb']])
def test_suite():
s = unittest.TestSuite()
for klass in (CosineIndexTests, OkapiIndexTests,
CosineQueryTests, OkapiQueryTests, PLexiconTests):
s.addTest(unittest.makeSuite(klass))
return s
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
src/Products/ZCTextIndex/tests/wordstats.py
deleted
100644 → 0
View file @
48f67574
#! /usr/bin/env python
"""Dump statistics about each word in the index.
usage: wordstats.py data.fs [index key]
"""
import
ZODB
from
ZODB.FileStorage
import
FileStorage
def
main
(
fspath
,
key
):
fs
=
FileStorage
(
fspath
,
read_only
=
1
)
db
=
ZODB
.
DB
(
fs
)
rt
=
db
.
open
().
root
()
index
=
rt
[
key
]
lex
=
index
.
lexicon
idx
=
index
.
index
print
"Words"
,
lex
.
length
()
print
"Documents"
,
idx
.
length
()
print
"Word frequencies: count, word, wid"
for
word
,
wid
in
lex
.
items
():
docs
=
idx
.
_wordinfo
[
wid
]
print
len
(
docs
),
word
,
wid
print
"Per-doc scores: wid, (doc, score,)+"
for
wid
in
lex
.
wids
():
print
wid
,
docs
=
idx
.
_wordinfo
[
wid
]
for
docid
,
score
in
docs
.
items
():
print
docid
,
score
,
print
if
__name__
==
"__main__"
:
import
sys
args
=
sys
.
argv
[
1
:]
index_key
=
"index"
if
len
(
args
)
==
1
:
fspath
=
args
[
0
]
elif
len
(
args
)
==
2
:
fspath
,
index_key
=
args
else
:
print
"Expected 1 or 2 args, got"
,
len
(
args
)
main
(
fspath
,
index_key
)
src/Products/ZCTextIndex/www/index.gif
deleted
100644 → 0
View file @
48f67574
111 Bytes
src/Products/ZCTextIndex/www/lexicon.gif
deleted
100644 → 0
View file @
48f67574
364 Bytes
versions.cfg
View file @
1c1a53d1
...
...
@@ -13,6 +13,7 @@ initgroups = 2.13.0
Missing = 2.13.1
MultiMapping = 2.13.0
Persistence = 2.13.2
Products.ZCTextIndex = 2.13.0
Record = 2.13.0
RestrictedPython = 3.6.0a1
tempstorage = 2.11.3
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment