Commit 0226c34d authored by Casey Duncan's avatar Casey Duncan

Integration with Zope complete. ZCTextIndex is now a bonafide Plug-in index.

Some additional plug-in index APIs were added to ZCTextIndex and support APIs added to Index and Lexicon.

_apply_index does not use NBest since ZCatalog has an incompatible strategy for finding the top results. NBest might be abstracted from this product for general consumption in application code.
parent e5cbcd43
......@@ -23,6 +23,9 @@ from BTrees.IIBTree import weightedIntersection, weightedUnion
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
import ZODB
from Persistence import Persistent
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and
......@@ -39,7 +42,7 @@ def scaled_int(f, scale=SCALE_FACTOR):
# expensive.
return int(f * scale + 0.5)
class Index:
class Index(Persistent):
__implements__ = IIndex
......@@ -59,6 +62,10 @@ class Index:
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
......@@ -97,6 +104,7 @@ class Index:
self._add_wordinfo(uniqwids[i], freqs[i], docid)
self._docweight[docid] = docweight
self._add_undoinfo(docid, wids)
return len(wids)
def unindex_doc(self, docid):
for wid in self._get_undoinfo(docid):
......
......@@ -59,6 +59,10 @@ class Lexicon:
if wid is not None:
wids.append(wid)
return wids
def get_word(self, wid):
"""Return the word for the given word id"""
return self.__words[wid]
def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern):
......
......@@ -21,15 +21,21 @@ from OFS.SimpleItem import SimpleItem
from Products.PluginIndexes.common.PluggableIndex \
import PluggableIndexInterface
from Products.PluginIndexes.common.util import parseIndexRequest
from Products.ZCTextIndex.Index import Index
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.Lexicon \
import Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from Globals import DTMLFile
from Globals import DTMLFile, InitializeClass
from Interface import verify_class_implementation
from AccessControl.SecurityInfo import ClassSecurityInfo
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent TextIndex"""
__implements__ = PluggableIndexInterface
meta_type = 'ZCTextIndex'
......@@ -37,6 +43,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
manage_options= (
{'label': 'Settings', 'action': 'manage_main'},
)
query_options = ['query', 'nbest']
def __init__(self, id, extra, caller):
self.id = id
......@@ -45,23 +53,46 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id
verify_class_implementation(ILexicon, lexicon.__class__)
if not ILexicon.isImplementedBy(lexicon):
raise ValueError, \
'Object "%s" does not implement lexicon interface' \
% lexicon.getId()
self.lexicon = lexicon
self.index = Index(self.lexicon)
self.parser = QueryParser()
## Pluggable Index APIs ##
def index_object(self, docid, obj):
self.index.index_doc(docid, self._get_object_text(obj))
def index_object(self, docid, obj, threshold=None):
# XXX We currently ignore subtransaction threshold
count = self.index.index_doc(docid, self._get_object_text(obj))
self._p_changed = 1 # XXX
return count
def unindex_object(self, docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX
def _apply_index(self, req):
pass # XXX
def _apply_index(self, request, cid=''):
"""Apply the query specified by request which is a mapping
containing the query
Returns two object on success, the resultSet containing the
matching record numbers and a tuple containing the names of the
fields used
Returns None if request is not valid for this index.
"""
record = parseIndexRequest(request, self.id, self.query_options)
if record.keys==None:
return None
query_str = ' '.join(record.keys)
tree = self.parser.parseQuery(query_str)
results = tree.executeQuery(self.index)
return results, (self._fieldname,)
def query(self, query, nbest=10):
# returns a mapping from docids to scores
......@@ -70,7 +101,20 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest()
def numObjects(self):
"""Return number of object indexed"""
return self.index.length()
def getEntryForObject(self, documentId, default=None):
"""Return the list of words indexed for documentId"""
try:
word_ids = self.index.get_words(documentId)
except KeyError:
return default
get_word = self.lexicon.get_word
return [get_word(wid) for wid in word_ids]
def _get_object_text(self, obj):
x = getattr(obj, self._fieldname)
if callable(x):
......@@ -82,6 +126,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
InitializeClass(ZCTextIndex)
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None):
"""Add a text index"""
......@@ -93,15 +139,30 @@ manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title, splitter=None, normalizer=None,
stopword=None, REQUEST=None):
stopwords=None, REQUEST=None):
"""Add ZCTextIndex Lexicon"""
elements = []
if splitter:
elements.append(Lexicon.Splitter())
elements.append(Splitter())
if normalizer:
elements.append(CaseNormalizer())
if stopwords:
elements.append(StopWordRemover())
lexicon = Lexicon(*elements)
lexicon = PLexicon(id, title, *elements)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
class PLexicon(Lexicon, Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent Lexcion for ZCTextIndex"""
meta_type = 'ZCTextIndex Lexicon'
def __init__(self, id, title='', *pipeline):
self.id = str(id)
self.title = str(title)
PLexicon.inheritedAttribute('__init__')(self, *pipeline)
InitializeClass(PLexicon)
......@@ -21,8 +21,15 @@ def initialize(context):
context.registerClass(
ZCTextIndex.ZCTextIndex,
permission='Add Pluggable Index',
constructors=(ZCTextIndex.manage_addZCTextIndexForm,
permission = 'Add Pluggable Index',
constructors = (ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex),
visibility=None
)
context.registerClass(
ZCTextIndex.PLexicon,
permission = 'Add Vocabularies',
constructors = (ZCTextIndex.manage_addLexiconForm,
ZCTextIndex.manage_addLexicon),
)
......@@ -33,7 +33,7 @@
splitter?
</td>
<td align="left" valign="top">
<input type="checkbox" name="splitter" />
<input type="checkbox" name="splitter" checked />
</td>
</tr>
......@@ -43,7 +43,7 @@
case normalizer?
</td>
<td align="left" valign="top">
<input type="checkbox" name="normalizer" />
<input type="checkbox" name="normalizer" checked />
</td>
</tr>
......@@ -53,7 +53,7 @@
remove stop words?
</td>
<td align="left" valign="top">
<input type="checkbox" name="stopword" />
<input type="checkbox" name="stopwords" checked />
</td>
</tr>
......
......@@ -27,23 +27,6 @@ from the most relevant to the least relevant.
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<select name="extra.vocabulary:record">
<dtml-in "this().aq_parent.objectItems('Vocabulary')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
......@@ -60,12 +43,19 @@ from the most relevant to the least relevant.
Lexicon
</div></td>
<td>
<select name="extra.lexicon_id:record">
<dtml-in "this().aq_parent.objectItems('Lexicon')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
<dtml-in expr="superValues('ZCTextIndex Lexicon')">
<dtml-if sequence-start>
<select name="extra.lexicon_id:record">
</dtml-if>
<option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" missing>
</option>
<dtml-if sequence-end>
</select>
</dtml-if>
<dtml-else>
<em>You must create a ZCTextIndex Lexicon first.</em>
</dtml-in>
</td>
</tr>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment