Commit 0226c34d authored by Casey Duncan's avatar Casey Duncan

Integration with Zope complete. ZCTextIndex is now a bonafide Plug-in index.

Some additional plug-in index APIs were added to ZCTextIndex and support APIs added to Index and Lexicon.

_apply_index does not use NBest since ZCatalog has an incompatible strategy for finding the top results. NBest might be abstracted from this product for general consumption in application code.
parent e5cbcd43
...@@ -23,6 +23,9 @@ from BTrees.IIBTree import weightedIntersection, weightedUnion ...@@ -23,6 +23,9 @@ from BTrees.IIBTree import weightedIntersection, weightedUnion
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
import ZODB
from Persistence import Persistent
# Instead of storing floats, we generally store scaled ints. Binary pickles # Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024 # can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and # is large enough to get about 3 decimal digits of fractional info, and
...@@ -39,7 +42,7 @@ def scaled_int(f, scale=SCALE_FACTOR): ...@@ -39,7 +42,7 @@ def scaled_int(f, scale=SCALE_FACTOR):
# expensive. # expensive.
return int(f * scale + 0.5) return int(f * scale + 0.5)
class Index: class Index(Persistent):
__implements__ = IIndex __implements__ = IIndex
...@@ -59,6 +62,10 @@ class Index: ...@@ -59,6 +62,10 @@ class Index:
def length(self): def length(self):
"""Return the number of documents in the index.""" """Return the number of documents in the index."""
return len(self._docwords) return len(self._docwords)
def get_words(self, docid):
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
# Most of the computation for computing a relevance score for the # Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently # document occurs in the search() method. The code currently
...@@ -97,6 +104,7 @@ class Index: ...@@ -97,6 +104,7 @@ class Index:
self._add_wordinfo(uniqwids[i], freqs[i], docid) self._add_wordinfo(uniqwids[i], freqs[i], docid)
self._docweight[docid] = docweight self._docweight[docid] = docweight
self._add_undoinfo(docid, wids) self._add_undoinfo(docid, wids)
return len(wids)
def unindex_doc(self, docid): def unindex_doc(self, docid):
for wid in self._get_undoinfo(docid): for wid in self._get_undoinfo(docid):
......
...@@ -59,6 +59,10 @@ class Lexicon: ...@@ -59,6 +59,10 @@ class Lexicon:
if wid is not None: if wid is not None:
wids.append(wid) wids.append(wid)
return wids return wids
def get_word(self, wid):
"""Return the word for the given word id"""
return self.__words[wid]
def globToWordIds(self, pattern): def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern): if not re.match("^\w+\*$", pattern):
......
...@@ -21,15 +21,21 @@ from OFS.SimpleItem import SimpleItem ...@@ -21,15 +21,21 @@ from OFS.SimpleItem import SimpleItem
from Products.PluginIndexes.common.PluggableIndex \ from Products.PluginIndexes.common.PluggableIndex \
import PluggableIndexInterface import PluggableIndexInterface
from Products.PluginIndexes.common.util import parseIndexRequest
from Products.ZCTextIndex.Index import Index from Products.ZCTextIndex.Index import Index
from Products.ZCTextIndex.ILexicon import ILexicon from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.Lexicon \
import Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser from Products.ZCTextIndex.QueryParser import QueryParser
from Globals import DTMLFile from Globals import DTMLFile, InitializeClass
from Interface import verify_class_implementation from Interface import verify_class_implementation
from AccessControl.SecurityInfo import ClassSecurityInfo
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent TextIndex"""
__implements__ = PluggableIndexInterface __implements__ = PluggableIndexInterface
meta_type = 'ZCTextIndex' meta_type = 'ZCTextIndex'
...@@ -37,6 +43,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -37,6 +43,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
manage_options= ( manage_options= (
{'label': 'Settings', 'action': 'manage_main'}, {'label': 'Settings', 'action': 'manage_main'},
) )
query_options = ['query', 'nbest']
def __init__(self, id, extra, caller): def __init__(self, id, extra, caller):
self.id = id self.id = id
...@@ -45,23 +53,46 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -45,23 +53,46 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
if lexicon is None: if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id
verify_class_implementation(ILexicon, lexicon.__class__) if not ILexicon.isImplementedBy(lexicon):
raise ValueError, \
'Object "%s" does not implement lexicon interface' \
% lexicon.getId()
self.lexicon = lexicon self.lexicon = lexicon
self.index = Index(self.lexicon) self.index = Index(self.lexicon)
self.parser = QueryParser() self.parser = QueryParser()
## Pluggable Index APIs ##
def index_object(self, docid, obj): def index_object(self, docid, obj, threshold=None):
self.index.index_doc(docid, self._get_object_text(obj)) # XXX We currently ignore subtransaction threshold
count = self.index.index_doc(docid, self._get_object_text(obj))
self._p_changed = 1 # XXX self._p_changed = 1 # XXX
return count
def unindex_object(self, docid): def unindex_object(self, docid):
self.index.unindex_doc(docid) self.index.unindex_doc(docid)
self._p_changed = 1 # XXX self._p_changed = 1 # XXX
def _apply_index(self, req): def _apply_index(self, request, cid=''):
pass # XXX """Apply the query specified by request which is a mapping
containing the query
Returns two object on success, the resultSet containing the
matching record numbers and a tuple containing the names of the
fields used
Returns None if request is not valid for this index.
"""
record = parseIndexRequest(request, self.id, self.query_options)
if record.keys==None:
return None
query_str = ' '.join(record.keys)
tree = self.parser.parseQuery(query_str)
results = tree.executeQuery(self.index)
return results, (self._fieldname,)
def query(self, query, nbest=10): def query(self, query, nbest=10):
# returns a mapping from docids to scores # returns a mapping from docids to scores
...@@ -70,7 +101,20 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -70,7 +101,20 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
chooser = NBest(nbest) chooser = NBest(nbest)
chooser.addmany(results.items()) chooser.addmany(results.items())
return chooser.getbest() return chooser.getbest()
def numObjects(self):
"""Return number of object indexed"""
return self.index.length()
def getEntryForObject(self, documentId, default=None):
"""Return the list of words indexed for documentId"""
try:
word_ids = self.index.get_words(documentId)
except KeyError:
return default
get_word = self.lexicon.get_word
return [get_word(wid) for wid in word_ids]
def _get_object_text(self, obj): def _get_object_text(self, obj):
x = getattr(obj, self._fieldname) x = getattr(obj, self._fieldname)
if callable(x): if callable(x):
...@@ -82,6 +126,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -82,6 +126,8 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
manage_main = DTMLFile('dtml/manageZCTextIndex', globals()) manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
InitializeClass(ZCTextIndex)
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None, def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None): RESPONSE=None):
"""Add a text index""" """Add a text index"""
...@@ -93,15 +139,30 @@ manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals()) ...@@ -93,15 +139,30 @@ manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals()) manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title, splitter=None, normalizer=None, def manage_addLexicon(self, id, title, splitter=None, normalizer=None,
stopword=None, REQUEST=None): stopwords=None, REQUEST=None):
"""Add ZCTextIndex Lexicon"""
elements = [] elements = []
if splitter: if splitter:
elements.append(Lexicon.Splitter()) elements.append(Splitter())
if normalizer: if normalizer:
elements.append(CaseNormalizer()) elements.append(CaseNormalizer())
if stopwords: if stopwords:
elements.append(StopWordRemover()) elements.append(StopWordRemover())
lexicon = Lexicon(*elements) lexicon = PLexicon(id, title, *elements)
self._setObject(id, lexicon) self._setObject(id, lexicon)
if REQUEST is not None: if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1) return self.manage_main(self, REQUEST, update_menu=1)
class PLexicon(Lexicon, Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent Lexcion for ZCTextIndex"""
meta_type = 'ZCTextIndex Lexicon'
def __init__(self, id, title='', *pipeline):
self.id = str(id)
self.title = str(title)
PLexicon.inheritedAttribute('__init__')(self, *pipeline)
InitializeClass(PLexicon)
...@@ -21,8 +21,15 @@ def initialize(context): ...@@ -21,8 +21,15 @@ def initialize(context):
context.registerClass( context.registerClass(
ZCTextIndex.ZCTextIndex, ZCTextIndex.ZCTextIndex,
permission='Add Pluggable Index', permission = 'Add Pluggable Index',
constructors=(ZCTextIndex.manage_addZCTextIndexForm, constructors = (ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex), ZCTextIndex.manage_addZCTextIndex),
visibility=None visibility=None
) )
context.registerClass(
ZCTextIndex.PLexicon,
permission = 'Add Vocabularies',
constructors = (ZCTextIndex.manage_addLexiconForm,
ZCTextIndex.manage_addLexicon),
)
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
splitter? splitter?
</td> </td>
<td align="left" valign="top"> <td align="left" valign="top">
<input type="checkbox" name="splitter" /> <input type="checkbox" name="splitter" checked />
</td> </td>
</tr> </tr>
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
case normalizer? case normalizer?
</td> </td>
<td align="left" valign="top"> <td align="left" valign="top">
<input type="checkbox" name="normalizer" /> <input type="checkbox" name="normalizer" checked />
</td> </td>
</tr> </tr>
...@@ -53,7 +53,7 @@ ...@@ -53,7 +53,7 @@
remove stop words? remove stop words?
</td> </td>
<td align="left" valign="top"> <td align="left" valign="top">
<input type="checkbox" name="stopword" /> <input type="checkbox" name="stopwords" checked />
</td> </td>
</tr> </tr>
......
...@@ -27,23 +27,6 @@ from the most relevant to the least relevant. ...@@ -27,23 +27,6 @@ from the most relevant to the least relevant.
</td> </td>
</tr> </tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<select name="extra.vocabulary:record">
<dtml-in "this().aq_parent.objectItems('Vocabulary')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
<div class="form-label"> <div class="form-label">
...@@ -60,12 +43,19 @@ from the most relevant to the least relevant. ...@@ -60,12 +43,19 @@ from the most relevant to the least relevant.
Lexicon Lexicon
</div></td> </div></td>
<td> <td>
<select name="extra.lexicon_id:record"> <dtml-in expr="superValues('ZCTextIndex Lexicon')">
<dtml-in "this().aq_parent.objectItems('Lexicon')"> <dtml-if sequence-start>
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">) <select name="extra.lexicon_id:record">
</dtml-in> </dtml-if>
</select> <option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" missing>
</option>
<dtml-if sequence-end>
</select>
</dtml-if>
<dtml-else>
<em>You must create a ZCTextIndex Lexicon first.</em>
</dtml-in>
</td> </td>
</tr> </tr>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment