Commit f6a8b104 authored by Casey Duncan's avatar Casey Duncan

Improved Zope integration

  * A pipeline factory registry now allows registration of possible
    pipeline elements for use by Zope lexicons.

  * ZMI constructor form for lexicon uses pipeline registry to generate form
    fields

  * ZMI constructor form for ZCTextindex allows you to choose between
    Okapi and Cosine relevance algorithms
parent 4b2ced78
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
############################################################################## ##############################################################################
from Products.ZCTextIndex.ISplitter import ISplitter from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import splitter_factory
import re import re
...@@ -43,6 +44,8 @@ class HTMLWordSplitter: ...@@ -43,6 +44,8 @@ class HTMLWordSplitter:
rx = re.compile("[A-Za-z]") rx = re.compile("[A-Za-z]")
return [word for word in text.split() return [word for word in text.split()
if len(word) > 1 and rx.search(word)] if len(word) > 1 and rx.search(word)]
splitter_factory.registerFactory('HTML Word Splitter', HTMLWordSplitter)
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
......
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class IPipelineElementFactory(Interface):
"""Class for creating pipeline elements by name"""
def registerFactory(name, factory):
"""Registers a pipeline factory by name.
Each name can be registered only once. Duplicate registrations
will raise a ValueError
"""
def getFactoryNames():
"""Returns a sorted list of registered pipeline factory names
"""
def instantiate(name):
"""Instantiates a pipeline element by name. If name is not registered
raise a KeyError.
"""
...@@ -18,6 +18,7 @@ from BTrees.IOBTree import IOBTree ...@@ -18,6 +18,7 @@ from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree from BTrees.OIBTree import OIBTree
from Products.ZCTextIndex.ILexicon import ILexicon from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.StopDict import get_stopdict from Products.ZCTextIndex.StopDict import get_stopdict
from PipelineFactory import splitter_factory, element_factory
class Lexicon: class Lexicon:
...@@ -140,11 +141,15 @@ class Splitter: ...@@ -140,11 +141,15 @@ class Splitter:
for s in lst: for s in lst:
result += self.rxGlob.findall(s) result += self.rxGlob.findall(s)
return result return result
splitter_factory.registerFactory('Regex Splitter', Splitter)
class CaseNormalizer: class CaseNormalizer:
def process(self, lst): def process(self, lst):
return [w.lower() for w in lst] return [w.lower() for w in lst]
element_factory.registerFactory('Case Normalizer', CaseNormalizer)
class StopWordRemover: class StopWordRemover:
...@@ -161,3 +166,6 @@ class StopWordRemover: ...@@ -161,3 +166,6 @@ class StopWordRemover:
else: else:
def process(self, lst): def process(self, lst):
return self._process(self.dict, lst) return self._process(self.dict, lst)
element_factory.registerFactory('Stop Word Remover', StopWordRemover)
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Products.ZCTextIndex.IPipelineElementFactory \
import IPipelineElementFactory
class PipelineElementFactory:
__implements__ = IPipelineElementFactory
def __init__(self):
self._elements = {}
def registerFactory(self, name, factory):
if self._elements.has_key(name):
raise ValueError, 'ZCTextIndex splitter named' + \
'"%s" already registered'
self._elements[name] = factory
def getFactoryNames(self):
names = self._elements.keys()
names.sort()
return names
def instantiate(self, name):
return self._elements[name]()
splitter_factory = PipelineElementFactory()
element_factory = PipelineElementFactory()
...@@ -26,12 +26,17 @@ from Products.PluginIndexes.common.PluggableIndex import \ ...@@ -26,12 +26,17 @@ from Products.PluginIndexes.common.PluggableIndex import \
PluggableIndexInterface PluggableIndexInterface
from Products.PluginIndexes.common.util import parseIndexRequest from Products.PluginIndexes.common.util import parseIndexRequest
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
from Products.ZCTextIndex.ILexicon import ILexicon from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.Lexicon import \ from Products.ZCTextIndex.Lexicon import \
Lexicon, Splitter, CaseNormalizer, StopWordRemover Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser from Products.ZCTextIndex.QueryParser import QueryParser
from PipelineFactory import splitter_factory, element_factory
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
index_types = {'Okapi BM25 Rank':OkapiIndex,
'Cosine Measure':CosineIndex}
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent TextIndex""" """Persistent TextIndex"""
...@@ -50,7 +55,7 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -50,7 +55,7 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
## Constructor ## ## Constructor ##
def __init__(self, id, extra, caller, index_factory=OkapiIndex): def __init__(self, id, extra, caller, index_factory=None):
self.id = id self.id = id
self._fieldname = extra.doc_attr self._fieldname = extra.doc_attr
lexicon = getattr(caller, extra.lexicon_id, None) lexicon = getattr(caller, extra.lexicon_id, None)
...@@ -64,7 +69,15 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -64,7 +69,15 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
% lexicon.getId() % lexicon.getId()
self.lexicon = lexicon self.lexicon = lexicon
self._index_factory = index_factory
if index_factory is None:
if extra.index_type not in index_types.keys():
raise ValueError, 'Invalid index type "%s"' % extra.index_type
self._index_factory = index_types[extra.index_type]
self._index_type = extra.index_type
else:
self._index_factory = index_factory
self.clear() self.clear()
## External methods not in the Pluggable Index API ## ## External methods not in the Pluggable Index API ##
...@@ -144,6 +157,10 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): ...@@ -144,6 +157,10 @@ class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
## User Interface Methods ## ## User Interface Methods ##
manage_main = DTMLFile('dtml/manageZCTextIndex', globals()) manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
def getIndexType(self):
"""Return index type string"""
return getattr(self, '_index_type', self._index_factory.__name__)
InitializeClass(ZCTextIndex) InitializeClass(ZCTextIndex)
...@@ -157,29 +174,39 @@ manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals()) ...@@ -157,29 +174,39 @@ manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals()) manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title, splitter=None, normalizer=None, def manage_addLexicon(self, id, title='', splitter_name=None,
stopwords=None, REQUEST=None): element_names=None, REQUEST=None):
"""Add ZCTextIndex Lexicon""" """Add ZCTextIndex Lexicon"""
elements = []
if splitter: elements = [element_factory.instantiate(name) for name in element_names]
elements.append(Splitter())
if normalizer: if splitter_name:
elements.append(CaseNormalizer()) elements.insert(0, splitter_factory.instantiate(splitter_name))
if stopwords:
elements.append(StopWordRemover())
lexicon = PLexicon(id, title, *elements) lexicon = PLexicon(id, title, *elements)
self._setObject(id, lexicon) self._setObject(id, lexicon)
if REQUEST is not None: if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1) return self.manage_main(self, REQUEST, update_menu=1)
class PLexicon(Lexicon, Persistent, Acquisition.Implicit, SimpleItem): class PLexicon(Lexicon, Persistent, Acquisition.Implicit, SimpleItem):
"""Persistent Lexcion for ZCTextIndex""" """Persistent Lexicon for ZCTextIndex"""
meta_type = 'ZCTextIndex Lexicon' meta_type = 'ZCTextIndex Lexicon'
manage_options = ({'label':'Overview', 'action':'manage_main'},) + \
SimpleItem.manage_options
def __init__(self, id, title='', *pipeline): def __init__(self, id, title='', *pipeline):
self.id = str(id) self.id = str(id)
self.title = str(title) self.title = str(title)
PLexicon.inheritedAttribute('__init__')(self, *pipeline) PLexicon.inheritedAttribute('__init__')(self, *pipeline)
## User Interface Methods ##
def getPipelineNames(self):
"""Return list of names of pipeline element classes"""
return [element.__class__.__name__ for element in self._pipeline]
manage_main = DTMLFile('dtml/manageLexicon', globals())
InitializeClass(PLexicon) InitializeClass(PLexicon)
...@@ -16,14 +16,17 @@ ...@@ -16,14 +16,17 @@
Experimental plugin text index for ZCatalog. Experimental plugin text index for ZCatalog.
""" """
from PipelineFactory import splitter_factory, element_factory
from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
def initialize(context): def initialize(context):
from Products.ZCTextIndex import ZCTextIndex
context.registerClass( context.registerClass(
ZCTextIndex.ZCTextIndex, ZCTextIndex.ZCTextIndex,
permission = 'Add Pluggable Index', permission = 'Add Pluggable Index',
constructors = (ZCTextIndex.manage_addZCTextIndexForm, constructors = (ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex), ZCTextIndex.manage_addZCTextIndex,
getIndexTypes),
icon='www/index.gif', icon='www/index.gif',
visibility=None visibility=None
) )
...@@ -32,6 +35,19 @@ def initialize(context): ...@@ -32,6 +35,19 @@ def initialize(context):
ZCTextIndex.PLexicon, ZCTextIndex.PLexicon,
permission = 'Add Vocabularies', permission = 'Add Vocabularies',
constructors = (ZCTextIndex.manage_addLexiconForm, constructors = (ZCTextIndex.manage_addLexiconForm,
ZCTextIndex.manage_addLexicon), ZCTextIndex.manage_addLexicon,
getSplitterNames, getElementNames),
icon='www/lexicon.gif' icon='www/lexicon.gif'
) )
## Functions below are for use in the ZMI constructor forms ##
def getSplitterNames(self):
return splitter_factory.getFactoryNames()
def getElementNames(self):
return element_factory.getFactoryNames()
def getIndexTypes(self):
return ZCTextIndex.index_types.keys()
<dtml-var manage_page_header> <dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _, <dtml-var "manage_form_title(this(), _,
form_title='Add Lexicon', form_title='Add ZCTextIndex Lexicon',
)"> )">
<p class="form-help">
A ZCTextIndex Lexicon processes and stores the words of documents indexed
with a ZCTextIndex. Multiple ZCTextIndexes can share the same lexicon.
</p>
<FORM ACTION="manage_addLexicon" METHOD="POST"> <form action="manage_addLexicon" method="POST">
<table cellspacing="0" cellpadding="2" border="0"> <table cellspacing="0" cellpadding="2" border="0">
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
...@@ -16,6 +21,7 @@ ...@@ -16,6 +21,7 @@
<input type="text" name="id" size="40" /> <input type="text" name="id" size="40" />
</td> </td>
</tr> </tr>
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
<div class="form-optional"> <div class="form-optional">
...@@ -29,33 +35,28 @@ ...@@ -29,33 +35,28 @@
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
<div class="form-label"> <div class="form-label">Word Splitter</div>
splitter?
</td>
<td align="left" valign="top">
<input type="checkbox" name="splitter" checked />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
case normalizer?
</td> </td>
<td align="left" valign="top"> <td align="left" valign="top">
<input type="checkbox" name="normalizer" checked /> <select name="splitter_name">
<dtml-in name="getSplitterNames">
<option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
</dtml-in>
</select>
</td> </td>
</tr> </tr>
<tr> <dtml-in name="getElementNames">
<td align="left" valign="top"> <tr>
<div class="form-label"> <td align="left" valign="top">
remove stop words? <div class="form-label">&dtml-sequence-item;</div>
</td> </td>
<td align="left" valign="top"> <td align="left" valign="top">
<input type="checkbox" name="stopwords" checked /> <input type="checkbox" name="element_names:list"
</td> value="&dtml-sequence-item;" checked />
</tr> </td>
</tr>
</dtml-in>
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
......
...@@ -37,6 +37,21 @@ from the most relevant to the least relevant. ...@@ -37,6 +37,21 @@ from the most relevant to the least relevant.
</td> </td>
</tr> </tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Ranking Strategy
</div>
</td>
<td align="left" valign="top">
<select name="extra.index_type:record">
<dtml-in name="getIndexTypes">
<option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
</dtml-in>
</select>
</td>
</tr>
<tr> <tr>
<td align="left" valign"top"> <td align="left" valign"top">
<div class="form-label"> <div class="form-label">
...@@ -48,7 +63,7 @@ from the most relevant to the least relevant. ...@@ -48,7 +63,7 @@ from the most relevant to the least relevant.
<select name="extra.lexicon_id:record"> <select name="extra.lexicon_id:record">
</dtml-if> </dtml-if>
<option value="&dtml-id;"> <option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" missing> &dtml-id; <dtml-var name="title" fmt="(%s)" null>
</option> </option>
<dtml-if sequence-end> <dtml-if sequence-end>
</select> </select>
...@@ -59,18 +74,6 @@ from the most relevant to the least relevant. ...@@ -59,18 +74,6 @@ from the most relevant to the least relevant.
</td> </td>
</tr> </tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Type
</div>
</td>
<td align="left" valign="top">
ZCTextIndex
</td>
</tr>
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
</td> </td>
......
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
The lexicon processes and stores the words found in objects indexed by one
or more ZCTextIndexes.
</p>
<p class="section-bar">
<span class="form-label">Input Pipeline Stages</span>
</p>
<p>
Text indexed through this lexicon is processed by the following pipeline
stages
</p>
<ol>
<dtml-in name="getPipelineNames">
<li>&dtml-sequence-item;</li>
</dtml-in>
</ol>
<dtml-var manage_page_footer>
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.IPipelineElement import IPipelineElement
from Products.ZCTextIndex.PipelineFactory import PipelineElementFactory
class NullPipelineElement:
__implements__ = IPipelineElement
def process(source):
pass
class PipelineFactoryTest(TestCase):
def setUp(self):
self.huey = NullPipelineElement()
self.dooey = NullPipelineElement()
self.louie = NullPipelineElement()
def testPipeline(self):
pf = PipelineElementFactory()
pf.registerFactory('huey', self.huey)
pf.registerFactory('dooey', self.dooey)
pf.registerFactory('louie', self.louie)
self.assertRaises(ValueError, pf.registerFactory, 'huey', self.huey)
self.assertEqual(pf.getFactoryNames(), ['dooey', 'huey', 'louie'])
def test_suite():
return makeSuite(PipelineFactoryTest)
if __name__=='__main__':
main(defaultTest='test_suite')
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment