Commit fbd41e2f authored by Casey Duncan's avatar Casey Duncan

Enhanced pipeline element factory so that you can group elements that must be

selected in a mutally exclusive manner (such as splitters).

Existing pipeline elements have been grouped appropriately.

Added a stop word remover that does not remove single char words.

Modified ZMI lexicon add form to use pipeline element groups to render form.
Groups with multiple elements are rendered as selects, singletons are rendered
as checkboxes.
parent 5402d08f
......@@ -13,7 +13,7 @@
##############################################################################
from Products.ZCTextIndex.ISplitter import ISplitter
from Products.ZCTextIndex.PipelineFactory import splitter_factory
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
......@@ -45,7 +45,9 @@ class HTMLWordSplitter:
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
splitter_factory.registerFactory('HTML Word Splitter', HTMLWordSplitter)
element_factory.registerFactory('Word Splitter',
'HTML aware splitter',
HTMLWordSplitter)
if __name__ == "__main__":
import sys
......
......@@ -17,18 +17,23 @@ from Interface import Base as Interface
class IPipelineElementFactory(Interface):
"""Class for creating pipeline elements by name"""
def registerFactory(name, factory):
"""Registers a pipeline factory by name.
def registerFactory(group, name, factory):
"""Registers a pipeline factory by name and element group.
Each name can be registered only once. Duplicate registrations
will raise a ValueError
Each name can be registered only once for a given group. Duplicate
registrations will raise a ValueError
"""
def getFactoryNames():
def getFactoryGroups():
"""Returns a sorted list of element group names
"""
def getFactoryNames(group):
"""Returns a sorted list of registered pipeline factory names
in the specified element group
"""
def instantiate(name):
"""Instantiates a pipeline element by name. If name is not registered
raise a KeyError.
def instantiate(group, name):
"""Instantiates a pipeline element by group and name. If name is not
registered raise a KeyError.
"""
......@@ -20,8 +20,7 @@ from BTrees.OIBTree import OIBTree
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import QueryError
from Products.ZCTextIndex.PipelineFactory import \
splitter_factory, element_factory
from Products.ZCTextIndex.PipelineFactory import element_factory
class Lexicon:
......@@ -169,20 +168,26 @@ class Splitter:
result += self.rxGlob.findall(s)
return result
splitter_factory.registerFactory('Regex Splitter', Splitter)
element_factory.registerFactory('Word Splitter',
'Whitespace splitter',
Splitter)
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
element_factory.registerFactory('Case Normalizer', CaseNormalizer)
element_factory.registerFactory('Case Normalizer',
'Case Normalizer',
CaseNormalizer)
element_factory.registerFactory('Stop Words',
' Don\'t remove stop words',
None)
class StopWordRemover:
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
try:
from Products.ZCTextIndex.stopper import process as _process
......@@ -194,5 +199,16 @@ class StopWordRemover:
def process(self, lst):
return self._process(self.dict, lst)
element_factory.registerFactory('Stop Words',
'Remove listed stop words only',
StopWordRemover)
class StopWordAndSingleCharRemover(StopWordRemover):
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
element_factory.registerFactory('Stop Word Remover', StopWordRemover)
element_factory.registerFactory('Stop Words',
'Remove listed and single char words',
StopWordAndSingleCharRemover)
......@@ -20,24 +20,33 @@ class PipelineElementFactory:
__implements__ = IPipelineElementFactory
def __init__(self):
self._elements = {}
def registerFactory(self, name, factory):
if self._elements.has_key(name):
raise ValueError, 'ZCTextIndex splitter named' + \
'"%s" already registered'
self._elements[name] = factory
def getFactoryNames(self):
names = self._elements.keys()
self._groups = {}
def registerFactory(self, group, name, factory):
if self._groups.has_key(group) and \
self._groups[group].has_key(name):
raise ValueError('ZCTextIndex lexicon element "%s" '
'already registered in group "%s"'
% (name, group))
elements = self._groups.get(group)
if elements is None:
elements = self._groups[group] = {}
elements[name] = factory
def getFactoryGroups(self):
groups = self._groups.keys()
groups.sort()
return groups
def getFactoryNames(self, group):
names = self._groups[group].keys()
names.sort()
return names
def instantiate(self, name):
return self._elements[name]()
splitter_factory = PipelineElementFactory()
def instantiate(self, group, name):
factory = self._groups[group][name]
if factory is not None:
return factory()
element_factory = PipelineElementFactory()
......@@ -31,7 +31,7 @@ from Products.ZCTextIndex.Lexicon import \
Lexicon, Splitter, CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from PipelineFactory import splitter_factory, element_factory
from PipelineFactory import element_factory
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
......@@ -174,16 +174,23 @@ manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title='', splitter_name=None,
element_names=None, REQUEST=None):
def manage_addLexicon(self, id, title='', elements=[], REQUEST=None):
"""Add ZCTextIndex Lexicon"""
elements = [element_factory.instantiate(name) for name in element_names]
if splitter_name:
elements.insert(0, splitter_factory.instantiate(splitter_name))
pipeline = []
for el_record in elements:
if not hasattr(el_record, 'name'):
continue # Skip over records that only specify element group
element = element_factory.instantiate(el_record.group, el_record.name)
if element is not None:
if el_record.group == 'Word Splitter':
# I don't like hardcoding this, but its a simple solution
# to get the splitter element first in the pipeline
pipeline.insert(0, element)
else:
pipeline.append(element)
lexicon = PLexicon(id, title, *elements)
lexicon = PLexicon(id, title, *pipeline)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
......
......@@ -16,7 +16,7 @@
Experimental plugin text index for ZCatalog.
"""
from PipelineFactory import splitter_factory, element_factory
from PipelineFactory import element_factory
from Products.ZCTextIndex import ZCTextIndex, HTMLSplitter
def initialize(context):
......@@ -36,17 +36,17 @@ def initialize(context):
permission = 'Add Vocabularies',
constructors = (ZCTextIndex.manage_addLexiconForm,
ZCTextIndex.manage_addLexicon,
getSplitterNames, getElementNames),
getElementGroups, getElementNames),
icon='www/lexicon.gif'
)
## Functions below are for use in the ZMI constructor forms ##
def getSplitterNames(self):
return splitter_factory.getFactoryNames()
def getElementGroups(self):
return element_factory.getFactoryGroups()
def getElementNames(self):
return element_factory.getFactoryNames()
def getElementNames(self, group):
return element_factory.getFactoryNames(group)
def getIndexTypes(self):
return ZCTextIndex.index_types.keys()
......
......@@ -33,29 +33,29 @@
</td>
</tr>
<dtml-in name="getElementGroups" prefix="group">
<dtml-let elements="getElementNames(group_item)">
<tr>
<td align="left" valign="top">
<div class="form-label">Word Splitter</div>
<div class="form-label">&dtml-group_item;</div>
</td>
<td align="left" valign="top">
<select name="splitter_name">
<dtml-in name="getSplitterNames">
<option value="&dtml-sequence-item;">&dtml-sequence-item;</option>
<input type="hidden" name="elements.group:records"
value="&dtml-group_item;" />
<dtml-if expr="_.len(elements) > 1">
<select name="elements.name:records">
<dtml-in name="elements">
<option value="&dtml-sequence-item;"
>&dtml-sequence-item;</option>
</dtml-in>
</select>
<dtml-else>
<input type="checkbox" name="elements.name:records"
value="<dtml-var expr="elements[0]">" checked />
</dtml-if>
</td>
</tr>
<dtml-in name="getElementNames">
<tr>
<td align="left" valign="top">
<div class="form-label">&dtml-sequence-item;</div>
</td>
<td align="left" valign="top">
<input type="checkbox" name="element_names:list"
value="&dtml-sequence-item;" checked />
</td>
</tr>
</dtml-let>
</dtml-in>
<tr>
......
......@@ -29,14 +29,19 @@ class PipelineFactoryTest(TestCase):
self.huey = NullPipelineElement()
self.dooey = NullPipelineElement()
self.louie = NullPipelineElement()
self.daffy = NullPipelineElement()
def testPipeline(self):
pf = PipelineElementFactory()
pf.registerFactory('huey', self.huey)
pf.registerFactory('dooey', self.dooey)
pf.registerFactory('louie', self.louie)
self.assertRaises(ValueError, pf.registerFactory, 'huey', self.huey)
self.assertEqual(pf.getFactoryNames(), ['dooey', 'huey', 'louie'])
pf.registerFactory('donald', 'huey', self.huey)
pf.registerFactory('donald', 'dooey', self.dooey)
pf.registerFactory('donald', 'louie', self.louie)
pf.registerFactory('looney', 'daffy', self.daffy)
self.assertRaises(ValueError, pf.registerFactory,'donald', 'huey',
self.huey)
self.assertEqual(pf.getFactoryGroups(), ['donald', 'looney'])
self.assertEqual(pf.getFactoryNames('donald'),
['dooey', 'huey', 'louie'])
def test_suite():
return makeSuite(PipelineFactoryTest)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment