Commit b8c3a39b authored by Michel Pelletier's avatar Michel Pelletier

Added stop word interface to Lexicon.

parent 8e6e5acb
...@@ -91,7 +91,7 @@ from Persistence import Persistent ...@@ -91,7 +91,7 @@ from Persistence import Persistent
from OFS.SimpleItem import Item from OFS.SimpleItem import Item
from SearchIndex import Lexicon, GlobbingLexicon from SearchIndex import Lexicon, GlobbingLexicon
from VocabularyInterface import VocabularyInterface from SearchIndex.Lexicon import stop_word_dict
manage_addVocabularyForm=HTMLFile('addVocabulary',globals()) manage_addVocabularyForm=HTMLFile('addVocabulary',globals())
...@@ -113,7 +113,6 @@ class Vocabulary(Item, Persistent, Implicit): ...@@ -113,7 +113,6 @@ class Vocabulary(Item, Persistent, Implicit):
meta_type = "Vocabulary" meta_type = "Vocabulary"
_isAVocabulary = 1 _isAVocabulary = 1
__extends__=(VocabularyInterface,)
manage_options=( manage_options=(
...@@ -137,6 +136,7 @@ class Vocabulary(Item, Persistent, Implicit): ...@@ -137,6 +136,7 @@ class Vocabulary(Item, Persistent, Implicit):
['Anonymous', 'Manager']), ['Anonymous', 'Manager']),
) )
## manage_main = HTMLFile('vocab_manage_main', globals()) ## manage_main = HTMLFile('vocab_manage_main', globals())
manage_vocabulary = HTMLFile('manage_vocab', globals()) manage_vocabulary = HTMLFile('manage_vocab', globals())
...@@ -151,7 +151,7 @@ class Vocabulary(Item, Persistent, Implicit): ...@@ -151,7 +151,7 @@ class Vocabulary(Item, Persistent, Implicit):
if globbing: if globbing:
self.lexicon = GlobbingLexicon.GlobbingLexicon() self.lexicon = GlobbingLexicon.GlobbingLexicon()
else: else:
self.lexicon = Lexicon.Lexicon() self.lexicon = Lexicon.Lexicon(stop_word_dict)
def query(self, pattern): def query(self, pattern):
""" """ """ """
...@@ -171,6 +171,11 @@ class Vocabulary(Item, Persistent, Implicit): ...@@ -171,6 +171,11 @@ class Vocabulary(Item, Persistent, Implicit):
if RESPONSE: if RESPONSE:
RESPONSE.redirect(URL1 + '/manage_vocabulary') RESPONSE.redirect(URL1 + '/manage_vocabulary')
def manage_stop_syn(self, stop_syn, REQUEST=None):
pass
def insert(self, word=''): def insert(self, word=''):
self.lexicon.set(word) self.lexicon.set(word)
......
<html>
<head>
<title>Edit <dtml-var title_or_id></title>
</head>
<body bgcolor="#ffffff" link="#000099" vlink="#555555" alink="#77003b">
<dtml-var manage_tabs>
<form action="manage_stop_syn" method="POST">
<textarea name="stop_syn:lines">
</textarea>
</form>
<br>
</body>
</html>
...@@ -239,6 +239,14 @@ class GlobbingLexicon(Lexicon): ...@@ -239,6 +239,14 @@ class GlobbingLexicon(Lexicon):
return words return words
def Splitter(self, astring, words=None):
""" wrap the splitter """
## don't do anything, less efficient but there's not much
## sense in stemming a globbing lexicon.
return Splitter(astring)
def translate(self, pat): def translate(self, pat):
"""Translate a PATTERN to a regular expression. """Translate a PATTERN to a regular expression.
......
...@@ -113,11 +113,27 @@ class Lexicon(Persistent, Implicit): ...@@ -113,11 +113,27 @@ class Lexicon(Persistent, Implicit):
""" """
counter = 0
def __init__(self): def __init__(self, stop_syn=None):
self._lexicon = OIBTree() self._lexicon = OIBTree()
self.counter = 0 self.counter = 0
if stop_syn is None:
self.stop_syn = {}
else:
self.stop_syn = {}
def set_stop_syn(selfb, stop_syn):
""" pass in a mapping of stopwords and synonyms. Format is:
{'word' : [syn1, syn2, ..., synx]}
Vocabularies do not necesarily need to implement this if their
splitters do not support stemming or stoping.
"""
self.stop_syn = stop_syn
def set(self, word): def set(self, word):
""" return the word id of 'word' """ """ return the word id of 'word' """
...@@ -142,8 +158,11 @@ class Lexicon(Persistent, Implicit): ...@@ -142,8 +158,11 @@ class Lexicon(Persistent, Implicit):
def __len__(self): def __len__(self):
return len(self._lexicon) return len(self._lexicon)
def Splitter(self, astring, words): def Splitter(self, astring, words=None):
""" wrap the splitter """ """ wrap the splitter """
if words is None:
word = self.stop_syn
return Splitter(astring, words) return Splitter(astring, words)
def grep(self, query): def grep(self, query):
......
...@@ -92,7 +92,7 @@ is no longer known. ...@@ -92,7 +92,7 @@ is no longer known.
""" """
__version__='$Revision: 1.21 $'[11:-2] __version__='$Revision: 1.22 $'[11:-2]
from Globals import Persistent from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree import BTree, IIBTree, IOBTree, OIBTree
...@@ -164,7 +164,6 @@ class UnTextIndex(Persistent, Implicit): ...@@ -164,7 +164,6 @@ class UnTextIndex(Persistent, Implicit):
self.call_methods=call_methods self.call_methods=call_methods
self._index=IOBTree() self._index=IOBTree()
self._unindex=IOBTree() self._unindex=IOBTree()
self._syn=stop_word_dict
else: else:
pass pass
...@@ -177,6 +176,11 @@ class UnTextIndex(Persistent, Implicit): ...@@ -177,6 +176,11 @@ class UnTextIndex(Persistent, Implicit):
self._lexicon = lexicon self._lexicon = lexicon
def __setstate(self, state):
Persistent.__setstate__(self, state)
if hasattr(self, '_syn'):
del self._syn
def getLexicon(self, vocab_id): def getLexicon(self, vocab_id):
""" bit of a hack, indexes have been made acquirers so that """ bit of a hack, indexes have been made acquirers so that
...@@ -194,10 +198,10 @@ class UnTextIndex(Persistent, Implicit): ...@@ -194,10 +198,10 @@ class UnTextIndex(Persistent, Implicit):
def __len__(self): def __len__(self):
return len(self._unindex) return len(self._unindex)
def __setstate__(self, state): ## def __setstate__(self, state):
Persistent.__setstate__(self, state) ## Persistent.__setstate__(self, state)
if not hasattr(self, '_lexicon'): ## if not hasattr(self, '_lexicon'):
self._lexicon = Lexicon() ## self._lexicon = Lexicon()
def clear(self): def clear(self):
...@@ -240,7 +244,11 @@ class UnTextIndex(Persistent, Implicit): ...@@ -240,7 +244,11 @@ class UnTextIndex(Persistent, Implicit):
## The Splitter should now be european compliant at least. ## The Splitter should now be european compliant at least.
## Someone should test this. ## Someone should test this.
src = self.getLexicon(self._lexicon).Splitter(k, self._syn)
## import pdb
## pdb.set_trace()
src = self.getLexicon(self._lexicon).Splitter(k)
## This returns a tuple of stemmed words. Stopwords have been ## This returns a tuple of stemmed words. Stopwords have been
## stripped. ## stripped.
...@@ -324,7 +332,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -324,7 +332,7 @@ class UnTextIndex(Persistent, Implicit):
def __getitem__(self, word): def __getitem__(self, word):
"""Return an InvertedIndex-style result "list" """Return an InvertedIndex-style result "list"
""" """
src = tuple(self.getLexicon(self._lexicon).Splitter(word, self._syn)) src = tuple(self.getLexicon(self._lexicon).Splitter(word))
if not src: return ResultList({}, (word,), self) if not src: return ResultList({}, (word,), self)
if len(src) == 1: if len(src) == 1:
src=src[0] src=src[0]
...@@ -412,13 +420,13 @@ class UnTextIndex(Persistent, Implicit): ...@@ -412,13 +420,13 @@ class UnTextIndex(Persistent, Implicit):
r = [] r = []
for word in words: for word in words:
r = r+self.getLexicon(self._lexicon).Splitter(doc, self._syn).indexes(word) r = r+self.getLexicon(self._lexicon).Splitter(doc).indexes(word)
return r return r
def _subindex(self, isrc, d, old, last): def _subindex(self, isrc, d, old, last):
src = self.getLexicon(self._lexicon).Splitter(isrc, self._syn) src = self.getLexicon(self._lexicon).Splitter(isrc)
for s in src: for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment