Commit 63651bb3 authored by Michel Pelletier's avatar Michel Pelletier

partial searching

parent d03616e0
...@@ -99,17 +99,46 @@ from DocumentTemplate.DT_Util import Eval, expr_globals ...@@ -99,17 +99,46 @@ from DocumentTemplate.DT_Util import Eval, expr_globals
from AccessControl.Permission import name_trans from AccessControl.Permission import name_trans
from Catalog import Catalog, orify from Catalog import Catalog, orify
from SearchIndex import UnIndex, UnTextIndex from SearchIndex import UnIndex, UnTextIndex
from Vocabulary import Vocabulary
import IOBTree import IOBTree
manage_addZCatalogForm=HTMLFile('addZCatalog',globals()) manage_addZCatalogForm=HTMLFile('addZCatalog',globals())
def manage_addZCatalog(self,id,title,REQUEST=None): def manage_addZCatalog(self, id, title, vocab='', vocab_id='', REQUEST=None):
"""Add a ZCatalog object """Add a ZCatalog object
""" """
c=ZCatalog(id,title) c=ZCatalog(id, title, vocab, vocab_id, self)
self._setObject(id,c) self._setObject(id, c)
if REQUEST is not None: if REQUEST is not None:
return self.manage_main(self,REQUEST) return self.manage_main(self, REQUEST)
def VocabularyIDs(self):
""" returns a list of acquireable vocabularies. Stole this from
ZSQLMethods """
ids={}
have_id=ids.has_key
StringType=type('')
while self is not None:
if hasattr(self, 'objectValues'):
for o in self.objectValues():
if (hasattr(o,'_isAVocabulary') and o._isAVocabulary
and hasattr(o,'id')):
id=o.id
if type(id) is not StringType: id=id()
if not have_id(id):
if hasattr(o,'title_and_id'): o=o.title_and_id()
else: o=id
ids[id]=id
if hasattr(self, 'aq_parent'): self=self.aq_parent
else: self=None
ids=map(lambda item: (item[1], item[0]), ids.items())
ids.sort()
return ids
class ZCatalog(Folder, Persistent, Implicit): class ZCatalog(Folder, Persistent, Implicit):
...@@ -191,13 +220,22 @@ class ZCatalog(Folder, Persistent, Implicit): ...@@ -191,13 +220,22 @@ class ZCatalog(Folder, Persistent, Implicit):
threshold=10000 threshold=10000
_v_total=0 _v_total=0
def __init__(self, id, title='', vocab=0, vocab_id='', container=None):
def __init__(self,id,title=''):
self.id=id self.id=id
self.title=title self.title=title
self.vocab_id = vocab_id
self.threshold = 10000 self.threshold = 10000
self._v_total = 0 self._v_total = 0
self._catalog = Catalog()
if not vocab:
v = Vocabulary('Vocabulary', 'Vocabulary', globbing=1)
self._setObject('Vocabulary', v)
v = 'Vocabulary'
else:
v = vocab_id
self._catalog = Catalog(vocabulary=v)
self._catalog.addColumn('id') self._catalog.addColumn('id')
self._catalog.addIndex('id', 'FieldIndex') self._catalog.addIndex('id', 'FieldIndex')
...@@ -213,7 +251,12 @@ class ZCatalog(Folder, Persistent, Implicit): ...@@ -213,7 +251,12 @@ class ZCatalog(Folder, Persistent, Implicit):
self._catalog.addColumn('summary') self._catalog.addColumn('summary')
self._catalog.addIndex('PrincipiaSearchSource', 'TextIndex') self._catalog.addIndex('PrincipiaSearchSource', 'TextIndex')
def getVocabulary(self):
""" more ack! """
return getattr(self, self.vocab_id)
def manage_edit(self, RESPONSE, URL1, threshold=1000, REQUEST=None): def manage_edit(self, RESPONSE, URL1, threshold=1000, REQUEST=None):
""" edit the catalog """ """ edit the catalog """
...@@ -359,7 +402,7 @@ class ZCatalog(Folder, Persistent, Implicit): ...@@ -359,7 +402,7 @@ class ZCatalog(Folder, Persistent, Implicit):
if self._v_total > self.threshold: if self._v_total > self.threshold:
# commit a subtransaction # commit a subtransaction
get_transaction().commit(1) get_transaction().commit(1)
# kick the chache # kick the chache, this may be overkill but ya never know
self._p_jar.cacheFullSweep(1) self._p_jar.cacheFullSweep(1)
self._v_total = 0 self._v_total = 0
...@@ -545,10 +588,7 @@ class ZCatalog(Folder, Persistent, Implicit): ...@@ -545,10 +588,7 @@ class ZCatalog(Folder, Persistent, Implicit):
) )
): ):
if apply_func: if apply_func:
if apply_path: apply_func(ob, (apply_path+'/'+p))
apply_func(ob, (apply_path+'/'+p))
else:
apply_func(ob, p)
else: else:
add_result((p, ob)) add_result((p, ob))
dflag=0 dflag=0
......
##############################################################################
#
# Zope Public License (ZPL) Version 1.0
# -------------------------------------
#
# Copyright (c) Digital Creations. All rights reserved.
#
# This license has been certified as Open Source(tm).
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions in source code must retain the above copyright
# notice, this list of conditions, and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions, and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. Digital Creations requests that attribution be given to Zope
# in any manner possible. Zope includes a "Powered by Zope"
# button that is installed by default. While it is not a license
# violation to remove this button, it is requested that the
# attribution remain. A significant investment has been put
# into Zope, and this effort will continue if the Zope community
# continues to grow. This is one way to assure that growth.
#
# 4. All advertising materials and documentation mentioning
# features derived from or use of this software must display
# the following acknowledgement:
#
# "This product includes software developed by Digital Creations
# for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# In the event that the product being advertised includes an
# intact Zope distribution (with copyright and license included)
# then this clause is waived.
#
# 5. Names associated with Zope or Digital Creations must not be used to
# endorse or promote products derived from this software without
# prior written permission from Digital Creations.
#
# 6. Modified redistributions of any form whatsoever must retain
# the following acknowledgment:
#
# "This product includes software developed by Digital Creations
# for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# Intact (re-)distributions of any official Zope release do not
# require an external acknowledgement.
#
# 7. Modifications are encouraged but must be packaged separately as
# patches to official Zope releases. Distributions that do not
# clearly separate the patches from the original work must be clearly
# labeled as unofficial distributions. Modifications which do not
# carry the name Zope may be packaged in any form, as long as they
# conform to all of the clauses above.
#
#
# Disclaimer
#
# THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY
# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DIGITAL CREATIONS OR ITS
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
#
# This software consists of contributions made by Digital Creations and
# many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file.
#
##############################################################################
import string, regex, ts_regex
import regsub
from Lexicon import Lexicon
__doc__=""" Lexicon object that supports
"""
from Splitter import Splitter
from Persistence import Persistent
from Acquisition import Implicit
import OIBTree, BTree, IOBTree
from intSet import intSet
OIBTree=OIBTree.BTree
OOBTree=BTree.BTree
IOBTree=IOBTree.BTree
import re
class GlobbingLexicon(Lexicon):
"""
Base class to support globbing lexicon object.
"""
multi_wc = '*'
single_wc = '?'
eow = '$'
def __init__(self):
self.counter = 0
self._lexicon = OIBTree()
self._inverseLex = IOBTree()
self._digrams = OOBTree()
def set(self, word):
""" """
if self._lexicon.has_key(word):
return self._lexicon[word]
else:
word = intern(word)
self._lexicon[word] = self.counter
self._inverseLex[self.counter] = word
## now, split the word into digrams and insert references
## to 'word' into the digram object. The first and last
## digrams in the list are specially marked with $ to
## indicate the beginning and end of the word
digrams = []
digrams.append(self.eow + word[0]) # mark the beginning
for i in range(len(word)):
digrams.append(word[i:i+2])
digrams[-1] = digrams[-1] + self.eow # mark the end
_digrams = self._digrams
for digram in digrams:
set = _digrams.get(digram)
if set is None:
_digrams[digram] = set = intSet()
set.insert(self.counter)
self._digrams = _digrams
self.counter = self.counter + 1
return self.counter
def query(self, pattern):
""" Query the lexicon for words matching a pattern.
"""
wc_set = [self.multi_wc, self.single_wc]
digrams = []
for i in range(len(pattern)):
if pattern[i] in wc_set:
continue
if i == 0:
digrams.insert(i, (self.eow + pattern[i]) )
digrams.append((pattern[i] + pattern[i+1]))
else:
try:
if pattern[i+1] not in wc_set:
digrams.append( pattern[i] + pattern[i+1] )
except IndexError:
digrams.append( (pattern[i] + self.eow) )
## now get all of the intsets that contain the result digrams
result = None
for digram in digrams:
if self._digrams.has_key(digram):
set = self._digrams[digram]
if set is not None:
if result is None:
result = set
else:
result.intersection(set)
if result is None:
return ()
else:
## now we have narrowed the list of possible candidates
## down to those words which contain digrams. However,
## some words may have been returned that match digrams,
## but do not match 'pattern'. This is because some words
## may contain all matching digrams, but in the wrong
## order.
expr = re.compile(self.translate(pattern))
words = []
hits = []
for x in result:
if expr.search(self._inverseLex[x]):
hits.append(x)
return hits
def __getitem__(self, word):
""" """
return self.query(word)
def translate(self, pat):
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
i, n = 0, len(pat)
res = ''
while i < n:
c = pat[i]
i = i+1
if c == self.multi_wc:
res = res + '.*'
elif c == self.single_wc:
res = res + '.'
else:
res = res + re.escape(c)
return res + "$"
...@@ -113,15 +113,8 @@ class Lexicon(Persistent, Implicit): ...@@ -113,15 +113,8 @@ class Lexicon(Persistent, Implicit):
""" """
def __init__(self, globbish=None): def __init__(self):
self._lexicon = OIBTree() self._lexicon = OIBTree()
if globbish:
self._ngrams = OOBTree()
self.counter = 0
def __getitem__(self, key):
""" overload mapping behavior """
return self._lexicon[key]
def set(self, word): def set(self, word):
""" return the word id of 'word' """ """ return the word id of 'word' """
...@@ -134,19 +127,23 @@ class Lexicon(Persistent, Implicit): ...@@ -134,19 +127,23 @@ class Lexicon(Persistent, Implicit):
self.counter = self.counter + 1 self.counter = self.counter + 1
return self.counter return self.counter
def get(self, key):
""" """
return self._lexicon[key]
def __len__(self): def __len__(self):
return len(self._lexicon) return len(self._lexicon)
def Splitter(self, astring, words): def Splitter(self, astring, words):
""" wrap the splitter """ """ wrap the splitter """
return Splitter(astring, words) return Splitter(astring, words)
def grep(self, query): def grep(self, query):
""" """
regular expression search through the lexicon regular expression search through the lexicon
he he. he he.
Do not use unless you know what your doing!!!
""" """
expr = re.compile(query) expr = re.compile(query)
hits = [] hits = []
...@@ -155,6 +152,12 @@ class Lexicon(Persistent, Implicit): ...@@ -155,6 +152,12 @@ class Lexicon(Persistent, Implicit):
hits.append(x) hits.append(x)
return hits return hits
AndNot = 'andnot' AndNot = 'andnot'
And = 'and' And = 'and'
Or = 'or' Or = 'or'
...@@ -166,9 +169,27 @@ def query(s, index, default_operator = Or, ...@@ -166,9 +169,27 @@ def query(s, index, default_operator = Or,
# First replace any occurences of " and not " with " andnot " # First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s) s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
q = parse(s) q = parse(s)
q = parse_wc(q, index)
q = parse2(q, default_operator) q = parse2(q, default_operator)
return evaluate(q, index) return evaluate(q, index)
def parse_wc(q, index):
'''expand wildcards'''
lex = index.getLexicon(index._lexicon)
words = []
for w in q:
if ( (lex.multi_wc in w) or
(lex.single_wc in w) ):
wids = lex.query(w)
for wid in wids:
if words:
words.append(Or)
words.append(lex._inverseLex[wid])
else:
words.append(w)
return words
def parse(s): def parse(s):
'''Parse parentheses and quotes''' '''Parse parentheses and quotes'''
l = [] l = []
......
...@@ -84,9 +84,10 @@ ...@@ -84,9 +84,10 @@
############################################################################## ##############################################################################
"""Simple column indices""" """Simple column indices"""
__version__='$Revision: 1.9 $'[11:-2] __version__='$Revision: 1.10 $'[11:-2]
from Globals import Persistent from Globals import Persistent
from Acquisition import Implicit
import BTree import BTree
import IOBTree import IOBTree
from intSet import intSet from intSet import intSet
...@@ -107,7 +108,7 @@ def nonEmpty(s): ...@@ -107,7 +108,7 @@ def nonEmpty(s):
return 1 return 1
class UnIndex(Persistent): class UnIndex(Persistent, Implicit):
"""UnIndex object interface""" """UnIndex object interface"""
def __init__(self, id=None, ignore_ex=None, call_methods=None): def __init__(self, id=None, ignore_ex=None, call_methods=None):
......
...@@ -92,10 +92,11 @@ is no longer known. ...@@ -92,10 +92,11 @@ is no longer known.
""" """
__version__='$Revision: 1.18 $'[11:-2] __version__='$Revision: 1.19 $'[11:-2]
from Globals import Persistent from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree import BTree, IIBTree, IOBTree, OIBTree
from Acquisition import Implicit
BTree=BTree.BTree BTree=BTree.BTree
IOBTree=IOBTree.BTree IOBTree=IOBTree.BTree
IIBucket=IIBTree.Bucket IIBucket=IIBTree.Bucket
...@@ -110,7 +111,7 @@ import string, regex, regsub, pdb ...@@ -110,7 +111,7 @@ import string, regex, regsub, pdb
from Lexicon import Lexicon, query, stop_word_dict from Lexicon import Lexicon, query, stop_word_dict
from ResultList import ResultList from ResultList import ResultList
class UnTextIndex(Persistent): class UnTextIndex(Persistent, Implicit):
def __init__(self, id=None, ignore_ex=None, def __init__(self, id=None, ignore_ex=None,
call_methods=None, lexicon=None): call_methods=None, lexicon=None):
...@@ -161,9 +162,20 @@ class UnTextIndex(Persistent): ...@@ -161,9 +162,20 @@ class UnTextIndex(Persistent):
if lexicon is None: if lexicon is None:
self._lexicon=Lexicon() self._lexicon=Lexicon()
else: else:
self._lexicon=lexicon self._lexicon = lexicon
def getLexicon(self, vocab_id):
""" bit of a hack, indexes have been made acquirers so that
they can acquire a vocabulary object from the object system in
Zope. I don't think indexes were ever intended to participate
in this way, but I don't see too much of a problem with it.
"""
vocab = getattr(self, vocab_id)
return vocab.lexicon
def __len__(self): def __len__(self):
return len(self._unindex) return len(self._unindex)
...@@ -213,7 +225,7 @@ class UnTextIndex(Persistent): ...@@ -213,7 +225,7 @@ class UnTextIndex(Persistent):
## The Splitter should now be european compliant at least. ## The Splitter should now be european compliant at least.
## Someone should test this. ## Someone should test this.
src = self._lexicon.Splitter(k, self._syn) src = self.getLexicon(self._lexicon).Splitter(k, self._syn)
## This returns a tuple of stemmed words. Stopwords have been ## This returns a tuple of stemmed words. Stopwords have been
## stripped. ## stripped.
...@@ -226,7 +238,7 @@ class UnTextIndex(Persistent): ...@@ -226,7 +238,7 @@ class UnTextIndex(Persistent):
index = self._index index = self._index
unindex = self._unindex unindex = self._unindex
lexicon = self._lexicon lexicon = self.getLexicon(self._lexicon)
get = index.get get = index.get
unindex[i] = [] unindex[i] = []
times = 0 times = 0
...@@ -297,28 +309,20 @@ class UnTextIndex(Persistent): ...@@ -297,28 +309,20 @@ class UnTextIndex(Persistent):
def __getitem__(self, word): def __getitem__(self, word):
"""Return an InvertedIndex-style result "list" """Return an InvertedIndex-style result "list"
""" """
src = tuple(self._lexicon.Splitter(word, self._syn)) src = tuple(self.getLexicon(self._lexicon).Splitter(word, self._syn))
if not src: if not src: return ResultList({}, (word,), self)
return ResultList({}, (word,), self)
if len(src) == 1: if len(src) == 1:
src=src[0] src=src[0]
if src[:1]=='"' and src[-1:]=='"': if src[:1]=='"' and src[-1:]=='"': return self[src]
return self[src] r = self._index.get(self.getLexicon(self._lexicon)[word][0],None)
if r is None: r = {}
r = self._index.get(self._lexicon[word], None)
if r is None:
r = {}
return ResultList(r, (word,), self) return ResultList(r, (word,), self)
r = None r = None
for word in src: for word in src:
rr = self[word] rr = self[word]
if r is None: r = rr
if r is None: else: r = r.near(rr)
r = rr
else:
r = r.near(rr)
return r return r
...@@ -393,13 +397,13 @@ class UnTextIndex(Persistent): ...@@ -393,13 +397,13 @@ class UnTextIndex(Persistent):
r = [] r = []
for word in words: for word in words:
r = r+self._lexicon.Splitter(doc, self._syn).indexes(word) r = r+self.getLexicon(self._lexicon).Splitter(doc, self._syn).indexes(word)
return r return r
def _subindex(self, isrc, d, old, last): def _subindex(self, isrc, d, old, last):
src = self._lexicon.Splitter(isrc, self._syn) src = self.getLexicon.Splitter(isrc, self._syn)
for s in src: for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment