Commit 1e40bbb9 authored by Michel Pelletier's avatar Michel Pelletier

Added ability to index an object

parent 89721d84
...@@ -84,7 +84,7 @@ ...@@ -84,7 +84,7 @@
############################################################################## ##############################################################################
"""Simple column indices""" """Simple column indices"""
__version__='$Revision: 1.22 $'[11:-2] __version__='$Revision: 1.23 $'[11:-2]
from Globals import Persistent from Globals import Persistent
from BTree import BTree from BTree import BTree
...@@ -96,6 +96,7 @@ import string ...@@ -96,6 +96,7 @@ import string
ListType=type([]) ListType=type([])
StringType=type('s') StringType=type('s')
def nonEmpty(s): def nonEmpty(s):
"returns true if a non-empty string or any other (nonstring) type" "returns true if a non-empty string or any other (nonstring) type"
if type(s) is StringType: if type(s) is StringType:
...@@ -104,29 +105,37 @@ def nonEmpty(s): ...@@ -104,29 +105,37 @@ def nonEmpty(s):
else: else:
return 1 return 1
class Index(Persistent): class Index(Persistent):
"""Index object interface""" """Index object interface"""
def __init__(self,data=None,schema=None,id=None): def __init__(self, data=None, schema=None, id=None,
ignore_ex=None, call_methods=None):
"""Create an index """Create an index
The arguments are: The arguments are:
'data' -- a mapping from integer object ids to objects or records, 'data' -- a mapping from integer object ids to objects or
records,
'schema' -- a mapping from item name to index into data
records. If 'data' is a mapping to objects, then schema
should ne 'None'.
'schema' -- a mapping from item name to index into data records. 'id' -- the name of the item attribute to index. This is
If 'data' is a mapping to objects, then schema should ne 'None'. either an attribute name or a record key.
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
""" """
###################################################################### ######################################################################
# For b/w compatability, have to allow __init__ calls with zero args # For b/w compatability, have to allow __init__ calls with zero args
if not data==schema==id==None:
self._data=data if not data==schema==id==ignore_ex==call_methods==None:
self._schema=schema self._data = data
self.id=id self._schema = schema
self._index=BTree() self.id = id
self.ignore_ex=ignore_ex
self.call_methods=call_methods
self._index = BTree()
self._reindex() self._reindex()
else: else:
...@@ -135,6 +144,7 @@ class Index(Persistent): ...@@ -135,6 +144,7 @@ class Index(Persistent):
# for b/w compatability # for b/w compatability
_init = __init__ _init = __init__
def dpHasUniqueValuesFor(self, name): def dpHasUniqueValuesFor(self, name):
' has unique values for column NAME ' ' has unique values for column NAME '
if name == self.id: if name == self.id:
...@@ -142,6 +152,7 @@ class Index(Persistent): ...@@ -142,6 +152,7 @@ class Index(Persistent):
else: else:
return 0 return 0
def dpUniqueValues(self, name=None, withLengths=0): def dpUniqueValues(self, name=None, withLengths=0):
"""\ """\
returns the unique values for name returns the unique values for name
...@@ -163,10 +174,12 @@ class Index(Persistent): ...@@ -163,10 +174,12 @@ class Index(Persistent):
else: rl.append((i, len(self._index[i]))) else: rl.append((i, len(self._index[i])))
return tuple(rl) return tuple(rl)
def clear(self): def clear(self):
self._index=BTree() self._index = BTree()
def _reindex(self,start=0): def _reindex(self, start=0):
"""Recompute index data for data with ids >= start.""" """Recompute index data for data with ids >= start."""
index=self._index index=self._index
...@@ -174,12 +187,12 @@ class Index(Persistent): ...@@ -174,12 +187,12 @@ class Index(Persistent):
if not start: index.clear() if not start: index.clear()
id=self.id id = self.id
if self._schema is None: if self._schema is None:
f=getattr f=getattr
else: else:
f=operator.__getitem__ f = operator.__getitem__
id=self._schema[id] id = self._schema[id]
for i,row in self._data.items(start): for i,row in self._data.items(start):
k=f(row,id) k=f(row,id)
...@@ -187,59 +200,73 @@ class Index(Persistent): ...@@ -187,59 +200,73 @@ class Index(Persistent):
if k is None or k == MV: continue if k is None or k == MV: continue
set=get(k) set=get(k)
if set is None: index[k]=set=intSet() if set is None: index[k] = set = intSet()
set.insert(i) set.insert(i)
def index_item(self,i):
def index_item(self, i, obj=None):
"""Recompute index data for data with ids >= start.""" """Recompute index data for data with ids >= start."""
index=self._index index = self._index
id=self.id id = self.id
if self._schema is None: if self._schema is None:
f=getattr f = getattr
else: else:
f=operator.__getitem__ f = operator.__getitem__
id=self._schema[id] id = self._schema[id]
row=self._data[i] if obj is None:
k=f(row,id) obj = self._data[i]
if self.call_methods:
k = f(obj, id)()
else:
k = f(obj, id)
if k is None or k == MV: return if k is None or k == MV: return
set=index.get(k) set = index.get(k)
if set is None: index[k]=set=intSet() if set is None: index[k] = set = intSet()
set.insert(i) set.insert(i)
def unindex_item(self,i):
def unindex_item(self, i, obj=None):
"""Recompute index data for data with ids >= start.""" """Recompute index data for data with ids >= start."""
index=self._index index = self._index
id=self.id id = self.id
if self._schema is None: if self._schema is None:
f=getattr f = getattr
else: else:
f=operator.__getitem__ f = operator.__getitem__
id=self._schema[id] id = self._schema[id]
row=self._data[i] if obj is None:
k=f(row,id) obj = self._data[i]
if self.call_methods:
k = f(obj, id)()
else:
k = f(obj, id)
set=index.get(k) set = index.get(k)
if set is not None: set.remove(i) if set is not None: set.remove(i)
def _apply_index(self, request, cid=''): def _apply_index(self, request, cid=''):
"""Apply the index to query parameters given in the argument, request """Apply the index to query parameters given in the argument,
request
The argument should be a mapping object. The argument should be a mapping object.
If the request does not contain the needed parameters, then None is If the request does not contain the needed parameters, then
returned. None is returned.
If the request contains a parameter with the name of the column If the request contains a parameter with the name of the
+ '_usage', it is sniffed for information on how to handle applying column + '_usage', it is sniffed for information on how to
the index. handle applying the index.
Otherwise two objects are returned. The first object is a Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching ResultSet containing the record numbers of the matching
...@@ -247,19 +274,19 @@ class Index(Persistent): ...@@ -247,19 +274,19 @@ class Index(Persistent):
all data fields used. all data fields used.
""" """
id=self.id #name of the column id = self.id #name of the column
cidid="%s/%s" % (cid,id) cidid = "%s/%s" % (cid,id)
has_key=request.has_key has_key = request.has_key
if has_key(cidid): keys=request[cidid] if has_key(cidid): keys = request[cidid]
elif has_key(id): keys=request[id] elif has_key(id): keys = request[id]
else: return None else: return None
if type(keys) is not ListType: keys=[keys] if type(keys) is not ListType: keys=[keys]
index=self._index index = self._index
r=None r = None
anyTrue=0 anyTrue = 0
opr=None opr = None
if request.has_key(id+'_usage'): if request.has_key(id+'_usage'):
# see if any usage params are sent to field # see if any usage params are sent to field
...@@ -267,26 +294,26 @@ class Index(Persistent): ...@@ -267,26 +294,26 @@ class Index(Persistent):
opr, opr_args=opr[0], opr[1:] opr, opr_args=opr[0], opr[1:]
if opr=="range": if opr=="range":
if 'min' in opr_args: lo=min(keys) if 'min' in opr_args: lo = min(keys)
else: lo=None else: lo = None
if 'max' in opr_args: hi=max(keys) if 'max' in opr_args: hi = max(keys)
else: hi=None else: hi = None
anyTrue=1 anyTrue=1
try: try:
if hi: setlist=index.items(lo,hi) if hi: setlist = index.items(lo,hi)
else: setlist=index.items(lo) else: setlist = index.items(lo)
for k,set in setlist: for k,set in setlist:
if r is None: r=set if r is None: r = set
else: r=r.union(set) else: r = r.union(set)
except KeyError: pass except KeyError: pass
else: #not a range else: #not a range
get=index.get get = index.get
for key in keys: for key in keys:
if key: anyTrue=1 if key: anyTrue = 1
set=get(key) set=get(key)
if set is not None: if set is not None:
if r is None: r=set if r is None: r = set
else: r = r.union(set) else: r = r.union(set)
if r is None: if r is None:
...@@ -294,3 +321,19 @@ class Index(Persistent): ...@@ -294,3 +321,19 @@ class Index(Persistent):
else: return None else: return None
return r, (id,) return r, (id,)
...@@ -202,7 +202,7 @@ Notes on a new text index design ...@@ -202,7 +202,7 @@ Notes on a new text index design
space. space.
""" """
__version__='$Revision: 1.18 $'[11:-2] __version__='$Revision: 1.19 $'[11:-2]
from Globals import Persistent from Globals import Persistent
import BTree, IIBTree import BTree, IIBTree
...@@ -210,32 +210,43 @@ BTree=BTree.BTree ...@@ -210,32 +210,43 @@ BTree=BTree.BTree
IIBTree=IIBTree.Bucket IIBTree=IIBTree.Bucket
from intSet import intSet from intSet import intSet
import operator import operator
getitem=operator.__getitem__
from Splitter import Splitter from Splitter import Splitter
from string import strip from string import strip
import string, regex, regsub import string, regex, regsub
class TextIndex(Persistent): class TextIndex(Persistent):
def __init__(self,data=None,schema=None,id=None): def __init__(self, data=None, schema=None, id=None,
ignore_ex=None, call_methods=None):
"""Create an index """Create an index
The arguments are: The arguments are:
'data' -- a mapping from integer object ids to objects or records, 'data' -- a mapping from integer object ids to objects or
records,
'schema' -- a mapping from item name to index into data records. 'schema' -- a mapping from item name to index into data
If 'data' is a mapping to objects, then schema should ne 'None'. records. If 'data' is a mapping to objects, then schema
should ne 'None'.
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- Tells the indexer to ignore exceptions that
are rasied when indexing an object.
'call_methods' -- Tells the indexer to call methods instead
of getattr or getitem to get an attribute.
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
""" """
###################################################################### ######################################################################
# For b/w compatability, have to allow __init__ calls with zero args # For b/w compatability, have to allow __init__ calls with zero args
if not data==schema==id==None: if not data==schema==id==ignore_ex==call_methods==None:
self._data=data self._data=data
self._schema=schema self._schema=schema
self.id=id self.id=id
self.ignore_ex=ignore_ex
self.call_methods=call_methods
self._index=BTree() self._index=BTree()
self._syn=stop_word_dict self._syn=stop_word_dict
self._reindex() self._reindex()
...@@ -245,47 +256,67 @@ class TextIndex(Persistent): ...@@ -245,47 +256,67 @@ class TextIndex(Persistent):
# for backwards compatability # for backwards compatability
_init = __init__ _init = __init__
def clear(self): def clear(self):
self._index=BTree() self._index = BTree()
def positions(self, docid, words): def positions(self, docid, words):
"""Return the positions in the document for the given document """Return the positions in the document for the given document
id of the word, word.""" id of the word, word."""
id=self.id id = self.id
if self._schema is None: if self._schema is None:
f=getattr f = getattr
else: else:
f=getitem f = operator.__getitem__
id=self._schema[id] id = self._schema[id]
row=self._data[docid]
doc=str(f(row,id)) row = self._data[docid]
r=[]
if self.call_methods:
doc = str(f(row, id)())
else:
doc = str(f(row, id))
r = []
for word in words: for word in words:
r=r+Splitter(doc, self._syn).indexes(word) r = r+Splitter(doc, self._syn).indexes(word)
return r return r
def index_item(self,i,un=0):
"""Recompute index data for data with ids >= start."""
id=self.id def index_item(self, i, obj=None, un=0):
"""Recompute index data for data with ids >= start.
if 'obj' is passed in, it is indexed instead of _data[i]"""
id = self.id
if self._schema is None: if self._schema is None:
f=getattr f = getattr
else: else:
f=getitem f = operator.__getitem__
id=self._schema[id] id = self._schema[id]
row=self._data[i] if obj is None:
k=str(f(row,id)) obj = self._data[i]
self._index_document(k,i,un) if self.call_methods:
k = str(f(obj, id)())
else:
k = str(f(obj, id))
self._index_document(k, i ,un)
def unindex_item(self, i, obj=None):
return self.index_item(i, obj, 1)
def unindex_item(self, i): return self.index_item(i,1)
def _reindex(self,start=0): def _reindex(self, start=0):
"""Recompute index data for data with ids >= start.""" """Recompute index data for data with ids >= start."""
for i in self._data.keys(start): self.index_item(i) for i in self._data.keys(start): self.index_item(i)
def _index_document(self, document_text, id, un=0, def _index_document(self, document_text, id, un=0,
tupleType=type(()), tupleType=type(()),
dictType=type({}), dictType=type({}),
...@@ -293,21 +324,21 @@ class TextIndex(Persistent): ...@@ -293,21 +324,21 @@ class TextIndex(Persistent):
src = Splitter(document_text, self._syn) src = Splitter(document_text, self._syn)
d = {} d = {}
old=d.has_key old = d.has_key
last=None last = None
for s in src: for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"': last=self.subindex(s[1:-1], d, old, last)
else: else:
if old(s): if old(s):
if s != last: d[s]=d[s]+1 if s != last: d[s] = d[s]+1
else: d[s]=1 else: d[s] = 1
index=self._index index = self._index
get=index.get get = index.get
if un: if un:
for word,score in d.items(): for word,score in d.items():
r=get(word) r = get(word)
if r is not None: if r is not None:
if type(r) is tupleType: del index[word] if type(r) is tupleType: del index[word]
else: else:
...@@ -315,27 +346,28 @@ class TextIndex(Persistent): ...@@ -315,27 +346,28 @@ class TextIndex(Persistent):
if type(r) is dictType: if type(r) is dictType:
if len(r) < 2: if len(r) < 2:
if r: if r:
for k, v in r.items(): index[word]=k,v for k, v in r.items(): index[word] = k,v
else: del index[word] else: del index[word]
else: index[word]=r else: index[word] = r
else: else:
for word,score in d.items(): for word,score in d.items():
r=get(word) r = get(word)
if r is not None: if r is not None:
r=index[word] r = index[word]
if type(r) is tupleType: if type(r) is tupleType:
r={r[0]:r[1]} r = {r[0]:r[1]}
r[id]=score r[id] = score
index[word]=r index[word] = r
elif type(r) is dictType: elif type(r) is dictType:
if len(r) > 4: if len(r) > 4:
b=IIBTree() b = IIBTree()
for k, v in r.items(): b[k]=v for k, v in r.items(): b[k] = v
r=b r = b
r[id]=score r[id] = score
index[word]=r index[word] = r
else: r[id]=score else: r[id] = score
else: index[word]=id,score else: index[word] = id, score
def _subindex(self, isrc, d, old, last): def _subindex(self, isrc, d, old, last):
...@@ -345,38 +377,41 @@ class TextIndex(Persistent): ...@@ -345,38 +377,41 @@ class TextIndex(Persistent):
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
else: else:
if old(s): if old(s):
if s != last: d[s]=d[s]+1 if s != last: d[s] = d[s]+1
else: d[s]=1 else: d[s] = 1
return last return last
def __getitem__(self, word): def __getitem__(self, word):
"""Return an InvertedIndex-style result "list" """Return an InvertedIndex-style result "list"
""" """
src = tuple(Splitter(word, self._syn)) src = tuple(Splitter(word, self._syn))
if not src: return ResultList({},(word,),self) if not src: return ResultList({}, (word,), self)
if len(src) == 1: if len(src) == 1:
src=src[0] src=src[0]
if src[:1]=='"' and src[-1:]=='"': return self[src] if src[:1]=='"' and src[-1:]=='"': return self[src]
r=self._index.get(word,None) r = self._index.get(word,None)
if r is None: r={} if r is None: r = {}
return ResultList(r,(word,),self) return ResultList(r, (word,), self)
r=None r = None
for word in src: for word in src:
rr=self[word] rr = self[word]
if r is None: r=rr if r is None: r = rr
else: r=r.near(rr) else: r = r.near(rr)
return r return r
def _apply_index(self, request, cid='', ListType=[]): def _apply_index(self, request, cid='', ListType=[]):
"""Apply the index to query parameters given in the argument, request """ Apply the index to query parameters given in the argument,
request
The argument should be a mapping object. The argument should be a mapping object.
If the request does not contain the needed parameters, then None is If the request does not contain the needed parameters, then
returned. None is returned.
Otherwise two objects are returned. The first object is a Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching ResultSet containing the record numbers of the matching
...@@ -384,30 +419,30 @@ class TextIndex(Persistent): ...@@ -384,30 +419,30 @@ class TextIndex(Persistent):
all data fields used. all data fields used.
""" """
id=self.id id = self.id
cidid="%s/%s" % (cid,id) cidid = "%s/%s" % (cid, id)
has_key=request.has_key has_key = request.has_key
if has_key(cidid): keys=request[cidid] if has_key(cidid): keys = request[cidid]
elif has_key(id): keys=request[id] elif has_key(id): keys =request[id]
else: return None else: return None
if type(keys) is type(''): if type(keys) is type(''):
if not keys or not strip(keys): return None if not keys or not strip(keys): return None
keys=[keys] keys = [keys]
r=None r = None
for key in keys: for key in keys:
key=strip(key) key = strip(key)
if not key: continue if not key: continue
rr=intSet() rr = intSet()
try: try:
for i,score in query(key,self).items(): for i,score in query(key,self).items():
if score: rr.insert(i) if score: rr.insert(i)
except KeyError: pass except KeyError: pass
if r is None: r=rr if r is None: r = rr
else: else:
# Note that we *and*/*narrow* multiple search terms. # Note that we *and*/*narrow* multiple search terms.
r=r.intersection(rr) r = r.intersection(rr)
if r is not None: return r, (id,) if r is not None: return r, (id,)
return intSet(), (id,) return intSet(), (id,)
...@@ -415,74 +450,78 @@ class TextIndex(Persistent): ...@@ -415,74 +450,78 @@ class TextIndex(Persistent):
class ResultList: class ResultList:
def __init__(self, d, words, index, TupleType=type(())): def __init__(self, d, words, index, TupleType=type(())):
self._index=index self._index = index
self._words=words self._words = words
if (type(d) is TupleType): self._dict = { d[0] : d[1] } if (type(d) is TupleType): self._dict = { d[0] : d[1] }
else: self._dict = d else: self._dict = d
def __len__(self): return len(self._dict) def __len__(self): return len(self._dict)
def __getitem__(self, key): return self._dict[key] def __getitem__(self, key): return self._dict[key]
def keys(self): return self._dict.keys() def keys(self): return self._dict.keys()
def has_key(self, key): return self._dict.has_key(key) def has_key(self, key): return self._dict.has_key(key)
def items(self): return self._dict.items() def items(self): return self._dict.items()
def __and__(self, x): def __and__(self, x):
result = {} result = {}
dict=self._dict dict = self._dict
xdict=x._dict xdict = x._dict
xhas=xdict.has_key xhas = xdict.has_key
for id, score in dict.items(): for id, score in dict.items():
if xhas(id): result[id]=xdict[id]+score if xhas(id): result[id] = xdict[id]+score
return self.__class__(result, self._words+x._words, self._index) return self.__class__(result, self._words+x._words, self._index)
def and_not(self, x): def and_not(self, x):
result = {} result = {}
dict=self._dict dict = self._dict
xdict=x._dict xdict = x._dict
xhas=xdict.has_key xhas = xdict.has_key
for id, score in dict.items(): for id, score in dict.items():
if not xhas(id): result[id]=xdict[id]+score if not xhas(id): result[id] = xdict[id]+score
return self.__class__(result, self._words, self._index) return self.__class__(result, self._words, self._index)
def __or__(self, x): def __or__(self, x):
result = {} result = {}
dict=self._dict dict = self._dict
has=dict.has_key has = dict.has_key
xdict=x._dict xdict = x._dict
xhas=xdict.has_key xhas = xdict.has_key
for id, score in dict.items(): for id, score in dict.items():
if xhas(id): result[id]=xdict[id]+score if xhas(id): result[id] = xdict[id]+score
else: result[id]=score else: result[id] = score
for id, score in xdict.items(): for id, score in xdict.items():
if not has(id): result[id]=score if not has(id): result[id] = score
return self.__class__(result, self._words+x._words, self._index) return self.__class__(result, self._words+x._words, self._index)
def near(self, x): def near(self, x):
result = {} result = {}
dict=self._dict dict = self._dict
xdict=x._dict xdict = x._dict
xhas=xdict.has_key xhas = xdict.has_key
positions=self._index.positions positions = self._index.positions
for id, score in dict.items(): for id, score in dict.items():
if not xhas(id): continue if not xhas(id): continue
p=(map(lambda i: (i,0), positions(id,self._words))+ p=(map(lambda i: (i,0), positions(id,self._words))+
map(lambda i: (i,1), positions(id,x._words))) map(lambda i: (i,1), positions(id,x._words)))
p.sort() p.sort()
d=lp=9999 d = lp = 9999
li=None li = None
lsrc=None lsrc = None
for i,src in p: for i,src in p:
if i is not li and src is not lsrc and li is not None: if i is not li and src is not lsrc and li is not None:
d=min(d,i-li) d = min(d,i-li)
li=i li = i
lsrc=src lsrc = src
if d==lp: score=min(score,xdict[id]) # synonyms if d==lp: score = min(score,xdict[id]) # synonyms
else: score=(score+xdict[id])/d else: score = (score+xdict[id])/d
result[id]=score result[id] = score
return self.__class__(result, self._words+x._words, self._index) return self.__class__(result, self._words+x._words, self._index)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment