Commit 1e40bbb9 authored by Michel Pelletier's avatar Michel Pelletier

Added ability to index an object

parent 89721d84
......@@ -84,7 +84,7 @@
##############################################################################
"""Simple column indices"""
__version__='$Revision: 1.22 $'[11:-2]
__version__='$Revision: 1.23 $'[11:-2]
from Globals import Persistent
from BTree import BTree
......@@ -96,6 +96,7 @@ import string
ListType=type([])
StringType=type('s')
def nonEmpty(s):
"returns true if a non-empty string or any other (nonstring) type"
if type(s) is StringType:
......@@ -104,29 +105,37 @@ def nonEmpty(s):
else:
return 1
class Index(Persistent):
"""Index object interface"""
def __init__(self,data=None,schema=None,id=None):
def __init__(self, data=None, schema=None, id=None,
ignore_ex=None, call_methods=None):
"""Create an index
The arguments are:
'data' -- a mapping from integer object ids to objects or records,
'data' -- a mapping from integer object ids to objects or
records,
'schema' -- a mapping from item name to index into data
records. If 'data' is a mapping to objects, then schema
should ne 'None'.
'schema' -- a mapping from item name to index into data records.
If 'data' is a mapping to objects, then schema should ne 'None'.
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
"""
######################################################################
# For b/w compatability, have to allow __init__ calls with zero args
if not data==schema==id==None:
self._data=data
self._schema=schema
self.id=id
self._index=BTree()
if not data==schema==id==ignore_ex==call_methods==None:
self._data = data
self._schema = schema
self.id = id
self.ignore_ex=ignore_ex
self.call_methods=call_methods
self._index = BTree()
self._reindex()
else:
......@@ -135,6 +144,7 @@ class Index(Persistent):
# for b/w compatability
_init = __init__
def dpHasUniqueValuesFor(self, name):
' has unique values for column NAME '
if name == self.id:
......@@ -142,6 +152,7 @@ class Index(Persistent):
else:
return 0
def dpUniqueValues(self, name=None, withLengths=0):
"""\
returns the unique values for name
......@@ -163,10 +174,12 @@ class Index(Persistent):
else: rl.append((i, len(self._index[i])))
return tuple(rl)
def clear(self):
self._index=BTree()
self._index = BTree()
def _reindex(self,start=0):
def _reindex(self, start=0):
"""Recompute index data for data with ids >= start."""
index=self._index
......@@ -174,12 +187,12 @@ class Index(Persistent):
if not start: index.clear()
id=self.id
id = self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
f = operator.__getitem__
id = self._schema[id]
for i,row in self._data.items(start):
k=f(row,id)
......@@ -187,59 +200,73 @@ class Index(Persistent):
if k is None or k == MV: continue
set=get(k)
if set is None: index[k]=set=intSet()
if set is None: index[k] = set = intSet()
set.insert(i)
def index_item(self,i):
def index_item(self, i, obj=None):
"""Recompute index data for data with ids >= start."""
index=self._index
index = self._index
id=self.id
id = self.id
if self._schema is None:
f=getattr
f = getattr
else:
f=operator.__getitem__
id=self._schema[id]
f = operator.__getitem__
id = self._schema[id]
row=self._data[i]
k=f(row,id)
if obj is None:
obj = self._data[i]
if self.call_methods:
k = f(obj, id)()
else:
k = f(obj, id)
if k is None or k == MV: return
set=index.get(k)
if set is None: index[k]=set=intSet()
set = index.get(k)
if set is None: index[k] = set = intSet()
set.insert(i)
def unindex_item(self,i):
def unindex_item(self, i, obj=None):
"""Recompute index data for data with ids >= start."""
index=self._index
index = self._index
id=self.id
id = self.id
if self._schema is None:
f=getattr
f = getattr
else:
f=operator.__getitem__
id=self._schema[id]
f = operator.__getitem__
id = self._schema[id]
row=self._data[i]
k=f(row,id)
if obj is None:
obj = self._data[i]
if self.call_methods:
k = f(obj, id)()
else:
k = f(obj, id)
set=index.get(k)
set = index.get(k)
if set is not None: set.remove(i)
def _apply_index(self, request, cid=''):
"""Apply the index to query parameters given in the argument, request
"""Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then None is
returned.
If the request does not contain the needed parameters, then
None is returned.
If the request contains a parameter with the name of the column
+ '_usage', it is sniffed for information on how to handle applying
the index.
If the request contains a parameter with the name of the
column + '_usage', it is sniffed for information on how to
handle applying the index.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
......@@ -247,19 +274,19 @@ class Index(Persistent):
all data fields used.
"""
id=self.id #name of the column
id = self.id #name of the column
cidid="%s/%s" % (cid,id)
has_key=request.has_key
if has_key(cidid): keys=request[cidid]
elif has_key(id): keys=request[id]
cidid = "%s/%s" % (cid,id)
has_key = request.has_key
if has_key(cidid): keys = request[cidid]
elif has_key(id): keys = request[id]
else: return None
if type(keys) is not ListType: keys=[keys]
index=self._index
r=None
anyTrue=0
opr=None
index = self._index
r = None
anyTrue = 0
opr = None
if request.has_key(id+'_usage'):
# see if any usage params are sent to field
......@@ -267,26 +294,26 @@ class Index(Persistent):
opr, opr_args=opr[0], opr[1:]
if opr=="range":
if 'min' in opr_args: lo=min(keys)
else: lo=None
if 'max' in opr_args: hi=max(keys)
else: hi=None
if 'min' in opr_args: lo = min(keys)
else: lo = None
if 'max' in opr_args: hi = max(keys)
else: hi = None
anyTrue=1
try:
if hi: setlist=index.items(lo,hi)
else: setlist=index.items(lo)
if hi: setlist = index.items(lo,hi)
else: setlist = index.items(lo)
for k,set in setlist:
if r is None: r=set
else: r=r.union(set)
if r is None: r = set
else: r = r.union(set)
except KeyError: pass
else: #not a range
get=index.get
get = index.get
for key in keys:
if key: anyTrue=1
if key: anyTrue = 1
set=get(key)
if set is not None:
if r is None: r=set
if r is None: r = set
else: r = r.union(set)
if r is None:
......@@ -294,3 +321,19 @@ class Index(Persistent):
else: return None
return r, (id,)
......@@ -202,7 +202,7 @@ Notes on a new text index design
space.
"""
__version__='$Revision: 1.18 $'[11:-2]
__version__='$Revision: 1.19 $'[11:-2]
from Globals import Persistent
import BTree, IIBTree
......@@ -210,32 +210,43 @@ BTree=BTree.BTree
IIBTree=IIBTree.Bucket
from intSet import intSet
import operator
getitem=operator.__getitem__
from Splitter import Splitter
from string import strip
import string, regex, regsub
class TextIndex(Persistent):
def __init__(self,data=None,schema=None,id=None):
def __init__(self, data=None, schema=None, id=None,
ignore_ex=None, call_methods=None):
"""Create an index
The arguments are:
'data' -- a mapping from integer object ids to objects or records,
'data' -- a mapping from integer object ids to objects or
records,
'schema' -- a mapping from item name to index into data records.
If 'data' is a mapping to objects, then schema should ne 'None'.
'schema' -- a mapping from item name to index into data
records. If 'data' is a mapping to objects, then schema
should ne 'None'.
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- Tells the indexer to ignore exceptions that
are rasied when indexing an object.
'call_methods' -- Tells the indexer to call methods instead
of getattr or getitem to get an attribute.
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
"""
######################################################################
# For b/w compatability, have to allow __init__ calls with zero args
if not data==schema==id==None:
if not data==schema==id==ignore_ex==call_methods==None:
self._data=data
self._schema=schema
self.id=id
self.ignore_ex=ignore_ex
self.call_methods=call_methods
self._index=BTree()
self._syn=stop_word_dict
self._reindex()
......@@ -245,47 +256,67 @@ class TextIndex(Persistent):
# for backwards compatability
_init = __init__
def clear(self):
self._index=BTree()
self._index = BTree()
def positions(self, docid, words):
"""Return the positions in the document for the given document
id of the word, word."""
id=self.id
id = self.id
if self._schema is None:
f=getattr
f = getattr
else:
f=getitem
id=self._schema[id]
f = operator.__getitem__
id = self._schema[id]
row=self._data[docid]
doc=str(f(row,id))
r=[]
row = self._data[docid]
if self.call_methods:
doc = str(f(row, id)())
else:
doc = str(f(row, id))
r = []
for word in words:
r=r+Splitter(doc, self._syn).indexes(word)
r = r+Splitter(doc, self._syn).indexes(word)
return r
def index_item(self,i,un=0):
"""Recompute index data for data with ids >= start."""
id=self.id
def index_item(self, i, obj=None, un=0):
"""Recompute index data for data with ids >= start.
if 'obj' is passed in, it is indexed instead of _data[i]"""
id = self.id
if self._schema is None:
f=getattr
f = getattr
else:
f=getitem
id=self._schema[id]
f = operator.__getitem__
id = self._schema[id]
row=self._data[i]
k=str(f(row,id))
if obj is None:
obj = self._data[i]
self._index_document(k,i,un)
if self.call_methods:
k = str(f(obj, id)())
else:
k = str(f(obj, id))
self._index_document(k, i ,un)
def unindex_item(self, i, obj=None):
return self.index_item(i, obj, 1)
def unindex_item(self, i): return self.index_item(i,1)
def _reindex(self,start=0):
def _reindex(self, start=0):
"""Recompute index data for data with ids >= start."""
for i in self._data.keys(start): self.index_item(i)
def _index_document(self, document_text, id, un=0,
tupleType=type(()),
dictType=type({}),
......@@ -293,21 +324,21 @@ class TextIndex(Persistent):
src = Splitter(document_text, self._syn)
d = {}
old=d.has_key
last=None
old = d.has_key
last = None
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
if s[0] == '\"': last=self.subindex(s[1:-1], d, old, last)
else:
if old(s):
if s != last: d[s]=d[s]+1
else: d[s]=1
if s != last: d[s] = d[s]+1
else: d[s] = 1
index=self._index
get=index.get
index = self._index
get = index.get
if un:
for word,score in d.items():
r=get(word)
r = get(word)
if r is not None:
if type(r) is tupleType: del index[word]
else:
......@@ -315,27 +346,28 @@ class TextIndex(Persistent):
if type(r) is dictType:
if len(r) < 2:
if r:
for k, v in r.items(): index[word]=k,v
for k, v in r.items(): index[word] = k,v
else: del index[word]
else: index[word]=r
else: index[word] = r
else:
for word,score in d.items():
r=get(word)
r = get(word)
if r is not None:
r=index[word]
r = index[word]
if type(r) is tupleType:
r={r[0]:r[1]}
r[id]=score
index[word]=r
r = {r[0]:r[1]}
r[id] = score
index[word] = r
elif type(r) is dictType:
if len(r) > 4:
b=IIBTree()
for k, v in r.items(): b[k]=v
r=b
r[id]=score
index[word]=r
else: r[id]=score
else: index[word]=id,score
b = IIBTree()
for k, v in r.items(): b[k] = v
r = b
r[id] = score
index[word] = r
else: r[id] = score
else: index[word] = id, score
def _subindex(self, isrc, d, old, last):
......@@ -345,38 +377,41 @@ class TextIndex(Persistent):
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
else:
if old(s):
if s != last: d[s]=d[s]+1
else: d[s]=1
if s != last: d[s] = d[s]+1
else: d[s] = 1
return last
def __getitem__(self, word):
"""Return an InvertedIndex-style result "list"
"""
src = tuple(Splitter(word, self._syn))
if not src: return ResultList({},(word,),self)
if not src: return ResultList({}, (word,), self)
if len(src) == 1:
src=src[0]
if src[:1]=='"' and src[-1:]=='"': return self[src]
r=self._index.get(word,None)
if r is None: r={}
return ResultList(r,(word,),self)
r = self._index.get(word,None)
if r is None: r = {}
return ResultList(r, (word,), self)
r=None
r = None
for word in src:
rr=self[word]
if r is None: r=rr
else: r=r.near(rr)
rr = self[word]
if r is None: r = rr
else: r = r.near(rr)
return r
def _apply_index(self, request, cid='', ListType=[]):
"""Apply the index to query parameters given in the argument, request
""" Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then None is
returned.
If the request does not contain the needed parameters, then
None is returned.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
......@@ -384,30 +419,30 @@ class TextIndex(Persistent):
all data fields used.
"""
id=self.id
id = self.id
cidid="%s/%s" % (cid,id)
has_key=request.has_key
if has_key(cidid): keys=request[cidid]
elif has_key(id): keys=request[id]
cidid = "%s/%s" % (cid, id)
has_key = request.has_key
if has_key(cidid): keys = request[cidid]
elif has_key(id): keys =request[id]
else: return None
if type(keys) is type(''):
if not keys or not strip(keys): return None
keys=[keys]
r=None
keys = [keys]
r = None
for key in keys:
key=strip(key)
key = strip(key)
if not key: continue
rr=intSet()
rr = intSet()
try:
for i,score in query(key,self).items():
if score: rr.insert(i)
except KeyError: pass
if r is None: r=rr
if r is None: r = rr
else:
# Note that we *and*/*narrow* multiple search terms.
r=r.intersection(rr)
r = r.intersection(rr)
if r is not None: return r, (id,)
return intSet(), (id,)
......@@ -415,74 +450,78 @@ class TextIndex(Persistent):
class ResultList:
def __init__(self, d, words, index, TupleType=type(())):
self._index=index
self._words=words
self._index = index
self._words = words
if (type(d) is TupleType): self._dict = { d[0] : d[1] }
else: self._dict = d
def __len__(self): return len(self._dict)
def __getitem__(self, key): return self._dict[key]
def keys(self): return self._dict.keys()
def has_key(self, key): return self._dict.has_key(key)
def items(self): return self._dict.items()
def __and__(self, x):
result = {}
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
dict = self._dict
xdict = x._dict
xhas = xdict.has_key
for id, score in dict.items():
if xhas(id): result[id]=xdict[id]+score
if xhas(id): result[id] = xdict[id]+score
return self.__class__(result, self._words+x._words, self._index)
def and_not(self, x):
result = {}
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
dict = self._dict
xdict = x._dict
xhas = xdict.has_key
for id, score in dict.items():
if not xhas(id): result[id]=xdict[id]+score
if not xhas(id): result[id] = xdict[id]+score
return self.__class__(result, self._words, self._index)
def __or__(self, x):
result = {}
dict=self._dict
has=dict.has_key
xdict=x._dict
xhas=xdict.has_key
dict = self._dict
has = dict.has_key
xdict = x._dict
xhas = xdict.has_key
for id, score in dict.items():
if xhas(id): result[id]=xdict[id]+score
else: result[id]=score
if xhas(id): result[id] = xdict[id]+score
else: result[id] = score
for id, score in xdict.items():
if not has(id): result[id]=score
if not has(id): result[id] = score
return self.__class__(result, self._words+x._words, self._index)
def near(self, x):
result = {}
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
positions=self._index.positions
dict = self._dict
xdict = x._dict
xhas = xdict.has_key
positions = self._index.positions
for id, score in dict.items():
if not xhas(id): continue
p=(map(lambda i: (i,0), positions(id,self._words))+
map(lambda i: (i,1), positions(id,x._words)))
p.sort()
d=lp=9999
li=None
lsrc=None
d = lp = 9999
li = None
lsrc = None
for i,src in p:
if i is not li and src is not lsrc and li is not None:
d=min(d,i-li)
li=i
lsrc=src
if d==lp: score=min(score,xdict[id]) # synonyms
else: score=(score+xdict[id])/d
result[id]=score
d = min(d,i-li)
li = i
lsrc = src
if d==lp: score = min(score,xdict[id]) # synonyms
else: score = (score+xdict[id])/d
result[id] = score
return self.__class__(result, self._words+x._words, self._index)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment