Commit 169f5a22 authored by 's avatar

Added copyrights

parent 7b598491
##############################################################################
##############################################################################
#
# Copyright
#
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne
# Street, Suite 300, Fredericksburg, Virginia 22401 U.S.A. All
# rights reserved.
#
##############################################################################
__doc__='''Simple column indexes
$Id: Index.py,v 1.16 1998/12/14 16:32:55 jeffrey Exp $'''
__version__='$Revision: 1.16 $'[11:-2]
# Zope Public License (ZPL) Version 0.9.4
# ---------------------------------------
#
# Copyright (c) Digital Creations. All rights reserved.
#
# Redistribution and use in source and binary forms, with or
# without modification, are permitted provided that the following
# conditions are met:
#
# 1. Redistributions in source code must retain the above
# copyright notice, this list of conditions, and the following
# disclaimer.
#
# 2. Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions, and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# 3. Any use, including use of the Zope software to operate a
# website, must either comply with the terms described below
# under "Attribution" or alternatively secure a separate
# license from Digital Creations.
#
# 4. All advertising materials, documentation, or technical papers
# mentioning features derived from or use of this software must
# display the following acknowledgement:
#
# "This product includes software developed by Digital
# Creations for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# 5. Names associated with Zope or Digital Creations must not be
# used to endorse or promote products derived from this
# software without prior written permission from Digital
# Creations.
#
# 6. Redistributions of any form whatsoever must retain the
# following acknowledgment:
#
# "This product includes software developed by Digital
# Creations for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# 7. Modifications are encouraged but must be packaged separately
# as patches to official Zope releases. Distributions that do
# not clearly separate the patches from the original work must
# be clearly labeled as unofficial distributions.
#
# Disclaimer
#
# THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
# SHALL DIGITAL CREATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
#
# Attribution
#
# Individuals or organizations using this software as a web site
# must provide attribution by placing the accompanying "button"
# and a link to the accompanying "credits page" on the website's
# main entry point. In cases where this placement of
# attribution is not feasible, a separate arrangment must be
# concluded with Digital Creations. Those using the software
# for purposes other than web sites must provide a corresponding
# attribution in locations that include a copyright using a
# manner best suited to the application environment.
#
# This software consists of contributions made by Digital
# Creations and many individuals on behalf of Digital Creations.
# Specific attributions are listed in the accompanying credits
# file.
#
##############################################################################
"""Simple column indices"""
__version__='$Revision: 1.17 $'[11:-2]
from Globals import Persistent
from BTree import BTree
......@@ -26,244 +99,190 @@ StringType=type('s')
def nonEmpty(s):
"returns true if a non-empty string or any other (nonstring) type"
if type(s) is StringType:
if s: return 1
else: return 0
if s: return 1
else: return 0
else:
return 1
return 1
class Index(Persistent):
"""Index object interface"""
def _init(self,data,schema,id):
"""Create an index
"""Create an index
The arguments are:
The arguments are:
'data' -- a mapping from integer object ids to objects or records,
'data' -- a mapping from integer object ids to objects or records,
'schema' -- a mapping from item name to index into data records.
'schema' -- a mapping from item name to index into data records.
If 'data' is a mapping to objects, then schema should ne 'None'.
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
"""
self._data=data
self._schema=schema
self.id=id
self._index=BTree()
self._reindex()
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
"""
self._data=data
self._schema=schema
self.id=id
self._index=BTree()
self._reindex()
def dpHasUniqueValuesFor(self, name):
' has unique values for column NAME '
if name == self.id:
return 1
else:
return 0
' has unique values for column NAME '
if name == self.id:
return 1
else:
return 0
def dpUniqueValues(self, name=None, withLengths=0):
"""\
returns the unique values for name
if withLengths is true, returns a sequence of
tuples of (value, length)
"""
if name is None:
name = self.id
elif name != self.id:
return []
if not withLengths: return tuple(
filter(nonEmpty,self._index.keys())
)
else:
rl=[]
for i in self._index.keys():
if not nonEmpty(i): continue
else: rl.append((i, len(self._index[i])))
return tuple(rl)
"""\
returns the unique values for name
if withLengths is true, returns a sequence of
tuples of (value, length)
"""
if name is None:
name = self.id
elif name != self.id:
return []
if not withLengths: return tuple(
filter(nonEmpty,self._index.keys())
)
else:
rl=[]
for i in self._index.keys():
if not nonEmpty(i): continue
else: rl.append((i, len(self._index[i])))
return tuple(rl)
def clear(self):
self._index=BTree()
self._index=BTree()
def _reindex(self,start=0):
"""Recompute index data for data with ids >= start."""
"""Recompute index data for data with ids >= start."""
index=self._index
get=index.get
if not start: index.clear()
index=self._index
get=index.get
if not start: index.clear()
id=self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
id=self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
for i,row in self._data.items(start):
k=f(row,id)
for i,row in self._data.items(start):
k=f(row,id)
if k is None or k == MV: continue
if k is None or k == MV: continue
set=get(k)
if set is None: index[k]=set=intSet()
set.insert(i)
set=get(k)
if set is None: index[k]=set=intSet()
set.insert(i)
def index_item(self,i):
"""Recompute index data for data with ids >= start."""
"""Recompute index data for data with ids >= start."""
index=self._index
index=self._index
id=self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
id=self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
row=self._data[i]
k=f(row,id)
row=self._data[i]
k=f(row,id)
if k is None or k == MV: return
if k is None or k == MV: return
set=index.get(k)
if set is None: index[k]=set=intSet()
set.insert(i)
set=index.get(k)
if set is None: index[k]=set=intSet()
set.insert(i)
def unindex_item(self,i):
"""Recompute index data for data with ids >= start."""
"""Recompute index data for data with ids >= start."""
index=self._index
index=self._index
id=self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
id=self.id
if self._schema is None:
f=getattr
else:
f=operator.__getitem__
id=self._schema[id]
row=self._data[i]
k=f(row,id)
set=index.get(k)
if set is not None: set.remove(i)
row=self._data[i]
k=f(row,id)
set=index.get(k)
if set is not None: set.remove(i)
def _apply_index(self, request, cid=''):
"""Apply the index to query parameters given in the argument, request
The argument should be a mapping object.
If the request does not contain the needed parameters, then None is
returned.
If the request contains a parameter with the name of the column
+ '_usage', it is sniffed for information on how to handle applying
the index.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
id=self.id #name of the column
cidid="%s/%s" % (cid,id)
has_key=request.has_key
if has_key(cidid): keys=request[cidid]
elif has_key(id): keys=request[id]
else: return None
if type(keys) is not ListType: keys=[keys]
index=self._index
r=None
anyTrue=0
opr=None
if request.has_key(id+'_usage'):
# see if any usage params are sent to field
opr=string.split(string.lower(request[id+"_usage"]),':')
opr, opr_args=opr[0], opr[1:]
if opr=="range":
if 'min' in opr_args: lo=min(keys)
else: lo=None
if 'max' in opr_args: hi=max(keys)
else: hi=None
anyTrue=1
try:
if hi: setlist=index.items(lo,hi)
else: setlist=index.items(lo)
for k,set in setlist:
if r is None: r=set
else: r=r.union(set)
except KeyError: pass
else: #not a range
get=index.get
for key in keys:
if key: anyTrue=1
set=get(key)
if set is not None:
if r is None: r=set
else: r = r.union(set)
if r is None:
if anyTrue: r=intSet()
else: return None
return r, (id,)
##############################################################################
#
# $Log: Index.py,v $
# Revision 1.16 1998/12/14 16:32:55 jeffrey
# unique values listing now won't return empty strings
#
# Revision 1.15 1998/10/13 21:07:17 jeffrey
# added dpUniqueValues and dpHasUniqueValuesFor methods
#
# Revision 1.14 1998/02/25 22:38:34 jeffrey
# made the Index persistent, just as it should be
#
# Revision 1.13 1998/02/05 19:02:37 jim
# Replaced try/except with get
#
# Revision 1.12 1997/12/02 19:34:39 jeffrey
# fixed buglet in .clear() method
#
# Revision 1.11 1997/10/10 19:25:03 jeffrey
# fixed min:max buglet
#
# Revision 1.10 1997/10/10 18:34:56 jeffrey
# Added range searching/indexing
#
# Revision 1.9 1997/09/26 22:21:43 jim
# added protocol needed by searchable objects
#
# Revision 1.8 1997/09/23 16:46:48 jim
# Added logic to handle missing data.
#
# Revision 1.7 1997/09/17 18:58:08 brian
# Fixed a booboo in unindex_item
#
# Revision 1.6 1997/09/12 14:46:51 jim
# *** empty log message ***
#
# Revision 1.5 1997/09/12 14:18:04 jim
# Added logic to allow "blank" inputs.
#
# Revision 1.4 1997/09/10 21:46:18 jim
# Fixed bug that caused return of None when there were no matches.
#
# Revision 1.3 1997/09/10 17:25:26 jim
# Changed to use regular old BTree.
#
# Revision 1.2 1997/09/08 18:53:24 jim
# *** empty log message ***
#
# Revision 1.1 1997/09/08 18:52:04 jim
# *** empty log message ***
#
#
"""Apply the index to query parameters given in the argument, request
The argument should be a mapping object.
If the request does not contain the needed parameters, then None is
returned.
If the request contains a parameter with the name of the column
+ '_usage', it is sniffed for information on how to handle applying
the index.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
id=self.id #name of the column
cidid="%s/%s" % (cid,id)
has_key=request.has_key
if has_key(cidid): keys=request[cidid]
elif has_key(id): keys=request[id]
else: return None
if type(keys) is not ListType: keys=[keys]
index=self._index
r=None
anyTrue=0
opr=None
if request.has_key(id+'_usage'):
# see if any usage params are sent to field
opr=string.split(string.lower(request[id+"_usage"]),':')
opr, opr_args=opr[0], opr[1:]
if opr=="range":
if 'min' in opr_args: lo=min(keys)
else: lo=None
if 'max' in opr_args: hi=max(keys)
else: hi=None
anyTrue=1
try:
if hi: setlist=index.items(lo,hi)
else: setlist=index.items(lo)
for k,set in setlist:
if r is None: r=set
else: r=r.union(set)
except KeyError: pass
else: #not a range
get=index.get
for key in keys:
if key: anyTrue=1
set=get(key)
if set is not None:
if r is None: r=set
else: r = r.union(set)
if r is None:
if anyTrue: r=intSet()
else: return None
return r, (id,)
##############################################################################
##############################################################################
#
# Copyright
#
# Copyright 1997 Digital Creations, L.C., 910 Princess Anne
# Street, Suite 300, Fredericksburg, Virginia 22401 U.S.A. All
# rights reserved.
#
##############################################################################
__doc__='''Text Index
# Zope Public License (ZPL) Version 0.9.4
# ---------------------------------------
#
# Copyright (c) Digital Creations. All rights reserved.
#
# Redistribution and use in source and binary forms, with or
# without modification, are permitted provided that the following
# conditions are met:
#
# 1. Redistributions in source code must retain the above
# copyright notice, this list of conditions, and the following
# disclaimer.
#
# 2. Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions, and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# 3. Any use, including use of the Zope software to operate a
# website, must either comply with the terms described below
# under "Attribution" or alternatively secure a separate
# license from Digital Creations.
#
# 4. All advertising materials, documentation, or technical papers
# mentioning features derived from or use of this software must
# display the following acknowledgement:
#
# "This product includes software developed by Digital
# Creations for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# 5. Names associated with Zope or Digital Creations must not be
# used to endorse or promote products derived from this
# software without prior written permission from Digital
# Creations.
#
# 6. Redistributions of any form whatsoever must retain the
# following acknowledgment:
#
# "This product includes software developed by Digital
# Creations for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# 7. Modifications are encouraged but must be packaged separately
# as patches to official Zope releases. Distributions that do
# not clearly separate the patches from the original work must
# be clearly labeled as unofficial distributions.
#
# Disclaimer
#
# THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
# SHALL DIGITAL CREATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
#
# Attribution
#
# Individuals or organizations using this software as a web site
# must provide attribution by placing the accompanying "button"
# and a link to the accompanying "credits page" on the website's
# main entry point. In cases where this placement of
# attribution is not feasible, a separate arrangment must be
# concluded with Digital Creations. Those using the software
# for purposes other than web sites must provide a corresponding
# attribution in locations that include a copyright using a
# manner best suited to the application environment.
#
# This software consists of contributions made by Digital
# Creations and many individuals on behalf of Digital Creations.
# Specific attributions are listed in the accompanying credits
# file.
#
##############################################################################
"""Text Index
Notes on a new text index design
......@@ -40,9 +116,9 @@ Notes on a new text index design
- Does the system save word positions as we do?
- What is the index indexing?
- What is the index indexing?
- What was the vocabulary of the system?
- What was the vocabulary of the system?
Let\'s see. Assume a 10,000 word vocabulary. Then we use
25-bytes per entry. Hm.....
......@@ -98,26 +174,26 @@ Notes on a new text index design
InvertedIndex -- word -> idSet
ResultIndex -- id -> docData
ResultIndex -- id -> docData
where:
word -- is a token, typically a word, but could be a name or a
number
word -- is a token, typically a word, but could be a name or a
number
textSearchResult -- id -> (score, positions)
textSearchResult -- id -> (score, positions)
id -- integer, say 4-byte.
positions -- sequence of integers.
id -- integer, say 4-byte.
positions -- sequence of integers.
score -- numeric measure of relevence, f(numberOfWords, positions)
score -- numeric measure of relevence, f(numberOfWords, positions)
numberOfWords -- number of words in source document.
numberOfWords -- number of words in source document.
idSet -- set of ids
idSet -- set of ids
docData -- numberOfWords, word->positions
docData -- numberOfWords, word->positions
Note that ids and positions are ints. We will build C
extensions for efficiently storing and pickling structures
......@@ -125,10 +201,8 @@ Notes on a new text index design
overhead and storage/retrieveal times, as well as storeage
space.
$Id: TextIndex.py,v 1.11 1998/09/28 20:43:22 jim Exp $'''
__version__='$Revision: 1.11 $'[11:-2]
"""
__version__='$Revision: 1.12 $'[11:-2]
from Globals import Persistent
import BTree, IIBTree
......@@ -144,197 +218,197 @@ import string, regex, regsub
class TextIndex(Persistent):
def _init(self,data,schema,id):
"""Create an index
"""Create an index
The arguments are:
The arguments are:
'data' -- a mapping from integer object ids to objects or records,
'data' -- a mapping from integer object ids to objects or records,
'schema' -- a mapping from item name to index into data records.
'schema' -- a mapping from item name to index into data records.
If 'data' is a mapping to objects, then schema should ne 'None'.
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
"""
self._data=data
self._schema=schema
self.id=id
self._index=BTree()
self._syn=stop_word_dict
self._reindex()
'id' -- the name of the item attribute to index. This is either
an attribute name or a record key.
"""
self._data=data
self._schema=schema
self.id=id
self._index=BTree()
self._syn=stop_word_dict
self._reindex()
def clear(self):
self._index=BTree()
self._index=BTree()
def positions(self, docid, words):
"""Return the positions in the document for the given document
id of the word, word."""
id=self.id
if self._schema is None:
f=getattr
else:
f=getitem
id=self._schema[id]
row=self._data[docid]
doc=str(f(row,id))
r=[]
for word in words:
r=r+Splitter(doc, self._syn).indexes(word)
return r
"""Return the positions in the document for the given document
id of the word, word."""
id=self.id
if self._schema is None:
f=getattr
else:
f=getitem
id=self._schema[id]
row=self._data[docid]
doc=str(f(row,id))
r=[]
for word in words:
r=r+Splitter(doc, self._syn).indexes(word)
return r
def index_item(self,i,un=0):
"""Recompute index data for data with ids >= start."""
"""Recompute index data for data with ids >= start."""
id=self.id
if self._schema is None:
f=getattr
else:
f=getitem
id=self._schema[id]
id=self.id
if self._schema is None:
f=getattr
else:
f=getitem
id=self._schema[id]
row=self._data[i]
k=str(f(row,id))
row=self._data[i]
k=str(f(row,id))
self._index_document(k,i,un)
self._index_document(k,i,un)
def unindex_item(self, i): return self.index_item(i,1)
def _reindex(self,start=0):
"""Recompute index data for data with ids >= start."""
for i in self._data.keys(start): self.index_item(i)
"""Recompute index data for data with ids >= start."""
for i in self._data.keys(start): self.index_item(i)
def _index_document(self, document_text, id, un=0,
tupleType=type(()),
dictType=type({}),
):
tupleType=type(()),
dictType=type({}),
):
src = Splitter(document_text, self._syn)
d = {}
old=d.has_key
last=None
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
else:
if old(s):
if s != last: d[s]=d[s]+1
else: d[s]=1
index=self._index
get=index.get
if un:
for word,score in d.items():
r=get(word)
if r is not None:
if type(r) is tupleType: del index[word]
else:
if r.has_key(id): del r[id]
if type(r) is dictType:
if len(r) < 2:
if r:
for k, v in r.items(): index[word]=k,v
else: del index[word]
else: index[word]=r
else:
for word,score in d.items():
r=get(word)
if r is not None:
r=index[word]
if type(r) is tupleType:
r={r[0]:r[1]}
r[id]=score
index[word]=r
elif type(r) is dictType:
if len(r) > 4:
b=IIBTree()
for k, v in r.items(): b[k]=v
r=b
r[id]=score
index[word]=r
else: r[id]=score
else: index[word]=id,score
old=d.has_key
last=None
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
else:
if old(s):
if s != last: d[s]=d[s]+1
else: d[s]=1
index=self._index
get=index.get
if un:
for word,score in d.items():
r=get(word)
if r is not None:
if type(r) is tupleType: del index[word]
else:
if r.has_key(id): del r[id]
if type(r) is dictType:
if len(r) < 2:
if r:
for k, v in r.items(): index[word]=k,v
else: del index[word]
else: index[word]=r
else:
for word,score in d.items():
r=get(word)
if r is not None:
r=index[word]
if type(r) is tupleType:
r={r[0]:r[1]}
r[id]=score
index[word]=r
elif type(r) is dictType:
if len(r) > 4:
b=IIBTree()
for k, v in r.items(): b[k]=v
r=b
r[id]=score
index[word]=r
else: r[id]=score
else: index[word]=id,score
def _subindex(self, isrc, d, old, last):
src = Splitter(isrc, self._syn)
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
else:
if old(s):
if s != last: d[s]=d[s]+1
else: d[s]=1
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
else:
if old(s):
if s != last: d[s]=d[s]+1
else: d[s]=1
return last
return last
def __getitem__(self, word):
"""Return an InvertedIndex-style result "list"
"""
"""Return an InvertedIndex-style result "list"
"""
src = tuple(Splitter(word, self._syn))
if not src: return ResultList({},(word,),self)
if len(src) == 1:
src=src[0]
if src[:1]=='"' and src[-1:]=='"': return self[src]
r=self._index.get(word,None)
if r is None: r={}
return ResultList(r,(word,),self)
r=None
for word in src:
rr=self[word]
if r is None: r=rr
else: r=r.near(rr)
return r
if len(src) == 1:
src=src[0]
if src[:1]=='"' and src[-1:]=='"': return self[src]
r=self._index.get(word,None)
if r is None: r={}
return ResultList(r,(word,),self)
r=None
for word in src:
rr=self[word]
if r is None: r=rr
else: r=r.near(rr)
return r
def _apply_index(self, request, cid='', ListType=[]):
"""Apply the index to query parameters given in the argument, request
"""Apply the index to query parameters given in the argument, request
The argument should be a mapping object.
The argument should be a mapping object.
If the request does not contain the needed parameters, then None is
returned.
If the request does not contain the needed parameters, then None is
returned.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
id=self.id
id=self.id
cidid="%s/%s" % (cid,id)
has_key=request.has_key
if has_key(cidid): keys=request[cidid]
elif has_key(id): keys=request[id]
else: return None
cidid="%s/%s" % (cid,id)
has_key=request.has_key
if has_key(cidid): keys=request[cidid]
elif has_key(id): keys=request[id]
else: return None
if type(keys) is type(''):
if not keys or not strip(keys): return None
keys=[keys]
r=None
for key in keys:
key=strip(key)
if not key: continue
rr=intSet()
try:
for i,score in query(key,self).items():
if score: rr.insert(i)
except KeyError: pass
if r is None: r=rr
else:
# Note that we *and*/*narrow* multiple search terms.
r=r.intersection(rr)
if r is not None: return r, (id,)
return intSet(), (id,)
r=None
for key in keys:
key=strip(key)
if not key: continue
rr=intSet()
try:
for i,score in query(key,self).items():
if score: rr.insert(i)
except KeyError: pass
if r is None: r=rr
else:
# Note that we *and*/*narrow* multiple search terms.
r=r.intersection(rr)
if r is not None: return r, (id,)
return intSet(), (id,)
class ResultList:
def __init__(self, d, words, index, TupleType=type(())):
self._index=index
self._words=words
self._index=index
self._words=words
if (type(d) is TupleType): self._dict = { d[0] : d[1] }
else: self._dict = d
......@@ -346,61 +420,61 @@ class ResultList:
def __and__(self, x):
result = {}
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
for id, score in dict.items():
if xhas(id): result[id]=xdict[id]+score
if xhas(id): result[id]=xdict[id]+score
return self.__class__(result, self._words+x._words, self._index)
def and_not(self, x):
result = {}
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
for id, score in dict.items():
if not xhas(id): result[id]=xdict[id]+score
if not xhas(id): result[id]=xdict[id]+score
return self.__class__(result, self._words, self._index)
def __or__(self, x):
result = {}
dict=self._dict
has=dict.has_key
xdict=x._dict
xhas=xdict.has_key
dict=self._dict
has=dict.has_key
xdict=x._dict
xhas=xdict.has_key
for id, score in dict.items():
if xhas(id): result[id]=xdict[id]+score
else: result[id]=score
if xhas(id): result[id]=xdict[id]+score
else: result[id]=score
for id, score in xdict.items():
if not has(id): result[id]=score
for id, score in xdict.items():
if not has(id): result[id]=score
return self.__class__(result, self._words+x._words, self._index)
def near(self, x):
result = {}
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
positions=self._index.positions
dict=self._dict
xdict=x._dict
xhas=xdict.has_key
positions=self._index.positions
for id, score in dict.items():
if not xhas(id): continue
p=(map(lambda i: (i,0), positions(id,self._words))+
map(lambda i: (i,1), positions(id,x._words)))
p.sort()
d=lp=9999
li=None
lsrc=None
for i,src in p:
if i is not li and src is not lsrc and li is not None:
d=min(d,i-li)
li=i
lsrc=src
if d==lp: score=min(score,xdict[id]) # synonyms
else: score=(score+xdict[id])/d
result[id]=score
if not xhas(id): continue
p=(map(lambda i: (i,0), positions(id,self._words))+
map(lambda i: (i,1), positions(id,x._words)))
p.sort()
d=lp=9999
li=None
lsrc=None
for i,src in p:
if i is not li and src is not lsrc and li is not None:
d=min(d,i-li)
li=i
lsrc=src
if d==lp: score=min(score,xdict[id]) # synonyms
else: score=(score+xdict[id])/d
result[id]=score
return self.__class__(result, self._words+x._words, self._index)
......@@ -413,7 +487,7 @@ Near = '...'
QueryError='TextIndex.QueryError'
def query(s, index, default_operator = Or,
ws = (string.whitespace,)):
ws = (string.whitespace,)):
# First replace any occurences of " and not " with " andnot "
s = regsub.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
q = parse(s)
......@@ -444,9 +518,9 @@ def parse(s):
return l
def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
ListType=type([]),
):
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
ListType=type([]),
):
'''Find operators and operands'''
i = 0
isop=operator_dict.has_key
......@@ -457,8 +531,8 @@ def parse2(q, default_operator,
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if isop(q[i]): q[i] = operator_dict[q[i]]
if isop(q[i]): q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
......@@ -471,17 +545,17 @@ def parens(s, parens_regex = regex.compile("(\|)")):
if (parens_regex.search(s) < 0): return None
if (parens_regex.group(0) == ")"):
raise QueryError, "Mismatched parentheses"
raise QueryError, "Mismatched parentheses"
open = parens_regex.regs[0][0] + 1
start = parens_regex.regs[0][1]
p = 1
while (parens_regex.search(s, start) >= 0):
if (parens_regex.group(0) == ")"): p = p - 1
if (parens_regex.group(0) == ")"): p = p - 1
else: p = p + 1
start = parens_regex.regs[0][1]
start = parens_regex.regs[0][1]
if (p == 0): return (open, parens_regex.regs[0][0])
......@@ -496,16 +570,16 @@ def quotes(s, ws = (string.whitespace,)):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
......@@ -535,8 +609,8 @@ def evaluate(q, index,ListType=type([])):
'''Evaluate a parsed query'''
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], index)
if (type(q[0]) is ListType):
return evaluate(q[0], index)
return index[q[0]]
......@@ -551,25 +625,25 @@ def evaluate(q, index,ListType=type([])):
i = 0
while (i < len(q)):
if q[i] is And:
left, right = get_operands(q, i, index)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
left, right = get_operands(q, i, index)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = get_operands(q, i, index)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
left, right = get_operands(q, i, index)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = get_operands(q, i, index)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
left, right = get_operands(q, i, index)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "Malformed query"
......@@ -624,46 +698,3 @@ stop_words=(
)
stop_word_dict={}
for word in stop_words: stop_word_dict[word]=None
##############################################################################
#
# $Log: TextIndex.py,v $
# Revision 1.11 1998/09/28 20:43:22 jim
# Fixed bug in searches on stop words.
#
# Revision 1.10 1998/02/05 19:02:09 jim
# Changed to use get method.
#
# Revision 1.9 1998/02/05 15:24:22 jim
# Got rid of most try/excepts.
#
# Revision 1.8 1997/12/02 19:36:19 jeffrey
# fixed bug in .clear() method
#
# Revision 1.7 1997/12/01 22:58:48 jeffrey
# Allow indexing of non-text fields
#
# Revision 1.6 1997/11/03 18:59:59 jim
# Fixed several bugs in handling query parsing and proximity search.
#
# Revision 1.5 1997/11/03 15:17:12 jim
# Updated to use new indexing strategy. Now, no longer store positions
# in index, but get them on demand from doc.
#
# Removed vestiges of InvertedIndex.
#
# Revision 1.4 1997/09/26 22:21:44 jim
# added protocol needed by searchable objects
#
# Revision 1.3 1997/09/17 17:53:32 jim
# Added unindex_item.
# This thing needs an overhaul; already. :-(
#
# Revision 1.2 1997/09/12 14:25:40 jim
# Added logic to allow "blank" inputs.
#
# Revision 1.1 1997/09/11 22:19:09 jim
# *** empty log message ***
#
#
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment