Commit b0a8f678 authored by chris's avatar chris

added list_class argument to Index __init__

parent 2813712a
...@@ -32,7 +32,7 @@ Example usage: ...@@ -32,7 +32,7 @@ Example usage:
InvertedIndex provides three types of indexes: one non-persistent InvertedIndex provides three types of indexes: one non-persistent
index, Index, and two persistent indexes, Persistent and Transactional. index, Index, and two persistent indexes, Persistent and Transactional.
$Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $''' $Id: InvertedIndex.py,v 1.10 1997/01/29 16:48:40 chris Exp $'''
# Copyright # Copyright
# #
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne # Copyright 1996 Digital Creations, L.C., 910 Princess Anne
...@@ -66,7 +66,7 @@ $Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $''' ...@@ -66,7 +66,7 @@ $Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $'''
# #
# Limitation Of Liability # Limitation Of Liability
# #
# In no event will DCLC be liable for direct, indirect, special, 3# In no event will DCLC be liable for direct, indirect, special,
# incidental, economic, cover, or consequential damages arising # incidental, economic, cover, or consequential damages arising
# out of the use of or inability to use this software even if # out of the use of or inability to use this software even if
# advised of the possibility of such damages. Some states do not # advised of the possibility of such damages. Some states do not
...@@ -84,6 +84,9 @@ $Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $''' ...@@ -84,6 +84,9 @@ $Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $'''
# (540) 371-6909 # (540) 371-6909
# #
# $Log: InvertedIndex.py,v $ # $Log: InvertedIndex.py,v $
# Revision 1.10 1997/01/29 16:48:40 chris
# added list_class argument to Index __init__
#
# Revision 1.9 1996/12/23 21:54:10 chris # Revision 1.9 1996/12/23 21:54:10 chris
# Checked out by Chris for testing/editing. # Checked out by Chris for testing/editing.
# #
...@@ -116,7 +119,7 @@ $Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $''' ...@@ -116,7 +119,7 @@ $Id: InvertedIndex.py,v 1.9 1996/12/23 21:54:10 chris Exp $'''
# #
# #
# #
__version__='$Revision: 1.9 $'[11:-2] __version__='$Revision: 1.10 $'[11:-2]
import regex, regsub, string, marshal import regex, regsub, string, marshal
...@@ -126,44 +129,77 @@ from types import * ...@@ -126,44 +129,77 @@ from types import *
class ResultList: class ResultList:
'''\ '''\
This object holds the list of frequency/key pairs for a word This object holds the information for a word in an inverted index. It
in an inverted index. provides mapping behavior, mapping document keys to corresponding
document information, including the frequency value.
Union of two ResultList objects may be performed with the | operator. Union of two ResultList objects may be performed with the | operator.
Intersection of two ResultList objects may be performed with the & operator. Intersection of two ResultList objects may be performed with the & operator.
A "not" operation may be performed on a ResultList using its Not() method. Other methods:
ResultList frequency/key pairs may be sorted highest frequency to lowest Not()
using the sort() method. near()
keys()
items()
sorted_items()
''' '''
def __init__(self, freq_key_pairs = None): def __init__(self, d = {}):
if (freq_key_pairs is None): self._dict = d
self._list = []
else:
self._list = freq_key_pairs
def addentry(self, freq, key): def addentry(self, document_key, *info):
self._list.append((freq, key)) '''\
addentry(document_key, *info)
add a document and related information to this ResultList'''
self._dict[document_key] = info
def __str__(self): def __str__(self):
return `self._list` return `self._dict`
def __len__(self): def __len__(self):
return len(self._list) return len(self._dict)
def __getitem__(self, key):
return self._dict[key]
def __delitem__(self, key):
del self._dict[key]
def keys(self):
'''\
keys()
get the documents in this ResultList'''
return self._dict.keys()
def has_key(self, key):
return self._dict.has_key(key)
def items(self):
'''items()
get a list of document key/document information pairs'''
return self._dict.items()
def __getitem__(self, i): def sorted_items(self):
return self._list[i] '''sorted_items()
get a
Sort the frequency/key pairs in the ResultList by highest to lowest
frequency'''
def __getslice__(self, i, j): items = self._dict.items()
return self._list[i : j] items.sort(lambda x, y: -cmp(x[1][0], y[1][0]))
return items
def __and__(self, x): def __and__(self, x):
...@@ -172,15 +208,12 @@ class ResultList: ...@@ -172,15 +208,12 @@ class ResultList:
by calculating the geometric mean of each pair of corresponding by calculating the geometric mean of each pair of corresponding
frequencies.''' frequencies.'''
result = [] result = {}
d = {}
for entry in self._list:
d[entry[1]] = entry[0]
for entry in x._list: for key in x.keys():
try: try:
result.append((pow(d[entry[1]] * entry[0], 0.5), entry[1])) result[key] = ( pow(self[key][0] * x[key][0], 0.5), None )
except: except KeyError:
pass pass
return ResultList(result) return ResultList(result)
...@@ -192,19 +225,16 @@ class ResultList: ...@@ -192,19 +225,16 @@ class ResultList:
combined by calculating the sum of each pair of corresponding combined by calculating the sum of each pair of corresponding
frequencies.''' frequencies.'''
result = [] result = {}
d = {}
for entry in self._list:
d[entry[1]] = entry[0]
for entry in x._list: for key in self.keys():
try: result[key] = ( self[key][0], None )
d[entry[1]] = d[entry[1]] + entry[0]
except:
d[entry[1]] = entry[0]
for key in d.keys(): for key in x.keys():
result.append((d[key], key)) try:
result[key] = (result[key][0] + x[key][0], None)
except KeyError:
result[key] = ( x[key][0], None )
return ResultList(result) return ResultList(result)
...@@ -220,64 +250,70 @@ class ResultList: ...@@ -220,64 +250,70 @@ class ResultList:
ResultList instance.''' ResultList instance.'''
index = index._index_object index = index._index_object
res = None
exclude = {}
for item in self._list:
exclude[item[1]] = 1
for key in index.keys(): for key in index.keys():
for item in index[key]._list: try:
if (not exclude.has_key(item[1])): keys = index[key].keys()
try: except KeyError:
res = res | ResultList([item]) continue
except:
res = ResultList([item]) index_val = index[key]
for key in keys:
if (not self.has_key(key)):
if (res):
res = res | { key : index_val[key] }
else:
res = ResultList({ key : index_val[key] })
try: if (res):
return res return res
except:
return ResultList()
return ResultList()
def __sub__(self, x):
pass
def near(self, x, distance = 1):
result = {}
def __add__(self, x): for key in self.keys():
return ResultList(self._list + x[:]) try:
value = x[key]
except KeyError:
continue
positions1 = self[key][1]
positions2 = value[1]
def sort(self): for position1 in positions1:
'''\ for position2 in positions2:
sort()
Sort the frequency/key pairs in the ResultList by highest to lowest if (position1 is None or position2 is None):
frequency''' break
self._list.sort() prox = position2 - position1
self._list.reverse() if ((prox > 0) and (prox <= distance)):
rel = pow(self[key][0] * value[0], 0.5)
try:
pos = result[key][1] + [ position2 ]
except KeyError:
pos = [ position2 ]
def __getstate__(self): result[key] = (rel, pos)
l = self._list else:
new_l = [] continue
for key, freq in l:
new_l = new_l + [ key, freq ] break
return marshal.dumps(new_l)
return ResultList(result)
def __setstate__(self, marshaled_state):
l = marshal.loads(marshaled_state)
if (len(l) and l[0] is not TupleType): def __getstate__(self):
new_l = [] return self._dict
for i in range(0, len(l), 2):
new_l.append(tuple(l[i : (i + 2)]))
l = new_l
self._list = l def __setstate__(self, state):
self._dict = state
RegexType = type(regex.compile('')) RegexType = type(regex.compile(''))
...@@ -326,12 +362,15 @@ class Index: ...@@ -326,12 +362,15 @@ class Index:
# perform a test search # perform a test search
print i['blah'] print i['blah']
''' #' '''
list_class = ResultList list_class = ResultList
def __init__(self, index_dictionary = None): def __init__(self, index_dictionary = None, list_class = None):
'Create an inverted index' 'Create an inverted index'
if (list_class is not None):
self.list_class = list_class
self.set_index(index_dictionary) self.set_index(index_dictionary)
...@@ -380,10 +419,12 @@ class Index: ...@@ -380,10 +419,12 @@ class Index:
nwords = math.log(len(src)) nwords = math.log(len(src))
i = {} d = {}
for s in src: for i in range(len(src)):
s = src[i]
s = string.lower(s) s = string.lower(s)
stopword_flag = 0 stopword_flag = 0
while (not stopword_flag): while (not stopword_flag):
try: try:
index_val = index[s] index_val = index[s]
...@@ -400,22 +441,21 @@ class Index: ...@@ -400,22 +441,21 @@ class Index:
continue continue
try: try:
i[s] = i[s] + 1 d[s].append(i)
except: except KeyError:
i[s] = 1 d[s] = [ i ]
for s in i.keys(): for s in d.keys():
freq = int(10000 * (i[s] / nwords)) freq = int(10000 * (len(d[s]) / nwords))
try: try:
index[s].addentry(freq, srckey) index[s].addentry(srckey, freq, d[s])
except: except:
index[s] = List([(freq, srckey)]) index[s] = List({srckey : (freq, d[s])})
def __getitem__(self, key): def __getitem__(self, key):
''' '''
Get the ResultList objects for the inverted key, key, sorted by Get the ResultList objects for the inverted key, key.
frequency.
The key may be a regular expression, in which case a regular The key may be a regular expression, in which case a regular
expression match is done. expression match is done.
...@@ -451,11 +491,6 @@ class Index: ...@@ -451,11 +491,6 @@ class Index:
key = string.lower(key) key = string.lower(key)
try:
key = index[key]
except KeyError:
return List()
while (type(key) == StringType): while (type(key) == StringType):
try: try:
key = index[key] key = index[key]
...@@ -475,6 +510,15 @@ class Index: ...@@ -475,6 +510,15 @@ class Index:
def __len__(self): def __len__(self):
return len(self._index_object) return len(self._index_object)
def remove_document(self, doc_key, s = None):
if (s is None):
for key in self.keys():
try:
del self[key][doc_key]
except:
continue
def get_stopwords(self): def get_stopwords(self):
index = self._index_object index = self._index_object
...@@ -498,12 +542,26 @@ class Index: ...@@ -498,12 +542,26 @@ class Index:
return synonyms return synonyms
def get_document_keys(self):
d = {}
for key in self.keys():
try:
doc_keys = self[key].keys()
except:
continue
for doc_key in doc_keys:
d[doc_key] = 1
return d.keys()
class PersistentResultList(ResultList, PickleDictionary.Persistent): class PersistentResultList(ResultList, PickleDictionary.Persistent):
def addentry(self, freq, key): def addentry(self, key, *info):
'''Add a frequency/key pair to this object''' '''Add a frequency/key pair to this object'''
self._list.append((freq, key)) ResultList.addentry(self, key, info)
self.__changed__(1) self.__changed__(1)
...@@ -512,7 +570,7 @@ class STPResultList(ResultList, SingleThreadedTransaction.Persistent): ...@@ -512,7 +570,7 @@ class STPResultList(ResultList, SingleThreadedTransaction.Persistent):
def addentry(self, freq, key): def addentry(self, freq, key):
'''Add a frequency/key pair to this object''' '''Add a frequency/key pair to this object'''
self._list.append((freq, key)) ResultList.addentry(self, key, info)
self.__changed__(1) self.__changed__(1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment