Commit f0ee8a26 authored by Jim Fulton's avatar Jim Fulton

Chris' changes, I think....

parent 92b0ff07
......@@ -30,7 +30,7 @@ Example usage:
print i['blah']
$Id: InvertedIndex.py,v 1.28 1997/03/31 23:17:53 jim Exp $'''
$Id: InvertedIndex.py,v 1.29 1997/04/08 00:14:22 jim Exp $'''
# Copyright
#
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne
......@@ -82,6 +82,9 @@ $Id: InvertedIndex.py,v 1.28 1997/03/31 23:17:53 jim Exp $'''
# (540) 371-6909
#
# $Log: InvertedIndex.py,v $
# Revision 1.29 1997/04/08 00:14:22 jim
# Chris' changes, I think....
#
# Revision 1.28 1997/03/31 23:17:53 jim
# I put back the list_class hook.
#
......@@ -181,7 +184,7 @@ $Id: InvertedIndex.py,v 1.28 1997/03/31 23:17:53 jim Exp $'''
#
#
#
__version__='$Revision: 1.28 $'[11:-2]
__version__='$Revision: 1.29 $'[11:-2]
import regex, regsub, string, copy
......@@ -190,208 +193,216 @@ from string import lower
from types import *
class ResultList:
'''\
This object holds the information for a word in an inverted index. It
provides mapping behavior, mapping document keys to corresponding
document information, including the frequency value.
Union of two ResultList objects may be performed with the | operator.
Intersection of two ResultList objects may be performed with the & operator.
Other methods:
Not()
near()
keys()
items()
sorted_items()
'''
def __init__(self, d = None):
self._dict = d or {}
def addentry(self, document_key, *info):
'''\
addentry(document_key, *info)
add a document and related information to this ResultList'''
self[document_key] = info
def __str__(self):
return `self._dict`
def __len__(self):
return len(self._dict)
def __getitem__(self, key):
return self._dict[key]
def __setitem__(self, key, v):
self._dict[key]=v
# Note self.__changed__(1) via the
def __delitem__(self, key):
del self._dict[key]
This object holds the information for a word in an inverted index. It
provides mapping behavior, mapping document keys to corresponding
document information, including the frequency value.
def keys(self):
'''\
keys()
get the documents in this ResultList'''
return self._dict.keys()
def has_key(self, key):
return self._dict.has_key(key)
def items(self):
'''items()
get a list of document key/document information pairs'''
return self._dict.items()
def sorted_items(self):
'''sorted_items()
get a
Sort the frequency/key pairs in the ResultList by highest to lowest
frequency'''
items = self._dict.items()
items.sort(lambda x, y: -cmp(x[1][0], y[1][0]))
return items
def __and__(self, x):
'''Allows intersection of two ResultList objects using the & operator.
When ResultLists are combined in this way, frequencies are combined
by calculating the geometric mean of each pair of corresponding
frequencies.'''
result = self.__class__()
for key,v in self.items():
try:
xv=x[key]
v=pow(v[0]*xv[0],0.5), v[1]+xv[1]
result[key] = v
except KeyError: pass
return result
def and_not(self, x):
'''Return items in the reciever that are not in the argument'''
result = self.__class__()
for key,v in self.items():
try: x[key]
except KeyError: result[key] = v
return result
Union of two ResultList objects may be performed with the | operator.
def __or__(self, x):
'''Allows union of two ResultList objects using the | operator.
When ResultLists are combined in this way, frequencies are
combined by calculating the sum of each pair of corresponding
frequencies.'''
result = self.__class__()
for key,v in self.items():
try:
xv = x[key]
v = v[0] + xv[0], v[1] + xv[1]
except: pass
result[key] = v
for key,v in x.items():
try:
self[key]
except:
result[key]=v
return result
def Not(self, index):
'''\
Not(index)
Perform a "not" operation on a ResultList object.
Not() returns the union of all ResultLists in the index that do
not contain a link to a document that is found in "self".
This method should be passed the Index object that returned the
ResultList instance.'''
index = index._index_object
res = None
for key in index.keys():
try:
keys = index[key].keys()
except KeyError:
continue
index_val = index[key]
for key in keys:
if (not self.has_key(key)):
if (res):
res = res | { key : index_val[key] }
else:
res = self.__class__({ key : index_val[key] })
if (res):
return res
return self.__class__()
def near(self, x, distance = 1):
'''\
near(rl, distance = 1)
Intersection of two ResultList objects may be performed with the & operator.
Returns a ResultList containing documents which contain'''
result = self.__class__()
for key,v in self.items():
try: value = x[key]
except KeyError: value=None
if value is None: continue
positions = v[1]+value[1]
positions.sort()
positionsr=[]
rel = pow(v[0] * value[0], 0.5)
pl=positions[0]
rl=-1
for i in range(1,len(positions)):
p=positions[i]
d=p-pl
if d > 0 and d <= distance:
if pl != rl: positionsr.append(pl)
positionsr.append(p)
rl=p
pl=p
result[key]=positionsr
return result
def __getstate__(self):
return self._dict
def __setstate__(self, state):
self._dict = state
Other methods:
Not()
near()
keys()
items()
sorted_items()
'''
def __init__(self, d = None):
self._dict = d or {}
def addentry(self, document_key, *info):
'''\
addentry(document_key, *info)
add a document and related information to this ResultList'''
self[document_key] = info
def __str__(self):
return `self._dict`
def __len__(self):
return len(self._dict)
def __getitem__(self, key):
return self._dict[key]
def __setitem__(self, key, v):
self._dict[key]=v
# Note self.__changed__(1) via the
def __delitem__(self, key):
del self._dict[key]
def keys(self):
'''\
keys()
get the documents in this ResultList'''
return self._dict.keys()
def has_key(self, key):
return self._dict.has_key(key)
def items(self):
'''items()
get a list of document key/document information pairs'''
return self._dict.items()
def sorted_items(self):
'''sorted_items()
get a
Sort the frequency/key pairs in the ResultList by highest to lowest
frequency'''
items = self._dict.items()
items.sort(lambda x, y: -cmp(x[1][0], y[1][0]))
return items
def __and__(self, x):
'''Allows intersection of two ResultList objects using the & operator.
When ResultLists are combined in this way, frequencies are combined
by calculating the geometric mean of each pair of corresponding
frequencies.'''
result = self.__class__()
for key,v in self.items():
try:
xv=x[key]
v=pow(v[0]*xv[0],0.5), v[1]+xv[1]
result[key] = v
except KeyError: pass
return result
def and_not(self, x):
'''Return items in the reciever that are not in the argument'''
result = self.__class__()
for key,v in self.items():
try: x[key]
except KeyError: result[key] = v
return result
def __or__(self, x):
'''Allows union of two ResultList objects using the | operator.
When ResultLists are combined in this way, frequencies are
combined by calculating the sum of each pair of corresponding
frequencies.'''
result = self.__class__()
for key,v in self.items():
try:
xv = x[key]
v = v[0] + xv[0], v[1] + xv[1]
except: pass
result[key] = v
for key,v in x.items():
try:
self[key]
except:
result[key]=v
return result
def Not(self, index):
'''\
Not(index)
Perform a "not" operation on a ResultList object.
Not() returns the union of all ResultLists in the index that do
not contain a link to a document that is found in "self".
This method should be passed the Index object that returned the
ResultList instance.'''
index = index._index_object
res = None
for key in index.keys():
try:
keys = index[key].keys()
except KeyError:
continue
index_val = index[key]
for key in keys:
if (not self.has_key(key)):
if (res):
res = res | { key : index_val[key] }
else:
res = self.__class__({ key : index_val[key] })
if (res):
return res
return self.__class__()
def near(self, x, distance = 1):
'''\
near(rl, distance = 1)
Returns a ResultList containing documents which contain'''
result = self.__class__()
for key, v in self.items():
try:
value = x[key]
except KeyError:
value = None
if value is None:
continue
positions = v[1] + value[1]
positions.sort()
positionsr = []
rel = pow(v[0] * value[0], 0.5)
pl = positions[0]
rl = -1
for i in range(1, len(positions)):
p = positions[i]
d = p - pl
if d > 0 and d <= distance:
if pl != rl:
positionsr.append(pl)
positionsr.append(p)
rl = p
pl = p
if (len(positionsr)):
result[key] = positionsr
return result
def __getstate__(self):
return self._dict
def __setstate__(self, state):
self._dict = state
RegexType = type(regex.compile(''))
......@@ -416,264 +427,264 @@ _default_stop_words = [
default_stop_words = {}
for w in _default_stop_words:
default_stop_words[w] = None
default_stop_words[w] = None
for w in string.letters:
default_stop_words[w] = None
default_stop_words[w] = None
class Index:
'''\
An inverted index.
This class handles indexing and searching.
An optional argument may be provided when instantiating
an Index object. This argument should be a dictionary
specifying stems, synonyms, and stopwords. The dictionary
may also be used to initialize the index with previously
indexed values. Within the dictionary, stopwords should
be keywords (string values) mapped to the Python value None;
stems and synonyms should be keywords mapped to their
corresponding keywords, and previously indexed values should
map a keyword to a ResultList object.
Indexing is performed using the index() method.
Searching is performed using the Index object\'s mapping
behaviour.
Example usage:
d = {
'and' : None, # Stopword
'or' : None, # Stopword
'not' : None, # Stopword
'running' : 'run', # Stem
}
doc = open('/usr/users/chris/doc.txt', 'r').read()
key = '/usr/users/chris/doc.txt'
# instantiate an Index object, passing it a dictionary
# containing stopwords and stems
i = InvertedIndex.Index(d)
# index the document, doc, with key, key.
i.index(doc, key)
# perform a test search
print i['blah']
'''
list_class=ResultList
def __init__(self, index_dictionary = None):
'Create an inverted index'
if (index_dictionary is None):
index_dictionary = copy.copy(default_stop_words)
self.set_index(index_dictionary)
def set_index(self, index_dictionary = None):
'Change the index dictionary for the index.'
if (index_dictionary is None):
index_dictionary = {}
self._index_object = index_dictionary
def split_words(self, s):
'split a string into separate words'
return regsub.split(s, '[^a-zA-Z]+')
def index(self, src, srckey):
'''\
index(src, srckey)
Update the index by indexing the words in src to the key, srckey
The source object, src, will be converted to a string and the
words in the string will be used as indexes to retrieve the objects
key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table.
'''
import math
index = self._index_object
src = regsub.gsub('-[ \t]*\n[ \t]*', '', str(src)) # de-hyphenate
src = map(lower,filter(None, self.split_words(src)))
if (len(src) < 2):
return
nwords = math.log(len(src))
d = {}
i = -1
for s in src:
i = i + 1
stopword_flag = 0
while (not stopword_flag):
try:
index_val = index[s]
except KeyError:
break
if (index_val is None):
stopword_flag = 1
elif (type(index_val) != StringType):
break
else:
s = index_val
else: # s is a stopword
continue
try:
d[s].append(i)
except KeyError:
d[s] = [ i ]
addentry = self.addentry
for word, positions in d.items():
freq = int(10000 * (len(positions) / nwords))
addentry(word,srckey,(freq, positions))
def addentry(self,word,key,data):
index = self._index_object
try:
rl = index[word]
except:
rl = {}
index[word] = rl
rl[key] = data
def __getitem__(self, key):
'''\
Get the ResultList objects for the inverted key, key.
The key may be a regular expression, in which case a regular
expression match is done.
The key may be a string, in which case an case-insensitive
match is done.
'''
index = self._index_object
List = self.list_class
if (type(key) == RegexType):
dict = {}
for k in index.keys():
if (key.search(k) >= 0):
try:
while (type(index[k]) == StringType):
k = index[k]
except KeyError:
continue
if (index[k] is None):
continue
dict[index[k]] = 1
Lists = dict.keys()
if (not len(Lists)):
return List()
return reduce(lambda x, y: x | y, Lists)
key = lower(key)
while (type(key) == StringType):
try:
key = index[key]
except KeyError:
return List()
if (key is None):
return List()
return List(key)
def keys(self):
return self._index_object.keys()
def __len__(self):
return len(self._index_object)
An inverted index.
def remove_document(self, doc_key, s = None):
'''\
remove_document(doc_key, s = None)
Remove a specified document from the index, given the document key.
Optionally, the document source may be provided. This helps to
speed up removal of documents from a large index.
This class handles indexing and searching.
An optional argument may be provided when instantiating
an Index object. This argument should be a dictionary
specifying stems, synonyms, and stopwords. The dictionary
may also be used to initialize the index with previously
indexed values. Within the dictionary, stopwords should
be keywords (string values) mapped to the Python value None;
stems and synonyms should be keywords mapped to their
corresponding keywords, and previously indexed values should
map a keyword to a ResultList object.
Indexing is performed using the index() method.
Searching is performed using the Index object\'s mapping
behaviour.
Example usage:
d = {
'and' : None, # Stopword
'or' : None, # Stopword
'not' : None, # Stopword
'running' : 'run', # Stem
}
doc = open('/usr/users/chris/doc.txt', 'r').read()
key = '/usr/users/chris/doc.txt'
# instantiate an Index object, passing it a dictionary
# containing stopwords and stems
i = InvertedIndex.Index(d)
# index the document, doc, with key, key.
i.index(doc, key)
# perform a test search
print i['blah']
'''
if (s is None):
for key in self.keys():
try:
del self[key][doc_key]
except KeyError:
continue
else:
s = regsub.gsub('-[ \t]*\n[ \t]*', '', str(s)) # de-hyphenate
s = filter(None, self.split_words(s))
for key in s:
list_class=ResultList
def __init__(self, index_dictionary = None):
'Create an inverted index'
if (index_dictionary is None):
index_dictionary = copy.copy(default_stop_words)
self.set_index(index_dictionary)
def set_index(self, index_dictionary = None):
'Change the index dictionary for the index.'
if (index_dictionary is None):
index_dictionary = {}
self._index_object = index_dictionary
def split_words(self, s):
'split a string into separate words'
return regsub.split(s, '[^a-zA-Z]+')
def index(self, src, srckey):
'''\
index(src, srckey)
Update the index by indexing the words in src to the key, srckey
The source object, src, will be converted to a string and the
words in the string will be used as indexes to retrieve the objects
key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table.
'''
import math
index = self._index_object
src = regsub.gsub('-[ \t]*\n[ \t]*', '', str(src)) # de-hyphenate
src = map(lower,filter(None, self.split_words(src)))
if (len(src) < 2):
return
nwords = math.log(len(src))
d = {}
i = -1
for s in src:
i = i + 1
stopword_flag = 0
while (not stopword_flag):
try:
index_val = index[s]
except KeyError:
break
if (index_val is None):
stopword_flag = 1
elif (type(index_val) != StringType):
break
else:
s = index_val
else: # s is a stopword
continue
try:
d[s].append(i)
except KeyError:
d[s] = [ i ]
addentry = self.addentry
for word, positions in d.items():
freq = int(10000 * (len(positions) / nwords))
addentry(word,srckey,(freq, positions))
def addentry(self,word,key,data):
index = self._index_object
try:
del self[key][doc_key]
except KeyError:
continue
def get_stopwords(self):
index = self._index_object
stopwords = []
for word in index.keys():
if (index[word] is None):
stopwords.append(word)
return stopwords
def get_synonyms(self):
index = self._index_object
synonyms = {}
for word in index.keys():
if (type(index[word]) == StringType):
synonyms[word] = index[word]
return synonyms
def get_document_keys(self):
d = {}
for key in self.keys():
try:
doc_keys = self[key].keys()
except:
continue
for doc_key in doc_keys:
d[doc_key] = 1
return d.keys()
rl = index[word]
except:
rl = {}
index[word] = rl
rl[key] = data
def __getitem__(self, key):
'''\
Get the ResultList objects for the inverted key, key.
The key may be a regular expression, in which case a regular
expression match is done.
The key may be a string, in which case an case-insensitive
match is done.
'''
index = self._index_object
List = self.list_class
if (type(key) == RegexType):
dict = {}
for k in index.keys():
if (key.search(k) >= 0):
try:
while (type(index[k]) == StringType):
k = index[k]
except KeyError:
continue
if (index[k] is None):
continue
dict[index[k]] = 1
Lists = dict.keys()
if (not len(Lists)):
return List()
return reduce(lambda x, y: x | y, Lists)
key = lower(key)
while (type(key) == StringType):
try:
key = index[key]
except KeyError:
return List()
if (key is None):
return List()
return List(key)
def keys(self):
return self._index_object.keys()
def __len__(self):
return len(self._index_object)
def remove_document(self, doc_key, s = None):
'''\
remove_document(doc_key, s = None)
Remove a specified document from the index, given the document key.
Optionally, the document source may be provided. This helps to
speed up removal of documents from a large index.
'''
if (s is None):
for key in self.keys():
try:
del self[key][doc_key]
except KeyError:
continue
else:
s = regsub.gsub('-[ \t]*\n[ \t]*', '', str(s)) # de-hyphenate
s = filter(None, self.split_words(s))
for key in s:
try:
del self[key][doc_key]
except KeyError:
continue
def get_stopwords(self):
index = self._index_object
stopwords = []
for word in index.keys():
if (index[word] is None):
stopwords.append(word)
return stopwords
def get_synonyms(self):
index = self._index_object
synonyms = {}
for word in index.keys():
if (type(index[word]) == StringType):
synonyms[word] = index[word]
return synonyms
def get_document_keys(self):
d = {}
for key in self.keys():
try:
doc_keys = self[key].keys()
except:
continue
for doc_key in doc_keys:
d[doc_key] = 1
return d.keys()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment