Commit 39eb5d7a authored by chris's avatar chris

indexed data now stored as dictionaries rather than ResultLists.

indexing documents with few than two keywords fails silently rather
than raising an exception.
parent 144c0bc0
...@@ -30,7 +30,7 @@ Example usage: ...@@ -30,7 +30,7 @@ Example usage:
print i['blah'] print i['blah']
$Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $''' $Id: InvertedIndex.py,v 1.25 1997/03/28 16:53:50 chris Exp $'''
# Copyright # Copyright
# #
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne # Copyright 1996 Digital Creations, L.C., 910 Princess Anne
...@@ -82,6 +82,11 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $''' ...@@ -82,6 +82,11 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $'''
# (540) 371-6909 # (540) 371-6909
# #
# $Log: InvertedIndex.py,v $ # $Log: InvertedIndex.py,v $
# Revision 1.25 1997/03/28 16:53:50 chris
# indexed data now stored as dictionaries rather than ResultLists.
# indexing documents with few than two keywords fails silently rather
# than raising an exception.
#
# Revision 1.24 1997/03/24 20:22:27 chris # Revision 1.24 1997/03/24 20:22:27 chris
# *** empty log message *** # *** empty log message ***
# #
...@@ -167,7 +172,7 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $''' ...@@ -167,7 +172,7 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $'''
# #
# #
# #
__version__='$Revision: 1.24 $'[11:-2] __version__='$Revision: 1.25 $'[11:-2]
import regex, regsub, string, copy import regex, regsub, string, copy
...@@ -293,14 +298,16 @@ class ResultList: ...@@ -293,14 +298,16 @@ class ResultList:
for key,v in self.items(): for key,v in self.items():
try: try:
xv=x[key] xv = x[key]
v=v[0]+xv[0], v[1]+xv[1] v = v[0] + xv[0], v[1] + xv[1]
except: pass except: pass
result[key] = v result[key] = v
for key,v in x.items(): for key,v in x.items():
try: self[key] try:
except: result[key]=v self[key]
except:
result[key]=v
return result return result
...@@ -383,16 +390,18 @@ RegexType = type(regex.compile('')) ...@@ -383,16 +390,18 @@ RegexType = type(regex.compile(''))
IndexingError = 'InvertedIndex.IndexingError' IndexingError = 'InvertedIndex.IndexingError'
_default_stop_words = [ _default_stop_words = [
'also', 'an', 'and', 'are', 'at', 'be', 'been', 'being', 'but', 'by', 'about', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
'can', 'cannot', 'did', 'do', 'doing', 'either', 'else', 'even', 'for', 'because', 'been', 'being', 'but', 'by', 'can', 'cannot', 'did', 'do',
'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'herself', 'doing', 'each', 'either', 'else', 'even', 'for', 'from', 'get', 'got',
'him', 'himself', 'his', 'if', 'in', 'it', 'its', 'me', 'my', 'myself', 'had', 'has', 'have', 'he', 'her', 'hers', 'herself', 'him', 'himself',
'no', 'not', 'of', 'on', 'only', 'onto', 'or', 'our', 'ourselves', 'she', 'so', 'some', 'his', 'how', 'if', 'in', 'into', 'is', 'it', 'its', 'me', 'my', 'myself',
'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'there', 'no', 'not', 'of', 'on', 'one', 'only', 'onto', 'or', 'our', 'ourselves',
'these', 'they', 'this', 'those', 'to', 'too', 'unless', 'until', 'us', 'she', 'since', 'so', 'some', 'take', 'than', 'that', 'the', 'their', 'them',
'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through',
'who', 'whoever', 'whom', 'whomever', 'whose', 'why', 'with', 'without', 'to', 'too', 'unless', 'until', 'upon', 'us', 'very', 'was', 'we', 'were',
'would', 'yes', 'your', 'yours', 'yourself', 'yourselves', 'what', 'when', 'where', 'which', 'while', 'who', 'whoever', 'whom',
'whomever', 'whose', 'why', 'will', 'with', 'without', 'would', 'yes',
'you', 'your', 'yours', 'yourself', 'yourselves',
] ]
default_stop_words = {} default_stop_words = {}
...@@ -448,95 +457,92 @@ class Index: ...@@ -448,95 +457,92 @@ class Index:
print i['blah'] print i['blah']
''' '''
list_class = ResultList def __init__(self, index_dictionary = None)
'Create an inverted index'
def __init__(self, index_dictionary = None, list_class = None): if (index_dictionary is None):
'Create an inverted index' index_dictionary = copy.copy(default_stop_words)
if (list_class is not None):
self.list_class = list_class
if (index_dictionary is None): self.set_index(index_dictionary)
index_dictionary = copy.copy(default_stop_words)
self.set_index(index_dictionary)
def set_index(self, index_dictionary = None): def set_index(self, index_dictionary = None):
'Change the index dictionary for the index.' 'Change the index dictionary for the index.'
if (index_dictionary is None): if (index_dictionary is None):
index_dictionary = {} index_dictionary = {}
self._index_object = index_dictionary self._index_object = index_dictionary
def split_words(self, s): def split_words(self, s):
'split a string into separate words' 'split a string into separate words'
return regsub.split(s, '[^a-zA-Z]+') return regsub.split(s, '[^a-zA-Z]+')
def index(self, src, srckey): def index(self, src, srckey):
'''\ '''\
index(src, srckey) index(src, srckey)
Update the index by indexing the words in src to the key, srckey Update the index by indexing the words in src to the key, srckey
The source object, src, will be converted to a string and the The source object, src, will be converted to a string and the
words in the string will be used as indexes to retrieve the objects words in the string will be used as indexes to retrieve the objects
key, srckey. For simple objects, the srckey may be the object itself, key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table. or it may be a key into some other data structure, such as a table.
''' '''
import math import math
index = self._index_object index = self._index_object
src = regsub.gsub('-[ \t]*\n[ \t]*', '', str(src)) # de-hyphenate src = regsub.gsub('-[ \t]*\n[ \t]*', '', str(src)) # de-hyphenate
src = map(lower,filter(None, self.split_words(src))) src = map(lower,filter(None, self.split_words(src)))
if (len(src) < 2): if (len(src) < 2):
raise IndexingError, 'cannot index document with fewer than two keywords' return
nwords = math.log(len(src)) nwords = math.log(len(src))
d = {} d = {}
i = -1 i = -1
for s in src: for s in src:
i = i + 1 i = i + 1
stopword_flag = 0 stopword_flag = 0
while (not stopword_flag): while (not stopword_flag):
try: try:
index_val = index[s] index_val = index[s]
except KeyError: except KeyError:
break break
if (index_val is None):
stopword_flag = 1
elif (type(index_val) != StringType):
break
else:
s = index_val
else: # s is a stopword
continue
try: if (index_val is None):
d[s].append(i) stopword_flag = 1
except KeyError: elif (type(index_val) != StringType):
d[s] = [ i ] break
else:
s = index_val
else: # s is a stopword
continue
addentry=self.addentry try:
for word,positions in d.items(): d[s].append(i)
freq = int(10000 * (len(positions) / nwords)) except KeyError:
addentry(word,srckey,(freq, positions)) d[s] = [ i ]
addentry = self.addentry
for word, positions in d.items():
freq = int(10000 * (len(positions) / nwords))
addentry(word,srckey,(freq, positions))
def addentry(self,word,key,data): def addentry(self,word,key,data):
index=self._index_object index = self._index_object
try: rl=index[word] try:
rl = index[word]
except: except:
rl=self.list_class() rl = self.list_class()
index[word]=rl index[word] = {}
rl[key]=data
rl[key] = data
def __getitem__(self, key): def __getitem__(self, key):
'''\ '''\
...@@ -550,7 +556,7 @@ class Index: ...@@ -550,7 +556,7 @@ class Index:
''' '''
index = self._index_object index = self._index_object
List = self.list_class List = ResultList
if (type(key) == RegexType): if (type(key) == RegexType):
dict = {} dict = {}
...@@ -585,7 +591,7 @@ class Index: ...@@ -585,7 +591,7 @@ class Index:
if (key is None): if (key is None):
return List() return List()
return key return List(key)
def keys(self): def keys(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment