Commit 4082910c authored by Jim Fulton's avatar Jim Fulton

1.30 resurected.

parent 2470ea4c
...@@ -30,7 +30,7 @@ Example usage: ...@@ -30,7 +30,7 @@ Example usage:
print i['blah'] print i['blah']
$Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $''' $Id: InvertedIndex.py,v 1.33 1997/04/22 15:19:04 jim Exp $'''
# Copyright # Copyright
# #
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne # Copyright 1996 Digital Creations, L.C., 910 Princess Anne
...@@ -82,11 +82,8 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $''' ...@@ -82,11 +82,8 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $'''
# (540) 371-6909 # (540) 371-6909
# #
# $Log: InvertedIndex.py,v $ # $Log: InvertedIndex.py,v $
# Revision 1.32 1997/04/22 15:18:01 jim # Revision 1.33 1997/04/22 15:19:04 jim
# Cris' changes. # 1.30 resurected.
#
# Revision 1.31 1997/04/18 18:32:46 chris
# *** empty log message ***
# #
# Revision 1.30 1997/04/14 12:03:17 jim # Revision 1.30 1997/04/14 12:03:17 jim
# Fixed bug in proximity searches. # Fixed bug in proximity searches.
...@@ -193,12 +190,12 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $''' ...@@ -193,12 +190,12 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $'''
# #
# #
# #
__version__='$Revision: 1.32 $'[11:-2] __version__='$Revision: 1.33 $'[11:-2]
import regex, string, copy import regex, regsub, string, copy
from string import lower from string import lower
from WordSequence import WordSequence
from types import * from types import *
class ResultList: class ResultList:
...@@ -221,12 +218,7 @@ class ResultList: ...@@ -221,12 +218,7 @@ class ResultList:
''' '''
def __init__(self, d = None): def __init__(self, d = None):
if (d is None): self._dict = d or {}
self._dict = {}
elif (type(d) is TupleType):
self._dict = { d[0] : d[1:] }
else:
self._dict = d
def addentry(self, document_key, *info): def addentry(self, document_key, *info):
...@@ -495,13 +487,11 @@ class Index: ...@@ -495,13 +487,11 @@ class Index:
list_class=ResultList list_class=ResultList
def __init__(self, index_dictionary = None, synstop = None): def __init__(self, index_dictionary = None):
'Create an inverted index' 'Create an inverted index'
if (synstop is None): if (index_dictionary is None):
synstop = copy.copy(default_stop_words) index_dictionary = copy.copy(default_stop_words)
self.synstop = synstop
self.set_index(index_dictionary) self.set_index(index_dictionary)
...@@ -514,6 +504,11 @@ class Index: ...@@ -514,6 +504,11 @@ class Index:
self._index_object = index_dictionary self._index_object = index_dictionary
def split_words(self, s):
'split a string into separate words'
return regsub.split(s, '[^a-zA-Z]+')
def index(self, src, srckey): def index(self, src, srckey):
'''\ '''\
index(src, srckey) index(src, srckey)
...@@ -525,44 +520,59 @@ class Index: ...@@ -525,44 +520,59 @@ class Index:
key, srckey. For simple objects, the srckey may be the object itself, key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table. or it may be a key into some other data structure, such as a table.
''' '''
src = WordSequence(src, self.synstop)
import math
index = self._index_object
src = regsub.gsub('-[ \t]*\n[ \t]*', '', str(src)) # de-hyphenate
src = map(lower,filter(None, self.split_words(src)))
if (len(src) < 2):
return
nwords = math.log(len(src))
d = {} d = {}
i = -1 i = -1
for s in src: for s in src:
i = i + 1 i = i + 1
stopword_flag = 0
while (not stopword_flag):
try:
index_val = index[s]
except KeyError:
break
if (index_val is None):
stopword_flag = 1
elif (type(index_val) != StringType):
break
else:
s = index_val
else: # s is a stopword
continue
try: try:
d[s].append(i) d[s].append(i)
except KeyError: except KeyError:
d[s] = [ i ] d[s] = [ i ]
if (i < 1):
return
import math
nwords = math.log(i + 1)
addentry = self.addentry addentry = self.addentry
for word, positions in d.items(): for word, positions in d.items():
freq = int(100 * (len(positions) / nwords)) freq = int(10000 * (len(positions) / nwords))
addentry(word,srckey,(freq, positions)) addentry(word,srckey,(freq, positions))
def addentry(self,word,key,data): def addentry(self,word,key,data):
index = self._index_object index = self._index_object
try: try:
rl = index[word] rl = index[word]
except: except:
rl = ( key, ) + data rl = {}
index[word] = rl index[word] = rl
return
if (type(rl) is TupleType):
rl = { rl[0] : rl[1:] }
rl[key] = data rl[key] = data
def __getitem__(self, key): def __getitem__(self, key):
'''\ '''\
...@@ -576,7 +586,6 @@ class Index: ...@@ -576,7 +586,6 @@ class Index:
''' '''
index = self._index_object index = self._index_object
synstop = self.synstop
List = self.list_class List = self.list_class
if (type(key) == RegexType): if (type(key) == RegexType):
...@@ -603,19 +612,16 @@ class Index: ...@@ -603,19 +612,16 @@ class Index:
key = lower(key) key = lower(key)
while (1): while (type(key) == StringType):
try: try:
key = synstop[key] key = index[key]
except KeyError: except KeyError:
break return List()
if (key is None): if (key is None):
return List() return List()
try: return List(key)
return index[key]
except KeyError:
return List()
def keys(self): def keys(self):
...@@ -642,35 +648,37 @@ class Index: ...@@ -642,35 +648,37 @@ class Index:
del self[key][doc_key] del self[key][doc_key]
except KeyError: except KeyError:
continue continue
# else: else:
# s = WordSequence(s) s = regsub.gsub('-[ \t]*\n[ \t]*', '', str(s)) # de-hyphenate
# for key in s: s = filter(None, self.split_words(s))
# try:
# del self[key][doc_key] for key in s:
# except KeyError: try:
# continue del self[key][doc_key]
except KeyError:
continue
def get_stopwords(self): def get_stopwords(self):
synstop = self.synstop index = self._index_object
stopwords = [] stopwords = []
for key, val in synstop.items(): for word in index.keys():
if (value is None): if (index[word] is None):
stopwords.append(key) stopwords.append(word)
return stopwords return stopwords
def get_synonyms(self): def get_synonyms(self):
synstop = self.synstop index = self._index_object
syns = [] synonyms = {}
for key, val in synstop.items(): for word in index.keys():
if (type(value) is StringType): if (type(index[word]) == StringType):
syns.append(key) synonyms[word] = index[word]
return syns return synonyms
def get_document_keys(self): def get_document_keys(self):
...@@ -687,17 +695,6 @@ class Index: ...@@ -687,17 +695,6 @@ class Index:
return d.keys() return d.keys()
def highlight(self, text, positions, before, after):
ws = WordSequence(text, self.synstop)
positions.sort()
positions.reverse()
for position in positions:
start, end = ws.pos(position)
text = text[:start] + before + text[start:end] + after + text[end:]
return text
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment