Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
b5ae0084
Commit
b5ae0084
authored
Apr 23, 1997
by
chris
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
*** empty log message ***
parent
4082910c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
782 additions
and
74 deletions
+782
-74
lib/python/SearchIndex/InvertedIndex.py
lib/python/SearchIndex/InvertedIndex.py
+782
-74
No files found.
lib/python/SearchIndex/InvertedIndex.py
View file @
b5ae0084
...
...
@@ -30,7 +30,7 @@ Example usage:
print i['blah']
$Id: InvertedIndex.py,v 1.3
3 1997/04/22 15:19:04 jim
Exp $'''
$Id: InvertedIndex.py,v 1.3
4 1997/04/23 16:53:27 chris
Exp $'''
# Copyright
#
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne
...
...
@@ -82,8 +82,14 @@ $Id: InvertedIndex.py,v 1.33 1997/04/22 15:19:04 jim Exp $'''
# (540) 371-6909
#
# $Log: InvertedIndex.py,v $
# Revision 1.33 1997/04/22 15:19:04 jim
# 1.30 resurected.
# Revision 1.34 1997/04/23 16:53:27 chris
# *** empty log message ***
#
# Revision 1.32 1997/04/22 15:18:01 jim
# Cris' changes.
#
# Revision 1.31 1997/04/18 18:32:46 chris
# *** empty log message ***
#
# Revision 1.30 1997/04/14 12:03:17 jim
# Fixed bug in proximity searches.
...
...
@@ -190,12 +196,12 @@ $Id: InvertedIndex.py,v 1.33 1997/04/22 15:19:04 jim Exp $'''
#
#
#
__version__
=
'$Revision: 1.3
3
$'
[
11
:
-
2
]
__version__
=
'$Revision: 1.3
4
$'
[
11
:
-
2
]
import
regex
,
regsub
,
string
,
copy
import
regex
,
string
,
copy
from
string
import
lower
from
WordSequence
import
WordSequence
from
types
import
*
class
ResultList
:
...
...
@@ -218,7 +224,12 @@ class ResultList:
'''
def
__init__
(
self
,
d
=
None
):
self
.
_dict
=
d
or
{}
if
(
d
is
None
):
self
.
_dict
=
{}
elif
(
type
(
d
)
is
TupleType
):
self
.
_dict
=
{
d
[
0
]
:
d
[
1
:]
}
else
:
self
.
_dict
=
d
def
addentry
(
self
,
document_key
,
*
info
):
...
...
@@ -487,10 +498,12 @@ class Index:
list_class
=
ResultList
def
__init__
(
self
,
index_dictionary
=
None
):
def
__init__
(
self
,
index_dictionary
=
None
,
synstop
=
None
):
'Create an inverted index'
if
(
index_dictionary
is
None
):
index_dictionary
=
copy
.
copy
(
default_stop_words
)
if
(
synstop
is
None
):
synstop
=
copy
.
copy
(
default_stop_words
)
self
.
synstop
=
synstop
self
.
set_index
(
index_dictionary
)
...
...
@@ -504,11 +517,6 @@ class Index:
self
.
_index_object
=
index_dictionary
def
split_words
(
self
,
s
):
'split a string into separate words'
return
regsub
.
split
(
s
,
'[^a-zA-Z]+'
)
def
index
(
self
,
src
,
srckey
):
'''
\
index(src, srckey)
...
...
@@ -520,60 +528,45 @@ class Index:
key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table.
'''
import
math
index
=
self
.
_index_object
src
=
regsub
.
gsub
(
'-[
\
t
]*
\
n
[
\
t
]*'
,
''
,
str
(
src
))
# de-hyphenate
src
=
map
(
lower
,
filter
(
None
,
self
.
split_words
(
src
)))
if
(
len
(
src
)
<
2
):
return
nwords
=
math
.
log
(
len
(
src
))
src
=
WordSequence
(
src
,
self
.
synstop
)
d
=
{}
i
=
-
1
for
s
in
src
:
i
=
i
+
1
stopword_flag
=
0
while
(
not
stopword_flag
):
try
:
index_val
=
index
[
s
]
except
KeyError
:
break
if
(
index_val
is
None
):
stopword_flag
=
1
elif
(
type
(
index_val
)
!=
StringType
):
break
else
:
s
=
index_val
else
:
# s is a stopword
continue
try
:
d
[
s
].
append
(
i
)
except
KeyError
:
d
[
s
]
=
[
i
]
if
(
i
<
1
):
return
import
math
nwords
=
math
.
log
(
i
+
1
)
addentry
=
self
.
addentry
for
word
,
positions
in
d
.
items
():
freq
=
int
(
100
00
*
(
len
(
positions
)
/
nwords
))
freq
=
int
(
100
*
(
len
(
positions
)
/
nwords
))
addentry
(
word
,
srckey
,(
freq
,
positions
))
def
addentry
(
self
,
word
,
key
,
data
):
index
=
self
.
_index_object
try
:
rl
=
index
[
word
]
except
:
rl
=
{}
rl
=
(
key
,
)
+
data
index
[
word
]
=
rl
return
if
(
type
(
rl
)
is
TupleType
):
rl
=
{
rl
[
0
]
:
rl
[
1
:]
}
rl
[
key
]
=
data
def
__getitem__
(
self
,
key
):
'''
\
Get the ResultList objects for the inverted key, key.
...
...
@@ -586,6 +579,7 @@ class Index:
'''
index
=
self
.
_index_object
synstop
=
self
.
synstop
List
=
self
.
list_class
if
(
type
(
key
)
==
RegexType
):
...
...
@@ -612,16 +606,19 @@ class Index:
key
=
lower
(
key
)
while
(
type
(
key
)
==
StringType
):
while
(
1
):
try
:
key
=
index
[
key
]
key
=
synstop
[
key
]
except
KeyError
:
return
List
()
break
if
(
key
is
None
):
return
List
()
return
List
(
key
)
try
:
return
index
[
key
]
except
KeyError
:
return
List
()
def
keys
(
self
):
...
...
@@ -648,37 +645,35 @@ class Index:
del
self
[
key
][
doc_key
]
except
KeyError
:
continue
else
:
s
=
regsub
.
gsub
(
'-[
\
t
]*
\
n
[
\
t
]*'
,
''
,
str
(
s
))
# de-hyphenate
s
=
filter
(
None
,
self
.
split_words
(
s
))
for
key
in
s
:
try
:
del
self
[
key
][
doc_key
]
except
KeyError
:
continue
# else:
# s = WordSequence(s)
# for key in s:
# try:
# del self[key][doc_key]
# except KeyError:
# continue
def
get_stopwords
(
self
):
index
=
self
.
_index_object
synstop
=
self
.
synstop
stopwords
=
[]
for
word
in
index
.
key
s
():
if
(
index
[
word
]
is
None
):
stopwords
.
append
(
word
)
for
key
,
val
in
synstop
.
item
s
():
if
(
value
is
None
):
stopwords
.
append
(
key
)
return
stopwords
def
get_synonyms
(
self
):
index
=
self
.
_index_object
synstop
=
self
.
synstop
synonyms
=
{}
for
word
in
index
.
key
s
():
if
(
type
(
index
[
word
])
==
StringType
):
synonyms
[
word
]
=
index
[
word
]
syns
=
[]
for
key
,
val
in
synstop
.
item
s
():
if
(
type
(
value
)
is
StringType
):
syns
.
append
(
key
)
return
synonym
s
return
syn
s
def
get_document_keys
(
self
):
...
...
@@ -695,6 +690,719 @@ class Index:
return
d
.
keys
()
def
highlight
(
self
,
text
,
positions
,
before
,
after
):
ws
=
WordSequence
(
text
,
self
.
synstop
)
positions
.
sort
()
positions
.
reverse
()
for
position
in
positions
:
start
,
end
=
ws
.
pos
(
position
)
text
=
text
[:
start
]
+
before
+
text
[
start
:
end
]
+
after
+
text
[
end
:]
return
text
#!/usr/local/bin/python
# $What$
__doc__
=
'''Simple Inverted Indexer
This module provides simple tools for creating and maintaining
inverted indexes. An inverted index indexes a collection of
objects on words in their textual representation.
Example usage:
d = {
'and' : None,
'or' : None,
'not' : None,
'running' : 'run',
}
doc = open('/usr/users/chris/doc.txt', 'r').read()
key = '/usr/users/chris/doc.txt'
# instantiate an Index object, passing it a dictionary
# containing stopwords and stems
i = InvertedIndex.Index(d)
# index the document, doc, with key, key.
i.index(doc, key)
# perform a test search
print i['blah']
$Id: InvertedIndex.py,v 1.34 1997/04/23 16:53:27 chris Exp $'''
# Copyright
#
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne
# Street, Suite 300, Fredericksburg, Virginia 22401 U.S.A. All
# rights reserved. Copyright in this software is owned by DCLC,
# unless otherwise indicated. Permission to use, copy and
# distribute this software is hereby granted, provided that the
# above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear. Note that
# any product, process or technology described in this software
# may be the subject of other Intellectual Property rights
# reserved by Digital Creations, L.C. and are not licensed
# hereunder.
#
# Trademarks
#
# Digital Creations & DCLC, are trademarks of Digital Creations, L.C..
# All other trademarks are owned by their respective companies.
#
# No Warranty
#
# The software is provided "as is" without warranty of any kind,
# either express or implied, including, but not limited to, the
# implied warranties of merchantability, fitness for a particular
# purpose, or non-infringement. This software could include
# technical inaccuracies or typographical errors. Changes are
# periodically made to the software; these changes will be
# incorporated in new editions of the software. DCLC may make
# improvements and/or changes in this software at any time
# without notice.
#
# Limitation Of Liability
#
# In no event will DCLC be liable for direct, indirect, special,
# incidental, economic, cover, or consequential damages arising
# out of the use of or inability to use this software even if
# advised of the possibility of such damages. Some states do not
# allow the exclusion or limitation of implied warranties or
# limitation of liability for incidental or consequential
# damages, so the above limitation or exclusion may not apply to
# you.
#
#
# If you have questions regarding this software,
# contact:
#
# Jim Fulton, jim@digicool.com
#
# (540) 371-6909
#
# $Log: InvertedIndex.py,v $
# Revision 1.34 1997/04/23 16:53:27 chris
# *** empty log message ***
#
# Revision 1.32 1997/04/22 15:18:01 jim
# Cris' changes.
#
# Revision 1.31 1997/04/18 18:32:46 chris
# *** empty log message ***
#
# Revision 1.30 1997/04/14 12:03:17 jim
# Fixed bug in proximity searches.
#
# Revision 1.29 1997/04/08 00:14:22 jim
# Chris' changes, I think....
#
# Revision 1.28 1997/03/31 23:17:53 jim
# I put back the list_class hook.
#
# Revision 1.27 1997/03/28 17:11:38 chris
# *** empty log message ***
#
# Revision 1.26 1997/03/28 16:54:57 chris
# *** empty log message ***
#
# Revision 1.25 1997/03/28 16:53:50 chris
# indexed data now stored as dictionaries rather than ResultLists.
# indexing documents with few than two keywords fails silently rather
# than raising an exception.
#
# Revision 1.24 1997/03/24 20:22:27 chris
# *** empty log message ***
#
# Revision 1.23 1997/03/22 13:32:23 jim
# Rearranged index method to update result lists in a separate
# overridable method. This is needed to implement a clear method
# in a subclass that allows an inverted index to be "cleared" without
# actually updating data records.
#
# Made some slight optimizations.
#
# Revision 1.22 1997/03/22 13:02:17 jim
# Finish fixing bug in __or__ that Chris has started to fix.
#
# Revision 1.21 1997/03/20 21:51:01 jim
# Rearranged and, or, and near.
# Got rid of get/setstate.
# Made result-list-specific methods use mapping prootcol to make it
# easier to mix with other mapping types.
#
# Revision 1.20 1997/03/05 19:28:18 chris
# fixed typo
#
# Revision 1.19 1997/03/05 19:25:52 chris
# removed references to SingleThreadedTransaction
#
# Revision 1.18 1997/03/05 19:21:36 chris
# removed PersistentResultList, placing it in its own module
#
# Revision 1.17 1997/02/24 16:29:01 chris
# *** empty log message ***
#
# Revision 1.16 1997/02/21 19:37:01 cici
# *** empty log message ***
#
# Revision 1.15 1997/02/19 17:05:09 chris
# *** empty log message ***
#
# Revision 1.14 1997/02/19 16:37:39 chris
# Removed Transactional and Persistent classes
#
# Revision 1.13 1997/02/13 17:28:32 chris
# *** empty log message ***
#
# Revision 1.12 1997/02/12 18:35:21 cici
# added apply() to Transactional and Persistent addentry() methods.
#
# Revision 1.11 1997/02/12 18:11:54 cici
# *** empty log message ***
#
# Revision 1.10 1997/01/29 16:48:40 chris
# added list_class argument to Index __init__
#
# Revision 1.9 1996/12/23 21:54:10 chris
# Checked out by Chris for testing/editing.
#
# Revision 1.8 1996/12/13 13:53:11 jim
# Checked in so I could edit.
#
# Revision 1.7 1996/12/10 21:17:57 chris
# Experimenting....
#
# Revision 1.6 1996/12/09 15:50:15 jim
# Checked in so jim can hack.
#
# Revision 1.5 1996/12/03 18:15:07 chris
# Updated doc strings
#
# Revision 1.4 1996/12/03 18:11:57 chris
# Went back to returning empty ResultLists for failed searches.
#
# Revision 1.3 1996/12/03 17:44:21 chris
# Added pack() methods to Persistent and Transactional.
# Disabled autosave on Persistent.
# Failed searches now raise a KeyError rather than returning an
# empty ResultList.
#
# Revision 1.2 1996/11/18 18:50:16 chris
# Added doc strings
#
# Revision 1.1 1996/11/15 17:41:37 chris
# Initial version
#
#
#
__version__
=
'$Revision: 1.34 $'
[
11
:
-
2
]
import
regex
,
string
,
copy
from
string
import
lower
from
WordSequence
import
WordSequence
from
types
import
*
class
ResultList
:
'''
\
This object holds the information for a word in an inverted index. It
provides mapping behavior, mapping document keys to corresponding
document information, including the frequency value.
Union of two ResultList objects may be performed with the | operator.
Intersection of two ResultList objects may be performed with the & operator.
Other methods:
Not()
near()
keys()
items()
sorted_items()
'''
def
__init__
(
self
,
d
=
None
):
if
(
d
is
None
):
self
.
_dict
=
{}
elif
(
type
(
d
)
is
TupleType
):
self
.
_dict
=
{
d
[
0
]
:
d
[
1
:]
}
else
:
self
.
_dict
=
d
def
addentry
(
self
,
document_key
,
*
info
):
'''
\
addentry(document_key, *info)
add a document and related information to this ResultList'''
self
[
document_key
]
=
info
def
__str__
(
self
):
return
`self._dict`
def
__len__
(
self
):
return
len
(
self
.
_dict
)
def
__getitem__
(
self
,
key
):
return
self
.
_dict
[
key
]
def
__setitem__
(
self
,
key
,
v
):
self
.
_dict
[
key
]
=
v
# Note self.__changed__(1) via the
def
__delitem__
(
self
,
key
):
del
self
.
_dict
[
key
]
def
keys
(
self
):
'''
\
keys()
get the documents in this ResultList'''
return
self
.
_dict
.
keys
()
def
has_key
(
self
,
key
):
return
self
.
_dict
.
has_key
(
key
)
def
items
(
self
):
'''items()
get a list of document key/document information pairs'''
return
self
.
_dict
.
items
()
def
sorted_items
(
self
):
'''sorted_items()
get a
Sort the frequency/key pairs in the ResultList by highest to lowest
frequency'''
items
=
self
.
_dict
.
items
()
items
.
sort
(
lambda
x
,
y
:
-
cmp
(
x
[
1
][
0
],
y
[
1
][
0
]))
return
items
def
__and__
(
self
,
x
):
'''Allows intersection of two ResultList objects using the & operator.
When ResultLists are combined in this way, frequencies are combined
by calculating the geometric mean of each pair of corresponding
frequencies.'''
result
=
self
.
__class__
()
for
key
,
v
in
self
.
items
():
try
:
xv
=
x
[
key
]
v
=
pow
(
v
[
0
]
*
xv
[
0
],
0.5
),
v
[
1
]
+
xv
[
1
]
result
[
key
]
=
v
except
KeyError
:
pass
return
result
def
and_not
(
self
,
x
):
'''Return items in the receiver that are not in the argument'''
result
=
self
.
__class__
()
for
key
,
v
in
self
.
items
():
try
:
x
[
key
]
except
KeyError
:
result
[
key
]
=
v
return
result
def
__or__
(
self
,
x
):
'''Allows union of two ResultList objects using the | operator.
When ResultLists are combined in this way, frequencies are
combined by calculating the sum of each pair of corresponding
frequencies.'''
result
=
self
.
__class__
()
for
key
,
v
in
self
.
items
():
try
:
xv
=
x
[
key
]
v
=
v
[
0
]
+
xv
[
0
],
v
[
1
]
+
xv
[
1
]
except
:
pass
result
[
key
]
=
v
for
key
,
v
in
x
.
items
():
try
:
self
[
key
]
except
:
result
[
key
]
=
v
return
result
def
Not
(
self
,
index
):
'''
\
Not(index)
Perform a "not" operation on a ResultList object.
Not() returns the union of all ResultLists in the index that do
not contain a link to a document that is found in "self".
This method should be passed the Index object that returned the
ResultList instance.'''
index
=
index
.
_index_object
res
=
None
for
key
in
index
.
keys
():
try
:
keys
=
index
[
key
].
keys
()
except
KeyError
:
continue
index_val
=
index
[
key
]
for
key
in
keys
:
if
(
not
self
.
has_key
(
key
)):
if
(
res
):
res
=
res
|
{
key
:
index_val
[
key
]
}
else
:
res
=
self
.
__class__
({
key
:
index_val
[
key
]
})
if
(
res
):
return
res
return
self
.
__class__
()
def
near
(
self
,
x
,
distance
=
1
):
'''
\
near(rl, distance = 1)
Returns a ResultList containing documents which contain'''
result
=
self
.
__class__
()
for
key
,
v
in
self
.
items
():
try
:
value
=
x
[
key
]
except
KeyError
:
value
=
None
if
value
is
None
:
continue
score
=
pow
(
v
[
0
]
*
value
[
0
],
0.5
)
positions
=
v
[
1
]
+
value
[
1
]
positions
.
sort
()
positionsr
=
[]
rel
=
pow
(
v
[
0
]
*
value
[
0
],
0.5
)
pl
=
positions
[
0
]
rl
=
-
1
for
i
in
range
(
1
,
len
(
positions
)):
p
=
positions
[
i
]
d
=
p
-
pl
if
d
>
0
and
d
<=
distance
:
if
pl
!=
rl
:
positionsr
.
append
(
pl
)
positionsr
.
append
(
p
)
rl
=
p
pl
=
p
if
(
len
(
positionsr
)):
result
[
key
]
=
score
,
positionsr
return
result
def
__getstate__
(
self
):
return
self
.
_dict
def
__setstate__
(
self
,
state
):
self
.
_dict
=
state
RegexType
=
type
(
regex
.
compile
(
''
))
IndexingError
=
'InvertedIndex.IndexingError'
_default_stop_words
=
[
'about'
,
'all'
,
'also'
,
'an'
,
'and'
,
'any'
,
'are'
,
'as'
,
'at'
,
'be'
,
'because'
,
'been'
,
'being'
,
'but'
,
'by'
,
'can'
,
'cannot'
,
'did'
,
'do'
,
'doing'
,
'each'
,
'either'
,
'else'
,
'even'
,
'for'
,
'from'
,
'get'
,
'got'
,
'had'
,
'has'
,
'have'
,
'he'
,
'her'
,
'hers'
,
'herself'
,
'him'
,
'himself'
,
'his'
,
'how'
,
'if'
,
'in'
,
'into'
,
'is'
,
'it'
,
'its'
,
'me'
,
'my'
,
'myself'
,
'no'
,
'not'
,
'of'
,
'on'
,
'one'
,
'only'
,
'onto'
,
'or'
,
'our'
,
'ourselves'
,
'she'
,
'since'
,
'so'
,
'some'
,
'take'
,
'than'
,
'that'
,
'the'
,
'their'
,
'them'
,
'themselves'
,
'then'
,
'there'
,
'these'
,
'they'
,
'this'
,
'those'
,
'through'
,
'to'
,
'too'
,
'unless'
,
'until'
,
'upon'
,
'us'
,
'very'
,
'was'
,
'we'
,
'were'
,
'what'
,
'when'
,
'where'
,
'which'
,
'while'
,
'who'
,
'whoever'
,
'whom'
,
'whomever'
,
'whose'
,
'why'
,
'will'
,
'with'
,
'without'
,
'would'
,
'yes'
,
'you'
,
'your'
,
'yours'
,
'yourself'
,
'yourselves'
,
]
default_stop_words
=
{}
for
w
in
_default_stop_words
:
default_stop_words
[
w
]
=
None
for
w
in
string
.
letters
:
default_stop_words
[
w
]
=
None
class
Index
:
'''
\
An inverted index.
This class handles indexing and searching.
An optional argument may be provided when instantiating
an Index object. This argument should be a dictionary
specifying stems, synonyms, and stopwords. The dictionary
may also be used to initialize the index with previously
indexed values. Within the dictionary, stopwords should
be keywords (string values) mapped to the Python value None;
stems and synonyms should be keywords mapped to their
corresponding keywords, and previously indexed values should
map a keyword to a ResultList object.
Indexing is performed using the index() method.
Searching is performed using the Index object
\
'
s mapping
behaviour.
Example usage:
d = {
'and' : None, # Stopword
'or' : None, # Stopword
'not' : None, # Stopword
'running' : 'run', # Stem
}
doc = open('/usr/users/chris/doc.txt', 'r').read()
key = '/usr/users/chris/doc.txt'
# instantiate an Index object, passing it a dictionary
# containing stopwords and stems
i = InvertedIndex.Index(d)
# index the document, doc, with key, key.
i.index(doc, key)
# perform a test search
print i['blah']
'''
list_class
=
ResultList
def
__init__
(
self
,
index_dictionary
=
None
,
synstop
=
None
):
'Create an inverted index'
if
(
synstop
is
None
):
synstop
=
copy
.
copy
(
default_stop_words
)
self
.
synstop
=
synstop
self
.
set_index
(
index_dictionary
)
def
set_index
(
self
,
index_dictionary
=
None
):
'Change the index dictionary for the index.'
if
(
index_dictionary
is
None
):
index_dictionary
=
{}
self
.
_index_object
=
index_dictionary
def
index
(
self
,
src
,
srckey
):
'''
\
index(src, srckey)
Update the index by indexing the words in src to the key, srckey
The source object, src, will be converted to a string and the
words in the string will be used as indexes to retrieve the objects
key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table.
'''
src
=
WordSequence
(
src
,
self
.
synstop
)
d
=
{}
i
=
-
1
for
s
in
src
:
i
=
i
+
1
try
:
d
[
s
].
append
(
i
)
except
KeyError
:
d
[
s
]
=
[
i
]
if
(
i
<
1
):
return
import
math
nwords
=
math
.
log
(
i
+
1
)
addentry
=
self
.
addentry
for
word
,
positions
in
d
.
items
():
freq
=
int
(
100
*
(
len
(
positions
)
/
nwords
))
addentry
(
word
,
srckey
,(
freq
,
positions
))
def
addentry
(
self
,
word
,
key
,
data
):
index
=
self
.
_index_object
try
:
rl
=
index
[
word
]
except
:
rl
=
(
key
,
)
+
data
index
[
word
]
=
rl
return
if
(
type
(
rl
)
is
TupleType
):
rl
=
{
rl
[
0
]
:
rl
[
1
:]
}
rl
[
key
]
=
data
def
__getitem__
(
self
,
key
):
'''
\
Get the ResultList objects for the inverted key, key.
The key may be a regular expression, in which case a regular
expression match is done.
The key may be a string, in which case an case-insensitive
match is done.
'''
index
=
self
.
_index_object
synstop
=
self
.
synstop
List
=
self
.
list_class
if
(
type
(
key
)
==
RegexType
):
dict
=
{}
for
k
in
index
.
keys
():
if
(
key
.
search
(
k
)
>=
0
):
try
:
while
(
type
(
index
[
k
])
==
StringType
):
k
=
index
[
k
]
except
KeyError
:
continue
if
(
index
[
k
]
is
None
):
continue
dict
[
index
[
k
]]
=
1
Lists
=
dict
.
keys
()
if
(
not
len
(
Lists
)):
return
List
()
return
reduce
(
lambda
x
,
y
:
x
|
y
,
Lists
)
key
=
lower
(
key
)
while
(
1
):
try
:
key
=
synstop
[
key
]
except
KeyError
:
break
if
(
key
is
None
):
return
List
()
try
:
return
index
[
key
]
except
KeyError
:
return
List
()
def
keys
(
self
):
return
self
.
_index_object
.
keys
()
def
__len__
(
self
):
return
len
(
self
.
_index_object
)
def
remove_document
(
self
,
doc_key
,
s
=
None
):
'''
\
remove_document(doc_key, s = None)
Remove a specified document from the index, given the document key.
Optionally, the document source may be provided. This helps to
speed up removal of documents from a large index.
'''
if
(
s
is
None
):
for
key
in
self
.
keys
():
try
:
del
self
[
key
][
doc_key
]
except
KeyError
:
continue
# else:
# s = WordSequence(s)
# for key in s:
# try:
# del self[key][doc_key]
# except KeyError:
# continue
def
get_stopwords
(
self
):
synstop
=
self
.
synstop
stopwords
=
[]
for
key
,
val
in
synstop
.
items
():
if
(
value
is
None
):
stopwords
.
append
(
key
)
return
stopwords
def
get_synonyms
(
self
):
synstop
=
self
.
synstop
syns
=
[]
for
key
,
val
in
synstop
.
items
():
if
(
type
(
value
)
is
StringType
):
syns
.
append
(
key
)
return
syns
def
get_document_keys
(
self
):
d
=
{}
for
key
in
self
.
keys
():
try
:
doc_keys
=
self
[
key
].
keys
()
except
:
continue
for
doc_key
in
doc_keys
:
d
[
doc_key
]
=
1
return
d
.
keys
()
def
highlight
(
self
,
text
,
positions
,
before
,
after
):
ws
=
WordSequence
(
text
,
self
.
synstop
)
positions
.
sort
()
positions
.
reverse
()
for
position
in
positions
:
start
,
end
=
ws
.
pos
(
position
)
text
=
text
[:
start
]
+
before
+
text
[
start
:
end
]
+
after
+
text
[
end
:]
return
text
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment