Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
4082910c
Commit
4082910c
authored
Apr 22, 1997
by
Jim Fulton
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
1.30 resurected.
parent
2470ea4c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
74 additions
and
77 deletions
+74
-77
lib/python/SearchIndex/InvertedIndex.py
lib/python/SearchIndex/InvertedIndex.py
+74
-77
No files found.
lib/python/SearchIndex/InvertedIndex.py
View file @
4082910c
...
@@ -30,7 +30,7 @@ Example usage:
...
@@ -30,7 +30,7 @@ Example usage:
print i['blah']
print i['blah']
$Id: InvertedIndex.py,v 1.3
2 1997/04/22 15:18:01
jim Exp $'''
$Id: InvertedIndex.py,v 1.3
3 1997/04/22 15:19:04
jim Exp $'''
# Copyright
# Copyright
#
#
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne
...
@@ -82,11 +82,8 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $'''
...
@@ -82,11 +82,8 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $'''
# (540) 371-6909
# (540) 371-6909
#
#
# $Log: InvertedIndex.py,v $
# $Log: InvertedIndex.py,v $
# Revision 1.32 1997/04/22 15:18:01 jim
# Revision 1.33 1997/04/22 15:19:04 jim
# Cris' changes.
# 1.30 resurected.
#
# Revision 1.31 1997/04/18 18:32:46 chris
# *** empty log message ***
#
#
# Revision 1.30 1997/04/14 12:03:17 jim
# Revision 1.30 1997/04/14 12:03:17 jim
# Fixed bug in proximity searches.
# Fixed bug in proximity searches.
...
@@ -193,12 +190,12 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $'''
...
@@ -193,12 +190,12 @@ $Id: InvertedIndex.py,v 1.32 1997/04/22 15:18:01 jim Exp $'''
#
#
#
#
#
#
__version__
=
'$Revision: 1.3
2
$'
[
11
:
-
2
]
__version__
=
'$Revision: 1.3
3
$'
[
11
:
-
2
]
import
regex
,
string
,
copy
import
regex
,
regsub
,
string
,
copy
from
string
import
lower
from
string
import
lower
from
WordSequence
import
WordSequence
from
types
import
*
from
types
import
*
class
ResultList
:
class
ResultList
:
...
@@ -221,12 +218,7 @@ class ResultList:
...
@@ -221,12 +218,7 @@ class ResultList:
'''
'''
def
__init__
(
self
,
d
=
None
):
def
__init__
(
self
,
d
=
None
):
if
(
d
is
None
):
self
.
_dict
=
d
or
{}
self
.
_dict
=
{}
elif
(
type
(
d
)
is
TupleType
):
self
.
_dict
=
{
d
[
0
]
:
d
[
1
:]
}
else
:
self
.
_dict
=
d
def
addentry
(
self
,
document_key
,
*
info
):
def
addentry
(
self
,
document_key
,
*
info
):
...
@@ -495,12 +487,10 @@ class Index:
...
@@ -495,12 +487,10 @@ class Index:
list_class
=
ResultList
list_class
=
ResultList
def
__init__
(
self
,
index_dictionary
=
None
,
synstop
=
None
):
def
__init__
(
self
,
index_dictionary
=
None
):
'Create an inverted index'
'Create an inverted index'
if
(
synstop
is
None
):
if
(
index_dictionary
is
None
):
synstop
=
copy
.
copy
(
default_stop_words
)
index_dictionary
=
copy
.
copy
(
default_stop_words
)
self
.
synstop
=
synstop
self
.
set_index
(
index_dictionary
)
self
.
set_index
(
index_dictionary
)
...
@@ -514,6 +504,11 @@ class Index:
...
@@ -514,6 +504,11 @@ class Index:
self
.
_index_object
=
index_dictionary
self
.
_index_object
=
index_dictionary
def
split_words
(
self
,
s
):
'split a string into separate words'
return
regsub
.
split
(
s
,
'[^a-zA-Z]+'
)
def
index
(
self
,
src
,
srckey
):
def
index
(
self
,
src
,
srckey
):
'''
\
'''
\
index(src, srckey)
index(src, srckey)
...
@@ -525,45 +520,60 @@ class Index:
...
@@ -525,45 +520,60 @@ class Index:
key, srckey. For simple objects, the srckey may be the object itself,
key, srckey. For simple objects, the srckey may be the object itself,
or it may be a key into some other data structure, such as a table.
or it may be a key into some other data structure, such as a table.
'''
'''
src
=
WordSequence
(
src
,
self
.
synstop
)
import
math
index
=
self
.
_index_object
src
=
regsub
.
gsub
(
'-[
\
t
]*
\
n
[
\
t
]*'
,
''
,
str
(
src
))
# de-hyphenate
src
=
map
(
lower
,
filter
(
None
,
self
.
split_words
(
src
)))
if
(
len
(
src
)
<
2
):
return
nwords
=
math
.
log
(
len
(
src
))
d
=
{}
d
=
{}
i
=
-
1
i
=
-
1
for
s
in
src
:
for
s
in
src
:
i
=
i
+
1
i
=
i
+
1
stopword_flag
=
0
while
(
not
stopword_flag
):
try
:
try
:
d
[
s
].
append
(
i
)
index_val
=
index
[
s
]
except
KeyError
:
except
KeyError
:
d
[
s
]
=
[
i
]
break
if
(
i
<
1
):
if
(
index_val
is
None
):
return
stopword_flag
=
1
elif
(
type
(
index_val
)
!=
StringType
):
break
else
:
s
=
index_val
else
:
# s is a stopword
continue
import
math
try
:
nwords
=
math
.
log
(
i
+
1
)
d
[
s
].
append
(
i
)
except
KeyError
:
d
[
s
]
=
[
i
]
addentry
=
self
.
addentry
addentry
=
self
.
addentry
for
word
,
positions
in
d
.
items
():
for
word
,
positions
in
d
.
items
():
freq
=
int
(
100
*
(
len
(
positions
)
/
nwords
))
freq
=
int
(
100
00
*
(
len
(
positions
)
/
nwords
))
addentry
(
word
,
srckey
,(
freq
,
positions
))
addentry
(
word
,
srckey
,(
freq
,
positions
))
def
addentry
(
self
,
word
,
key
,
data
):
def
addentry
(
self
,
word
,
key
,
data
):
index
=
self
.
_index_object
index
=
self
.
_index_object
try
:
try
:
rl
=
index
[
word
]
rl
=
index
[
word
]
except
:
except
:
rl
=
(
key
,
)
+
data
rl
=
{}
index
[
word
]
=
rl
index
[
word
]
=
rl
return
if
(
type
(
rl
)
is
TupleType
):
rl
=
{
rl
[
0
]
:
rl
[
1
:]
}
rl
[
key
]
=
data
rl
[
key
]
=
data
def
__getitem__
(
self
,
key
):
def
__getitem__
(
self
,
key
):
'''
\
'''
\
Get the ResultList objects for the inverted key, key.
Get the ResultList objects for the inverted key, key.
...
@@ -576,7 +586,6 @@ class Index:
...
@@ -576,7 +586,6 @@ class Index:
'''
'''
index
=
self
.
_index_object
index
=
self
.
_index_object
synstop
=
self
.
synstop
List
=
self
.
list_class
List
=
self
.
list_class
if
(
type
(
key
)
==
RegexType
):
if
(
type
(
key
)
==
RegexType
):
...
@@ -603,19 +612,16 @@ class Index:
...
@@ -603,19 +612,16 @@ class Index:
key
=
lower
(
key
)
key
=
lower
(
key
)
while
(
1
):
while
(
type
(
key
)
==
StringType
):
try
:
try
:
key
=
synstop
[
key
]
key
=
index
[
key
]
except
KeyError
:
except
KeyError
:
break
return
List
()
if
(
key
is
None
):
if
(
key
is
None
):
return
List
()
return
List
()
try
:
return
List
(
key
)
return
index
[
key
]
except
KeyError
:
return
List
()
def
keys
(
self
):
def
keys
(
self
):
...
@@ -642,35 +648,37 @@ class Index:
...
@@ -642,35 +648,37 @@ class Index:
del
self
[
key
][
doc_key
]
del
self
[
key
][
doc_key
]
except
KeyError
:
except
KeyError
:
continue
continue
# else:
else
:
# s = WordSequence(s)
s
=
regsub
.
gsub
(
'-[
\
t
]*
\
n
[
\
t
]*'
,
''
,
str
(
s
))
# de-hyphenate
# for key in s:
s
=
filter
(
None
,
self
.
split_words
(
s
))
# try:
# del self[key][doc_key]
for
key
in
s
:
# except KeyError:
try
:
# continue
del
self
[
key
][
doc_key
]
except
KeyError
:
continue
def
get_stopwords
(
self
):
def
get_stopwords
(
self
):
synstop
=
self
.
synstop
index
=
self
.
_index_object
stopwords
=
[]
stopwords
=
[]
for
key
,
val
in
synstop
.
item
s
():
for
word
in
index
.
key
s
():
if
(
value
is
None
):
if
(
index
[
word
]
is
None
):
stopwords
.
append
(
key
)
stopwords
.
append
(
word
)
return
stopwords
return
stopwords
def
get_synonyms
(
self
):
def
get_synonyms
(
self
):
synstop
=
self
.
synstop
index
=
self
.
_index_object
syns
=
[]
synonyms
=
{}
for
key
,
val
in
synstop
.
item
s
():
for
word
in
index
.
key
s
():
if
(
type
(
value
)
is
StringType
):
if
(
type
(
index
[
word
])
==
StringType
):
syns
.
append
(
key
)
synonyms
[
word
]
=
index
[
word
]
return
syn
s
return
synonym
s
def
get_document_keys
(
self
):
def
get_document_keys
(
self
):
...
@@ -687,17 +695,6 @@ class Index:
...
@@ -687,17 +695,6 @@ class Index:
return
d
.
keys
()
return
d
.
keys
()
def
highlight
(
self
,
text
,
positions
,
before
,
after
):
ws
=
WordSequence
(
text
,
self
.
synstop
)
positions
.
sort
()
positions
.
reverse
()
for
position
in
positions
:
start
,
end
=
ws
.
pos
(
position
)
text
=
text
[:
start
]
+
before
+
text
[
start
:
end
]
+
after
+
text
[
end
:]
return
text
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment