Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
41eb6fe8
Commit
41eb6fe8
authored
Jan 26, 2001
by
Christopher Petrilli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merge of Leixcon cleaup and text index merging.
parent
78f3476f
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
261 additions
and
174 deletions
+261
-174
lib/python/SearchIndex/GlobbingLexicon.py
lib/python/SearchIndex/GlobbingLexicon.py
+85
-56
lib/python/SearchIndex/Lexicon.py
lib/python/SearchIndex/Lexicon.py
+19
-14
lib/python/SearchIndex/UnIndex.py
lib/python/SearchIndex/UnIndex.py
+9
-7
lib/python/SearchIndex/UnKeywordIndex.py
lib/python/SearchIndex/UnKeywordIndex.py
+2
-2
lib/python/SearchIndex/UnTextIndex.py
lib/python/SearchIndex/UnTextIndex.py
+146
-95
No files found.
lib/python/SearchIndex/GlobbingLexicon.py
View file @
41eb6fe8
...
@@ -81,82 +81,108 @@
...
@@ -81,82 +81,108 @@
# many individuals on behalf of Digital Creations. Specific
# many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file.
# attributions are listed in the accompanying credits file.
#
#
##############################################################################
#############################################################################
__doc__
=
""" Lexicon object that supports
"""
from
Lexicon
import
Lexicon
from
Lexicon
import
Lexicon
from
Splitter
import
Splitter
from
Splitter
import
Splitter
from
intSet
import
intSet
from
intSet
import
intSet
from
UnTextIndex
import
Or
from
UnTextIndex
import
Or
import
re
,
time
import
re
,
string
import
OIBTree
,
BTree
,
IOBTree
,
IIBTree
import
OIBTree
,
BTree
,
IOBTree
,
IIBTree
# Short cuts for common data containers
OIBTree
=
OIBTree
.
BTree
# Object -> Integer
OIBTree
=
OIBTree
.
BTree
# Object -> Integer
OOBTree
=
BTree
.
BTree
# Object -> Object
OOBTree
=
BTree
.
BTree
# Object -> Object
IOBTree
=
IOBTree
.
BTree
# Integer -> Object
IOBTree
=
IOBTree
.
BTree
# Integer -> Object
IIBucket
=
IIBTree
.
Bucket
# Integer -> Integer
IIBucket
=
IIBTree
.
Bucket
# Integer -> Integer
import
pdb
class
GlobbingLexicon
(
Lexicon
):
class
GlobbingLexicon
(
Lexicon
):
"""
"""Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Base class to support globbing lexicon object.
Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
"""
"""
multi_wc
=
'*'
multi_wc
=
'*'
single_wc
=
'?'
single_wc
=
'?'
eow
=
'$'
eow
=
'$'
def
__init__
(
self
):
self
.
counter
=
0
def
__init__
(
self
):
self
.
counter
=
0
# word id counter XXX
self
.
_lexicon
=
OIBTree
()
self
.
_lexicon
=
OIBTree
()
self
.
_inverseLex
=
IOBTree
()
self
.
_inverseLex
=
IOBTree
()
self
.
_digrams
=
OOBTree
()
self
.
_digrams
=
OOBTree
()
def
set
(
self
,
word
):
""" """
def
createDigrams
(
self
,
word
):
"""Returns a list with the set of digrams in the word."""
digrams
=
[]
digrams
.
append
(
self
.
eow
+
word
[
0
])
# Mark the beginning
for
i
in
range
(
len
(
word
)):
digrams
.
append
(
word
[
i
:
i
+
2
])
digrams
[
-
1
]
=
digrams
[
-
1
]
+
self
.
eow
# Mark the end
return
digrams
def
getWordId
(
self
,
word
):
"""Provided 'word', return the matching integer word id."""
if
self
.
_lexicon
.
has_key
(
word
):
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
return
self
.
_lexicon
[
word
]
else
:
else
:
word
=
intern
(
word
)
return
self
.
assignWordId
(
word
)
self
.
_lexicon
[
word
]
=
self
.
counter
self
.
_inverseLex
[
self
.
counter
]
=
word
## now, split the word into digrams and insert references
set
=
getWordId
# Kludge for old code
## to 'word' into the digram object. The first and last
## digrams in the list are specially marked with $ to
## indicate the beginning and end of the word
digrams
=
[]
digrams
.
append
(
self
.
eow
+
word
[
0
])
# mark the beginning
for
i
in
range
(
len
(
word
)
):
def
assignWordId
(
self
,
word
):
digrams
.
append
(
word
[
i
:
i
+
2
])
"""Assigns a new word id to the provided word, and return it."""
digrams
[
-
1
]
=
digrams
[
-
1
]
+
self
.
eow
# mark the end
# Double check it's not in the lexicon already, and if it is, just
# return it.
_digrams
=
self
.
_digrams
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
for
digram
in
digrams
:
set
=
_digrams
.
get
(
digram
)
# First we go ahead and put the forward and reverse maps in.
if
set
is
None
:
self
.
_lexicon
[
word
]
=
self
.
counter
_digrams
[
digram
]
=
set
=
intSet
()
self
.
_inverseLex
[
self
.
counter
]
=
word
set
.
insert
(
self
.
counter
)
counter
=
self
.
counter
# Now take all the digrams and insert them into the digram map.
self
.
counter
=
self
.
counter
+
1
for
digram
in
self
.
createDigrams
(
word
):
return
counter
set
=
self
.
_digrams
.
get
(
digram
)
if
set
is
None
:
self
.
_digrams
[
digram
]
=
set
=
intSet
()
set
.
insert
(
self
.
counter
)
self
.
counter
=
self
.
counter
+
1
return
self
.
counter
-
1
# Adjust for the previous increment
def
get
(
self
,
pattern
):
def
get
(
self
,
pattern
):
""" Query the lexicon for words matching a pattern.
""" Query the lexicon for words matching a pattern."""
"""
wc_set
=
[
self
.
multi_wc
,
self
.
single_wc
]
wc_set
=
[
self
.
multi_wc
,
self
.
single_wc
]
digrams
=
[]
digrams
=
[]
...
@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
...
@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
## may contain all matching digrams, but in the wrong
## may contain all matching digrams, but in the wrong
## order.
## order.
expr
=
re
.
compile
(
self
.
translate
(
pattern
))
expr
=
re
.
compile
(
self
.
createRegex
(
pattern
))
words
=
[]
words
=
[]
hits
=
[]
hits
=
[]
for
x
in
result
.
keys
():
for
x
in
result
.
keys
():
if
expr
.
match
(
self
.
_inverseLex
[
x
]):
if
expr
.
match
(
self
.
_inverseLex
[
x
]):
hits
.
append
(
x
)
hits
.
append
(
x
)
return
hits
return
hits
def
__getitem__
(
self
,
word
):
def
__getitem__
(
self
,
word
):
""" """
""" """
return
self
.
get
(
word
)
return
self
.
get
(
word
)
def
query_hook
(
self
,
q
):
"""expand wildcards
"""
def
query_hook
(
self
,
q
):
"""expand wildcards"""
words
=
[]
words
=
[]
wids
=
[]
wids
=
[]
for
w
in
q
:
for
w
in
q
:
...
@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
...
@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
return
words
return
words
def
Splitter
(
self
,
astring
,
words
=
None
):
def
Splitter
(
self
,
astring
,
words
=
None
):
""" wrap the splitter """
""" wrap the splitter """
...
@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
...
@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
return
Splitter
(
astring
)
return
Splitter
(
astring
)
def
translate
(
self
,
pat
):
def
createRegex
(
self
,
pat
):
"""Translate a PATTERN to a regular expression.
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
There is no way to quote meta-characters.
"""
"""
i
,
n
=
0
,
len
(
pat
)
transTable
=
string
.
maketrans
(
""
,
""
)
res
=
''
while
i
<
n
:
# First, deal with mutli-character globbing
c
=
pat
[
i
]
result
=
string
.
replace
(
pat
,
'*'
,
'.*'
)
i
=
i
+
1
if
c
==
self
.
multi_wc
:
# Next, we need to deal with single-character globbing
res
=
res
+
'.*'
result
=
string
.
replace
(
result
,
'?'
,
'.?'
)
elif
c
==
self
.
single_wc
:
res
=
res
+
'.?'
# Now, we need to remove all of the characters that
else
:
# are forbidden.
res
=
res
+
re
.
escape
(
c
)
result
=
string
.
translate
(
result
,
transTable
,
return
res
+
'$'
r'()&|!@#$%^{}\
<>
')
return "%s$" % result
lib/python/SearchIndex/Lexicon.py
View file @
41eb6fe8
...
@@ -83,11 +83,6 @@
...
@@ -83,11 +83,6 @@
#
#
##############################################################################
##############################################################################
import
string
,
regex
,
ts_regex
import
regsub
__doc__
=
""" Module breaks out Zope specific methods and behavior. In
__doc__
=
""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer
addition, provides the Lexicon class which defines a word to integer
mapping.
mapping.
...
@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
...
@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
self
.
stop_syn
=
stop_syn
self
.
stop_syn
=
stop_syn
def
set
(
self
,
word
):
def
getWordId
(
self
,
word
):
""" return the word id of 'word' """
""" return the word id of 'word' """
if
self
.
_lexicon
.
has_key
(
word
):
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
return
self
.
_lexicon
[
word
]
else
:
else
:
if
not
hasattr
(
self
,
'counter'
):
return
self
.
assignWordId
(
word
)
self
.
counter
=
0
self
.
_lexicon
[
intern
(
word
)]
=
self
.
counter
set
=
getWordId
self
.
counter
=
self
.
counter
+
1
return
self
.
counter
-
1
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
if
not
hasattr
(
self
,
'counter'
):
self
.
counter
=
0
self
.
_lexicon
[
intern
(
word
)]
=
self
.
counter
self
.
counter
=
self
.
counter
+
1
return
self
.
counter
-
1
def
get
(
self
,
key
,
default
=
None
):
def
get
(
self
,
key
,
default
=
None
):
"""
"""
"""
Return the matched word against the key.
"""
return
[
self
.
_lexicon
.
get
(
key
,
default
)]
return
[
self
.
_lexicon
.
get
WordId
(
key
,
default
)]
def
__getitem__
(
self
,
key
):
def
__getitem__
(
self
,
key
):
...
...
lib/python/SearchIndex/UnIndex.py
View file @
41eb6fe8
...
@@ -85,7 +85,7 @@
...
@@ -85,7 +85,7 @@
"""Simple column indices"""
"""Simple column indices"""
__version__
=
'$Revision: 1.2
3
$'
[
11
:
-
2
]
__version__
=
'$Revision: 1.2
4
$'
[
11
:
-
2
]
...
@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
...
@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
(
'unindex_object could not remove '
(
'unindex_object could not remove '
'integer id %s from index %s. This '
'integer id %s from index %s. This '
'should not happen.'
'should not happen.'
%
(
str
(
i
),
str
(
k
))))
%
(
str
(
documentId
),
str
(
self
.
id
))))
else
:
else
:
LOG
(
self
.
__class__
.
__name__
,
ERROR
,
LOG
(
self
.
__class__
.
__name__
,
ERROR
,
(
'unindex_object tried to retrieve set %s '
(
'unindex_object tried to retrieve set %s '
'from index %s but couldn
\
'
t. This '
'from index %s but couldn
\
'
t. This '
'should not happen.'
%
(
repr
(
set
),
str
(
k
))))
'should not happen.'
%
(
repr
(
entry
),
str
(
self
.
id
))))
def
insertForwardIndexEntry
(
self
,
entry
,
documentId
):
def
insertForwardIndexEntry
(
self
,
entry
,
documentId
):
...
@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
...
@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
This will also deal with creating the entire row if necessary."""
This will also deal with creating the entire row if necessary."""
indexRow
=
self
.
_index
.
get
(
entry
,
MV
)
indexRow
=
self
.
_index
.
get
(
entry
,
MV
)
# Make sure there's actually a row there already. If not, create
# Make sure there's actually a row there already. If not, create
# an IntSet and stuff it in first.
# an IntSet and stuff it in first.
if
indexRow
is
MV
:
if
indexRow
is
MV
:
...
@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
...
@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
datum
=
getattr
(
obj
,
self
.
id
)
datum
=
getattr
(
obj
,
self
.
id
)
if
callable
(
datum
):
if
callable
(
datum
):
datum
=
datum
()
datum
=
datum
()
except
:
except
AttributeError
:
datum
=
MV
datum
=
MV
# We don't want to do anything that we don't have to here, so we'll
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
# check to see if the new and existing information is the same.
if
not
(
datum
==
self
.
_unindex
.
get
(
documentId
,
MV
)):
oldDatum
=
self
.
_unindex
.
get
(
documentId
,
MV
)
if
not
datum
==
oldDatum
:
if
oldDatum
is
not
MV
:
self
.
removeForwardIndexEntry
(
oldDatum
,
documentId
)
self
.
insertForwardIndexEntry
(
datum
,
documentId
)
self
.
insertForwardIndexEntry
(
datum
,
documentId
)
self
.
_unindex
[
documentId
]
=
datum
self
.
_unindex
[
documentId
]
=
datum
returnStatus
=
1
returnStatus
=
1
self
.
_p_changed
=
1
# Tickle the transaction
return
returnStatus
return
returnStatus
...
...
lib/python/SearchIndex/UnKeywordIndex.py
View file @
41eb6fe8
...
@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
...
@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
newKeywords
=
getattr
(
obj
,
self
.
id
)
newKeywords
=
getattr
(
obj
,
self
.
id
)
if
callable
(
newKeywords
):
if
callable
(
newKeywords
):
newKeywords
=
newKeywords
()
newKeywords
=
newKeywords
()
except
:
except
Except
:
newKeywords
=
MV
newKeywords
=
MV
if
type
(
newKeywords
)
is
StringType
:
if
type
(
newKeywords
)
is
StringType
:
...
@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
...
@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
except
TypeError
:
except
TypeError
:
return
0
return
0
self
.
_unindex
[
documentId
]
=
newKeywords
self
.
_unindex
[
documentId
]
=
newKeywords
[:]
# Make a copy
return
1
return
1
...
...
lib/python/SearchIndex/UnTextIndex.py
View file @
41eb6fe8
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment