Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
41eb6fe8
Commit
41eb6fe8
authored
Jan 26, 2001
by
Christopher Petrilli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merge of Leixcon cleaup and text index merging.
parent
78f3476f
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
261 additions
and
174 deletions
+261
-174
lib/python/SearchIndex/GlobbingLexicon.py
lib/python/SearchIndex/GlobbingLexicon.py
+85
-56
lib/python/SearchIndex/Lexicon.py
lib/python/SearchIndex/Lexicon.py
+19
-14
lib/python/SearchIndex/UnIndex.py
lib/python/SearchIndex/UnIndex.py
+9
-7
lib/python/SearchIndex/UnKeywordIndex.py
lib/python/SearchIndex/UnKeywordIndex.py
+2
-2
lib/python/SearchIndex/UnTextIndex.py
lib/python/SearchIndex/UnTextIndex.py
+146
-95
No files found.
lib/python/SearchIndex/GlobbingLexicon.py
View file @
41eb6fe8
...
...
@@ -81,82 +81,108 @@
# many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file.
#
##############################################################################
__doc__
=
""" Lexicon object that supports
#############################################################################
"""
from
Lexicon
import
Lexicon
from
Splitter
import
Splitter
from
intSet
import
intSet
from
UnTextIndex
import
Or
import
re
,
time
import
re
,
string
import
OIBTree
,
BTree
,
IOBTree
,
IIBTree
# Short cuts for common data containers
OIBTree
=
OIBTree
.
BTree
# Object -> Integer
OOBTree
=
BTree
.
BTree
# Object -> Object
IOBTree
=
IOBTree
.
BTree
# Integer -> Object
IIBucket
=
IIBTree
.
Bucket
# Integer -> Integer
import
pdb
class
GlobbingLexicon
(
Lexicon
):
"""
"""Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Base class to support globbing lexicon object.
Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
"""
multi_wc
=
'*'
single_wc
=
'?'
eow
=
'$'
def
__init__
(
self
):
self
.
counter
=
0
def
__init__
(
self
):
self
.
counter
=
0
# word id counter XXX
self
.
_lexicon
=
OIBTree
()
self
.
_inverseLex
=
IOBTree
()
self
.
_digrams
=
OOBTree
()
def
set
(
self
,
word
):
""" """
def
createDigrams
(
self
,
word
):
"""Returns a list with the set of digrams in the word."""
digrams
=
[]
digrams
.
append
(
self
.
eow
+
word
[
0
])
# Mark the beginning
for
i
in
range
(
len
(
word
)):
digrams
.
append
(
word
[
i
:
i
+
2
])
digrams
[
-
1
]
=
digrams
[
-
1
]
+
self
.
eow
# Mark the end
return
digrams
def
getWordId
(
self
,
word
):
"""Provided 'word', return the matching integer word id."""
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
else
:
word
=
intern
(
word
)
self
.
_lexicon
[
word
]
=
self
.
counter
self
.
_inverseLex
[
self
.
counter
]
=
word
return
self
.
assignWordId
(
word
)
## now, split the word into digrams and insert references
## to 'word' into the digram object. The first and last
## digrams in the list are specially marked with $ to
## indicate the beginning and end of the word
set
=
getWordId
# Kludge for old code
digrams
=
[]
digrams
.
append
(
self
.
eow
+
word
[
0
])
# mark the beginning
for
i
in
range
(
len
(
word
)
):
digrams
.
append
(
word
[
i
:
i
+
2
])
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word, and return it."""
digrams
[
-
1
]
=
digrams
[
-
1
]
+
self
.
eow
# mark the end
_digrams
=
self
.
_digrams
for
digram
in
digrams
:
set
=
_digrams
.
get
(
digram
)
if
set
is
None
:
_digrams
[
digram
]
=
set
=
intSet
()
set
.
insert
(
self
.
counter
)
# Double check it's not in the lexicon already, and if it is, just
# return it.
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
# First we go ahead and put the forward and reverse maps in.
self
.
_lexicon
[
word
]
=
self
.
counter
self
.
_inverseLex
[
self
.
counter
]
=
word
counter
=
self
.
counter
self
.
counter
=
self
.
counter
+
1
return
counter
# Now take all the digrams and insert them into the digram map.
for
digram
in
self
.
createDigrams
(
word
):
set
=
self
.
_digrams
.
get
(
digram
)
if
set
is
None
:
self
.
_digrams
[
digram
]
=
set
=
intSet
()
set
.
insert
(
self
.
counter
)
self
.
counter
=
self
.
counter
+
1
return
self
.
counter
-
1
# Adjust for the previous increment
def
get
(
self
,
pattern
):
""" Query the lexicon for words matching a pattern.
"""
""" Query the lexicon for words matching a pattern."""
wc_set
=
[
self
.
multi_wc
,
self
.
single_wc
]
digrams
=
[]
...
...
@@ -199,22 +225,22 @@ class GlobbingLexicon(Lexicon):
## may contain all matching digrams, but in the wrong
## order.
expr
=
re
.
compile
(
self
.
translate
(
pattern
))
expr
=
re
.
compile
(
self
.
createRegex
(
pattern
))
words
=
[]
hits
=
[]
for
x
in
result
.
keys
():
if
expr
.
match
(
self
.
_inverseLex
[
x
]):
hits
.
append
(
x
)
return
hits
def
__getitem__
(
self
,
word
):
""" """
return
self
.
get
(
word
)
def
query_hook
(
self
,
q
):
"""expand wildcards
"""
def
query_hook
(
self
,
q
):
"""expand wildcards"""
words
=
[]
wids
=
[]
for
w
in
q
:
...
...
@@ -230,6 +256,7 @@ class GlobbingLexicon(Lexicon):
return
words
def
Splitter
(
self
,
astring
,
words
=
None
):
""" wrap the splitter """
...
...
@@ -239,21 +266,23 @@ class GlobbingLexicon(Lexicon):
return
Splitter
(
astring
)
def
translate
(
self
,
pat
):
def
createRegex
(
self
,
pat
):
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
i
,
n
=
0
,
len
(
pat
)
res
=
''
while
i
<
n
:
c
=
pat
[
i
]
i
=
i
+
1
if
c
==
self
.
multi_wc
:
res
=
res
+
'.*'
elif
c
==
self
.
single_wc
:
res
=
res
+
'.?'
else
:
res
=
res
+
re
.
escape
(
c
)
return
res
+
'$'
transTable
=
string
.
maketrans
(
""
,
""
)
# First, deal with mutli-character globbing
result
=
string
.
replace
(
pat
,
'*'
,
'.*'
)
# Next, we need to deal with single-character globbing
result
=
string
.
replace
(
result
,
'?'
,
'.?'
)
# Now, we need to remove all of the characters that
# are forbidden.
result
=
string
.
translate
(
result
,
transTable
,
r'()&|!@#$%^{}\
<>
')
return "%s$" % result
lib/python/SearchIndex/Lexicon.py
View file @
41eb6fe8
...
...
@@ -83,11 +83,6 @@
#
##############################################################################
import
string
,
regex
,
ts_regex
import
regsub
__doc__
=
""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer
mapping.
...
...
@@ -137,23 +132,33 @@ class Lexicon(Persistent, Implicit):
self
.
stop_syn
=
stop_syn
def
set
(
self
,
word
):
def
getWordId
(
self
,
word
):
""" return the word id of 'word' """
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
else
:
if
not
hasattr
(
self
,
'counter'
):
self
.
counter
=
0
self
.
_lexicon
[
intern
(
word
)]
=
self
.
counter
self
.
counter
=
self
.
counter
+
1
return
self
.
counter
-
1
return
self
.
assignWordId
(
word
)
set
=
getWordId
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
if
not
hasattr
(
self
,
'counter'
):
self
.
counter
=
0
self
.
_lexicon
[
intern
(
word
)]
=
self
.
counter
self
.
counter
=
self
.
counter
+
1
return
self
.
counter
-
1
def
get
(
self
,
key
,
default
=
None
):
"""
"""
return
[
self
.
_lexicon
.
get
(
key
,
default
)]
"""
Return the matched word against the key.
"""
return
[
self
.
_lexicon
.
get
WordId
(
key
,
default
)]
def
__getitem__
(
self
,
key
):
...
...
lib/python/SearchIndex/UnIndex.py
View file @
41eb6fe8
...
...
@@ -85,7 +85,7 @@
"""Simple column indices"""
__version__
=
'$Revision: 1.2
3
$'
[
11
:
-
2
]
__version__
=
'$Revision: 1.2
4
$'
[
11
:
-
2
]
...
...
@@ -197,12 +197,12 @@ class UnIndex(Persistent, Implicit):
(
'unindex_object could not remove '
'integer id %s from index %s. This '
'should not happen.'
%
(
str
(
i
),
str
(
k
))))
%
(
str
(
documentId
),
str
(
self
.
id
))))
else
:
LOG
(
self
.
__class__
.
__name__
,
ERROR
,
(
'unindex_object tried to retrieve set %s '
'from index %s but couldn
\
'
t. This '
'should not happen.'
%
(
repr
(
set
),
str
(
k
))))
'should not happen.'
%
(
repr
(
entry
),
str
(
self
.
id
))))
def
insertForwardIndexEntry
(
self
,
entry
,
documentId
):
...
...
@@ -212,7 +212,7 @@ class UnIndex(Persistent, Implicit):
This will also deal with creating the entire row if necessary."""
indexRow
=
self
.
_index
.
get
(
entry
,
MV
)
# Make sure there's actually a row there already. If not, create
# an IntSet and stuff it in first.
if
indexRow
is
MV
:
...
...
@@ -234,17 +234,19 @@ class UnIndex(Persistent, Implicit):
datum
=
getattr
(
obj
,
self
.
id
)
if
callable
(
datum
):
datum
=
datum
()
except
:
except
AttributeError
:
datum
=
MV
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
if
not
(
datum
==
self
.
_unindex
.
get
(
documentId
,
MV
)):
oldDatum
=
self
.
_unindex
.
get
(
documentId
,
MV
)
if
not
datum
==
oldDatum
:
if
oldDatum
is
not
MV
:
self
.
removeForwardIndexEntry
(
oldDatum
,
documentId
)
self
.
insertForwardIndexEntry
(
datum
,
documentId
)
self
.
_unindex
[
documentId
]
=
datum
returnStatus
=
1
self
.
_p_changed
=
1
# Tickle the transaction
return
returnStatus
...
...
lib/python/SearchIndex/UnKeywordIndex.py
View file @
41eb6fe8
...
...
@@ -115,7 +115,7 @@ class UnKeywordIndex(UnIndex):
newKeywords
=
getattr
(
obj
,
self
.
id
)
if
callable
(
newKeywords
):
newKeywords
=
newKeywords
()
except
:
except
Except
:
newKeywords
=
MV
if
type
(
newKeywords
)
is
StringType
:
...
...
@@ -162,7 +162,7 @@ class UnKeywordIndex(UnIndex):
except
TypeError
:
return
0
self
.
_unindex
[
documentId
]
=
newKeywords
self
.
_unindex
[
documentId
]
=
newKeywords
[:]
# Make a copy
return
1
...
...
lib/python/SearchIndex/UnTextIndex.py
View file @
41eb6fe8
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment