Commit f2a03547 authored by Tim Peters's avatar Tim Peters

testDocUpdate(): Thanks to stop-word removal, there weren't actually

*any* words in common across the versions.  Helped Will along by adding
a pragmatic comment to his "knocking indeed" rant.  Reworked to use
the inscrutable magic of dict.setdefault.
parent 35879b41
...@@ -32,11 +32,12 @@ def eq(scaled1, scaled2, epsilon=scaled_int(0.01)): ...@@ -32,11 +32,12 @@ def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
if abs(scaled1 - scaled2) > epsilon: if abs(scaled1 - scaled2) > epsilon:
raise AssertionError, "%s != %s" % (scaled1, scaled2) raise AssertionError, "%s != %s" % (scaled1, scaled2)
# a series of text chunks to use for the re-index tests # A series of text chunks to use for the re-index tests (testDocUpdate).
text = [ text = [
"""Here's a knocking indeed! If a """Here's a knocking indeed! If a
man were porter of hell-gate, he should have man were porter of hell-gate, he should have
old turning the key.""", old turning the key. knock (that made sure
sure there's at least one word in common)."""
"""Knock, """Knock,
knock, knock! Who's there, i' the name of knock, knock! Who's there, i' the name of
...@@ -96,26 +97,27 @@ class ZCIndexTestsBase: ...@@ -96,26 +97,27 @@ class ZCIndexTestsBase:
def testDocUpdate(self): def testDocUpdate(self):
docid = 1 docid = 1
stop = get_stopdict()
unique = {} # compute a set of unique words for each version
d = {} # find some common words
common = []
N = len(text) N = len(text)
stop = get_stopdict()
d = {} # word -> list of version numbers containing that word
for version, i in zip(text, range(N)): for version, i in zip(text, range(N)):
# use a simple splitter rather than an official one # use a simple splitter rather than an official one
words = [w for w in re.split("\W+", version.lower()) words = [w for w in re.split("\W+", version.lower())
if len(w) > 1 and not stop.has_key(w)] if len(w) > 1 and not stop.has_key(w)]
# count occurences of each word word_seen = {}
for w in words: for w in words:
l = d[w] = d.get(w, []) if not word_seen.has_key(w):
l.append(i) d.setdefault(w, []).append(i)
for k, v in d.items(): word_seen[w] = 1
if len(v) == 1:
v = v[0] unique = {} # version number -> list of words unique to that version
l = unique[v] = unique.get(v, []) common = [] # list of words common to all versions
l.append(k) for w, versionlist in d.items():
elif len(v) == N: if len(versionlist) == 1:
common.append(k) unique.setdefault(versionlist[0], []).append(w)
elif len(versionlist) == N:
common.append(w)
for version, i in zip(text, range(N)): for version, i in zip(text, range(N)):
doc = Indexable(version) doc = Indexable(version)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment