Commit c470e7d5 authored by Guido van Rossum's avatar Guido van Rossum

Keep some statistics about indexing: total number of bytes and words

indexed (where the bytes are counted before entry into the pipeline,
and the words are counted after the pipeline is done).  To get the
numbers, use the _nbytes and _nwords instance variables directly.
parent 79b99fbb
......@@ -30,6 +30,10 @@ class Lexicon:
self._nextwid = 1
self._pipeline = pipeline
# Keep some statistics about indexing
self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
self._nwords = 0 # Number of words indexed (after pipeline)
def length(self):
"""Return the number of unique terms in the lexicon."""
return self._nextwid - 1
......@@ -45,8 +49,11 @@ class Lexicon:
def sourceToWordIds(self, text):
last = _text2list(text)
for t in last:
self._nbytes += len(t)
for element in self._pipeline:
last = element.process(last)
self._nwords += len(last)
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment