Commit 80cc084f authored by Guido van Rossum's avatar Guido van Rossum

Dump word frequencies as well.

parent c892b4bd
...@@ -64,7 +64,7 @@ MAXLINES = 3 ...@@ -64,7 +64,7 @@ MAXLINES = 3
def main(): def main():
try: try:
opts, args = getopt.getopt(sys.argv[1:], "bd:hm:n:Op:t:uwW") opts, args = getopt.getopt(sys.argv[1:], "bd:fhm:n:Op:t:uwW")
except getopt.error, msg: except getopt.error, msg:
print msg print msg
print "use -h for help" print "use -h for help"
...@@ -77,12 +77,14 @@ def main(): ...@@ -77,12 +77,14 @@ def main():
datafs = os.path.expanduser(DATAFS) datafs = os.path.expanduser(DATAFS)
pack = 0 pack = 0
trans = 20000 trans = 20000
dumpwords = dumpwids = 0 dumpwords = dumpwids = dumpfreqs = 0
for o, a in opts: for o, a in opts:
if o == "-b": if o == "-b":
bulk = 1 bulk = 1
if o == "-d": if o == "-d":
datafs = a datafs = a
if o == "-f":
dumpfreqs = 1
if o == "-h": if o == "-h":
print __doc__ print __doc__
return return
...@@ -103,11 +105,13 @@ def main(): ...@@ -103,11 +105,13 @@ def main():
if o == "-W": if o == "-W":
dumpwids = 1 dumpwids = 1
ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack) ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
if dumpfreqs:
ix.dumpfreqs()
if dumpwords: if dumpwords:
ix.dumpwords() ix.dumpwords()
if dumpwids: if dumpwids:
ix.dumpwids() ix.dumpwids()
if dumpwords or dumpwids: if dumpwords or dumpwids or dumpfreqs:
return return
if bulk: if bulk:
if optimize: if optimize:
...@@ -172,15 +176,41 @@ class Indexer: ...@@ -172,15 +176,41 @@ class Indexer:
print len(self.path2docid), "Pathnames" print len(self.path2docid), "Pathnames"
print self.index.lexicon.length(), "Words" print self.index.lexicon.length(), "Words"
def dumpfreqs(self):
lexicon = self.index.lexicon
index = self.index.index
assert isinstance(index, OkapiIndex)
L = []
for wid in lexicon.wids():
freq = 0
for f in index._wordinfo.get(wid, {}).values():
freq += f
L.append((freq, wid, lexicon.get_word(wid)))
L.sort()
L.reverse()
for freq, wid, word in L:
print "%10d %10d %s" % (wid, freq, word)
def dumpwids(self): def dumpwids(self):
lexicon = self.index.lexicon lexicon = self.index.lexicon
index = self.index.index
assert isinstance(index, OkapiIndex)
for wid in lexicon.wids(): for wid in lexicon.wids():
print "%10d %s" % (wid, lexicon.get_word(wid)) freq = 0
for f in index._wordinfo.get(wid, {}).values():
freq += f
print "%10d %10d %s" % (wid, freq, lexicon.get_word(wid))
def dumpwords(self): def dumpwords(self):
lexicon = self.index.lexicon lexicon = self.index.lexicon
index = self.index.index
assert isinstance(index, OkapiIndex)
for word in lexicon.words(): for word in lexicon.words():
print "%10d %s" % (lexicon.get_wid(word), word) wid = lexicon.get_wid(word)
freq = 0
for f in index._wordinfo.get(wid, {}).values():
freq += f
print "%10d %10d %s" % (wid, freq, word)
def close(self): def close(self):
self.root = None self.root = None
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment