Commit 830d8ea9 authored by Guido van Rossum's avatar Guido van Rossum

Buncha updates:

- Use slightly more portable values for the Data.fs and
  Zope/lib/python.

- Add -t NNN option to specify how often to commit a transaction;
  default 20,000.

- Change -p into -p NNN to specify how often (counted in commits) to
  pack (default 0 -- never pack).

- Reworked the commit and pack logic to maintain the various counters
  across folders.

- Store relative paths (e.g. "inbox/1").

- Store the mtime of indexed messages in doctimes[docid].

- Store the mtime of indexed folders in watchfolders[folder] (unused).

- Refactor updatefolder() to:

  (a) Avoid indexing messages it's already indexed and whose mtime
      hasn't changed.  (This probably needs an override just in case.)

  (b) Unindex messages that no longer exist in the folder.

- Include the folder name and the message header fields from, to, cc,
  bcc, and subject in the text to be indexed.
parent fc0b69d5
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
"""MH mail indexer.""" """MH mail indexer."""
import os
import re import re
import sys import sys
import time import time
...@@ -9,17 +10,19 @@ import mhlib ...@@ -9,17 +10,19 @@ import mhlib
import getopt import getopt
import traceback import traceback
from StringIO import StringIO from StringIO import StringIO
from stat import ST_MTIME
DATAFS = "/home/guido/.Data.fs" DATAFS = "~/.Data.fs"
ZOPECODE = "/home/guido/projects/ds9/lib/python" ZOPECODE = "~/projects/Zope/lib/python"
sys.path.append(ZOPECODE) sys.path.append(os.path.expanduser(ZOPECODE))
from ZODB import DB from ZODB import DB
from ZODB.FileStorage import FileStorage from ZODB.FileStorage import FileStorage
from Persistence import Persistent from Persistence import Persistent
from BTrees.IOBTree import IOBTree from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree from BTrees.OIBTree import OIBTree
from BTrees.IIBTree import IIBTree
from Products.ZCTextIndex.NBest import NBest from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.OkapiIndex import OkapiIndex from Products.ZCTextIndex.OkapiIndex import OkapiIndex
...@@ -33,7 +36,7 @@ MAXLINES = 3 ...@@ -33,7 +36,7 @@ MAXLINES = 3
def main(): def main():
try: try:
opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu") opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Op:t:u")
except getopt.error, msg: except getopt.error, msg:
print msg print msg
sys.exit(2) sys.exit(2)
...@@ -42,8 +45,9 @@ def main(): ...@@ -42,8 +45,9 @@ def main():
optimize = 0 optimize = 0
nbest = NBEST nbest = NBEST
maxlines = MAXLINES maxlines = MAXLINES
datafs = DATAFS datafs = os.path.expanduser(DATAFS)
pack = 0 pack = 0
trans = 20000
for o, a in opts: for o, a in opts:
if o == "-b": if o == "-b":
bulk = 1 bulk = 1
...@@ -56,18 +60,18 @@ def main(): ...@@ -56,18 +60,18 @@ def main():
if o == "-O": if o == "-O":
optimize = 1 optimize = 1
if o == "-p": if o == "-p":
pack = 1 pack = int(a)
if o == "-t":
trans = ont(a)
if o == "-u": if o == "-u":
update = 1 update = 1
ix = Indexer(datafs, update or bulk) ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
if bulk: if bulk:
if optimize: if optimize:
ix.optimize(args) ix.optimize(args)
ix.bulkupdate(args) ix.bulkupdate(args)
elif update: elif update:
ix.update(args) ix.update(args)
if pack:
ix.pack()
elif args: elif args:
for i in range(len(args)): for i in range(len(args)):
a = args[i] a = args[i]
...@@ -79,12 +83,18 @@ def main(): ...@@ -79,12 +83,18 @@ def main():
ix.query(" ".join(args), nbest, maxlines) ix.query(" ".join(args), nbest, maxlines)
else: else:
ix.interact(nbest) ix.interact(nbest)
if pack:
ix.pack()
class Indexer: class Indexer:
filestorage = database = connection = root = None filestorage = database = connection = root = None
def __init__(self, datafs, writable=0): def __init__(self, datafs, writable=0, trans=0, pack=0):
self.trans_limit = trans
self.pack_limit = pack
self.trans_count = 0
self.pack_count = 0
self.stopdict = get_stopdict() self.stopdict = get_stopdict()
self.mh = mhlib.MH() self.mh = mhlib.MH()
self.filestorage = FileStorage(datafs, read_only=(not writable)) self.filestorage = FileStorage(datafs, read_only=(not writable))
...@@ -99,6 +109,14 @@ class Indexer: ...@@ -99,6 +109,14 @@ class Indexer:
self.docpaths = self.root["docpaths"] self.docpaths = self.root["docpaths"]
except KeyError: except KeyError:
self.docpaths = self.root["docpaths"] = IOBTree() self.docpaths = self.root["docpaths"] = IOBTree()
try:
self.doctimes = self.root["doctimes"]
except KeyError:
self.doctimes = self.root["doctimes"] = IIBTree()
try:
self.watchfolders = self.root["watchfolders"]
except KeyError:
self.watchfolders = self.root["watchfolders"] = {}
self.path2docid = OIBTree() self.path2docid = OIBTree()
for docid in self.docpaths.keys(): for docid in self.docpaths.keys():
path = self.docpaths[docid] path = self.docpaths[docid]
...@@ -195,6 +213,7 @@ class Indexer: ...@@ -195,6 +213,7 @@ class Indexer:
path = self.docpaths[docid] path = self.docpaths[docid]
score = min(100, int(score * factor)) score = min(100, int(score * factor))
print "Rank: %d Score: %d%% File: %s" % (rank, score, path) print "Rank: %d Score: %d%% File: %s" % (rank, score, path)
path = os.path.join(self.mh.getpath(), path)
fp = open(path) fp = open(path)
msg = mhlib.Message("<folder>", 0, fp) msg = mhlib.Message("<folder>", 0, fp)
for header in "From", "To", "Cc", "Bcc", "Subject", "Date": for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
...@@ -254,6 +273,7 @@ class Indexer: ...@@ -254,6 +273,7 @@ class Indexer:
msgs.sort() msgs.sort()
self.updatefolder(f, msgs) self.updatefolder(f, msgs)
self.commit()
def optimize(self, args): def optimize(self, args):
uniqwords = {} uniqwords = {}
...@@ -279,19 +299,14 @@ class Indexer: ...@@ -279,19 +299,14 @@ class Indexer:
for n in msgs: for n in msgs:
print "prescanning", n print "prescanning", n
m = f.openmessage(n) m = f.openmessage(n)
text = self.getmessagetext(m) text = self.getmessagetext(m, f.name)
for p in pipeline: for p in pipeline:
text = p.process(text) text = p.process(text)
for word in text: for word in text:
uniqwords[word] = uniqwords.get(word, 0) + 1 uniqwords[word] = uniqwords.get(word, 0) + 1
def bulkupdate(self, args): def bulkupdate(self, args):
chunk = 5000
target = len(self.docpaths) + chunk
for folder in args: for folder in args:
if len(self.docpaths) >= target:
self.pack()
target = len(self.docpaths) + chunk
if folder.startswith("+"): if folder.startswith("+"):
folder = folder[1:] folder = folder[1:]
print "\nFOLDER", folder print "\nFOLDER", folder
...@@ -302,31 +317,34 @@ class Indexer: ...@@ -302,31 +317,34 @@ class Indexer:
continue continue
self.updatefolder(f, f.listmessages()) self.updatefolder(f, f.listmessages())
print "Total", len(self.docpaths) print "Total", len(self.docpaths)
self.pack() self.commit()
print "Indexed", self.index.lexicon._nbytes, "bytes and", print "Indexed", self.index.lexicon._nbytes, "bytes and",
print self.index.lexicon._nwords, "words;", print self.index.lexicon._nwords, "words;",
print len(self.index.lexicon._words), "unique words." print len(self.index.lexicon._words), "unique words."
def updatefolder(self, f, msgs): def updatefolder(self, f, msgs):
done = 0 self.watchfolders[f.name] = self.getmtime(f.name)
new = 0
for n in msgs: for n in msgs:
print "indexing", n path = "%s/%s" % (f.name, n)
docid = self.path2docid.get(path, 0)
if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
print "unchanged", docid, path
continue
docid = self.newdocid(path)
m = f.openmessage(n) m = f.openmessage(n)
text = self.getmessagetext(m) text = self.getmessagetext(m, f.name)
path = f.getmessagefilename(n)
self.unindexpath(path)
if not text: if not text:
self.unindexpath(path)
continue continue
docid = self.newdocid(path) print "indexing", docid, path
self.index.index_text(docid, text) self.index.index_text(docid, text)
done += 1 self.maycommit()
new = 1 # Remove messages from the folder that no longer exist
if done%500 == 0: for path in self.path2docid.keys(f.name):
self.commit() if not path.startswith(f.name + "/"):
new = 0 break
if new: if self.getmtime(path) == 0:
self.commit() self.unindexpath(path)
print "done." print "done."
def unindexpath(self, path): def unindexpath(self, path):
...@@ -334,14 +352,19 @@ class Indexer: ...@@ -334,14 +352,19 @@ class Indexer:
docid = self.path2docid[path] docid = self.path2docid[path]
print "unindexing", docid, path print "unindexing", docid, path
del self.docpaths[docid] del self.docpaths[docid]
del self.doctimes[docid]
del self.path2docid[path] del self.path2docid[path]
try: try:
self.index.unindex(docid) self.index.unindex(docid)
except KeyError, msg: except KeyError, msg:
print "KeyError", msg print "KeyError", msg
self.maycommit()
def getmessagetext(self, m): def getmessagetext(self, m, name=None):
L = [] L = []
if name:
L.append("_folder " + name) # To restrict search to a folder
self.getheaders(m, L)
try: try:
self.getmsgparts(m, L, 0) self.getmsgparts(m, L, 0)
except: except:
...@@ -361,22 +384,57 @@ class Indexer: ...@@ -361,22 +384,57 @@ class Indexer:
elif ctype == "message/rfc822": elif ctype == "message/rfc822":
f = StringIO(m.getbodytext()) f = StringIO(m.getbodytext())
m = mhlib.Message("<folder>", 0, f) m = mhlib.Message("<folder>", 0, f)
self.getheaders(m, L)
self.getmsgparts(m, L, level+1) self.getmsgparts(m, L, level+1)
def getheaders(self, m, L):
H = []
for key in "from", "to", "cc", "bcc", "subject":
value = m.get(key)
if value:
H.append(value)
if H:
L.append("\n".join(H))
def newdocid(self, path): def newdocid(self, path):
docid = self.path2docid.get(path)
if docid is not None:
self.doctimes[docid] = self.getmtime(path)
return docid
docid = self.maxdocid + 1 docid = self.maxdocid + 1
self.maxdocid = docid self.maxdocid = docid
self.docpaths[docid] = path self.docpaths[docid] = path
self.doctimes[docid] = self.getmtime(path)
self.path2docid[path] = docid self.path2docid[path] = docid
return docid return docid
def getmtime(self, path):
path = os.path.join(self.mh.getpath(), path)
try:
st = os.stat(path)
except os.error, msg:
return 0
return st[ST_MTIME]
def maycommit(self):
self.trans_count += 1
if self.trans_count >= self.trans_limit > 0:
self.commit()
def commit(self): def commit(self):
if self.trans_count > 0:
print "committing..." print "committing..."
get_transaction().commit() get_transaction().commit()
self.trans_count = 0
self.pack_count += 1
if self.pack_count >= self.pack_limit > 0:
self.pack()
def pack(self): def pack(self):
if self.pack_count > 0:
print "packing..." print "packing..."
self.database.pack() self.database.pack()
self.pack_count = 0
class TextIndex(Persistent): class TextIndex(Persistent):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment