Buncha updates:

- Use slightly more portable values for the Data.fs and Zope/lib/python. - Add -t NNN option to specify how often to commit a transaction; default 20,000. - Change -p into -p NNN to specify how often (counted in commits) to pack (default 0 -- never pack). - Reworked the commit and pack logic to maintain the various counters across folders. - Store relative paths (e.g. "inbox/1"). - Store the mtime of indexed messages in doctimes[docid]. - Store the mtime of indexed folders in watchfolders[folder] (unused). - Refactor updatefolder() to: (a) Avoid indexing messages it's already indexed and whose mtime hasn't changed. (This probably needs an override just in case.) (b) Unindex messages that no longer exist in the folder. - Include the folder name and the message header fields from, to, cc, bcc, and subject in the text to be indexed.

Buncha updates:
- Use slightly more portable values for the Data.fs and Zope/lib/python. - Add -t NNN option to specify how often to commit a transaction; default 20,000. - Change -p into -p NNN to specify how often (counted in commits) to pack (default 0 -- never pack). - Reworked the commit and pack logic to maintain the various counters across folders. - Store relative paths (e.g. "inbox/1"). - Store the mtime of indexed messages in doctimes[docid]. - Store the mtime of indexed folders in watchfolders[folder] (unused). - Refactor updatefolder() to: (a) Avoid indexing messages it's already indexed and whose mtime hasn't changed. (This probably needs an override just in case.) (b) Unindex messages that no longer exist in the folder. - Include the folder name and the message header fields from, to, cc, bcc, and subject in the text to be indexed.
830d8ea9 · Guido van Rossum · fc0b69d5 · 830d8ea9
Commit 830d8ea9 authored May 23, 2002 by Guido van Rossum
Hide whitespace changes
Inline Side-by-side

Showing with 94 additions and 36 deletions

lib/python/Products/ZCTextIndex/tests/mhindex.py lib/python/Products/ZCTextIndex/tests/mhindex.py +94 -36

No files found.
--- a/lib/python/Products/ZCTextIndex/tests/mhindex.py
+++ b/lib/python/Products/ZCTextIndex/tests/mhindex.py
@@ -2,6 +2,7 @@

 """MH mail indexer."""

+import os
 import re
 import sys
 import time
@@ -9,17 +10,19 @@ import mhlib
 import getopt
 import traceback
 from StringIO import StringIO
+from stat import ST_MTIME

-DATAFS = "/home/guido/.Data.fs"
-ZOPECODE = "/home/guido/projects/ds9/lib/python"
+DATAFS = "~/.Data.fs"
+ZOPECODE = "~/projects/Zope/lib/python"

-sys.path.append(ZOPECODE)
+sys.path.append(os.path.expanduser(ZOPECODE))

 from ZODB import DB
 from ZODB.FileStorage import FileStorage
 from Persistence import Persistent
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+from BTrees.IIBTree import IIBTree

 from Products.ZCTextIndex.NBest import NBest
 from Products.ZCTextIndex.OkapiIndex import OkapiIndex
@@ -33,7 +36,7 @@ MAXLINES = 3

 def main():
    try:
-        opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu")
+        opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Op:t:u")
    except getopt.error, msg:
        print msg
        sys.exit(2)
@@ -42,8 +45,9 @@ def main():
    optimize = 0
    nbest = NBEST
    maxlines = MAXLINES
-    datafs = DATAFS
+    datafs = os.path.expanduser(DATAFS)
    pack = 0
+    trans = 20000
    for o, a in opts:
        if o == "-b":
            bulk = 1
@@ -56,18 +60,18 @@ def main():
        if o == "-O":
            optimize = 1
        if o == "-p":
-            pack = 1
+            pack = int(a)
+        if o == "-t":
+            trans = ont(a)
        if o == "-u":
            update = 1
-    ix = Indexer(datafs, update or bulk)
+    ix = Indexer(datafs, writable=update or bulk, trans=trans, pack=pack)
    if bulk:
        if optimize:
            ix.optimize(args)
        ix.bulkupdate(args)
    elif update:
        ix.update(args)
-        if pack:
-            ix.pack()
    elif args:
        for i in range(len(args)):
            a = args[i]
@@ -79,12 +83,18 @@ def main():
        ix.query(" ".join(args), nbest, maxlines)
    else:
        ix.interact(nbest)
+    if pack:
+        ix.pack()

 class Indexer:

    filestorage = database = connection = root = None

-    def __init__(self, datafs, writable=0):
+    def __init__(self, datafs, writable=0, trans=0, pack=0):
+        self.trans_limit = trans
+        self.pack_limit = pack
+        self.trans_count = 0
+        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
@@ -99,6 +109,14 @@ class Indexer:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
+        try:
+            self.doctimes = self.root["doctimes"]
+        except KeyError:
+            self.doctimes = self.root["doctimes"] = IIBTree()
+        try:
+            self.watchfolders = self.root["watchfolders"]
+        except KeyError:
+            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
@@ -195,6 +213,7 @@ class Indexer:
            path = self.docpaths[docid]
            score = min(100, int(score * factor))
            print "Rank:    %d   Score: %d%%   File: %s" % (rank, score, path)
+            path = os.path.join(self.mh.getpath(), path)
            fp = open(path)
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
@@ -254,6 +273,7 @@ class Indexer:
        msgs.sort()

        self.updatefolder(f, msgs)
+        self.commit()

    def optimize(self, args):
        uniqwords = {}
@@ -279,19 +299,14 @@ class Indexer:
        for n in msgs:
            print "prescanning", n
            m = f.openmessage(n)
-            text = self.getmessagetext(m)
+            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
-        chunk = 5000
-        target = len(self.docpaths) + chunk
        for folder in args:
-            if len(self.docpaths) >= target:
-                self.pack()
-                target = len(self.docpaths) + chunk
            if folder.startswith("+"):
                folder = folder[1:]
            print "\nFOLDER", folder
@@ -302,31 +317,34 @@ class Indexer:
                continue
            self.updatefolder(f, f.listmessages())
            print "Total", len(self.docpaths)
-        self.pack()
+        self.commit()
        print "Indexed", self.index.lexicon._nbytes, "bytes and",
        print self.index.lexicon._nwords, "words;",
        print len(self.index.lexicon._words), "unique words."

    def updatefolder(self, f, msgs):
-        done = 0
-        new = 0
+        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
-            print "indexing", n
+            path = "%s/%s" % (f.name, n)
+            docid = self.path2docid.get(path, 0)
+            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
+                print "unchanged", docid, path
+                continue
+            docid = self.newdocid(path)
            m = f.openmessage(n)
-            text = self.getmessagetext(m)
-            path = f.getmessagefilename(n)
-            self.unindexpath(path)
+            text = self.getmessagetext(m, f.name)
            if not text:
+                self.unindexpath(path)
                continue
-            docid = self.newdocid(path)
+            print "indexing", docid, path
            self.index.index_text(docid, text)
-            done += 1
-            new = 1
-            if done%500 == 0:
-                self.commit()
-                new = 0
-        if new:
-            self.commit()
+            self.maycommit()
+        # Remove messages from the folder that no longer exist
+        for path in self.path2docid.keys(f.name):
+            if not path.startswith(f.name + "/"):
+                break
+            if self.getmtime(path) == 0:
+                self.unindexpath(path)
        print "done."

    def unindexpath(self, path):
@@ -334,14 +352,19 @@ class Indexer:
            docid = self.path2docid[path]
            print "unindexing", docid, path
            del self.docpaths[docid]
+            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex(docid)
            except KeyError, msg:
                print "KeyError", msg
+            self.maycommit()

-    def getmessagetext(self, m):
+    def getmessagetext(self, m, name=None):
        L = []
+        if name:
+            L.append("_folder " + name) # To restrict search to a folder
+            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except:
@@ -361,22 +384,57 @@ class Indexer:
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
+            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

+    def getheaders(self, m, L):
+        H = []
+        for key in "from", "to", "cc", "bcc", "subject":
+            value = m.get(key)
+            if value:
+                H.append(value)
+        if H:
+            L.append("\n".join(H))
+
    def newdocid(self, path):
+        docid = self.path2docid.get(path)
+        if docid is not None:
+            self.doctimes[docid] = self.getmtime(path)
+            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
+        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

+    def getmtime(self, path):
+        path = os.path.join(self.mh.getpath(), path)
+        try:
+            st = os.stat(path)
+        except os.error, msg:
+            return 0
+        return st[ST_MTIME]
+
+    def maycommit(self):
+        self.trans_count += 1
+        if self.trans_count >= self.trans_limit > 0:
+            self.commit()
+
    def commit(self):
-        print "committing..."
-        get_transaction().commit()
+        if self.trans_count > 0:
+            print "committing..."
+            get_transaction().commit()
+            self.trans_count = 0
+            self.pack_count += 1
+            if self.pack_count >= self.pack_limit > 0:
+                self.pack()

    def pack(self):
-        print "packing..."
-        self.database.pack()
+        if self.pack_count > 0:
+            print "packing..."
+            self.database.pack()
+            self.pack_count = 0

 class TextIndex(Persistent):