wcfs: tests: Move client to be pinkill'ed into separate process

If we don't the whole testing process will become killed when wcfs becomes taught to kill clients that do not handle pin notifications well. Use multiprocessing to do so and to be able to interoperate with spawned test process by sending/receiving objects to/from it. Preliminary history: levin.zimmermann/wendelin.core@aef0f0e1Co-authored-by: Levin Zimmermann <levin.zimmermann@nexedi.com> /discussed-on !18

wcfs: tests: Move client to be pinkill'ed into separate process
If we don't the whole testing process will become killed when wcfs becomes taught to kill clients that do not handle pin notifications well. Use multiprocessing to do so and to be able to interoperate with spawned test process by sending/receiving objects to/from it. Preliminary history: levin.zimmermann/wendelin.core@aef0f0e1Co-authored-by: Levin Zimmermann <levin.zimmermann@nexedi.com> /discussed-on !18
33ea7769 · Kirill Smelkov · Levin Zimmermann · 1303799e · 33ea7769 · 33ea7769
Commit 33ea7769 authored Sep 16, 2024 by Kirill Smelkov Committed by Levin Zimmermann Sep 17, 2024
Show whitespace changes
Inline Side-by-side

Showing with 229 additions and 25 deletions

wcfs/wcfs_faultyprot_test.py wcfs/wcfs_faultyprot_test.py +212 -20

wcfs/wcfs_test.py wcfs/wcfs_test.py +17 -5

No files found.
--- a/wcfs/wcfs_faultyprot_test.py
+++ b/wcfs/wcfs_faultyprot_test.py
@@ -22,14 +22,24 @@ protection against slow/faulty clients in isolation protocol."""

 from __future__ import print_function, absolute_import

+from wendelin.lib.zodb import zstor_2zurl
+from wendelin import wcfs
+
+import sys, os, subprocess, traceback
+import six
 from golang import select, func, defer
 from golang import context, sync, time

 import pytest; xfail = pytest.mark.xfail
-from pytest import fail, fixture
-from wendelin.wcfs.wcfs_test import tDB, h, \
+from pytest import fixture
+from wendelin.wcfs.wcfs_test import tDB, h, tAt, eprint, \
        setup_module, teardown_module, setup_function, teardown_function

+if six.PY2:
+    from _multiprocessing import Connection as MPConnection
+else:
+    from multiprocessing.connection import Connection as MPConnection
+

 # tests in this module require WCFS to promptly react to pin handler
 # timeouts so that verifying WCFS killing logic does not take a lot of time.
@@ -39,40 +49,222 @@ def with_prompt_pintimeout(monkeypatch):
    return monkeypatch.setenv("WENDELIN_CORE_WCFS_OPTIONS", "-pintimeout %.1fs" % tkill, prepend=" ")


-# verify that wcfs kills slow/faulty client who does not reply to pin in time.
-@xfail  # protection against faulty/slow clients
+# tSubProcess provides infrastructure to run a function in separate process.
+#
+# It runs f(cin, cout, *argv, **kw) in subprocess with cin and cout
+# connected to parent via multiprocessing.Connection .
+#
+# It is similar to multiprocessing.Process in spawn mode that is available on py3.
+# We need to use spawn mode - not fork - because fork does not work well when
+# parent process is multithreaded, as many things, that are relying on the
+# additional threads in the original process, stop to function in the forked
+# child without additional care. For example pygolang timers and signals
+# currently stop to work after the fork, and in general it is believed that in
+# multithreaded programs the only safe thing to do after the fork is exec.
+# Please see section "NOTES" in
+#
+#   https://man7.org/linux/man-pages/man3/pthread_atfork.3.html
+#
+# for details about this issue.
+class tSubProcess(object):
+    def __init__(proc, f, *argv, **kw):
+        exev = [sys.executable, '-c', 'from wendelin.wcfs import wcfs_faultyprot_test as t; '
+                                      't.tSubProcess._start(%r)' % f.__name__]
+        proc.popen = subprocess.Popen(exev, stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
+        try:
+            proc.cin  = MPConnection(proc.popen.stdin.fileno(),  readable=False)
+            proc.cout = MPConnection(proc.popen.stdout.fileno(), writable=False)
+            proc.send(argv)
+            proc.send(kw)
+        except:
+            proc.popen.kill()
+            raise
+
+    # _start is trampoline ran in the subprocess to launch to user function.
+    @staticmethod
+    def _start(funcname):
+        cin  = MPConnection(sys.stdin.fileno(),  writable=False)
+        cout = MPConnection(sys.stdout.fileno(), readable=False)
+        argv = cin.recv()
+        kw   = cin.recv()
+        f = globals()[funcname]
+        procname = kw.pop('_procname', f.__name__)
+        try:
+            f(cin, cout, *argv, **kw)
+            _ = 'END'
+        except BaseException as exc:
+            # dump traceback so it appears in the log because Traceback objects are not picklable
+            eprint("\nException in subprocess %s (pid%d):" % (procname, os.getpid()))
+            traceback.print_exc()
+            _ = exc
+        cout.send(_)
+        cout.close()
+
+    # close releases resources associated with subprocess.
+    def close(proc):
+        if proc.popen.returncode is None:
+            proc.popen.kill()
+
+    # exitcode returns subprocess exit code or None if subprocess has not yet terminated.
+    @property
+    def exitcode(proc):
+        return proc.popen.returncode
+
+    # join waits for the subprocess to end.
+    def join(proc, ctx):
+        gotend = False
+        goteof = False
+        joined = False
+        while not (goteof and joined):
+            if ctx.err() is not None:
+                raise ctx.err()
+
+            if not joined:
+                joined = (proc.popen.poll() is not None)
+
+            # recv from proc to see if it was END or exception
+            # make sure to recv at least once after joined to read buffered messages / exception
+            if goteof:
+                time.sleep(0.1*time.second)
+            else:
+                try:
+                    _, ok = proc.tryrecv()
+                except EOFError:
+                    goteof = True
+                else:
+                    if ok:
+                        if not gotend:
+                            assert _ == 'END'
+                            gotend = True
+                        else:
+                            raise AssertionError("got %r after END" % (_,))
+
+    # send sends object to subprocess input.
+    def send(proc, obj):
+        proc.cin.send(obj)
+
+    # recv receives object/exception from subprocess output.
+    def recv(proc, ctx): # -> obj | raise exception | EOFError
+        while 1:
+            if ctx.err() is not None:
+                raise ctx.err()
+            _, ok = proc.tryrecv()
+            if ok:
+                return _
+
+    # tryrecv tries to receive an object/exception from subprocess output.
+    # It does so without blocking.
+    def tryrecv(proc): # -> (obj, ok) | raise exception | EOFError
+        _ = proc.cout.poll(0.1*time.second)
+        if not _:
+            return None, False
+        _ = proc.cout.recv()
+        if isinstance(_, BaseException):
+            raise _
+        return _, True
+
+
+# tFaultySubProcess runs f(tFaultyClient, *argv, *kw) in subprocess.
+# It's a small convenience wrapper over tSubProcess - please see its documentation for details.
+class tFaultySubProcess(tSubProcess):
+    def __init__(fproc, t, f, *argv, **kw):
+        kw.setdefault('zurl',       zstor_2zurl(t.root._p_jar.db().storage))
+        kw.setdefault('zfile_oid',  t.zfile._p_oid)
+        kw.setdefault('_procname',  f.__name__)
+
+        kw.setdefault('pintimeout', t.pintimeout)
+        tremain = t.ctx.deadline() - time.now()
+        assert t.pintimeout < tremain/3 # 2·pintimeout is needed to reliably detect wcfs kill reaction
+
+        for k,v in list(kw.items()):
+            if isinstance(v, tAt):  # tAt is not picklable
+                kw[k] = v.raw
+
+        super(tFaultySubProcess, fproc).__init__(_tFaultySubProcess_start, f.__name__, *argv, **kw)
+        assert fproc.cout.recv() == "f: start"
+
 @func
-def test_wcfs_pintimeout_kill(with_prompt_pintimeout):
-    t = tDB(); zf = t.zfile
-    defer(t.close)
+def _tFaultySubProcess_start(cin, cout, funcname, **kw):
+    f = tFaultyClient()
+    f.cin  = cin
+    f.cout = cout
+    f.zurl = kw.pop('zurl')
+    f.zfile_oid = kw.pop('zfile_oid')
+    f.pintimeout = kw.pop('pintimeout')
+    f.wc = wcfs.join(f.zurl, autostart=False);  defer(f.wc.close)
+    # we do not need to implement timeouts precisely in the child process
+    # because parent will kill us on its timeout anyway.
+    ctx = context.background()
+    f.cout.send("f: start")
+    testf = globals()[funcname]
+    testf(ctx, f, **kw)

-    at1 = t.commit(zf, {2:'c1'})
-    at2 = t.commit(zf, {2:'c2'})
-    f = t.open(zf)
-    f.assertData(['','','c2'])
+# tFaultyClient is placeholder for arguments + WCFS connection for running test
+# function inside tFaultySubProcess.
+class tFaultyClient:
+    # .cin
+    # .cout
+    # .zurl
+    # .zfile_oid
+    # .wc
+    # .pintimeout
+    pass
+
+
+# ---- tests ----

-    # XXX move into subprocess not to kill whole testing
-    ctx, _ = context.with_timeout(context.background(), 2*t.pintimeout)

-    wl = t.openwatch()
+# verify that wcfs kills slow/faulty client who does not reply to pin in time.
+
+@func
+def _bad_watch_no_pin_reply(ctx, f, at):
+    wl = wcfs.WatchLink(f.wc)    ; defer(wl.close)
+
+    # wait for command to start watching
+    _ = f.cin.recv()
+    assert _ == "start watch", _
+
    wg = sync.WorkGroup(ctx)
    def _(ctx):
        # send watch. The pin handler won't be replying -> we should never get reply here.
-        wl.sendReq(ctx, b"watch %s @%s" % (h(zf._p_oid), h(at1)))
-        fail("watch request completed (should not as pin handler is stuck)")
+        wl.sendReq(ctx, b"watch %s @%s" % (h(f.zfile_oid), h(at)))
+        raise AssertionError("watch request completed (should not as pin handler is stuck)")
    wg.go(_)
    def _(ctx):
        req = wl.recvReq(ctx)
        assert req is not None
-        assert req.msg == b"pin %s #%d @%s" % (h(zf._p_oid), 2, h(at1))
+        f.cout.send(req.msg)

        # sleep > wcfs pin timeout - wcfs must kill us
        _, _rx = select(
            ctx.done().recv,                    # 0
-            time.after(2*t.pintimeout).recv, # 1
+            time.after(2*f.pintimeout).recv,    # 1
        )
        if _ == 0:
            raise ctx.err()
-        fail("wcfs did not killed stuck client")
+        raise AssertionError("wcfs did not kill stuck client")
    wg.go(_)
    wg.wait()
+
+@xfail  # protection against faulty/slow clients
+@func
+def test_wcfs_pintimeout_kill(with_prompt_pintimeout):
+    t = tDB(multiproc=True); zf = t.zfile
+    defer(t.close)
+
+    at1 = t.commit(zf, {2:'c1'})
+    at2 = t.commit(zf, {2:'c2'})
+    f = t.open(zf)
+    f.assertData(['','','c2'])
+
+    # launch faulty process that should be killed by wcfs on problematic pin during watch setup
+    p = tFaultySubProcess(t, _bad_watch_no_pin_reply, at=at1)
+    defer(p.close)
+
+    # wait till faulty client issues its watch, receives pin and pauses/misbehaves
+    p.send("start watch")
+    assert p.recv(t.ctx) == b"pin %s #%d @%s" % (h(zf._p_oid), 2, h(at1))
+
+    # the faulty client must become killed by wcfs
+    p.join(t.ctx)
+    assert p.exitcode is not None
--- a/wcfs/wcfs_test.py
+++ b/wcfs/wcfs_test.py
@@ -350,7 +350,7 @@ class DFile:
 # TODO(?) print -> t.trace/debug() + t.verbose depending on py.test -v -v ?
 class tWCFS(_tWCFS):
    @func
-    def __init__(t):
+    def __init__(t, multiproc=False):
        assert not os.path.exists(testmntpt)
        wc = wcfs.join(testzurl, autostart=True)
        assert wc.mountpoint == testmntpt
@@ -359,6 +359,9 @@ class tWCFS(_tWCFS):
        t.wc = wc
        t.pintimeout = float(t.wc._read(".wcfs/pintimeout"))

+        # multiproc=True indicates that wcfs server will be used by multiple client processes
+        t.multiproc=multiproc
+
        # the whole test is limited in time to detect deadlocks
        # NOTE with_timeout must be << timeout
        # NOTE pintimeout can be either
@@ -488,7 +491,7 @@ class tDB(tWCFS):
    # create before wcfs startup. old_data is []changeDelta - see .commit
    # and .change for details.
    @func
-    def __init__(t, old_data=[]):
+    def __init__(t, old_data=[], **kw):
        t.root = testdb.dbopen()
        def _(): # close/unlock db if __init__ fails
            exc = sys.exc_info()[1]
@@ -518,7 +521,7 @@ class tDB(tWCFS):
            t._commit(t.zfile, changeDelta)

        # start wcfs after testdb is created and initial data is committed
-        super(tDB, t).__init__()
+        super(tDB, t).__init__(**kw)

        # fh(.wcfs/zhead) + history of zhead read from there
        t._wc_zheadfh = open(t.wc.mountpoint + "/.wcfs/zhead")
@@ -968,6 +971,7 @@ class tWatchLink(wcfs.WatchLink):
        # this tWatchLink currently watches the following files at particular state.
        t._watching = {}    # {} foid -> tWatch

+        if not tdb.multiproc:
            tdb.assertStats({'WatchLink': len(tdb._wlinks)})

    def close(t):
@@ -981,6 +985,7 @@ class tWatchLink(wcfs.WatchLink):
            w.pinned = {}
        t._watching = {}

+        if not tdb.multiproc:
            tdb.assertStats({'WatchLink': len(tdb._wlinks)})


@@ -2057,6 +2062,13 @@ class tAt(bytes):
        return "@" + h(at)
    __str__ = __repr__

+    # raw returns raw bytes form of at.
+    # It should be used in contexts where at needs to be pickled, because tAt
+    # is unpicklable due to .tdb being unpicklable.
+    @property
+    def raw(at):
+        return fromhex(h(at))
+
 # hpin returns human-readable representation for {}blk->rev.
 @func(tDB)
 def hpin(t, pin):