Commit fac9a8fa authored by Kirill Smelkov's avatar Kirill Smelkov

Fix teeing not to trim trun's output if test run is cancelled

I was running nxdtest test locally and on py3.11 it started to fail
reliably for me in test_cancel_from_master:

    nxdtest$ python -m pytest -vvsx -k test_cancel_from_master
    ===================== test session starts ======================
    platform linux -- Python 3.11.9+, pytest-7.4.4, pluggy-1.4.0 -- /home/kirr/src/tools/go/py311-venv/bin/python
    cachedir: .pytest_cache
    rootdir: /home/kirr/src/wendelin/nxdtest
    configfile: pytest.ini
    plugins: timeout-2.3.1, mock-3.14.0
    collected 46 items / 44 deselected / 2 selected

    nxdtest/nxdtest_test.py::test_cancel_from_master[userns_disabled] FAILED

    =========================== FAILURES ===========================
    ___________ test_cancel_from_master[userns_disabled] ___________

    run_nxdtest = <function run_nxdtest.<locals>._run_nxdtest at 0x7fb680ea13a0>, capsys = <_pytest.capture.CaptureFixture object at 0x7fb680e7b6d0>
    distributor_with_cancelled_test = None, mocker = <pytest_mock.plugin.MockerFixture object at 0x7fb680e7ac50>

        @pytest.mark.timeout(timeout=10)
        def test_cancel_from_master(run_nxdtest, capsys, distributor_with_cancelled_test, mocker):
            # nxdtest polls every 5 minutes, but in test we don't want to wait so long.
            # set master poll interval to small, but enough time for spawned hang to
            # setup its signal handler.
            mocker.patch('nxdtest._tmasterpoll', 1*time.second)

            hang = "%s/testprog/hang" % (dirname(__file__),)
            run_nxdtest(
                """\
        TestCase('TEST1', ['%s'])
        """ % (hang),
                argv=[
                    "nxdtest",
                    "--master_url", "http://localhost",
                ],
            )
            captured = capsys.readouterr()
            assert "TEST1" in captured.out
            assert "# master asks to cancel test run" in captured.out
            assert "# test run canceled" in captured.out
    >       assert "hang: terminating" in captured.out
    E       assert 'hang: terminating' in "date:\tWed, 19 Feb 2025 18:13:51 MSK\nxnode:\tkirr@deca.navytux.spb.ru\nuname:\tLinux deca 6.1.0-18-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.1.76-1 (2024-02-01) x86_64\ncpu:\tIntel(R) Core(TM) i7-7600U CPU @ 2.80GHz\n# running for <MagicMock name='TestResultProxy.test_result_path' id='140421774003536'>\n\n>>> TEST1\n$ /home/kirr/src/wendelin/nxdtest/nxdtest/testprog/hang\n# user namespaces not available (-userns -userns/fuse).\n# isolation and many checks will be deactivated.\nhang: hanging ...\nhang: hanging ...\nhang: hanging ...\n# master asks to cancel test run\n# stopping due to cancel\nhang: hanging ...\nerror\tTEST1\t3.306s\t# 1t 1e 0f 0s\n# test run canceled\n# ran 1 test case:  1·error\n"
    E        +  where "date:\tWed, 19 Feb 2025 18:13:51 MSK\nxnode:\tkirr@deca.navytux.spb.ru\nuname:\tLinux deca 6.1.0-18-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.1.76-1 (2024-02-01) x86_64\ncpu:\tIntel(R) Core(TM) i7-7600U CPU @ 2.80GHz\n# running for <MagicMock name='TestResultProxy.test_result_path' id='140421774003536'>\n\n>>> TEST1\n$ /home/kirr/src/wendelin/nxdtest/nxdtest/testprog/hang\n# user namespaces not available (-userns -userns/fuse).\n# isolation and many checks will be deactivated.\nhang: hanging ...\nhang: hanging ...\nhang: hanging ...\n# master asks to cancel test run\n# stopping due to cancel\nhang: hanging ...\nerror\tTEST1\t3.306s\t# 1t 1e 0f 0s\n# test run canceled\n# ran 1 test case:  1·error\n" = CaptureResult(out="date:\tWed, 19 Feb 2025 18:13:51 MSK\nxnode:\tkirr@deca.navytux.spb.ru\nuname:\tLinux deca 6.1.0-18-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.1.76-1 (2024-02-01) x86_64\ncpu:\tIntel(R) Core(TM) i7-7600U CPU @ 2.80GHz\n# running for <MagicMock name='TestResultProxy.test_result_path' id='140421774003536'>\n\n>>> TEST1\n$ /home/kirr/src/wendelin/nxdtest/nxdtest/testprog/hang\n# user namespaces not available (-userns -userns/fuse).\n# isolation and many checks will be deactivated.\nhang: hanging ...\nhang: hanging ...\nhang: hanging ...\n# master asks to cancel test run\n# stopping due to cancel\nhang: hanging ...\nerror\tTEST1\t3.306s\t# 1t 1e 0f 0s\n# test run canceled\n# ran 1 test case:  1·error\n", err='').out

    nxdtest/nxdtest_test.py:361: AssertionError

Here the final bit of nxdtest/testprog/hang output, that is emitting
"hang: terminating" upon receiving SIGTERM, was not discovered in the
output of nxdtest.

Going through a bit of debugging I saw that we spawn tee for test stdout
and stderr under the same context, that is driving the main test, and so
when that test is canceled to be run, it means that that main context
becomes canceled, and so both main waiter function sees this ctx
becoming canceled _and_ spawned tees also see that context to become
canceled and so, depending on scheduling, can react by prematurely
exiting. That's why we do not see the tail of test output in nxdtest log.

-> Fix it by running tees in a context, that is not directly related to
main context - so that tees can do their work till receiving EOF. But
organize it all that way that if there is unexpected exception in the
main driver, the tees are canceled. And that if there is unexpected
exception in any of the spawned tees, then the main driver becomes
canceled due to that as well.

/cc @jerome
parent 19c7022c
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Copyright (C) 2018-2025 Nexedi SA and Contributors.
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
......@@ -308,14 +308,19 @@ def main():
# (explicit teeing instead of p.communicate() to be able to see incremental progress)
buf_out = []
buf_err = []
wg = sync.WorkGroup(ctx)
waitCtx, waitCancel = context.with_cancel(context.background())
defer(waitCancel)
wg = sync.WorkGroup(waitCtx)
wg.go(tee, p.stdout, bstdout, buf_out)
wg.go(tee, p.stderr, bstderr, buf_err)
# wait for trun to exit; terminate it on cancel
# let tees to run until they read everything from trun's output unless there is unexpected exception here
mainCtx = ctx
@func
def _(ctx):
ctx, cancel = context.merge(mainCtx, ctx)
defer(cancel)
defer(p.wait)
err = None
while 1:
done = p.poll()
if done is not None:
......@@ -329,22 +334,13 @@ def main():
if _ == 1:
emit("# stopping due to cancel")
p.terminate()
err = ctx.err()
# do not raise ctx.err() not to cancel tees prematurely
break
time.sleep(0.1)
if err is not None:
raise err
wg.go(_)
try:
wg.wait()
except Exception as e:
if errors.Is(e, context.canceled):
pass # ok, finish current test_result_line
else:
raise
wg.wait()
stdout = b''.join(buf_out)
stderr = b''.join(buf_err)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment