Commit c37a989d authored by Kirill Smelkov's avatar Kirill Smelkov

lib/zodb: Add tests for critical ZODB properties that Wendelin.core 2 will depend on

The tests verify that there is no concurrency bugs around load,
Connection.open and invalidations. See e.g.

https://github.com/zopefoundation/ZODB/issues/290
https://github.com/zopefoundation/ZEO/issues/155

By including the tests into wendelin.core, we will have CI coverage for
all supported storages (FileStorage, ZEO, NEO), and for all supported
ZODB (currently ZODB4, ZODB4-wc2 and ZODB5).

ZEO5 is know to currently fail zloadrace.
However, even though ZODB#290 was fixed, ZEO5 turned out to also fail on zopenrace:

        def test_zodb_zopenrace():
            # exercises ZODB.Connection + particular storage implementation
    >       zopenrace.main()

    lib/tests/test_zodb.py:382:
    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
    <decorator-gen-1>:2: in main
        ???
    ../../tools/go/pygolang/golang/__init__.py:103: in _
        return f(*argv, **kw)
    lib/tests/testprog/zopenrace.py:115: in main
        test(zstor)
    <decorator-gen-2>:2: in test
        ???
    ../../tools/go/pygolang/golang/__init__.py:103: in _
        return f(*argv, **kw)
    lib/tests/testprog/zopenrace.py:201: in test
        wg.wait()
    golang/_sync.pyx:246: in golang._sync.PyWorkGroup.wait
        ???
    golang/_sync.pyx:226: in golang._sync.PyWorkGroup.go.pyrunf
        ???
    lib/tests/testprog/zopenrace.py:165: in T1
        t1()
    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

        def t1():
            transaction.begin()
            zconn = db.open()

            root = zconn.root()
            obj1 = root['obj1']
            obj2 = root['obj2']

            # obj1 - reload it from zstor
            # obj2 - get it from zconn cache
            obj1._p_invalidate()

            # both objects must have the same values
            i1 = obj1.i
            i2 = obj2.i
            if i1 != i2:
    >           raise AssertionError("T1: obj1.i (%d)  !=  obj2.i (%d)" % (i1, i2))
    E           AssertionError: T1: obj1.i (3)  !=  obj2.i (2)

    lib/tests/testprog/zopenrace.py:156: AssertionError
parent 08e0c9fb
# Wendelin.core.bigfile | Tests for ZODB utilities # Wendelin.core.bigfile | Tests for ZODB utilities and critical properties of ZODB itself
# Copyright (C) 2014-2021 Nexedi SA and Contributors. # Copyright (C) 2014-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
...@@ -22,6 +22,8 @@ from wendelin.lib.testing import getTestDB ...@@ -22,6 +22,8 @@ from wendelin.lib.testing import getTestDB
from wendelin.lib import testing from wendelin.lib import testing
from persistent import Persistent, UPTODATE, GHOST, CHANGED from persistent import Persistent, UPTODATE, GHOST, CHANGED
from ZODB import DB, POSException from ZODB import DB, POSException
from ZODB.FileStorage import FileStorage
from ZODB.MappingStorage import MappingStorage
from BTrees.IOBTree import IOBTree from BTrees.IOBTree import IOBTree
import transaction import transaction
from transaction import TransactionManager from transaction import TransactionManager
...@@ -29,6 +31,8 @@ from golang import defer, func ...@@ -29,6 +31,8 @@ from golang import defer, func
from pytest import raises from pytest import raises
import pytest; xfail = pytest.mark.xfail import pytest; xfail = pytest.mark.xfail
from wendelin.lib.tests.testprog import zopenrace, zloadrace
testdb = None testdb = None
def dbopen(): def dbopen():
...@@ -367,6 +371,28 @@ def test_zurlstable(): ...@@ -367,6 +371,28 @@ def test_zurlstable():
assert zurl == zurl0 assert zurl == zurl0
# ---- tests for critical properties of ZODB ----
# verify race in between Connection.open and invalidations.
def test_zodb_zopenrace_basic():
# exercises mostly logic inside ZODB around ZODB.Connection
zopenrace.test(MappingStorage())
def test_zodb_zopenrace():
# exercises ZODB.Connection + particular storage implementation
zopenrace.main()
# verify race in between loading and invalidations.
def test_zodb_zloadrace():
# skip testing with FileStorage - in ZODB/py opening simultaneous read-write
# connections to the same file is not supported and will raise LockError.
_ = testdb.getZODBStorage()
_.close()
if isinstance(_, FileStorage):
pytest.skip("skipping on FileStorage")
zloadrace.main()
# ---- misc ---- # ---- misc ----
# zsync syncs ZODB storage. # zsync syncs ZODB storage.
......
#!/usr/bin/env python
# originally reported to https://github.com/zopefoundation/ZEO/issues/155
"""Program zloadrace.py demonstrates concurrency bug in ZODB5/ZEO5 that leads
to data corruption.
The bug was not fully analyzed, but offhand it looks like ZEO5 does not
properly synchronize on-client loads with invalidations which lead to stale
live cache in ZODB connection and corrupt data provided by ZODB to application
similarly to https://github.com/zopefoundation/ZODB/issues/290.
The program simulates eight clients: every client once in a while modifies two
integer objects preserving invariant that their values stay equal. At every
iteration each client also verifies the invariant with access to one of the
objects always going through loading from the database. This way if live cache
becomes stale the bug is observed as invariant breakage.
Here is example failure:
$ ./zloadrace5.py
No handlers could be found for logger "ZEO.asyncio.server"
Traceback (most recent call last):
File "./zloadrace5.py", line 139, in <module>
main()
File "</home/kirr/src/tools/py/decorator/src/decorator.pyc:decorator-gen-1>", line 2, in main
File "/home/kirr/src/tools/go/pygolang/golang/__init__.py", line 100, in _
return f(*argv, **kw)
File "./zloadrace5.py", line 123, in main
wg.wait()
File "golang/_sync.pyx", line 198, in golang._sync.PyWorkGroup.wait
pyerr_reraise(pyerr)
File "golang/_sync.pyx", line 178, in golang._sync.PyWorkGroup.go.pyrunf
f(pywg._pyctx, *argv, **kw)
File "</home/kirr/src/tools/py/decorator/src/decorator.pyc:decorator-gen-3>", line 2, in T
File "/home/kirr/src/tools/go/pygolang/golang/__init__.py", line 100, in _
return f(*argv, **kw)
File "./zloadrace5.py", line 112, in T
t1()
File "</home/kirr/src/tools/py/decorator/src/decorator.pyc:decorator-gen-8>", line 2, in t1
File "/home/kirr/src/tools/go/pygolang/golang/__init__.py", line 100, in _
return f(*argv, **kw)
File "./zloadrace5.py", line 94, in t1
raise AssertionError("T%s: obj1.i (%d) != obj2.i (%d)" % (name, i1, i2))
AssertionError: T7: obj1.i (1) != obj2.i (0)
--------
NOTE ZODB4/ZEO4 do not expose this **load vs invalidation** race because there
the issue was specially cared about. Here is relevant part of
`Connection._setstate()` on ZODB4:
https://github.com/zopefoundation/ZODB/blob/4.4.5-4-g7a1a49111/src/ZODB/Connection.py#L949-L964
Basically it is
1. check if oid was already invalidated and, if yes, load from storage with
`before=._txn_time` i.e. transaction ID of first invalidated transaction this
connection received after its transaction begin;
2. if oid was not invalidated - call `zstor.load(oid)` with **before=None** -
i.e. load latest data for that oid;
3. check again after load whether oid was invalidated, and if yes - reload data
again with `before=._txn_time`.
One can suppose that there can be a race window in that a new transaction was
committed before load in 2, but corresponding invalidation messages were not
yet sent by server, or not yet processed on client. If any of that were true,
it would result in breakage of Isolation property so that in-progress
transaction would observe changes from transactions going simultaneously to it.
After some analysis it looks like it should not be the case:
ZEO4 server explicitly guarantees that it does not mix processing load requests
inside tpc_finish + send invalidations. This way if load is processed after new
commit, load reply is guaranteed to come to client after invalidation message.
This was explicitly fixed by
https://github.com/zopefoundation/ZEO/commit/71eb1456 (search for callAsyncNoPoll there)
and later again by https://github.com/zopefoundation/ZEO/commit/94f275c3.
HOWEVER
ZODB5 shifts MVCC handling into storage layer and this way there is no 1-2-3
from the above in `Connection.setstate()`. ZEO5 server was also significantly
reworked compared to ZEO4 and it looks like that rework reintroduced some
concurrency bugs that lead to corrupt data.
"""
from __future__ import print_function
from ZODB import DB
from ZODB.FileStorage import FileStorage
from ZODB.POSException import ConflictError
import transaction
from persistent import Persistent
from random import randint
from wendelin.lib.testing import getTestDB
from golang import func, defer, select, default
from golang import sync, context
# PInt is persistent integer.
class PInt(Persistent):
def __init__(self, i):
self.i = i
@func
def main():
tdb = getTestDB()
tdb.setup()
defer(tdb.teardown)
def dbopen():
zstor = tdb.getZODBStorage()
db = DB(zstor)
return db
# init initializes the database with two integer objects - obj1/obj2 that are set to 0.
@func
def init():
db = dbopen()
defer(db.close)
transaction.begin()
zconn = db.open()
root = zconn.root()
root['obj1'] = PInt(0)
root['obj2'] = PInt(0)
transaction.commit()
zconn.close()
# T is a worker that accesses obj1/obj2 in a loop and verifies
# `obj1.i == obj2.i` invariant.
#
# access to obj1 is organized to always trigger loading from zstor.
# access to obj2 goes through zconn cache and so verifies whether the cache is not stale.
#
# Once in a while T tries to modify obj{1,2}.i maintaining the invariant as
# test source of changes for other workers.
@func
def T(ctx, name, N):
db = dbopen()
defer(db.close)
@func
def t1():
transaction.begin()
zconn = db.open()
defer(zconn.close)
root = zconn.root()
obj1 = root['obj1']
obj2 = root['obj2']
# obj1 - reload it from zstor
# obj2 - get it from zconn cache
obj1._p_invalidate()
# both objects must have the same values
i1 = obj1.i
i2 = obj2.i
if i1 != i2:
#print('FAIL')
raise AssertionError("T%s: obj1.i (%d) != obj2.i (%d)" % (name, i1, i2))
# change objects once in a while
if randint(0,4) == 0:
#print("T%s: modify" % name)
obj1.i += 1
obj2.i += 1
try:
transaction.commit()
except ConflictError:
#print('conflict -> ignore')
transaction.abort()
for i in range(N):
if ready(ctx.done()):
break
#print('T%s.%d' % (name, i))
t1()
# run 8 T workers concurrently. As of 20200123, likely due to race conditions
# in ZEO, it triggers the bug where T sees stale obj2 with obj1.i != obj2.i
init()
N = 1000
wg = sync.WorkGroup(context.background())
for x in range(8):
wg.go(T, x, N)
wg.wait()
print('OK')
# ready returns whether channel ch is ready.
def ready(ch):
_, _rx = select(
default, # 0
ch.recv, # 1
)
if _ == 0:
return False
return True
if __name__ == '__main__':
main()
#!/usr/bin/env python
# originally reported to https://github.com/zopefoundation/ZODB/issues/290
# fixed in https://github.com/zopefoundation/ZODB/commit/b5895a5c
"""Program zopenrace.py demonstrates concurrency bug in ZODB Connection.open()
that leads to stale live cache and wrong data provided by database to users.
The bug is that when a connection is opened, it syncs to storage and processes
invalidations received from the storage in two _separate_ steps, potentially
leading to situation where invalidations for transactions _past_ opened
connection's view of the database are included into opened connection's cache
invalidation. This leads to stale connection cache and old data provided by
ZODB.Connection when it is reopened next time.
That in turn can lead to loose of Consistency of the database if mix of current
and old data is used to process a transaction. A classic example would be bank
accounts A, B and C with A<-B and A<-C transfer transactions. If transaction
that handles A<-C sees stale data for A when starting its processing, it
results in A loosing what it should have received from B.
Below is timing diagram on how the bug happens on ZODB5:
Client1 or Thread1 Client2 or Thread2
# T1 begins transaction and opens zodb connection
newTransaction():
# implementation in Connection.py[1]
._storage.sync()
invalidated = ._storage.poll_invalidations():
# implementation in MVCCAdapterInstance [2]
# T1 settles on as of which particular database state it will be
# viewing the database.
._storage._start = ._storage._storage.lastTrasaction() + 1:
s = ._storage._storage
s._lock.acquire()
head = s._ltid
s._lock.release()
return head
# T2 commits here.
# Time goes by and storage server sends
# corresponding invalidation message to T1,
# which T1 queues in its _storage._invalidations
# T1 retrieves queued invalidations which _includes_
# invalidation for transaction that T2 just has committed past @head.
._storage._lock.acquire()
r = _storage._invalidations
._storage._lock.release()
return r
# T1 processes invalidations for [... head] _and_ invalidations for past-@head transaction.
# T1 thus will _not_ process invalidations for that next transaction when
# opening zconn _next_ time. The next opened zconn will thus see _stale_ data.
._cache.invalidate(invalidated)
The program simulates two clients: one (T2) constantly modifies two integer
objects preserving invariant that their values stay equal. The other client
(T1) constantly opens the database and verifies the invariant. T1 forces access
to one of the object to always go through loading from the database, and this
way if live cache becomes stale the bug is observed as invariant breakage.
Here is example failure:
$ taskset -c 1,2 ./zopenrace.py
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "./zopenrace.py", line 136, in T1
t1()
File "./zopenrace.py", line 130, in t1
raise AssertionError("t1: obj1.i (%d) != obj2.i (%d)" % (i1, i2))
AssertionError: t1: obj1.i (147) != obj2.i (146)
Traceback (most recent call last):
File "./zopenrace.py", line 179, in <module>
main()
File "./zopenrace.py", line 174, in main
raise AssertionError('FAIL')
AssertionError: FAIL
NOTE ZODB4 and ZODB3 do not have this particular open vs invalidation race.
[1] https://github.com/zopefoundation/ZODB/blob/5.5.1-29-g0b3db5aee/src/ZODB/Connection.py#L734-L742
[2] https://github.com/zopefoundation/ZODB/blob/5.5.1-29-g0b3db5aee/src/ZODB/mvccadapter.py#L130-L139
"""
from __future__ import print_function
from ZODB import DB
import transaction
from persistent import Persistent
from wendelin.lib.testing import getTestDB
from golang import func, defer, select, default
from golang import context, sync
# PInt is persistent integer.
class PInt(Persistent):
def __init__(self, i):
self.i = i
@func
def main():
tdb = getTestDB()
tdb.setup()
defer(tdb.teardown)
zstor = tdb.getZODBStorage()
defer(zstor.close)
test(zstor)
@func
def test(zstor):
db = DB(zstor)
defer(db.close)
# init initializes the database with two integer objects - obj1/obj2 that are set to 0.
def init():
transaction.begin()
zconn = db.open()
root = zconn.root()
root['obj1'] = PInt(0)
root['obj2'] = PInt(0)
transaction.commit()
zconn.close()
# T1 accesses obj1/obj2 in a loop and verifies that obj1.i == obj2.i
#
# access to obj1 is organized to always trigger loading from zstor.
# access to obj2 goes through zconn cache and so verifies whether the cache is not stale.
def T1(ctx, N):
def t1():
transaction.begin()
zconn = db.open()
root = zconn.root()
obj1 = root['obj1']
obj2 = root['obj2']
# obj1 - reload it from zstor
# obj2 - get it from zconn cache
obj1._p_invalidate()
# both objects must have the same values
i1 = obj1.i
i2 = obj2.i
if i1 != i2:
raise AssertionError("T1: obj1.i (%d) != obj2.i (%d)" % (i1, i2))
transaction.abort() # we did not changed anything; also fails with commit
zconn.close()
for i in range(N):
#print('T1.%d' % i)
if ready(ctx.done()):
raise ctx.err()
t1()
# T2 changes obj1/obj2 in a loop by doing `objX.i += 1`.
#
# Since both objects start from 0, the invariant that `obj1.i == obj2.i` is always preserved.
def T2(ctx, N):
def t2():
transaction.begin()
zconn = db.open()
root = zconn.root()
obj1 = root['obj1']
obj2 = root['obj2']
obj1.i += 1
obj2.i += 1
assert obj1.i == obj2.i
transaction.commit()
zconn.close()
for i in range(N):
#print('T2.%d' % i)
if ready(ctx.done()):
raise ctx.err()
t2()
# run T1 and T2 concurrently. As of 20191210, due to race condition in
# Connection.open, it triggers the bug where T1 sees stale obj2 with obj1.i != obj2.i
init()
N = 1000
wg = sync.WorkGroup(context.background())
wg.go(T1, N)
wg.go(T2, N)
wg.wait()
print('OK')
# ready returns whether channel ch is ready.
def ready(ch):
_, _rx = select(
default, # 0
ch.recv, # 1
)
if _ == 0:
return False
return True
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment