client: fix possible data corruption after conflict resolutions with replicas

This really fixes the bug described in commit 40bac312, which could probably be reverted. It only reduced the probability of failure. What happened is that the second conflict on 'a' for t3 what first reported by an answer to first store with: - a base serial at which a=0 - a conflict serial at which a=7 However, the cached data is not 8 anymore but 12, since a second store already occurred after the first conflict (reported by the other storage node). When this conflict was resolved before receiving the conflict for second store, it gave: resolve(old=0, saved=7, new=12) -> 19 instead of: resolve(old=4, saved=7, new=12) -> 15 (if we still had the data of the first store, we could also do resolve(old=0, saved=7, new=8) but that would be inefficient from a memory point of view) The bug was difficult to reproduce. testNotifyReplicated had to be run many many times before that race conditions trigger it. The test was changed to enforce some of them, and the above scenario now happens almost always.

client: fix possible data corruption after conflict resolutions with replicas
This really fixes the bug described in commit 40bac312, which could probably be reverted. It only reduced the probability of failure. What happened is that the second conflict on 'a' for t3 what first reported by an answer to first store with: - a base serial at which a=0 - a conflict serial at which a=7 However, the cached data is not 8 anymore but 12, since a second store already occurred after the first conflict (reported by the other storage node). When this conflict was resolved before receiving the conflict for second store, it gave: resolve(old=0, saved=7, new=12) -> 19 instead of: resolve(old=4, saved=7, new=12) -> 15 (if we still had the data of the first store, we could also do resolve(old=0, saved=7, new=8) but that would be inefficient from a memory point of view) The bug was difficult to reproduce. testNotifyReplicated had to be run many many times before that race conditions trigger it. The test was changed to enforce some of them, and the above scenario now happens almost always.
46c36465 · Julien Muchembled · 3dabba3a · 46c36465 · 46c36465 · 46c36465
Commit 46c36465 authored Apr 27, 2017 by Julien Muchembled
7 changed files
--- a/neo/client/app.py
+++ b/neo/client/app.py
@@ -442,8 +442,8 @@ class Application(ThreadedApplication):
        # Store object in tmp cache
        packet = Packets.AskStoreObject(oid, serial, compression,
            checksum, compressed_data, data_serial, ttid)
-        txn_context.data_dict[oid] = data, txn_context.write(
+        txn_context.data_dict[oid] = data, serial, txn_context.write(
-            self, packet, oid, oid=oid, serial=serial)
+            self, packet, oid, oid=oid)
        while txn_context.data_size >= self._cache._max_size:
            self._waitAnyTransactionMessage(txn_context)
@@ -460,13 +460,13 @@ class Application(ThreadedApplication):
            # This is also done atomically, to avoid race conditions
            # with PrimaryNotificationsHandler.notifyDeadlock
            try:
-                oid, (serial, conflict_serial) = pop_conflict()
+                oid, serial = pop_conflict()
            except KeyError:
                return
            try:
-                data = data_dict.pop(oid)[0]
+                data, old_serial, _ = data_dict.pop(oid)
            except KeyError:
-                assert oid is conflict_serial is None, (oid, conflict_serial)
+                assert oid is None, (oid, serial)
                # Storage refused us from taking object lock, to avoid a
                # possible deadlock. TID is actually used for some kind of
                # "locking priority": when a higher value has the lock,
@@ -485,33 +485,32 @@ class Application(ThreadedApplication):
                        self._askStorageForWrite(txn_context, uuid, packet)
            else:
                if data is CHECKED_SERIAL:
-                    raise ReadConflictError(oid=oid, serials=(conflict_serial,
+                    raise ReadConflictError(oid=oid,
-                        serial))
+                        serials=(serial, old_serial))
                # TODO: data can be None if a conflict happens during undo
                if data:
                    txn_context.data_size -= len(data)
-                if self.last_tid < conflict_serial:
+                if self.last_tid < serial:
                    self.sync() # possible late invalidation (very rare)
                try:
-                    data = tryToResolveConflict(oid, conflict_serial,
+                    data = tryToResolveConflict(oid, serial, old_serial, data)
-                        serial, data)
                except ConflictError:
                    logging.info(
                        'Conflict resolution failed for %s@%s with %s',
-                        dump(oid), dump(serial), dump(conflict_serial))
+                        dump(oid), dump(old_serial), dump(serial))
                    # With recent ZODB, get_pickle_metadata (from ZODB.utils)
                    # does not support empty values, so do not pass 'data'
                    # in this case.
-                    raise ConflictError(oid=oid, serials=(conflict_serial,
+                    raise ConflictError(oid=oid, serials=(serial, old_serial),
-                        serial), data=data or None)
+                                        data=data or None)
                else:
                    logging.info(
                        'Conflict resolution succeeded for %s@%s with %s',
-                        dump(oid), dump(serial), dump(conflict_serial))
+                        dump(oid), dump(old_serial), dump(serial))
                    # Mark this conflict as resolved
-                    resolved_dict[oid] = conflict_serial
+                    resolved_dict[oid] = serial
                    # Try to store again
-                    self._store(txn_context, oid, conflict_serial, data)
+                    self._store(txn_context, oid, serial, data)
    def _askStorageForWrite(self, txn_context, uuid, packet):
          node = self.nm.getByUUID(uuid)
@@ -927,7 +926,7 @@ class Application(ThreadedApplication):
        assert oid not in txn_context.cache_dict, oid
        assert oid not in txn_context.data_dict, oid
        packet = Packets.AskCheckCurrentSerial(ttid, oid, serial)
-        txn_context.data_dict[oid] = CHECKED_SERIAL, txn_context.write(
+        txn_context.data_dict[oid] = CHECKED_SERIAL, serial, txn_context.write(
-            self, packet, oid, 0, oid=oid, serial=serial)
+            self, packet, oid, 0, oid=oid)
        self._waitAnyTransactionMessage(txn_context, False)
--- a/neo/client/handlers/master.py
+++ b/neo/client/handlers/master.py
@@ -154,7 +154,7 @@ class PrimaryNotificationsHandler(MTEventHandler):
    def notifyDeadlock(self, conn, ttid, locking_tid):
        for txn_context in self.app.txn_contexts():
            if txn_context.ttid == ttid:
-                txn_context.conflict_dict[None] = locking_tid, None
+                txn_context.conflict_dict[None] = locking_tid
                txn_context.wakeup(conn)
                break

--- a/neo/client/handlers/storage.py
+++ b/neo/client/handlers/storage.py
@@ -58,7 +58,7 @@ class StorageAnswersHandler(AnswerBaseHandler):
    def answerObject(self, conn, oid, *args):
        self.app.setHandlerData(args)
-    def answerStoreObject(self, conn, conflict, oid, serial):
+    def answerStoreObject(self, conn, conflict, oid):
        txn_context = self.app.getHandlerData()
        if conflict:
            # Conflicts can not be resolved now because 'conn' is locked.
@@ -75,7 +75,7 @@ class StorageAnswersHandler(AnswerBaseHandler):
            # If this conflict is not already resolved, mark it for
            # resolution.
            if  txn_context.resolved_dict.get(oid, '') < conflict:
-                txn_context.conflict_dict[oid] = serial, conflict
+                txn_context.conflict_dict[oid] = conflict
        else:
            txn_context.written(self.app, conn.getUUID(), oid)
@@ -107,10 +107,10 @@ class StorageAnswersHandler(AnswerBaseHandler):
            except KeyError:
                if resolved:
                    # We should still be waiting for an answer from this node.
-                    assert conn.uuid in txn_context.data_dict[oid][1]
+                    assert conn.uuid in txn_context.data_dict[oid][2]
                    return
                assert oid in txn_context.data_dict
-                if serial <= txn_context.conflict_dict.get(oid, ('',))[0]:
+                if serial <= txn_context.conflict_dict.get(oid, ''):
                    # Another node already reported this conflict or a newer,
                    # by answering to this rebase or to the previous store.
                    return
@@ -136,8 +136,8 @@ class StorageAnswersHandler(AnswerBaseHandler):
                    if cached:
                        assert cached == data
                        txn_context.cache_size -= size
-                txn_context.data_dict[oid] = data, None
+                txn_context.data_dict[oid] = data, serial, None
-            txn_context.conflict_dict[oid] = serial, conflict
+            txn_context.conflict_dict[oid] = conflict
    def answerStoreTransaction(self, conn):
        pass

--- a/neo/client/transactions.py
+++ b/neo/client/transactions.py
@@ -40,11 +40,11 @@ class Transaction(object):
        self.queue = SimpleQueue()
        self.txn = txn
        # data being stored
-        self.data_dict = {}                      # {oid: (value, [node_id])}
+        self.data_dict = {}                 # {oid: (value, serial, [node_id])}
        # data stored: this will go to the cache on tpc_finish
        self.cache_dict = {}                # {oid: value}
        # conflicts to resolve
-        self.conflict_dict = {}                  # {oid: (base_serial, serial)}
+        self.conflict_dict = {}             # {oid: serial}
        # resolved conflicts
        self.resolved_dict = {}             # {oid: serial}
        # Keys are node ids instead of Node objects because a node may
@@ -98,7 +98,7 @@ class Transaction(object):
        # the data in self.data_dict until all nodes have answered so we remain
        # able to resolve conflicts.
        try:
-            data, uuid_list = self.data_dict[oid]
+            data, serial, uuid_list = self.data_dict[oid]
            uuid_list.remove(uuid)
        except KeyError:
            # 1. store to S1 and S2

--- a/neo/lib/handler.py
+++ b/neo/lib/handler.py
@@ -320,6 +320,10 @@ class EventQueue(object):
        self._event_queue = []
        self._executing_event = -1
+    # Stable sort when 2 keys are equal.
+    # XXX: Is it really useful to keep events with same key ordered
+    #      chronologically ? The caller could use more specific keys. For
+    #      write-locks (by the storage node), the locking tid seems enough.
    sortQueuedEvents = (lambda key=itemgetter(0): lambda self:
        self._event_queue.sort(key=key))()

--- a/neo/tests/__init__.py
+++ b/neo/tests/__init__.py
@@ -28,6 +28,8 @@ import weakref
 import MySQLdb
 import transaction
+from cStringIO import StringIO
+from cPickle import Unpickler
 from functools import wraps
 from inspect import isclass
 from .mock import Mock
@@ -40,6 +42,7 @@ from unittest.case import _ExpectedFailure, _UnexpectedSuccess
 try:
    from transaction.interfaces import IDataManager
    from ZODB.utils import newTid
+    from ZODB.ConflictResolution import PersistentReferenceFactory
 except ImportError:
    pass
@@ -492,5 +495,11 @@ class Patch(object):
        self.__del__()
+def unpickle_state(data):
+    unpickler = Unpickler(StringIO(data))
+    unpickler.persistent_load = PersistentReferenceFactory().persistent_load
+    unpickler.load() # skip the class tuple
+    return unpickler.load()
 __builtin__.pdb = lambda depth=0: \
    debug.getPdb().set_trace(sys._getframe(depth+1))
--- a/neo/tests/threaded/test.py
+++ b/neo/tests/threaded/test.py
@@ -36,7 +36,7 @@ from neo.lib.handler import DelayEvent
 from neo.lib import logging
 from neo.lib.protocol import (CellStates, ClusterStates, NodeStates, NodeTypes,
    Packets, Packet, uuid_str, ZERO_OID, ZERO_TID)
-from .. import expectedFailure, Patch, TransactionalResource
+from .. import expectedFailure, unpickle_state, Patch, TransactionalResource
 from . import ClientApplication, ConnectionFilter, LockLock, NEOThreadedTest, \
    RandomConflictDict, ThreadId, with_cluster
 from neo.lib.util import add64, makeChecksum, p64, u64
@@ -1481,11 +1481,11 @@ class Test(NEOThreadedTest):
        reports a conflict after that this conflict was fully resolved with
        another node.
        """
-        def answerStoreObject(orig, conn, conflict, oid, serial):
+        def answerStoreObject(orig, conn, conflict, oid):
            if not conflict:
                p.revert()
                ll()
-            orig(conn, conflict, oid, serial)
+            orig(conn, conflict, oid)
        if 1:
            s0, s1 = cluster.storage_list
            t1, c1 = cluster.getTransaction()
@@ -1975,6 +1975,35 @@ class Test(NEOThreadedTest):
    @with_cluster(replicas=1, partitions=4)
    def testNotifyReplicated(self, cluster):
+        """
+        Check replication while several concurrent transactions leads to
+        conflict resolutions and deadlock avoidances, and in particular the
+        handling of write-locks when the storage node is about to notify the
+        master that partitions are replicated.
+        Transactions are committed in the following order:
+        - t2
+        - t4, conflict on 'd'
+        - t1, deadlock on 'a'
+        - t3, deadlock on 'b', and 2 conflicts on 'a'
+        Special care is also taken for the change done by t3 on 'a', to check
+        that the client resolves conflicts with correct oldSerial:
+        1. The initial store (a=8) is first delayed by t2.
+        2. It is then kept aside by the deadlock.
+        3. On s1, deadlock avoidance happens after t1 stores a=7 and the store
+           is delayed again. However, it's the contrary on s0, and a conflict
+           is reported to the client.
+        4. Second store (a=12) based on t2.
+        5. t1 finishes and s1 reports the conflict for first store (with t1).
+           At that point, the base serial of this store is meaningless:
+           the client only has data for last store (based on t2), and it's its
+           base serial that must be used. t3 write 15 (and not 19 !).
+        6. Conflicts for the second store are with t2 and they're ignored
+           because they're already resolved.
+        Note that this test method lacks code to enforce some events to happen
+        in the expected order. Sometimes, the above scenario is not reproduced
+        entirely, but it's so rare that there's no point in making the code
+        further complicated.
+        """
        s0, s1 = cluster.storage_list
        s1.stop()
        cluster.join((s1,))
@@ -2020,14 +2049,33 @@ class Test(NEOThreadedTest):
            yield 1
            self.tic()
            self.assertPartitionTable(cluster, 'UO|UU|UU|UU')
-        def t4_vote(*args, **kw):
+        def t4_d(*args, **kw):
            self.tic()
            self.assertPartitionTable(cluster, 'UU|UU|UU|UU')
-            yield 0
+            yield 2
+        # Delay the conflict for the second store of 'a' by t3.
+        delay_conflict = {s0.uuid: [1], s1.uuid: [1,0]}
+        def delayConflict(conn, packet):
+            app = conn.getHandler().app
+            if (isinstance(packet, Packets.AnswerStoreObject)
+                and packet.decode()[0]):
+                conn, = cluster.client.getConnectionList(app)
+                kw = conn._handlers._pending[0][0][packet._id][3]
+                return 1 == u64(kw['oid']) and delay_conflict[app.uuid].pop()
+        def writeA(orig, txn_context, oid, serial, data):
+            if u64(oid) == 1:
+                value = unpickle_state(data)['value']
+                if value > 12:
+                    f.remove(delayConflict)
+                elif value == 12:
+                    f.add(delayConflict)
+            return orig(txn_context, oid, serial, data)
+        ###
        with ConnectionFilter() as f, \
+             Patch(cluster.client, _store=writeA), \
             self.thread_switcher(threads,
                (1, 2, 3, 0, 1, 0, 2, t3_c, 1, 3, 2, t3_resolve, 0, 0, 0,
-                 t1_rebase, 2, t3_b, 3, t4_vote),
+                 t1_rebase, 2, t3_b, 3, t4_d, 0, 2, 2),
                ('tpc_begin', 'tpc_begin', 'tpc_begin', 'tpc_begin', 2, 1, 1,
                 3, 3, 4, 4, 3, 1, 'RebaseTransaction', 'RebaseTransaction',
                 'AnswerRebaseTransaction', 'AnswerRebaseTransaction', 2