Commit 8e3c7b01 authored by Julien Muchembled's avatar Julien Muchembled

Implements backup using specialised storage nodes and relying on replication

Replication is also fully reimplemented:
- It is not done anymore on whole partitions.
- It runs at lowest priority not to degrades performance for client nodes.

Schema of MySQL table is changed to optimize storage layout: rows are now
grouped by age, for good partial replication performance.
This certainly also speeds up simple loads/stores.
parent 75d83690
......@@ -111,42 +111,17 @@ RC - Review output of pylint (CODE)
consider using query(request, args) instead of query(request % args)
- Make listening address and port optionnal, and if they are not provided
listen on all interfaces on any available port.
- Replication throttling (HIGH AVAILABILITY)
In its current implementation, replication runs at full speed, which
degrades performance for client nodes. Replication should allow
throttling, and that throttling should be configurable.
See "Replication pipelining".
- Make replication speed configurable (HIGH AVAILABILITY)
In its current implementation, replication runs at lowest priority, not to
degrades performance for client nodes. But when there's only 1 storage
left for a partition, it may be wanted to guarantee a minimum speed to
avoid complete data loss if another failure happens too early.
- Pack segmentation & throttling (HIGH AVAILABILITY)
In its current implementation, pack runs in one call on all storage nodes
at the same time, which lcoks down the whole cluster. This task should
be split in chunks and processed in "background" on storage nodes.
Packing throttling should probably be at the lowest possible priority
(below interactive use and below replication).
- Replication pipelining (SPEED)
Replication work currently with too many exchanges between replicating
storage, and network latency can become a significant limit.
This should be changed to have just one initial request from
replicating storage, and multiple packets from reference storage with
database range checksums. When receiving these checksums, replicating
storage must compare with what it has, and ask row lists (might not even
be required) and data when there are differences. Quick fetching from
network with asynchronous checking (=queueing) + congestion control
(asking reference storage's to pause its packet flow) will probably be
required.
This should make it easier to throttle replication workload on reference
storage node, as it can decide to postpone replication-related packets on
its own.
- Partial replication (SPEED)
In its current implementation, replication always happens on a whole
partition. In typical use, only a few last transactions will have been
missed, so replicating only past a given TID would be much faster.
To achieve this, storage nodes must store 2 values:
- a pack identifier, which must be different each time a pack occurs
(increasing number sequence, TID-ish, etc) to trigger a
whole-partition replication when a pack happened (this could be
improved too, later)
- the latest (-ish) transaction committed locally, to use as a lower
replication boundary
- tpc_finish failures propagation to master (FUNCTIONALITY)
When asked to lock transaction data, if something goes wrong the master
node must be informed.
......
......@@ -9,7 +9,7 @@ SQL commands to migrate each storage from NEO 0.10.x::
CREATE TABLE new_data (id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, hash BINARY(20) NOT NULL UNIQUE, compression TINYINT UNSIGNED NULL, value LONGBLOB NULL) ENGINE = InnoDB SELECT DISTINCT obj.hash as hash, compression, value FROM obj, data WHERE obj.hash=data.hash ORDER BY serial;
DROP TABLE data;
RENAME TABLE new_data TO data;
CREATE TABLE new_obj (partition SMALLINT UNSIGNED NOT NULL, oid BIGINT UNSIGNED NOT NULL, serial BIGINT UNSIGNED NOT NULL, data_id BIGINT UNSIGNED NULL, value_serial BIGINT UNSIGNED NULL, PRIMARY KEY (partition, oid, serial), KEY (data_id)) ENGINE = InnoDB SELECT partition, oid, serial, data.id as data_id, value_serial FROM obj LEFT JOIN data ON (obj.hash=data.hash);
CREATE TABLE new_obj (partition SMALLINT UNSIGNED NOT NULL, oid BIGINT UNSIGNED NOT NULL, serial BIGINT UNSIGNED NOT NULL, data_id BIGINT UNSIGNED NULL, value_serial BIGINT UNSIGNED NULL, PRIMARY KEY (partition, serial, oid), KEY (partition, oid, serial), KEY (data_id)) ENGINE = InnoDB SELECT partition, oid, serial, data.id as data_id, value_serial FROM obj LEFT JOIN data ON (obj.hash=data.hash);
DROP TABLE obj;
RENAME TABLE new_obj TO obj;
ALTER TABLE tobj CHANGE hash data_id BIGINT UNSIGNED NULL;
......
......@@ -959,7 +959,7 @@ class Application(object):
tid_list = []
# request a tid list for each partition
for offset in xrange(self.pt.getPartitions()):
p = Packets.AskTIDsFrom(start, stop, limit, [offset])
p = Packets.AskTIDsFrom(start, stop, limit, offset)
for node, conn in self.cp.iterateForObject(offset, readable=True):
try:
r = self._askStorage(conn, p)
......
......@@ -90,3 +90,8 @@ class ConfigurationManager(object):
# only from command line
return util.bin(self.argument_list.get('uuid', None))
def getUpstreamCluster(self):
return self.__get('upstream_cluster', True)
def getUpstreamMasters(self):
return util.parseMasterList(self.__get('upstream_masters'))
......@@ -79,6 +79,9 @@ class EpollEventManager(object):
self.epoll.unregister(fd)
del self.connection_dict[fd]
def isIdle(self):
return not (self._pending_processing or self.writer_set)
def _addPendingConnection(self, conn):
pending_processing = self._pending_processing
if conn not in pending_processing:
......
......@@ -48,6 +48,7 @@ class ErrorCodes(Enum):
PROTOCOL_ERROR = Enum.Item(4)
BROKEN_NODE = Enum.Item(5)
ALREADY_PENDING = Enum.Item(7)
REPLICATION_ERROR = Enum.Item(8)
ErrorCodes = ErrorCodes()
class ClusterStates(Enum):
......@@ -55,6 +56,9 @@ class ClusterStates(Enum):
VERIFYING = Enum.Item(2)
RUNNING = Enum.Item(3)
STOPPING = Enum.Item(4)
STARTING_BACKUP = Enum.Item(5)
BACKINGUP = Enum.Item(6)
STOPPING_BACKUP = Enum.Item(7)
ClusterStates = ClusterStates()
class NodeTypes(Enum):
......@@ -117,6 +121,7 @@ ZERO_TID = '\0' * 8
ZERO_OID = '\0' * 8
OID_LEN = len(INVALID_OID)
TID_LEN = len(INVALID_TID)
MAX_TID = '\x7f' + '\xff' * 7 # SQLite does not accept numbers above 2^63-1
UUID_NAMESPACES = {
NodeTypes.STORAGE: 'S',
......@@ -723,6 +728,7 @@ class LastIDs(Packet):
POID('last_oid'),
PTID('last_tid'),
PPTID('last_ptid'),
PTID('backup_tid'),
)
class PartitionTable(Packet):
......@@ -760,16 +766,6 @@ class PartitionChanges(Packet):
),
)
class ReplicationDone(Packet):
"""
Notify the master node that a partition has been successully replicated from
a storage to another.
S -> M
"""
_fmt = PStruct('notify_replication_done',
PNumber('offset'),
)
class StartOperation(Packet):
"""
Tell a storage nodes to start an operation. Until a storage node receives
......@@ -965,7 +961,7 @@ class GetObject(Packet):
"""
Ask a stored object by its OID and a serial or a TID if given. If a serial
is specified, the specified revision of an object will be returned. If
a TID is specified, an object right before the TID will be returned. S,C -> S.
a TID is specified, an object right before the TID will be returned. C -> S.
Answer the requested object. S -> C.
"""
_fmt = PStruct('ask_object',
......@@ -1003,16 +999,14 @@ class TIDList(Packet):
class TIDListFrom(Packet):
"""
Ask for length TIDs starting at min_tid. The order of TIDs is ascending.
S -> S.
Answer the requested TIDs. S -> S
C -> S.
Answer the requested TIDs. S -> C
"""
_fmt = PStruct('tid_list_from',
PTID('min_tid'),
PTID('max_tid'),
PNumber('length'),
PList('partition_list',
PNumber('partition'),
),
PNumber('partition'),
)
_answer = PStruct('answer_tids',
......@@ -1054,27 +1048,6 @@ class ObjectHistory(Packet):
PFHistoryList,
)
class ObjectHistoryFrom(Packet):
"""
Ask history information for a given object. The order of serials is
ascending, and starts at (or above) min_serial for min_oid. S -> S.
Answer the requested serials. S -> S.
"""
_fmt = PStruct('ask_object_history',
POID('min_oid'),
PTID('min_serial'),
PTID('max_serial'),
PNumber('length'),
PNumber('partition'),
)
_answer = PStruct('ask_finish_transaction',
PDict('object_dict',
POID('oid'),
PFTidList,
),
)
class PartitionList(Packet):
"""
All the following messages are for neoctl to admin node
......@@ -1341,6 +1314,110 @@ class NotifyReady(Packet):
"""
pass
# replication
class FetchTransactions(Packet):
"""
S -> S
"""
_fmt = PStruct('ask_transaction_list',
PNumber('partition'),
PNumber('length'),
PTID('min_tid'),
PTID('max_tid'),
PFTidList, # already known transactions
)
_answer = PStruct('answer_transaction_list',
PTID('pack_tid'),
PTID('next_tid'),
PFTidList, # transactions to delete
)
class AddTransaction(Packet):
"""
S -> S
"""
_fmt = PStruct('add_transaction',
PTID('tid'),
PString('user'),
PString('description'),
PString('extension'),
PBoolean('packed'),
PFOidList,
)
class FetchObjects(Packet):
"""
S -> S
"""
_fmt = PStruct('ask_object_list',
PNumber('partition'),
PNumber('length'),
PTID('min_tid'),
PTID('max_tid'),
POID('min_oid'),
PDict('object_dict', # already known objects
PTID('serial'),
PFOidList,
),
)
_answer = PStruct('answer_object_list',
PTID('pack_tid'),
PTID('next_tid'),
POID('next_oid'),
PDict('object_dict', # objects to delete
PTID('serial'),
PFOidList,
),
)
class AddObject(Packet):
"""
S -> S
"""
_fmt = PStruct('add_object',
POID('oid'),
PTID('serial'),
PBoolean('compression'),
PChecksum('checksum'),
PString('data'),
PTID('data_serial'),
)
class Replicate(Packet):
"""
M -> S
"""
_fmt = PStruct('replicate',
PTID('tid'),
PString('upstream_name'),
PDict('source_dict',
PNumber('partition'),
PAddress('address'),
)
)
class ReplicationDone(Packet):
"""
Notify the master node that a partition has been successully replicated from
a storage to another.
S -> M
"""
_fmt = PStruct('notify_replication_done',
PNumber('offset'),
PTID('tid'),
)
class Truncate(Packet):
"""
M -> S
"""
_fmt = PStruct('ask_truncate',
PTID('tid'),
)
_answer = PFEmpty
StaticRegistry = {}
def register(request, ignore_when_closed=None):
""" Register a packet in the packet registry """
......@@ -1516,16 +1593,12 @@ class Packets(dict):
ClusterState)
NotifyLastOID = register(
NotifyLastOID)
NotifyReplicationDone = register(
ReplicationDone)
AskObjectUndoSerial, AnswerObjectUndoSerial = register(
ObjectUndoSerial)
AskHasLock, AnswerHasLock = register(
HasLock)
AskTIDsFrom, AnswerTIDsFrom = register(
TIDListFrom)
AskObjectHistoryFrom, AnswerObjectHistoryFrom = register(
ObjectHistoryFrom)
AskPack, AnswerPack = register(
Pack, ignore_when_closed=False)
AskCheckTIDRange, AnswerCheckTIDRange = register(
......@@ -1540,6 +1613,20 @@ class Packets(dict):
CheckCurrentSerial)
NotifyTransactionFinished = register(
NotifyTransactionFinished)
Replicate = register(
Replicate)
NotifyReplicationDone = register(
ReplicationDone)
AskFetchTransactions, AnswerFetchTransactions = register(
FetchTransactions)
AskFetchObjects, AnswerFetchObjects = register(
FetchObjects)
AddTransaction = register(
AddTransaction)
AddObject = register(
AddObject)
AskTruncate, AnswerTruncate = register(
Truncate)
def Errors():
registry_dict = {}
......
......@@ -150,6 +150,11 @@ class PartitionTable(object):
return True
return False
def getCell(self, offset, uuid):
for cell in self.partition_list[offset]:
if cell.getUUID() == uuid:
return cell
def setCell(self, offset, node, state):
if state == CellStates.DISCARDED:
return self.removeCell(offset, node)
......@@ -157,28 +162,19 @@ class PartitionTable(object):
raise PartitionTableException('Invalid node state')
self.count_dict.setdefault(node, 0)
row = self.partition_list[offset]
if len(row) == 0:
# Create a new row.
row = [Cell(node, state), ]
if state != CellStates.FEEDING:
self.count_dict[node] += 1
self.partition_list[offset] = row
self.num_filled_rows += 1
for cell in self.partition_list[offset]:
if cell.getNode() is node:
if not cell.isFeeding():
self.count_dict[node] -= 1
cell.setState(state)
break
else:
# XXX this can be slow, but it is necessary to remove a duplicate,
# if any.
for cell in row:
if cell.getNode() == node:
row.remove(cell)
if not cell.isFeeding():
self.count_dict[node] -= 1
break
row = self.partition_list[offset]
self.num_filled_rows += not row
row.append(Cell(node, state))
if state != CellStates.FEEDING:
self.count_dict[node] += 1
return (offset, node.getUUID(), state)
if state != CellStates.FEEDING:
self.count_dict[node] += 1
return offset, node.getUUID(), state
def removeCell(self, offset, node):
row = self.partition_list[offset]
......
......@@ -28,6 +28,10 @@ from neo.lib.event import EventManager
from neo.lib.connection import ListeningConnection, ClientConnection
from neo.lib.exception import ElectionFailure, PrimaryFailure, OperationFailure
from neo.lib.util import dump
class StateChangedException(Exception): pass
from .backup_app import BackupApplication
from .handlers import election, identification, secondary
from .handlers import administration, client, storage, shutdown
from .pt import PartitionTable
......@@ -41,6 +45,8 @@ class Application(object):
packing = None
# Latest completely commited TID
last_transaction = ZERO_TID
backup_tid = None
backup_app = None
def __init__(self, config):
# Internal attributes.
......@@ -90,16 +96,29 @@ class Application(object):
self._current_manager = None
# backup
upstream_cluster = config.getUpstreamCluster()
if upstream_cluster:
if upstream_cluster == self.name:
raise ValueError("upstream cluster name must be"
" different from cluster name")
self.backup_app = BackupApplication(self, upstream_cluster,
*config.getUpstreamMasters())
registerLiveDebugger(on_log=self.log)
def close(self):
self.listening_conn = None
if self.backup_app is not None:
self.backup_app.close()
self.nm.close()
self.em.close()
del self.__dict__
def log(self):
self.em.log()
if self.backup_app is not None:
self.backup_app.log()
self.nm.log()
self.tm.log()
if self.pt is not None:
......@@ -257,27 +276,29 @@ class Application(object):
a shutdown.
"""
neo.lib.logging.info('provide service')
em = self.em
poll = self.em.poll
self.tm.reset()
self.changeClusterState(ClusterStates.RUNNING)
# Now everything is passive.
while True:
try:
em.poll(1)
except OperationFailure:
# If not operational, send Stop Operation packets to storage
# nodes and client nodes. Abort connections to client nodes.
neo.lib.logging.critical('No longer operational')
for node in self.nm.getIdentifiedList():
if node.isStorage() or node.isClient():
node.notify(Packets.StopOperation())
if node.isClient():
node.getConnection().abort()
# Then, go back, and restart.
return
try:
while True:
poll(1)
except OperationFailure:
# If not operational, send Stop Operation packets to storage
# nodes and client nodes. Abort connections to client nodes.
neo.lib.logging.critical('No longer operational')
except StateChangedException, e:
assert e.args[0] == ClusterStates.STARTING_BACKUP
self.backup_tid = tid = self.getLastTransaction()
self.pt.setBackupTidDict(dict((node.getUUID(), tid)
for node in self.nm.getStorageList(only_identified=True)))
for node in self.nm.getIdentifiedList():
if node.isStorage() or node.isClient():
node.notify(Packets.StopOperation())
if node.isClient():
node.getConnection().abort()
def playPrimaryRole(self):
neo.lib.logging.info(
......@@ -314,7 +335,13 @@ class Application(object):
self.runManager(RecoveryManager)
while True:
self.runManager(VerificationManager)
self.provideService()
if self.backup_tid:
if self.backup_app is None:
raise RuntimeError("No upstream cluster to backup"
" defined in configuration")
self.backup_app.provideService()
else:
self.provideService()
def playSecondaryRole(self):
"""
......@@ -364,7 +391,8 @@ class Application(object):
# select the storage handler
client_handler = client.ClientServiceHandler(self)
if state == ClusterStates.RUNNING:
if state in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP,
ClusterStates.BACKINGUP, ClusterStates.STOPPING_BACKUP):
storage_handler = storage.StorageServiceHandler(self)
elif self._current_manager is not None:
storage_handler = self._current_manager.getHandler()
......@@ -389,8 +417,9 @@ class Application(object):
handler = storage_handler
else:
continue # keep handler
conn.setHandler(handler)
handler.connectionCompleted(conn)
if type(handler) is not type(conn.getLastHandler()):
conn.setHandler(handler)
handler.connectionCompleted(conn)
self.cluster_state = state
def getNewUUID(self, node_type):
......@@ -437,19 +466,13 @@ class Application(object):
sys.exit()
def identifyStorageNode(self, uuid, node):
state = NodeStates.RUNNING
handler = None
if self.cluster_state == ClusterStates.RUNNING:
if uuid is None or node is None:
# same as for verification
state = NodeStates.PENDING
handler = storage.StorageServiceHandler(self)
elif self.cluster_state == ClusterStates.STOPPING:
if self.cluster_state == ClusterStates.STOPPING:
raise NotReadyError
else:
raise RuntimeError('unhandled cluster state: %s' %
(self.cluster_state, ))
return (uuid, state, handler)
state = NodeStates.RUNNING
if uuid is None or node is None:
# same as for verification
state = NodeStates.PENDING
return uuid, state, storage.StorageServiceHandler(self)
def identifyNode(self, node_type, uuid, node):
......
This diff is collapsed.
......@@ -18,15 +18,18 @@
import neo
from . import MasterHandler
from ..app import StateChangedException
from neo.lib.protocol import ClusterStates, NodeStates, Packets, ProtocolError
from neo.lib.protocol import Errors
from neo.lib.util import dump
CLUSTER_STATE_WORKFLOW = {
# destination: sources
ClusterStates.VERIFYING: set([ClusterStates.RECOVERING]),
ClusterStates.STOPPING: set([ClusterStates.RECOVERING,
ClusterStates.VERIFYING, ClusterStates.RUNNING]),
ClusterStates.VERIFYING: (ClusterStates.RECOVERING,),
ClusterStates.STARTING_BACKUP: (ClusterStates.RUNNING,
ClusterStates.STOPPING_BACKUP),
ClusterStates.STOPPING_BACKUP: (ClusterStates.BACKINGUP,
ClusterStates.STARTING_BACKUP),
}
class AdministrationHandler(MasterHandler):
......@@ -42,16 +45,17 @@ class AdministrationHandler(MasterHandler):
conn.answer(Packets.AnswerPrimary(app.uuid, []))
def setClusterState(self, conn, state):
app = self.app
# check request
if state not in CLUSTER_STATE_WORKFLOW:
try:
if app.cluster_state not in CLUSTER_STATE_WORKFLOW[state]:
raise ProtocolError('Can not switch to this state')
except KeyError:
raise ProtocolError('Invalid state requested')
valid_current_states = CLUSTER_STATE_WORKFLOW[state]
if self.app.cluster_state not in valid_current_states:
raise ProtocolError('Cannot switch to this state')
# change state
if state == ClusterStates.VERIFYING:
storage_list = self.app.nm.getStorageList(only_identified=True)
storage_list = app.nm.getStorageList(only_identified=True)
if not storage_list:
raise ProtocolError('Cannot exit recovery without any '
'storage node')
......@@ -60,15 +64,18 @@ class AdministrationHandler(MasterHandler):
if node.getConnection().isPending():
raise ProtocolError('Cannot exit recovery now: node %r is '
'entering cluster' % (node, ))
self.app._startup_allowed = True
else:
self.app.changeClusterState(state)
app._startup_allowed = True
state = app.cluster_state
elif state == ClusterStates.STARTING_BACKUP:
if app.tm.hasPending() or app.nm.getClientList(True):
raise ProtocolError("Can not switch to %s state with pending"
" transactions or connected clients" % state)
elif state != ClusterStates.STOPPING_BACKUP:
app.changeClusterState(state)
# answer
conn.answer(Errors.Ack('Cluster state changed'))
if state == ClusterStates.STOPPING:
self.app.cluster_state = state
self.app.shutdown()
if state != app.cluster_state:
raise StateChangedException(state)
def setNodeState(self, conn, uuid, state, modify_partition_table):
neo.lib.logging.info("set node state for %s-%s : %s" %
......
##############################################################################
#
# Copyright (c) 2011 Nexedi SARL and Contributors. All Rights Reserved.
# Julien Muchembled <jm@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##############################################################################
from neo.lib.exception import PrimaryFailure
from neo.lib.handler import EventHandler
from neo.lib.protocol import CellStates
class BackupHandler(EventHandler):
"""Handler dedicated to upstream master during BACKINGUP state"""
def connectionLost(self, conn, new_state):
if self.app.app.listening_conn: # if running
raise PrimaryFailure('connection lost')
def answerPartitionTable(self, conn, ptid, row_list):
self.app.pt.load(ptid, row_list, self.app.nm)
def notifyPartitionChanges(self, conn, ptid, cell_list):
self.app.pt.update(ptid, cell_list, self.app.nm)
def answerNodeInformation(self, conn):
pass
def notifyNodeInformation(self, conn, node_list):
self.app.nm.update(node_list)
def answerLastTransaction(self, conn, tid):
app = self.app
app.invalidatePartitions(tid, set(xrange(app.pt.getPartitions())))
def invalidateObjects(self, conn, tid, oid_list):
app = self.app
getPartition = app.app.pt.getPartition
partition_set = set(map(getPartition, oid_list))
partition_set.add(getPartition(tid))
app.invalidatePartitions(tid, partition_set)
......@@ -16,7 +16,7 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import neo.lib
from neo.lib.protocol import Packets, ProtocolError
from neo.lib.protocol import ClusterStates, Packets, ProtocolError
from neo.lib.exception import OperationFailure
from neo.lib.util import dump
from neo.lib.connector import ConnectorConnectionClosedException
......@@ -45,14 +45,18 @@ class StorageServiceHandler(BaseServiceHandler):
if not app.pt.operational():
raise OperationFailure, 'cannot continue operation'
app.tm.forget(conn.getUUID())
if app.getClusterState() == ClusterStates.BACKINGUP:
app.backup_app.nodeLost(node)
if app.packing is not None:
self.answerPack(conn, False)
def askLastIDs(self, conn):
app = self.app
loid = app.tm.getLastOID()
ltid = app.tm.getLastTID()
conn.answer(Packets.AnswerLastIDs(loid, ltid, app.pt.getID()))
conn.answer(Packets.AnswerLastIDs(
app.tm.getLastOID(),
app.tm.getLastTID(),
app.pt.getID(),
app.backup_tid))
def askUnfinishedTransactions(self, conn):
tm = self.app.tm
......@@ -68,15 +72,26 @@ class StorageServiceHandler(BaseServiceHandler):
# transaction locked on this storage node
self.app.tm.lock(ttid, conn.getUUID())
def notifyReplicationDone(self, conn, offset):
node = self.app.nm.getByUUID(conn.getUUID())
neo.lib.logging.debug("%s is up for offset %s" % (node, offset))
try:
cell_list = self.app.pt.setUpToDate(node, offset)
except PartitionTableException, e:
raise ProtocolError(str(e))
def notifyReplicationDone(self, conn, offset, tid):
app = self.app
node = app.nm.getByUUID(conn.getUUID())
if app.backup_tid:
cell_list = app.backup_app.notifyReplicationDone(node, offset, tid)
if not cell_list:
return
else:
try:
cell_list = self.app.pt.setUpToDate(node, offset)
if not cell_list:
raise ProtocolError('Non-oudated partition')
except PartitionTableException, e:
raise ProtocolError(str(e))
neo.lib.logging.debug("%s is up for offset %s", node, offset)
self.app.broadcastPartitionChanges(cell_list)
def answerTruncate(self, conn):
pass
def answerPack(self, conn, status):
app = self.app
if app.packing is not None:
......
......@@ -17,11 +17,25 @@
import neo.lib.pt
from struct import pack, unpack
from neo.lib.protocol import CellStates
from neo.lib.pt import PartitionTableException
from neo.lib.pt import PartitionTable
from neo.lib.protocol import CellStates, ZERO_TID
class PartitionTable(PartitionTable):
class Cell(neo.lib.pt.Cell):
replicating = ZERO_TID
def setState(self, state):
try:
if CellStates.OUT_OF_DATE == state != self.state:
del self.backup_tid, self.replicating
except AttributeError:
pass
return super(Cell, self).setState(state)
neo.lib.pt.Cell = Cell
class PartitionTable(neo.lib.pt.PartitionTable):
"""This class manages a partition table for the primary master node"""
def setID(self, id):
......@@ -54,7 +68,7 @@ class PartitionTable(PartitionTable):
row = []
for _ in xrange(repeats):
node = node_list[index]
row.append(neo.lib.pt.Cell(node))
row.append(Cell(node))
self.count_dict[node] = self.count_dict.get(node, 0) + 1
index += 1
if index == len(node_list):
......@@ -88,7 +102,7 @@ class PartitionTable(PartitionTable):
node_list = [c.getNode() for c in row]
n = self.findLeastUsedNode(node_list)
if n is not None:
row.append(neo.lib.pt.Cell(n,
row.append(Cell(n,
CellStates.OUT_OF_DATE))
self.count_dict[n] += 1
cell_list.append((offset, n.getUUID(),
......@@ -132,11 +146,11 @@ class PartitionTable(PartitionTable):
# check the partition is assigned and known as outdated
for cell in self.getCellList(offset):
if cell.getUUID() == uuid:
if not cell.isOutOfDate():
raise PartitionTableException('Non-oudated partition')
break
if cell.isOutOfDate():
break
return
else:
raise PartitionTableException('Non-assigned partition')
raise neo.lib.pt.PartitionTableException('Non-assigned partition')
# update the partition table
cell_list = [self.setCell(offset, node, CellStates.UP_TO_DATE)]
......@@ -177,7 +191,7 @@ class PartitionTable(PartitionTable):
else:
if num_cells <= self.nr:
row.append(neo.lib.pt.Cell(node, CellStates.OUT_OF_DATE))
row.append(Cell(node, CellStates.OUT_OF_DATE))
cell_list.append((offset, node.getUUID(),
CellStates.OUT_OF_DATE))
node_count += 1
......@@ -196,7 +210,7 @@ class PartitionTable(PartitionTable):
CellStates.FEEDING))
# Don't count a feeding cell.
self.count_dict[max_cell.getNode()] -= 1
row.append(neo.lib.pt.Cell(node, CellStates.OUT_OF_DATE))
row.append(Cell(node, CellStates.OUT_OF_DATE))
cell_list.append((offset, node.getUUID(),
CellStates.OUT_OF_DATE))
node_count += 1
......@@ -277,7 +291,7 @@ class PartitionTable(PartitionTable):
node = self.findLeastUsedNode([cell.getNode() for cell in row])
if node is None:
break
row.append(neo.lib.pt.Cell(node, CellStates.OUT_OF_DATE))
row.append(Cell(node, CellStates.OUT_OF_DATE))
changed_cell_list.append((offset, node.getUUID(),
CellStates.OUT_OF_DATE))
self.count_dict[node] += 1
......@@ -309,6 +323,13 @@ class PartitionTable(PartitionTable):
CellStates.OUT_OF_DATE))
return change_list
def iterNodeCell(self, node):
for offset, row in enumerate(self.partition_list):
for cell in row:
if cell.getNode() is node:
yield offset, cell
break
def getUpToDateCellNodeSet(self):
"""
Return a set of all nodes which are part of at least one UP TO DATE
......@@ -329,3 +350,16 @@ class PartitionTable(PartitionTable):
for cell in row
if cell.isOutOfDate())
def setBackupTidDict(self, backup_tid_dict):
for row in self.partition_list:
for cell in row:
cell.backup_tid = backup_tid_dict.get(cell.getUUID(),
ZERO_TID)
def getBackupTid(self):
try:
return min(max(cell.backup_tid for cell in row
if not cell.isOutOfDate())
for row in self.partition_list)
except ValueError:
return ZERO_TID
......@@ -33,6 +33,7 @@ class RecoveryManager(MasterHandler):
super(RecoveryManager, self).__init__(app)
# The target node's uuid to request next.
self.target_ptid = None
self.backup_tid_dict = {}
def getHandler(self):
return self
......@@ -98,6 +99,9 @@ class RecoveryManager(MasterHandler):
app.tm.setLastOID(ZERO_OID)
pt.make(allowed_node_set)
self._broadcastPartitionTable(pt.getID(), pt.getRowList())
elif app.backup_tid:
pt.setBackupTidDict(self.backup_tid_dict)
app.backup_tid = pt.getBackupTid()
app.setLastTransaction(app.tm.getLastTID())
neo.lib.logging.debug(
......@@ -118,7 +122,7 @@ class RecoveryManager(MasterHandler):
# ask the last IDs to perform the recovery
conn.ask(Packets.AskLastIDs())
def answerLastIDs(self, conn, loid, ltid, lptid):
def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid):
# Get max values.
if loid is not None:
self.app.tm.setLastOID(loid)
......@@ -128,6 +132,7 @@ class RecoveryManager(MasterHandler):
# something newer
self.target_ptid = lptid
conn.ask(Packets.AskPartitionTable())
self.backup_tid_dict[conn.getUUID()] = backup_tid
def answerPartitionTable(self, conn, ptid, row_list):
if ptid != self.target_ptid:
......@@ -136,6 +141,7 @@ class RecoveryManager(MasterHandler):
dump(self.target_ptid))
else:
self._broadcastPartitionTable(ptid, row_list)
self.app.backup_tid = self.backup_tid_dict[conn.getUUID()]
def _broadcastPartitionTable(self, ptid, row_list):
try:
......
......@@ -113,19 +113,21 @@ class VerificationManager(BaseServiceHandler):
def verifyData(self):
"""Verify the data in storage nodes and clean them up, if necessary."""
em, nm = self.app.em, self.app.nm
app = self.app
# wait for any missing node
neo.lib.logging.debug('waiting for the cluster to be operational')
while not self.app.pt.operational():
em.poll(1)
while not app.pt.operational():
app.em.poll(1)
if app.backup_tid:
return
neo.lib.logging.info('start to verify data')
getIdentifiedList = app.nm.getIdentifiedList
# Gather all unfinished transactions.
self._askStorageNodesAndWait(Packets.AskUnfinishedTransactions(),
[x for x in self.app.nm.getIdentifiedList() if x.isStorage()])
[x for x in getIdentifiedList() if x.isStorage()])
# Gather OIDs for each unfinished TID, and verify whether the
# transaction can be finished or must be aborted. This could be
......@@ -136,17 +138,16 @@ class VerificationManager(BaseServiceHandler):
if uuid_set is None:
packet = Packets.DeleteTransaction(tid, self._oid_set or [])
# Make sure that no node has this transaction.
for node in self.app.nm.getIdentifiedList():
for node in getIdentifiedList():
if node.isStorage():
node.notify(packet)
else:
packet = Packets.CommitTransaction(tid)
for node in self.app.nm.getIdentifiedList(pool_set=uuid_set):
for node in getIdentifiedList(pool_set=uuid_set):
node.notify(packet)
self._oid_set = set()
# If possible, send the packets now.
em.poll(0)
app.em.poll(0)
def verifyTransaction(self, tid):
em = self.app.em
......@@ -189,11 +190,11 @@ class VerificationManager(BaseServiceHandler):
return uuid_set
def answerLastIDs(self, conn, loid, ltid, lptid):
def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid):
# FIXME: this packet should not allowed here, the master already
# accepted the current partition table end IDs. As there were manually
# approved during recovery, there is no need to check them here.
pass
raise RuntimeError
def answerUnfinishedTransactions(self, conn, max_tid, tid_list):
uuid = conn.getUUID()
......
......@@ -54,15 +54,10 @@ UNIT_TEST_MODULES = [
'neo.tests.storage.testInitializationHandler',
'neo.tests.storage.testMasterHandler',
'neo.tests.storage.testStorageApp',
'neo.tests.storage.testStorageHandler',
'neo.tests.storage.testStorageMySQLdb',
'neo.tests.storage.testStorageBTree',
'neo.tests.storage.testStorage' + os.getenv('NEO_TESTS_ADAPTER', 'SQLite'),
'neo.tests.storage.testVerificationHandler',
'neo.tests.storage.testIdentificationHandler',
'neo.tests.storage.testTransactions',
'neo.tests.storage.testReplicationHandler',
'neo.tests.storage.testReplicator',
'neo.tests.storage.testReplication',
# client application
'neo.tests.client.testClientApp',
'neo.tests.client.testMasterHandler',
......@@ -70,6 +65,7 @@ UNIT_TEST_MODULES = [
'neo.tests.client.testConnectionPool',
# light functional tests
'neo.tests.threaded.test',
'neo.tests.threaded.testReplication',
]
FUNC_TEST_MODULES = [
......
......@@ -113,28 +113,21 @@ class Application(object):
"""Load persistent configuration data from the database.
If data is not present, generate it."""
def NoneOnKeyError(getter):
try:
return getter()
except KeyError:
return None
dm = self.dm
# check cluster name
try:
dm_name = dm.getName()
except KeyError:
name = dm.getName()
if name is None:
dm.setName(self.name)
else:
if dm_name != self.name:
raise RuntimeError('name %r does not match with the '
'database: %r' % (self.name, dm_name))
elif name != self.name:
raise RuntimeError('name %r does not match with the database: %r'
% (self.name, dm_name))
# load configuration
self.uuid = NoneOnKeyError(dm.getUUID)
num_partitions = NoneOnKeyError(dm.getNumPartitions)
num_replicas = NoneOnKeyError(dm.getNumReplicas)
ptid = NoneOnKeyError(dm.getPTID)
self.uuid = dm.getUUID()
num_partitions = dm.getNumPartitions()
num_replicas = dm.getNumReplicas()
ptid = dm.getPTID()
# check partition table configuration
if num_partitions is not None and num_replicas is not None:
......@@ -152,10 +145,7 @@ class Application(object):
def loadPartitionTable(self):
"""Load a partition table from the database."""
try:
ptid = self.dm.getPTID()
except KeyError:
ptid = None
ptid = self.dm.getPTID()
cell_list = self.dm.getPartitionTable()
new_cell_list = []
for offset, uuid, state in cell_list:
......@@ -216,9 +206,7 @@ class Application(object):
except OperationFailure, msg:
neo.lib.logging.error('operation stopped: %s', msg)
except PrimaryFailure, msg:
self.replicator.masterLost()
neo.lib.logging.error('primary master is down: %s', msg)
self.master_node = None
def connectToPrimary(self):
"""Find a primary master node, and connect to it.
......@@ -296,6 +284,7 @@ class Application(object):
neo.lib.logging.info('doing operation')
_poll = self._poll
isIdle = self.em.isIdle
handler = master.MasterOperationHandler(self)
self.master_conn.setHandler(handler)
......@@ -304,16 +293,21 @@ class Application(object):
self.dm.dropUnfinishedData()
self.tm.reset()
while True:
_poll()
if self.replicator.pending():
# Call processDelayedTasks before act, so tasks added in the
# act call are executed after one poll call, so that sent
# packets are already on the network and delayed task
# processing happens in parallel with the same task on the
# other storage node.
self.replicator.processDelayedTasks()
self.replicator.act()
self.task_queue = task_queue = deque()
try:
while True:
while task_queue and isIdle():
try:
task_queue[-1].next()
task_queue.rotate()
except StopIteration:
task_queue.pop()
_poll()
finally:
del self.task_queue
# Abort any replication, whether we are feeding or out-of-date.
for node in self.nm.getStorageList(only_identified=True):
node.getConnection().close()
def wait(self):
# change handler
......@@ -368,6 +362,13 @@ class Application(object):
neo.lib.logging.info(' %r:%r: %r:%r %r %r', key, event.__name__,
_msg_id, _conn, args)
def newTask(self, iterator):
try:
iterator.next()
except StopIteration:
return
self.task_queue.appendleft(iterator)
def shutdown(self, erase=False):
"""Close all connections and exit"""
for c in self.em.getConnectionList():
......
......@@ -15,10 +15,13 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
LOG_QUERIES = False
from neo.lib.exception import DatabaseFailure
from .manager import DatabaseManager
from .sqlite import SQLiteDatabaseManager
DATABASE_MANAGER_DICT = {}
DATABASE_MANAGER_DICT = {'SQLite': SQLiteDatabaseManager}
try:
from .mysqldb import MySQLDatabaseManager
......@@ -27,17 +30,6 @@ except ImportError:
else:
DATABASE_MANAGER_DICT['MySQL'] = MySQLDatabaseManager
try:
from .btree import BTreeDatabaseManager
except ImportError:
pass
else:
# XXX: warning: name might change in the future.
DATABASE_MANAGER_DICT['BTree'] = BTreeDatabaseManager
if not DATABASE_MANAGER_DICT:
raise ImportError('No database back-end available.')
def buildDatabaseManager(name, args=(), kw={}):
if name is None:
name = DATABASE_MANAGER_DICT.keys()[0]
......
This diff is collapsed.
......@@ -18,6 +18,7 @@
import neo.lib
from neo.lib import util
from neo.lib.exception import DatabaseFailure
from neo.lib.protocol import ZERO_TID
class CreationUndone(Exception):
pass
......@@ -37,34 +38,6 @@ class DatabaseManager(object):
"""Called during instanciation, to process database parameter."""
pass
def isUnderTransaction(self):
return self._under_transaction
def begin(self):
"""
Begin a transaction
"""
if self._under_transaction:
raise DatabaseFailure('A transaction has already begun')
self._begin()
self._under_transaction = True
def commit(self):
"""
Commit the current transaction
"""
if not self._under_transaction:
raise DatabaseFailure('The transaction has not begun')
self._commit()
self._under_transaction = False
def rollback(self):
"""
Rollback the current transaction
"""
self._rollback()
self._under_transaction = False
def setup(self, reset = 0):
"""Set up a database
......@@ -79,14 +52,33 @@ class DatabaseManager(object):
"""
raise NotImplementedError
def _begin(self):
raise NotImplementedError
def __enter__(self):
"""
Begin a transaction
"""
if self._under_transaction:
raise DatabaseFailure('A transaction has already begun')
r = self.begin()
self._under_transaction = True
return r
def _commit(self):
raise NotImplementedError
def __exit__(self, exc_type, exc_value, tb):
if not self._under_transaction:
raise DatabaseFailure('The transaction has not begun')
self._under_transaction = False
if exc_type is None:
self.commit()
else:
self.rollback()
def _rollback(self):
raise NotImplementedError
def begin(self):
pass
def commit(self):
pass
def rollback(self):
pass
def _getPartition(self, oid_or_tid):
return oid_or_tid % self.getNumPartitions()
......@@ -104,13 +96,8 @@ class DatabaseManager(object):
if self._under_transaction:
self._setConfiguration(key, value)
else:
self.begin()
try:
with self:
self._setConfiguration(key, value)
except:
self.rollback()
raise
self.commit()
def _setConfiguration(self, key, value):
raise NotImplementedError
......@@ -171,7 +158,9 @@ class DatabaseManager(object):
"""
Load a Partition Table ID from a database.
"""
return long(self.getConfiguration('ptid'))
ptid = self.getConfiguration('ptid')
if ptid is not None:
return long(ptid)
def setPTID(self, ptid):
"""
......@@ -194,18 +183,31 @@ class DatabaseManager(object):
"""
self.setConfiguration('loid', util.dump(loid))
def getBackupTID(self):
return util.bin(self.getConfiguration('backup_tid'))
def setBackupTID(self, backup_tid):
return self.setConfiguration('backup_tid', util.dump(backup_tid))
def getPartitionTable(self):
"""Return a whole partition table as a tuple of rows. Each row
is again a tuple of an offset (row ID), an UUID of a storage
node, and a cell state."""
raise NotImplementedError
def getLastTID(self, all = True):
"""Return the last TID in a database. If all is true,
unfinished transactions must be taken account into. If there
is no TID in the database, return None."""
def _getLastTIDs(self, all=True):
raise NotImplementedError
def getLastTIDs(self, all=True):
trans, obj = self._getLastTIDs()
if trans:
tid = max(trans.itervalues())
if obj:
tid = max(tid, max(obj.itervalues()))
else:
tid = max(obj.itervalues()) if obj else None
return tid, trans, obj
def getUnfinishedTIDList(self):
"""Return a list of unfinished transaction's IDs."""
raise NotImplementedError
......@@ -352,13 +354,8 @@ class DatabaseManager(object):
else:
del refcount[data_id]
if prune:
self.begin()
try:
with self:
self._pruneData(data_id_list)
except:
self.rollback()
raise
self.commit()
__getDataTID = set()
def _getDataTID(self, oid, tid=None, before_tid=None):
......@@ -466,23 +463,24 @@ class DatabaseManager(object):
an oid list"""
raise NotImplementedError
def deleteTransactionsAbove(self, partition, tid, max_tid):
"""Delete all transactions above given TID (inclued) in given
partition, but never above max_tid (in case transactions are committed
during replication)."""
raise NotImplementedError
def deleteObject(self, oid, serial=None):
"""Delete given object. If serial is given, only delete that serial for
given oid."""
raise NotImplementedError
def deleteObjectsAbove(self, partition, oid, serial, max_tid):
"""Delete all objects above given OID and serial (inclued) in given
partition, but never above max_tid (in case objects are stored during
replication)"""
def _deleteRange(self, partition, min_tid=None, max_tid=None):
"""Delete all objects and transactions between given min_tid (excluded)
and max_tid (included)"""
raise NotImplementedError
def truncate(self, tid):
assert tid not in (None, ZERO_TID), tid
with self:
assert self.getBackupTID()
self.setBackupTID(tid)
for partition in xrange(self.getNumPartitions()):
self._deleteRange(partition, tid)
def getTransaction(self, tid, all = False):
"""Return a tuple of the list of OIDs, user information,
a description, and extension information, for a given transaction
......@@ -498,10 +496,10 @@ class DatabaseManager(object):
If there is no such object ID in a database, return None."""
raise NotImplementedError
def getObjectHistoryFrom(self, oid, min_serial, max_serial, length,
partition):
"""Return a dict of length serials grouped by oid at (or above)
min_oid and min_serial and below max_serial, for given partition,
def getReplicationObjectList(self, min_tid, max_tid, length, partition,
min_oid):
"""Return a dict of length oids grouped by serial at (or above)
min_tid and min_oid and below max_tid, for given partition,
sorted in ascending order."""
raise NotImplementedError
......
This diff is collapsed.
This diff is collapsed.
......@@ -18,15 +18,15 @@
import neo
from neo.lib.handler import EventHandler
from neo.lib import protocol
from neo.lib.util import dump
from neo.lib.exception import PrimaryFailure, OperationFailure
from neo.lib.protocol import NodeStates, NodeTypes, Packets, Errors, ZERO_HASH
from neo.lib.protocol import NodeStates, NodeTypes
class BaseMasterHandler(EventHandler):
def connectionLost(self, conn, new_state):
if self.app.listening_conn: # if running
self.app.master_node = None
raise PrimaryFailure('connection lost')
def stopOperation(self, conn):
......@@ -62,44 +62,5 @@ class BaseMasterHandler(EventHandler):
dump(uuid))
self.app.tm.abortFor(uuid)
class BaseClientAndStorageOperationHandler(EventHandler):
""" Accept requests common to client and storage nodes """
def askTransactionInformation(self, conn, tid):
app = self.app
t = app.dm.getTransaction(tid)
if t is None:
p = Errors.TidNotFound('%s does not exist' % dump(tid))
else:
p = Packets.AnswerTransactionInformation(tid, t[1], t[2], t[3],
t[4], t[0])
conn.answer(p)
def _askObject(self, oid, serial, tid):
raise NotImplementedError
def askObject(self, conn, oid, serial, tid):
app = self.app
if self.app.tm.loadLocked(oid):
# Delay the response.
app.queueEvent(self.askObject, conn, (oid, serial, tid))
return
o = self._askObject(oid, serial, tid)
if o is None:
neo.lib.logging.debug('oid = %s does not exist', dump(oid))
p = Errors.OidDoesNotExist(dump(oid))
elif o is False:
neo.lib.logging.debug('oid = %s not found', dump(oid))
p = Errors.OidNotFound(dump(oid))
else:
serial, next_serial, compression, checksum, data, data_serial = o
neo.lib.logging.debug('oid = %s, serial = %s, next_serial = %s',
dump(oid), dump(serial), dump(next_serial))
if checksum is None:
checksum = ZERO_HASH
data = ''
p = Packets.AnswerObject(oid, serial, next_serial,
compression, checksum, data, data_serial)
conn.answer(p)
def answerUnfinishedTransactions(self, conn, *args, **kw):
self.app.replicator.setUnfinishedTIDList(*args, **kw)
......@@ -16,10 +16,10 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import neo.lib
from neo.lib import protocol
from neo.lib.handler import EventHandler
from neo.lib.util import dump, makeChecksum
from neo.lib.protocol import Packets, LockState, Errors, ZERO_HASH
from . import BaseClientAndStorageOperationHandler
from neo.lib.protocol import Packets, LockState, Errors, ProtocolError, \
ZERO_HASH, INVALID_PARTITION
from ..transactions import ConflictError, DelayedError
from ..exception import AlreadyPendingError
import time
......@@ -28,10 +28,40 @@ import time
# Set to None to disable.
SLOW_STORE = 2
class ClientOperationHandler(BaseClientAndStorageOperationHandler):
class ClientOperationHandler(EventHandler):
def _askObject(self, oid, serial, ttid):
return self.app.dm.getObject(oid, serial, ttid)
def askTransactionInformation(self, conn, tid):
t = self.app.dm.getTransaction(tid)
if t is None:
p = Errors.TidNotFound('%s does not exist' % dump(tid))
else:
p = Packets.AnswerTransactionInformation(tid, t[1], t[2], t[3],
t[4], t[0])
conn.answer(p)
def askObject(self, conn, oid, serial, tid):
app = self.app
if app.tm.loadLocked(oid):
# Delay the response.
app.queueEvent(self.askObject, conn, (oid, serial, tid))
return
o = app.dm.getObject(oid, serial, tid)
if o is None:
neo.lib.logging.debug('oid = %s does not exist', dump(oid))
p = Errors.OidDoesNotExist(dump(oid))
elif o is False:
neo.lib.logging.debug('oid = %s not found', dump(oid))
p = Errors.OidNotFound(dump(oid))
else:
serial, next_serial, compression, checksum, data, data_serial = o
neo.lib.logging.debug('oid = %s, serial = %s, next_serial = %s',
dump(oid), dump(serial), dump(next_serial))
if checksum is None:
checksum = ZERO_HASH
data = ''
p = Packets.AnswerObject(oid, serial, next_serial,
compression, checksum, data, data_serial)
conn.answer(p)
def connectionLost(self, conn, new_state):
uuid = conn.getUUID()
......@@ -96,22 +126,18 @@ class ClientOperationHandler(BaseClientAndStorageOperationHandler):
self._askStoreObject(conn, oid, serial, compression, checksum, data,
data_serial, ttid, unlock, time.time())
def askTIDsFrom(self, conn, min_tid, max_tid, length, partition_list):
getReplicationTIDList = self.app.dm.getReplicationTIDList
tid_list = []
extend = tid_list.extend
for partition in partition_list:
extend(getReplicationTIDList(min_tid, max_tid, length, partition))
conn.answer(Packets.AnswerTIDsFrom(tid_list))
def askTIDsFrom(self, conn, min_tid, max_tid, length, partition):
conn.answer(Packets.AnswerTIDsFrom(self.app.dm.getReplicationTIDList(
min_tid, max_tid, length, partition)))
def askTIDs(self, conn, first, last, partition):
# This method is complicated, because I must return TIDs only
# about usable partitions assigned to me.
if first >= last:
raise protocol.ProtocolError('invalid offsets')
raise ProtocolError('invalid offsets')
app = self.app
if partition == protocol.INVALID_PARTITION:
if partition == INVALID_PARTITION:
partition_list = app.pt.getAssignedPartitionList(app.uuid)
else:
partition_list = [partition]
......@@ -149,7 +175,7 @@ class ClientOperationHandler(BaseClientAndStorageOperationHandler):
def askObjectHistory(self, conn, oid, first, last):
if first >= last:
raise protocol.ProtocolError( 'invalid offsets')
raise ProtocolError('invalid offsets')
app = self.app
history_list = app.dm.getObjectHistory(oid, first, last - first)
......
......@@ -21,6 +21,7 @@ from neo.lib.handler import EventHandler
from neo.lib.protocol import NodeTypes, Packets, NotReadyError
from neo.lib.protocol import ProtocolError, BrokenNodeDisallowedError
from neo.lib.util import dump
from .storage import StorageOperationHandler
class IdentificationHandler(EventHandler):
""" Handler used for incoming connections during operation state """
......@@ -35,37 +36,42 @@ class IdentificationHandler(EventHandler):
if not self.app.ready:
raise NotReadyError
app = self.app
node = app.nm.getByUUID(uuid)
# If this node is broken, reject it.
if node is not None and node.isBroken():
raise BrokenNodeDisallowedError
# choose the handler according to the node type
if node_type == NodeTypes.CLIENT:
from .client import ClientOperationHandler
handler = ClientOperationHandler
if node is None:
node = app.nm.createClient(uuid=uuid)
elif node.isConnected():
# cut previous connection
node.getConnection().close()
assert not node.isConnected()
node.setRunning()
elif node_type == NodeTypes.STORAGE:
from .storage import StorageOperationHandler
handler = StorageOperationHandler
if node is None:
neo.lib.logging.error('reject an unknown storage node %s',
dump(uuid))
raise NotReadyError
if uuid is None:
if node_type != NodeTypes.STORAGE:
raise ProtocolError('reject anonymous non-storage node')
handler = StorageOperationHandler(self.app)
conn.setHandler(handler)
else:
raise ProtocolError('reject non-client-or-storage node')
# apply the handler and set up the connection
handler = handler(self.app)
conn.setHandler(handler)
node.setConnection(conn)
args = (NodeTypes.STORAGE, app.uuid, app.pt.getPartitions(),
app.pt.getReplicas(), uuid)
if uuid == app.uuid:
raise ProtocolError("uuid conflict or loopback connection")
node = app.nm.getByUUID(uuid)
# If this node is broken, reject it.
if node is not None and node.isBroken():
raise BrokenNodeDisallowedError
# choose the handler according to the node type
if node_type == NodeTypes.CLIENT:
from .client import ClientOperationHandler
handler = ClientOperationHandler
if node is None:
node = app.nm.createClient(uuid=uuid)
elif node.isConnected():
# cut previous connection
node.getConnection().close()
assert not node.isConnected()
node.setRunning()
elif node_type == NodeTypes.STORAGE:
if node is None:
neo.lib.logging.error('reject an unknown storage node %s',
dump(uuid))
raise NotReadyError
handler = StorageOperationHandler
else:
raise ProtocolError('reject non-client-or-storage node')
# apply the handler and set up the connection
handler = handler(self.app)
conn.setHandler(handler)
node.setConnection(conn, app.uuid < uuid)
# accept the identification and trigger an event
conn.answer(Packets.AcceptIdentification(*args))
conn.answer(Packets.AcceptIdentification(NodeTypes.STORAGE, uuid and
app.uuid, app.pt.getPartitions(), app.pt.getReplicas(), uuid))
handler.connectionCompleted(conn)
......@@ -25,10 +25,6 @@ class InitializationHandler(BaseMasterHandler):
def answerNodeInformation(self, conn):
pass
def notifyNodeInformation(self, conn, node_list):
# the whole node list is received here
BaseMasterHandler.notifyNodeInformation(self, conn, node_list)
def answerPartitionTable(self, conn, ptid, row_list):
app = self.app
pt = app.pt
......@@ -53,8 +49,9 @@ class InitializationHandler(BaseMasterHandler):
app.dm.setPartitionTable(ptid, cell_list)
def answerLastIDs(self, conn, loid, ltid, lptid):
def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid):
self.app.dm.setLastOID(loid)
self.app.dm.setBackupTID(backup_tid)
def notifyPartitionChanges(self, conn, ptid, cell_list):
# XXX: This is safe to ignore those notifications because all of the
......
......@@ -24,11 +24,8 @@ from . import BaseMasterHandler
class MasterOperationHandler(BaseMasterHandler):
""" This handler is used for the primary master """
def answerUnfinishedTransactions(self, conn, max_tid, ttid_list):
self.app.replicator.setUnfinishedTIDList(max_tid, ttid_list)
def notifyTransactionFinished(self, conn, ttid, max_tid):
self.app.replicator.transactionFinished(ttid, max_tid)
def notifyTransactionFinished(self, conn, *args, **kw):
self.app.replicator.transactionFinished(*args, **kw)
def notifyPartitionChanges(self, conn, ptid, cell_list):
"""This is very similar to Send Partition Table, except that
......@@ -44,14 +41,7 @@ class MasterOperationHandler(BaseMasterHandler):
app.dm.changePartitionTable(ptid, cell_list)
# Check changes for replications
if app.replicator is not None:
for offset, uuid, state in cell_list:
if uuid == app.uuid:
# If this is for myself, this can affect replications.
if state == CellStates.DISCARDED:
app.replicator.removePartition(offset)
elif state == CellStates.OUT_OF_DATE:
app.replicator.addPartition(offset)
app.replicator.notifyPartitionChanges(cell_list)
def askLockInformation(self, conn, ttid, tid, oid_list):
if not ttid in self.app.tm:
......@@ -74,3 +64,11 @@ class MasterOperationHandler(BaseMasterHandler):
if not conn.isClosed():
conn.answer(Packets.AnswerPack(True))
def replicate(self, conn, tid, upstream_name, source_dict):
self.app.replicator.backup(tid,
dict((p, (a, upstream_name))
for p, a in source_dict.iteritems()))
def askTruncate(self, conn, tid):
self.app.dm.truncate(tid)
conn.answer(Packets.AnswerTruncate())
This diff is collapsed.
......@@ -15,36 +15,101 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from . import BaseClientAndStorageOperationHandler
from neo.lib.protocol import Packets
import weakref
from functools import wraps
import neo.lib
from neo.lib.connector import ConnectorConnectionClosedException
from neo.lib.handler import EventHandler
from neo.lib.protocol import Errors, NodeStates, Packets, \
ZERO_HASH, ZERO_TID, ZERO_OID
from neo.lib.util import add64, u64
class StorageOperationHandler(BaseClientAndStorageOperationHandler):
def checkConnectionIsReplicatorConnection(func):
def decorator(self, conn, *args, **kw):
assert self.app.replicator.getCurrentConnection() is conn
return func(self, conn, *args, **kw)
return wraps(func)(decorator)
def _askObject(self, oid, serial, tid):
result = self.app.dm.getObject(oid, serial, tid)
if result and result[5]:
return result[:2] + (None, None, None) + result[4:]
return result
class StorageOperationHandler(EventHandler):
"""This class handles events for replications."""
def askLastIDs(self, conn):
app = self.app
oid = app.dm.getLastOID()
tid = app.dm.getLastTID()
conn.answer(Packets.AnswerLastIDs(oid, tid, app.pt.getID()))
def askTIDsFrom(self, conn, min_tid, max_tid, length, partition_list):
assert len(partition_list) == 1, partition_list
tid_list = self.app.dm.getReplicationTIDList(min_tid, max_tid, length,
partition_list[0])
conn.answer(Packets.AnswerTIDsFrom(tid_list))
def askObjectHistoryFrom(self, conn, min_oid, min_serial, max_serial,
length, partition):
object_dict = self.app.dm.getObjectHistoryFrom(min_oid, min_serial,
max_serial, length, partition)
conn.answer(Packets.AnswerObjectHistoryFrom(object_dict))
def connectionLost(self, conn, new_state):
if self.app.listening_conn and conn.isClient():
# XXX: Connection and Node should merged.
uuid = conn.getUUID()
if uuid:
node = self.app.nm.getByUUID(uuid)
else:
node = self.app.nm.getByAddress(conn.getAddress())
node.setState(NodeStates.DOWN)
replicator = self.app.replicator
if replicator.current_node is node:
replicator.abort()
# Client
def connectionFailed(self, conn):
if self.app.listening_conn:
self.app.replicator.abort()
@checkConnectionIsReplicatorConnection
def acceptIdentification(self, conn, node_type,
uuid, num_partitions, num_replicas, your_uuid):
self.app.replicator.fetchTransactions()
@checkConnectionIsReplicatorConnection
def answerFetchTransactions(self, conn, pack_tid, next_tid, tid_list):
if tid_list:
deleteTransaction = self.app.dm.deleteTransaction
for tid in tid_list:
deleteTransaction(tid)
assert not pack_tid, "TODO"
if next_tid:
self.app.replicator.fetchTransactions(next_tid)
else:
self.app.replicator.fetchObjects()
@checkConnectionIsReplicatorConnection
def addTransaction(self, conn, tid, user, desc, ext, packed, oid_list):
# Directly store the transaction.
self.app.dm.storeTransaction(tid, (),
(oid_list, user, desc, ext, packed), False)
@checkConnectionIsReplicatorConnection
def answerFetchObjects(self, conn, pack_tid, next_tid,
next_oid, object_dict):
if object_dict:
deleteObject = self.app.dm.deleteObject
for serial, oid_list in object_dict.iteritems():
for oid in oid_list:
delObject(oid, serial)
assert not pack_tid, "TODO"
if next_tid:
self.app.replicator.fetchObjects(next_tid, next_oid)
else:
self.app.replicator.finish()
@checkConnectionIsReplicatorConnection
def addObject(self, conn, oid, serial, compression,
checksum, data, data_serial):
dm = self.app.dm
if data or checksum != ZERO_HASH:
data_id = dm.storeData(checksum, data, compression)
else:
data_id = None
# Directly store the transaction.
obj = oid, data_id, data_serial
dm.storeTransaction(serial, (obj,), None, False)
@checkConnectionIsReplicatorConnection
def replicationError(self, conn, message):
self.app.replicator.abort('source message: ' + message)
# Server (all methods must set connection as server so that it isn't closed
# if client tasks are finished)
def askCheckTIDRange(self, conn, min_tid, max_tid, length, partition):
conn.asServer()
count, tid_checksum, max_tid = self.app.dm.checkTIDRange(min_tid,
max_tid, length, partition)
conn.answer(Packets.AnswerCheckTIDRange(min_tid, length,
......@@ -52,9 +117,91 @@ class StorageOperationHandler(BaseClientAndStorageOperationHandler):
def askCheckSerialRange(self, conn, min_oid, min_serial, max_tid, length,
partition):
conn.asServer()
count, oid_checksum, max_oid, serial_checksum, max_serial = \
self.app.dm.checkSerialRange(min_oid, min_serial, max_tid, length,
partition)
conn.answer(Packets.AnswerCheckSerialRange(min_oid, min_serial, length,
count, oid_checksum, max_oid, serial_checksum, max_serial))
def askFetchTransactions(self, conn, partition, length, min_tid, max_tid,
tid_list):
app = self.app
cell = app.pt.getCell(partition, app.uuid)
if cell is None or cell.isOutOfDate():
return conn.answer(Errors.ReplicationError(
"partition %u not readable" % partition))
conn.asServer()
msg_id = conn.getPeerId()
conn = weakref.proxy(conn)
peer_tid_set = set(tid_list)
dm = app.dm
tid_list = dm.getReplicationTIDList(min_tid, max_tid, length + 1,
partition)
next_tid = tid_list.pop() if length < len(tid_list) else None
def push():
try:
pack_tid = None # TODO
for tid in tid_list:
if tid in peer_tid_set:
peer_tid_set.remove(tid)
else:
t = dm.getTransaction(tid)
if t is None:
conn.answer(Errors.ReplicationError(
"partition %u dropped" % partition))
return
oid_list, user, desc, ext, packed = t
conn.notify(Packets.AddTransaction(
tid, user, desc, ext, packed, oid_list))
yield
conn.answer(Packets.AnswerFetchTransactions(
pack_tid, next_tid, peer_tid_set), msg_id)
yield
except (weakref.ReferenceError, ConnectorConnectionClosedException):
pass
app.newTask(push())
def askFetchObjects(self, conn, partition, length, min_tid, max_tid,
min_oid, object_dict):
app = self.app
cell = app.pt.getCell(partition, app.uuid)
if cell is None or cell.isOutOfDate():
return conn.answer(Errors.ReplicationError(
"partition %u not readable" % partition))
conn.asServer()
msg_id = conn.getPeerId()
conn = weakref.proxy(conn)
dm = app.dm
object_list = dm.getReplicationObjectList(min_tid, max_tid, length,
partition, min_oid)
if length < len(object_list):
next_tid, next_oid = object_list.pop()
else:
next_tid = next_oid = None
def push():
try:
pack_tid = None # TODO
for serial, oid in object_list:
oid_set = object_dict.get(serial)
if oid_set:
if type(oid_set) is list:
object_dict[serial] = oid_set = set(oid_set)
if oid in oid_set:
oid_set.remove(oid)
if not oid_set:
del object_dict[serial]
continue
object = dm.getObject(oid, serial)
if object is None:
conn.answer(Errors.ReplicationError(
"partition %u dropped" % partition))
return
conn.notify(Packets.AddObject(oid, serial, *object[2:]))
yield
conn.answer(Packets.AnswerFetchObjects(
pack_tid, next_tid, next_oid, object_dict), msg_id)
yield
except (weakref.ReferenceError, ConnectorConnectionClosedException):
pass
app.newTask(push())
......@@ -27,15 +27,11 @@ class VerificationHandler(BaseMasterHandler):
def askLastIDs(self, conn):
app = self.app
try:
oid = app.dm.getLastOID()
except KeyError:
oid = None
try:
tid = app.dm.getLastTID()
except KeyError:
tid = None
conn.answer(Packets.AnswerLastIDs(oid, tid, app.pt.getID()))
conn.answer(Packets.AnswerLastIDs(
app.dm.getLastOID(),
app.dm.getLastTIDs()[0],
app.pt.getID(),
app.dm.getBackupTID()))
def askPartitionTable(self, conn):
pt = self.app.pt
......
This diff is collapsed.
......@@ -131,6 +131,11 @@ class NeoTestBase(unittest.TestCase):
sys.stdout.write('\n')
sys.stdout.flush()
class failureException(AssertionError):
def __init__(self, msg=None):
neo.lib.logging.error(msg)
AssertionError.__init__(self, msg)
failIfEqual = failUnlessEqual = assertEquals = assertNotEquals = None
def assertNotEqual(self, first, second, msg=None):
......
This diff is collapsed.
......@@ -234,6 +234,9 @@ class ClientTests(NEOFunctionalTest):
temp_dir=self.getTempDirectory())
neoctl = self.neo.getNEOCTL()
self.neo.start()
# BUG: The following 2 lines creates 2 app, i.e. 2 TCP connections
# to the storage, so there may be a race condition at network
# level and 'st2.store' may be effective before 'st1.store'.
db1, conn1 = self.neo.getZODBConnection()
db2, conn2 = self.neo.getZODBConnection()
st1, st2 = conn1._storage, conn2._storage
......
......@@ -35,7 +35,7 @@ class ClusterTests(NEOFunctionalTest):
def testClusterStartup(self):
neo = NEOCluster(['test_neo1', 'test_neo2'], replicas=1,
adapter='MySQL', temp_dir=self.getTempDirectory())
temp_dir=self.getTempDirectory())
neoctl = neo.getNEOCTL()
neo.run()
# Runing a new cluster doesn't exit Recovery state.
......
This diff is collapsed.
......@@ -85,7 +85,7 @@ class MasterRecoveryTests(NeoUnitTestBase):
self.assertTrue(ptid2 > self.app.pt.getID())
self.assertTrue(oid2 > self.app.tm.getLastOID())
self.assertTrue(tid2 > self.app.tm.getLastTID())
recovery.answerLastIDs(conn, oid2, tid2, ptid2)
recovery.answerLastIDs(conn, oid2, tid2, ptid2, None)
self.assertEqual(oid2, self.app.tm.getLastOID())
self.assertEqual(tid2, self.app.tm.getLastTID())
self.assertEqual(ptid2, recovery.target_ptid)
......
......@@ -130,10 +130,11 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
self.app.tm.setLastTID(tid)
service.askLastIDs(conn)
packet = self.checkAnswerLastIDs(conn)
loid, ltid, lptid = packet.decode()
loid, ltid, lptid, backup_tid = packet.decode()
self.assertEqual(loid, oid)
self.assertEqual(ltid, tid)
self.assertEqual(lptid, ptid)
self.assertEqual(backup_tid, None)
def test_13_askUnfinishedTransactions(self):
service = self.service
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -26,8 +26,7 @@ from neo.storage.transactions import TransactionManager, \
DelayedError, ConflictError
from neo.lib.connection import MTClientConnection
from neo.lib.protocol import NodeStates, Packets, ZERO_TID
from . import NEOCluster, NEOThreadedTest, \
Patch, ConnectionFilter
from . import NEOCluster, NEOThreadedTest, Patch
from neo.lib.util import makeChecksum
from neo.client.pool import CELL_CONNECTED, CELL_GOOD
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment