Commit fb5d9b99 authored by Kirill Smelkov's avatar Kirill Smelkov

.

parent f1486cb0
...@@ -11,3 +11,29 @@ func (e *Error) Error() string { ...@@ -11,3 +11,29 @@ func (e *Error) Error() string {
} }
return s return s
} }
const nodeTypeChar = "MSCA4567"
func (nid NodeID) String() string {
// return ex 'S1', 'M2', ...
if nid == 0 {
return "?0"
}
typ := nid >> 24
num := nid & (1<<24 - 1)
temp := typ&(1 << 7) != 0
typ &= 1<<7 - 1
nodeType := NodeType(typ >> 4)
s := fmt.Sprintf("%c%d", nodeTypeChar[nodeType], num)
// 's1', 'm2', for temporary nids
if temp {
s = strings.Lower(s)
}
return s
}
...@@ -43,13 +43,34 @@ const ( ...@@ -43,13 +43,34 @@ const (
type ClusterState int32 type ClusterState int32
const ( const (
// NOTE cluster states descriptions is in protocol.py // Once the primary master is elected, the cluster has a state, which is
// initially RECOVERING, during which the master:
// - first recovers its own data by reading it from storage nodes;
// - waits for the partition table be operational;
// - automatically switch to VERIFYING if the cluster can be safely started.
// Whenever the partition table becomes non-operational again, the cluster
// goes back to this state.
RECOVERING ClusterState = iota RECOVERING ClusterState = iota
// Transient state, used to:
// - replay the transaction log, in case of unclean shutdown;
// - and actually truncate the DB if the user asked to do so.
// Then, the cluster either goes to RUNNING or STARTING_BACKUP state.
VERIFYING VERIFYING
// Normal operation. The DB is read-writable by clients.
CLUSTER_RUNNING // XXX conflict with NodeState.RUNNING CLUSTER_RUNNING // XXX conflict with NodeState.RUNNING
// Transient state to shutdown the whole cluster.
STOPPING STOPPING
// Transient state, during which the master (re)connect to the upstream
// master.
STARTING_BACKUP STARTING_BACKUP
// Backup operation. The master is notified of new transactions thanks to
// invalidations and orders storage nodes to fetch them from upstream.
// Because cells are synchronized independently, the DB is often
// inconsistent.
BACKINGUP BACKINGUP
// Transient state, when the user decides to go back to RUNNING state.
// The master stays in this state until the DB is consistent again.
// In case of failure, the cluster will go back to backup mode.
STOPPING_BACKUP STOPPING_BACKUP
) )
...@@ -74,18 +95,40 @@ const ( ...@@ -74,18 +95,40 @@ const (
type CellState int32 type CellState int32
const ( const (
// NOTE cell states description is in protocol.py // Normal state: cell is writable/readable, and it isn't planned to drop it.
UP_TO_DATE CellState = iota //short: U // XXX tag prefix name ? UP_TO_DATE CellState = iota //short: U // XXX tag prefix name ?
// Write-only cell. Last transactions are missing because storage is/was down
// for a while, or because it is new for the partition. It usually becomes
// UP_TO_DATE when replication is done.
OUT_OF_DATE //short: O OUT_OF_DATE //short: O
// Same as UP_TO_DATE, except that it will be discarded as soon as another
// node finishes to replicate it. It means a partition is moved from 1 node
// to another.
FEEDING //short: F FEEDING //short: F
// Not really a state: only used in network packets to tell storages to drop
// partitions.
DISCARDED //short: D DISCARDED //short: D
// A check revealed that data differs from other replicas. Cell is neither
// readable nor writable.
CORRUPTED //short: C CORRUPTED //short: C
) )
// An UUID (node identifier, 4-bytes signed integer) // NodeID is a node identifier, 4-bytes signed integer
type UUID int32 //
// High-order byte:
// 7 6 5 4 3 2 1 0
// | | | | +-+-+-+-- reserved (0)
// | +-+-+---------- node type
// +---------------- temporary if negative
// UUID namespaces are required to prevent conflicts when the master generate
// new uuid before it knows uuid of existing storage nodes. So only the high
// order bit is really important and the 31 other bits could be random.
// Extra namespace information and non-randomness of 3 LOB help to read logs.
//
// XXX was UUID in py
type NodeID int32
// TODO UUID_NAMESPACES // TODO NodeType -> base NodeID
var ErrDecodeOverflow = errors.New("decode: bufer overflow") var ErrDecodeOverflow = errors.New("decode: bufer overflow")
...@@ -183,14 +226,14 @@ func float64_NEODecode(b []byte) float64 { ...@@ -183,14 +226,14 @@ func float64_NEODecode(b []byte) float64 {
type NodeInfo struct { type NodeInfo struct {
NodeType NodeType
Address Address
UUID NodeID
NodeState NodeState
IdTimestamp float64 IdTimestamp float64
} }
//type CellList []struct { //type CellList []struct {
type CellInfo struct { type CellInfo struct {
UUID NodeID
CellState CellState
} }
...@@ -246,7 +289,7 @@ type CloseClient struct { ...@@ -246,7 +289,7 @@ type CloseClient struct {
type RequestIdentification struct { type RequestIdentification struct {
ProtocolVersion uint32 // TODO py.PProtocol upon decoding checks for != PROTOCOL_VERSION ProtocolVersion uint32 // TODO py.PProtocol upon decoding checks for != PROTOCOL_VERSION
NodeType NodeType // XXX name NodeType NodeType // XXX name
UUID UUID NodeID NodeID
Address Address // where requesting node is also accepting connections Address Address // where requesting node is also accepting connections
Name string Name string
IdTimestamp float64 IdTimestamp float64
...@@ -255,14 +298,14 @@ type RequestIdentification struct { ...@@ -255,14 +298,14 @@ type RequestIdentification struct {
// XXX -> ReplyIdentification? RequestIdentification.Answer somehow ? // XXX -> ReplyIdentification? RequestIdentification.Answer somehow ?
type AcceptIdentification struct { type AcceptIdentification struct {
NodeType NodeType // XXX name NodeType NodeType // XXX name
MyUUID UUID MyNodeID NodeID
NumPartitions uint32 // PNumber NumPartitions uint32 // PNumber
NumReplicas uint32 // PNumber NumReplicas uint32 // PNumber
YourUUID UUID YourNodeID NodeID
Primary Address Primary Address
KnownMasterList []struct { KnownMasterList []struct {
Address Address
UUID UUID NodeID NodeID
} }
} }
...@@ -271,7 +314,7 @@ type PrimaryMaster struct { ...@@ -271,7 +314,7 @@ type PrimaryMaster struct {
} }
type AnswerPrimary struct { type AnswerPrimary struct {
PrimaryUUID UUID PrimaryNodeID NodeID
} }
// Announce a primary master node election. PM -> SM. // Announce a primary master node election. PM -> SM.
...@@ -326,7 +369,7 @@ type PartitionChanges struct { ...@@ -326,7 +369,7 @@ type PartitionChanges struct {
CellList []struct { CellList []struct {
// XXX does below correlate with Cell inside top-level CellList ? // XXX does below correlate with Cell inside top-level CellList ?
Offset uint32 // PNumber Offset uint32 // PNumber
UUID UUID NodeID NodeID
CellState CellState CellState CellState
} }
} }
...@@ -399,7 +442,7 @@ type AnswerBeginTransaction struct { ...@@ -399,7 +442,7 @@ type AnswerBeginTransaction struct {
// True is returned if it's still possible to finish the transaction. // True is returned if it's still possible to finish the transaction.
type FailedVote struct { type FailedVote struct {
Tid zodb.Tid Tid zodb.Tid
UUIDList []UUID NodeList []NodeID
// XXX _answer = Error // XXX _answer = Error
} }
...@@ -514,7 +557,7 @@ type AnswerStoreObject struct { ...@@ -514,7 +557,7 @@ type AnswerStoreObject struct {
// Abort a transaction. C -> S and C -> PM -> S. // Abort a transaction. C -> S and C -> PM -> S.
type AbortTransaction struct { type AbortTransaction struct {
Tid zodb.Tid Tid zodb.Tid
UUIDList []UUID // unused for * -> S NodeList []NodeID // unused for * -> S
} }
// Ask to store a transaction. C -> S. // Ask to store a transaction. C -> S.
...@@ -623,7 +666,7 @@ type AnswerObjectHistory struct { ...@@ -623,7 +666,7 @@ type AnswerObjectHistory struct {
type PartitionList struct { type PartitionList struct {
MinOffset uint32 // PNumber MinOffset uint32 // PNumber
MaxOffset uint32 // PNumber MaxOffset uint32 // PNumber
UUID UUID NodeID NodeID
} }
type AnswerPartitionList struct { type AnswerPartitionList struct {
...@@ -643,7 +686,7 @@ type AnswerNodeList struct { ...@@ -643,7 +686,7 @@ type AnswerNodeList struct {
// Set the node state // Set the node state
type SetNodeState struct { type SetNodeState struct {
UUID NodeID
NodeState NodeState
// XXX _answer = Error ? // XXX _answer = Error ?
...@@ -651,14 +694,14 @@ type SetNodeState struct { ...@@ -651,14 +694,14 @@ type SetNodeState struct {
// Ask the primary to include some pending node in the partition table // Ask the primary to include some pending node in the partition table
type AddPendingNodes struct { type AddPendingNodes struct {
UUIDList []UUID NodeList []NodeID
// XXX _answer = Error // XXX _answer = Error
} }
// Ask the primary to optimize the partition table. A -> PM. // Ask the primary to optimize the partition table. A -> PM.
type TweakPartitionTable struct { type TweakPartitionTable struct {
UUIDList []UUID NodeList []NodeID
// XXX _answer = Error // XXX _answer = Error
} }
...@@ -691,7 +734,7 @@ type repairFlags struct { ...@@ -691,7 +734,7 @@ type repairFlags struct {
// Ask storage nodes to repair their databases. ctl -> A -> M // Ask storage nodes to repair their databases. ctl -> A -> M
type Repair struct { type Repair struct {
UUIDList []UUID NodeList []NodeID
repairFlags repairFlags
} }
...@@ -779,7 +822,7 @@ type AnswerPack struct { ...@@ -779,7 +822,7 @@ type AnswerPack struct {
// ctl -> A // ctl -> A
// A -> M // A -> M
type CheckReplicas struct { type CheckReplicas struct {
PartitionDict map[uint32]UUID // partition -> source (PNumber) PartitionDict map[uint32]NodeID // partition -> source (PNumber)
MinTID zodb.Tid MinTID zodb.Tid
MaxTID zodb.Tid MaxTID zodb.Tid
...@@ -846,7 +889,7 @@ type AnswerCheckSerialRange struct { ...@@ -846,7 +889,7 @@ type AnswerCheckSerialRange struct {
// S -> M // S -> M
type PartitionCorrupted struct { type PartitionCorrupted struct {
Partition uint32 // PNumber Partition uint32 // PNumber
CellList []UUID CellList []NodeID
} }
......
...@@ -104,7 +104,6 @@ def ClusterStates(): ...@@ -104,7 +104,6 @@ def ClusterStates():
# invalidations and orders storage nodes to fetch them from upstream. # invalidations and orders storage nodes to fetch them from upstream.
# Because cells are synchronized independently, the DB is often # Because cells are synchronized independently, the DB is often
# inconsistent. # inconsistent.
# TODO: allow clients to connect for read-only operations
BACKINGUP BACKINGUP
# Transient state, when the user decides to go back to RUNNING state. # Transient state, when the user decides to go back to RUNNING state.
# The master stays in this state until the DB is consistent again. # The master stays in this state until the DB is consistent again.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment