Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
neoppod
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Levin Zimmermann
neoppod
Commits
48e38f91
Commit
48e38f91
authored
7 years ago
by
Kirill Smelkov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
.
parent
2c0345b9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
108 additions
and
67 deletions
+108
-67
go/neo/connection.go
go/neo/connection.go
+1
-1
go/neo/server/master.go
go/neo/server/master.go
+94
-53
go/neo/server/storage.go
go/neo/server/storage.go
+13
-13
No files found.
go/neo/connection.go
View file @
48e38f91
...
...
@@ -949,7 +949,7 @@ func (c *Conn) Expect(msgv ...Msg) (which int, err error) {
return
-
1
,
&
ConnError
{
c
,
"recv"
,
fmt
.
Errorf
(
"unexpected message: %v"
,
msgType
)}
}
// Ask sends request and receives response
// Ask sends request and receives response
.
// It expects response to be exactly of resp type and errors otherwise
// XXX clarify error semantic (when Error is decoded)
// XXX do the same as Expect wrt respv ?
...
...
This diff is collapsed.
Click to expand it.
go/neo/server/master.go
View file @
48e38f91
...
...
@@ -27,6 +27,8 @@ import (
// "math"
"sync"
"github.com/pkg/errors"
"lab.nexedi.com/kirr/neo/go/neo"
"lab.nexedi.com/kirr/neo/go/zodb"
"lab.nexedi.com/kirr/neo/go/xcommon/xnet"
...
...
@@ -150,6 +152,7 @@ func (m *Master) Run(ctx context.Context) error {
}
m
.
node
.
MasterAddr
=
l
.
Addr
()
.
String
()
m
.
logf
(
"serving on %s ..."
,
m
.
node
.
MasterAddr
)
// accept incoming connections and pass them to main driver
wg
:=
sync
.
WaitGroup
{}
...
...
@@ -204,7 +207,7 @@ func (m *Master) runMain(ctx context.Context) (err error) {
// a command came to us to start the cluster.
err
:=
m
.
recovery
(
ctx
)
if
err
!=
nil
{
fmt
.
Println
(
err
)
m
.
log
(
err
)
return
err
// recovery cancelled
}
...
...
@@ -212,14 +215,14 @@ func (m *Master) runMain(ctx context.Context) (err error) {
// case previously it was unclean shutdown.
err
=
m
.
verify
(
ctx
)
if
err
!=
nil
{
fmt
.
Println
(
err
)
m
.
log
(
err
)
continue
// -> recovery
}
// provide service as long as partition table stays operational
err
=
m
.
service
(
ctx
)
if
err
!=
nil
{
fmt
.
Println
(
err
)
m
.
log
(
err
)
continue
// -> recovery
}
...
...
@@ -255,7 +258,7 @@ type storRecovery struct {
// - nil: recovery was ok and a command came for cluster to start
// - !nil: recovery was cancelled
func
(
m
*
Master
)
recovery
(
ctx
context
.
Context
)
(
err
error
)
{
fmt
.
Println
(
"master:
recovery"
)
m
.
log
(
"
recovery"
)
defer
xerr
.
Context
(
&
err
,
"master: recovery"
)
m
.
setClusterState
(
neo
.
ClusterRecovering
)
...
...
@@ -282,7 +285,7 @@ loop:
select
{
// new connection comes in
case
n
:=
<-
m
.
nodeCome
:
node
,
resp
:=
m
.
accept
(
n
,
/* XXX only accept storages -> PENDING */
)
node
,
resp
:=
m
.
identify
(
n
,
/* XXX only accept storages -> PENDING */
)
// if new storage arrived - start recovery on it too
wg
.
Add
(
1
)
...
...
@@ -294,10 +297,10 @@ loop:
return
}
err
:=
m
.
welcome
(
n
.
conn
,
resp
)
err
:=
m
.
accept
(
n
.
conn
,
resp
)
if
err
!=
nil
{
// XXX better m.nodeLeave <- nodeLeave{node, err} ?
// XXX move this m.nodeLeave <- to
welcome
() ?
// XXX move this m.nodeLeave <- to
accept
() ?
recovery
<-
storRecovery
{
node
:
node
,
err
:
err
}
}
...
...
@@ -305,36 +308,47 @@ loop:
storCtlRecovery
(
rctx
,
node
,
recovery
)
}()
// XXX calrify who sends here
/*
case n := <-m.nodeLeave:
// XXX close n.node.Link ?
// TODO close n.node.Link
//
// XXX dup wrt recovery err != nil -> func
m.nodeTab.SetNodeState(n.node, DOWN)
// XXX update something indicating cluster currently can be operational or not ?
*/
// a storage node came through recovery - let's see whether
// ptid ↑ and if so we should take partition table from there
case
r
:=
<-
recovery
:
// inprogress--
if
r
.
err
!=
nil
{
// XXX err ctx?
// XXX log here or in producer?
fmt
.
Printf
(
"master: %v
\n
"
,
r
.
err
)
m
.
logf
(
"%v"
,
r
.
err
)
// XXX close stor link / update .nodeTab
break
}
// XXX only if not cancelled
m
.
logf
(
"master: %v: closing link"
,
r
.
node
.
Link
)
// close stor link / update .nodeTab
err
:=
r
.
node
.
Link
.
Close
()
if
err
!=
nil
{
m
.
logf
(
"master: %v
\n
"
,
r
.
node
.
Link
)
}
m
.
nodeTab
.
SetNodeState
(
r
.
node
,
DOWN
)
// we are interested in latest partTab
// NOTE during recovery no one must be subscribed to
// partTab so it is ok to simply change whole m.partTab
if
r
.
partTab
.
PTid
>
m
.
partTab
.
PTid
{
m
.
partTab
=
r
.
partTab
}
else
{
// we are interested in latest partTab
// NOTE during recovery no one must be subscribed to
// partTab so it is ok to simply change whole m.partTab
if
r
.
partTab
.
PTid
>
m
.
partTab
.
PTid
{
m
.
partTab
=
r
.
partTab
}
}
// XXX update something indicating cluster currently can be operational or not
// XXX handle case of new cluster - when no storage reports valid parttab
// XXX -> create new parttab
// XXX update something indicating cluster currently can be operational or not ?
// request to start the cluster - if ok we exit replying ok
...
...
@@ -365,12 +379,7 @@ loop:
}
}
// // consume left recovery responses (which should come without delay since it was cancelled)
// for ; inprogress > 0; inprogress-- {
// <-recovery
// }
// wait all workers finish (which should come without delay since it was cancelled)
// wait all workers to finish (which should come without delay since it was cancelled)
done
:=
make
(
chan
struct
{})
go
func
()
{
wg
.
Wait
()
...
...
@@ -379,7 +388,18 @@ loop:
for
{
select
{
// XXX <-m.nodeLeave ?
case
<-
recovery
:
// we do not care errors here - they are either cancelled or IO errors
// we just log them and return - in case it is IO error
// on link it will be caught on next send/recv XXX
switch
errors
.
Cause
(
r
.
err
)
{
case
context
.
Canceled
,
context
.
DeadlineExceeded
:
// ok
default
:
// XXX not so ok
}
case
<-
done
:
break
}
...
...
@@ -392,29 +412,25 @@ loop:
// it retrieves various ids and partition table from as stored on the storage
func
storCtlRecovery
(
ctx
context
.
Context
,
stor
*
neo
.
Node
,
res
chan
storRecovery
)
{
var
err
error
// XXX where this close link on error should be ?
defer
func
()
{
if
err
==
nil
{
return
}
//
XXX on err still provide feedback to storRecovery chan ?
//
on error provide feedback to storRecovery chan
res
<-
storRecovery
{
node
:
stor
,
err
:
err
}
/*
fmt.Printf("master: %v", err)
// this must interrupt everything connected to stor node and
// thus eventually result in nodeLeave event to main driver
link.Close()
*/
}()
defer
xerr
.
Contextf
(
&
err
,
"%s: stor recovery"
,
stor
.
link
)
conn
,
err
:=
stor
.
link
.
NewConn
()
// FIXME bad -> not bad
conn
,
err
:=
stor
.
link
.
NewConn
()
if
err
!=
nil
{
return
}
defer
func
()
{
err2
:=
conn
.
Close
()
err
=
xerr
.
First
(
err
,
err2
)
}()
// XXX cancel on ctx
recovery
:=
neo
.
AnswerRecovery
{}
...
...
@@ -475,7 +491,7 @@ var errClusterDegraded = errors.New("cluster became non-operatonal")
//
// prerequisite for start: .partTab is operational wrt .nodeTab
func
(
m
*
Master
)
verify
(
ctx
context
.
Context
)
(
err
error
)
{
fmt
.
Println
(
"master:
verify"
)
m
.
log
(
"
verify"
)
defer
xerr
.
Context
(
&
err
,
"master: verify"
)
m
.
setClusterState
(
neo
.
ClusterVerifying
)
...
...
@@ -503,7 +519,7 @@ loop:
for
inprogress
>
0
{
select
{
case
n
:=
<-
m
.
nodeCome
:
node
,
ok
:=
m
.
accept
(
n
,
/* XXX only accept storages -> known ? RUNNING : PENDING */
)
node
,
ok
:=
m
.
identify
(
n
,
/* XXX only accept storages -> known ? RUNNING : PENDING */
)
if
!
ok
{
break
}
...
...
@@ -531,7 +547,7 @@ loop:
inprogress
--
if
v
.
err
!=
nil
{
fmt
.
Printf
(
"master: verify: %v
\n
"
,
v
.
err
)
m
.
logf
(
"verify: %v
"
,
v
.
err
)
// mark storage as non-working in nodeTab
// FIXME better -> v.node.setState(DOWN) ?
...
...
@@ -644,7 +660,7 @@ func storCtlVerify(ctx context.Context, link *neo.NodeLink, res chan storVerify)
//
// prerequisite for start: .partTab is operational wrt .nodeTab and verification passed
func
(
m
*
Master
)
service
(
ctx
context
.
Context
)
(
err
error
)
{
fmt
.
Println
(
"master:
service"
)
m
.
log
(
"
service"
)
defer
xerr
.
Context
(
&
err
,
"master: service"
)
// XXX we also need to tell storages StartOperation first
...
...
@@ -657,7 +673,7 @@ loop:
select
{
// a node connected and requests identification
case
n
:=
<-
m
.
nodeCome
:
node
,
resp
,
ok
:=
m
.
accept
(
n
,
/* XXX accept everyone */
)
node
,
resp
,
ok
:=
m
.
identify
(
n
,
/* XXX accept everyone */
)
state
:=
m
.
ClusterState
...
...
@@ -671,7 +687,7 @@ loop:
return
}
err =
welcome
(n.conn, resp)
err =
accept
(n.conn, resp)
if err {
// XXX
}
...
...
@@ -702,7 +718,7 @@ loop:
if r.err != nil {
// XXX err ctx?
// XXX log here or in producer?
fmt.Printf("master: %v\n
", r.err)
m.logf("%v
", r.err)
// TODO close stor link / update .nodeTab
break
...
...
@@ -740,7 +756,7 @@ loop:
// XXX was "stop verification if so"
case v := <-verify:
if v.err != nil {
fmt.Printf("master: verify: %v\n
", v.err)
m.logf("verify: %v
", v.err)
// mark storage as non-working in nodeTab
// FIXME better -> v.node.setState(DOWN) ?
...
...
@@ -819,11 +835,11 @@ loop:
return
err
}
//
accept
processes identification request of just connected node and either accepts or declines it.
// If node identification is accepted nodeTab is updated and corresponding node entry is returned.
//
identify
processes identification request of just connected node and either accepts or declines it.
// If node identification is accepted
.
nodeTab is updated and corresponding node entry is returned.
// Response message is constructed but not send back not to block the caller - it is
// the caller responsibility to send the response to node which requested identification.
func
(
m
*
Master
)
accept
(
n
nodeCome
)
(
node
*
neo
.
Node
,
resp
neo
.
Msg
)
{
func
(
m
*
Master
)
identify
(
n
nodeCome
)
(
node
*
neo
.
Node
,
resp
neo
.
Msg
)
{
// TODO log node accept/rejected
// XXX also verify ? :
...
...
@@ -899,8 +915,10 @@ func (m *Master) reject(conn *neo.Conn, resp neo.Msg) {
log
xerr
.
Merge
(
err1
,
err2
,
err3
)
}
// welcome sends acceptive identification response and closes conn
func
(
m
*
Master
)
welcome
(
conn
*
neo
.
Conn
,
resp
neo
.
Msg
)
error
{
// accept sends acceptive identification response and closes conn
// XXX if problem -> .nodeLeave
// XXX spawn ping goroutine from here?
func
(
m
*
Master
)
accept
(
conn
*
neo
.
Conn
,
resp
neo
.
Msg
)
error
{
// XXX cancel on ctx
err1
:=
conn
.
Send
(
resp
)
err2
:=
conn
.
Close
()
...
...
@@ -930,7 +948,7 @@ func (m *Master) allocUUID(nodeType neo.NodeType) neo.NodeUUID {
// XXX +error return?
func (m *Master) ServeLink(ctx context.Context, link *neo.NodeLink) {
logf := func(format string, argv ...interface{}) {
fmt.Printf("master:
%s: " + format + "\n", append([]interface{}{link}, argv...)...)
m.logf("
%s: " + format + "\n", append([]interface{}{link}, argv...)...)
}
logf("serving new node")
...
...
@@ -1236,3 +1254,26 @@ func (m *Master) ServeAdmin(ctx context.Context, conn *neo.Conn) {
func
(
m
*
Master
)
ServeMaster
(
ctx
context
.
Context
,
conn
*
neo
.
Conn
)
{
// TODO (for elections)
}
// ---- utils ----
// -> Logger
//func (m *Master) log(v interface{}) {
// XXX
//}
func
(
m
*
Master
)
logf
(
format
string
,
argv
...
interface
{})
{
// TODO support custom log.Logger
log
.
Output
(
2
,
fmt
.
Sprintf
(
"master @%v: "
+
format
,
append
([]
string
{
m
.
MasterAddr
},
argv
...
)
...
))
}
func
(
m
*
Master
)
vlogf
(
format
string
,
argv
...
interface
{})
{
// XXX -> verbose settings
if
false
{
return
}
m
.
log
(
format
,
argv
)
}
This diff is collapsed.
Click to expand it.
go/neo/server/storage.go
View file @
48e38f91
...
...
@@ -143,9 +143,9 @@ func (stor *Storage) talkMaster(ctx context.Context) error {
// XXX errctx
for
{
fmt
.
Printf
(
"stor: master(%v): connecting
\n
"
,
stor
.
node
.
MasterAddr
)
// XXX info
stor
.
vlogf
(
"master(%v): connecting ..."
,
stor
.
node
.
MasterAddr
)
err
:=
stor
.
talkMaster1
(
ctx
)
fmt
.
Printf
(
"stor: master(%v): %v
\n
"
,
stor
.
node
.
MasterAddr
,
err
)
stor
.
logf
(
"master(%v): %v
"
,
stor
.
node
.
MasterAddr
,
err
)
// TODO if err = shutdown -> return
...
...
@@ -186,7 +186,7 @@ func (stor *Storage) talkMaster1(ctx context.Context) (err error) {
// XXX -> node.Dial ?
if
accept
.
YourNodeUUID
!=
stor
.
node
.
MyInfo
.
NodeUUID
{
fmt
.
Printf
(
"stor: %v: master told us to have UUID=%v
\n
"
,
Mlink
,
accept
.
YourNodeUUID
)
stor
.
logf
(
"%v: master told us to have UUID=%v
"
,
Mlink
,
accept
.
YourNodeUUID
)
stor
.
node
.
MyInfo
.
NodeUUID
=
accept
.
YourNodeUUID
}
...
...
@@ -224,13 +224,13 @@ func (stor *Storage) talkMaster1(ctx context.Context) (err error) {
// let master initialize us. If successful this ends with StartOperation command.
err
=
stor
.
m1initialize
(
ctx
,
Mconn
)
if
err
!=
nil
{
fmt
.
Println
(
"stor: %v: master: %v"
,
err
)
stor
.
logf
(
"%v: master: %v"
,
Mconn
,
err
)
continue
// retry initializing
}
// we got StartOperation command. Let master drive us during servicing phase.
err
=
stor
.
m1serve
(
ctx
,
Mconn
)
fmt
.
Println
(
"stor: %v: master: %v"
,
err
)
stor
.
logf
(
"%v: master: %v"
,
Mconn
,
err
)
continue
// retry from initializing
}
...
...
@@ -358,7 +358,7 @@ func (stor *Storage) m1serve(ctx context.Context, Mconn *neo.Conn) (err error) {
// ServeLink serves incoming node-node link connection
// XXX +error return?
func
(
stor
*
Storage
)
ServeLink
(
ctx
context
.
Context
,
link
*
neo
.
NodeLink
)
{
fmt
.
Printf
(
"stor: %s: serving new node
\n
"
,
link
)
stor
.
logf
(
"%s: serving new node
"
,
link
)
// close link when either cancelling or returning (e.g. due to an error)
// ( when cancelling - link.Close will signal to all current IO to
...
...
@@ -373,7 +373,7 @@ func (stor *Storage) ServeLink(ctx context.Context, link *neo.NodeLink) {
// XXX ret err = ctx.Err()
case
<-
retch
:
}
fmt
.
Printf
(
"stor: %v: closing link
\n
"
,
link
)
stor
.
logf
(
"%v: closing link
"
,
link
)
link
.
Close
()
// XXX err
}()
...
...
@@ -381,7 +381,7 @@ func (stor *Storage) ServeLink(ctx context.Context, link *neo.NodeLink) {
// XXX only accept when operational (?)
nodeInfo
,
err
:=
IdentifyPeer
(
link
,
neo
.
STORAGE
)
if
err
!=
nil
{
fmt
.
Printf
(
"stor: %v
\n
"
,
err
)
stor
.
logf
(
"%v
"
,
err
)
return
}
...
...
@@ -392,7 +392,7 @@ func (stor *Storage) ServeLink(ctx context.Context, link *neo.NodeLink) {
default
:
// XXX vvv should be reply to peer
fmt
.
Printf
(
"stor: %v: unexpected peer type: %v
\n
"
,
link
,
nodeInfo
.
NodeType
)
stor
.
logf
(
"%v: unexpected peer type: %v
"
,
link
,
nodeInfo
.
NodeType
)
return
}
...
...
@@ -400,7 +400,7 @@ func (stor *Storage) ServeLink(ctx context.Context, link *neo.NodeLink) {
for
{
conn
,
err
:=
link
.
Accept
()
if
err
!=
nil
{
fmt
.
Printf
(
"stor: %v
\n
"
,
err
)
// XXX err ctx
stor
.
logf
(
"%v
"
,
err
)
// XXX err ctx
break
}
...
...
@@ -426,7 +426,7 @@ func (stor *Storage) withWhileOperational(ctx context.Context) (context.Context,
// the connection is closed when serveClient returns
// XXX +error return?
func
(
stor
*
Storage
)
serveClient
(
ctx
context
.
Context
,
conn
*
neo
.
Conn
)
{
fmt
.
Printf
(
"stor: %s: serving new client conn
\
n
"
,
conn
)
stor
.
logf
(
"%s: serving new client con
n"
,
conn
)
// rederive ctx to be also cancelled if M tells us StopOperation
ctx
,
cancel
:=
stor
.
withWhileOperational
(
ctx
)
...
...
@@ -457,9 +457,9 @@ func (stor *Storage) serveClient(ctx context.Context, conn *neo.Conn) {
case
err
=
<-
done
:
}
fmt
.
Printf
(
"stor: %v: %v
\n
"
,
conn
,
err
)
stor
.
logf
(
"%v: %v
"
,
conn
,
err
)
// XXX vvv -> defer ?
fmt
.
Printf
(
"stor: %v: closing client conn
\
n
"
,
conn
)
stor
.
logf
(
"%v: closing client con
n"
,
conn
)
conn
.
Close
()
// XXX err
}
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment