X review feedback on newapi
- add nodefs package overview; in particular describe what inode is so that there is no confusion about its meaning. - Draftly implement Lookup / Forget. They work not under global rawBridge.mu and still there should be no race of Lookup / Forget due to careful locking of inode -> (inode, ichild) + retrying. Add description of Forget semantic and what happens when we receive forget for a directory for which children are not forgotten yet. ( it is too late here now and I did not checked the implementation with a fresh head. I thought that it is better to release current state for discussion as I likely won't be able to work on newapi for at least another week ) - use atomics in DefaultNode setInode/inode; see Lookup and corresponding description nearby DefaultNode.setInode for why it is needed. - inode.{lookupCount,nodeID} are now protected not by global rawBridge.mu, but instead by inode.mu . - change Node operation to return Nodes, not Inode. In particulare Node.Lookup should now return Node. Inodes are internal index of nodefs VFS (see package description) and we should not load filesystem implementations to think about them where we can. Also it makes a more closed interface when filesystem works in terms it nodes completely. Also this way we offload filesystems for caring about tricky details of how to create inode for a hardlinked entry (see Lookup for details which handles about it) - Remove Node.Inode -> nodefs.InodeOf(Node). this way there will be no possibility to override Node.Inode and we can specify InodeOf semantic exactly in API docs. - unlockNodes: sort is not needed - lock/unlock Nodes: avoid duplicates (e.g. there can be duplicates if dir/a and dir/b are hardlinks to the same file. If we don't avoid duplicates lockNodes will deadlock) - made some other edits, part of them not complete...
... | ... | @@ -29,9 +29,15 @@ type rawBridge struct { |
options Options | ||
root *Inode | ||
// we maintain index | ||
// | ||
// ID -> inode | ||
// | ||
// so that we can serve FUSE operational requests where nodes are specified by | ||
// their ID. | ||
mu sync.Mutex | ||
nodes []mapEntry | ||
free []uint64 | ||
nodes []mapEntry // ID -> Inode; some inodes may be here in "forgotten" state not yet removed | ||
free []uint64 // free ID | ||
files []fileEntry | ||
freeFiles []uint64 | ||
... | ... | @@ -70,65 +76,308 @@ func NewNodeFS(root Node, opts *Options) fuse.RawFileSystem { |
return bridge | ||
} | ||
// XXX kill? | ||
func (b *rawBridge) inode(id uint64, fh uint64) (*Inode, fileEntry) { | ||
b.mu.Lock() | ||
defer b.mu.Unlock() | ||
// b.mu.Lock() | ||
// defer b.mu.Unlock() // XXX not good to return with released lock | ||
|
||
return b.nodes[id].inode, b.files[fh] | ||
} | ||
func (b *rawBridge) Lookup(header *fuse.InHeader, name string, out *fuse.EntryOut) (status fuse.Status) { | ||
parent, _ := b.inode(header.NodeId, 0) | ||
// Lookup & Forget | ||
// | ||
// Lookup checks inode.children and potentially calls inode.node.Lookup; | ||
// increments ichild.lookupCount. Forget decrements inode.lookupCount and, if | ||
// that reaches 0, unregisters inode from its parents. | ||
// | ||
// To avoid races, whenever we need to update group of inodes (for example | ||
// inode and its parents) we take locks on all inodes in the group. | ||
// | ||
// Before getting to take locks on a group, we start from locking only one | ||
// inode - the inode in question itself. This is required becuase we need to | ||
// first inspect the inode to decide what to do next and what is the inode | ||
// group to change. | ||
// | ||
// Going from only one inode locked to locked inode group requires to unlock | ||
// the inode in the middle. This creates possibility that at the time when | ||
// inode group will be locked, the inode itself was already changed. If the | ||
// inode was indeed changed, we restart the whole procedure. | ||
// | ||
// Inode.changeCounter is used to detect if inode was changed or not - it is | ||
// incremented every time the inode is modified. | ||
// | ||
// rawBridge.mu (the lock for ID -> inode index) is locked either: | ||
// | ||
// - on its own, or | ||
// - always after locking some inodes. | ||
// | ||
// it is never locked the other way (e.g. rawBridge.mu -> inode). | ||
// | ||
// | ||
// Note on FUSE FORGET semantic: Forget(inodeID) means filesystem client | ||
// (usually the kernel) evicts inode from its cache. It can happen that the | ||
// inode is directory and it children inodes are still kept in client cache | ||
// with nlookup > 0. If we see such a forgotten directory, we don't unlink it | ||
// from the tree until all of its children are also forgotten. | ||
child, code := parent.node.Lookup(context.TODO(), name, out) | ||
if !code.Ok() { | ||
if b.options.NegativeTimeout != nil { | ||
out.SetEntryTimeout(*b.options.NegativeTimeout) | ||
func (b *rawBridge) Forget(nodeid, nlookup uint64) { | ||
b.mu.Lock() | ||
inode := b.nodes[nodeid].inode | ||
b.mu.Unlock() | ||
// XXX if inode == nil || already forgoten -> log (client bug) | ||
b.forgetInode(inode, nlookup) | ||
} | ||
func (b *rawBridge) forgetInode(inode *Inode, nlookup uint64) { | ||
// lock inode, extract parents, unlock inode, relock(inode, parents), | ||
// verify that inode had not changed (and retry if it changed), and | ||
// perform unlinking. | ||
retry: | ||
for { | ||
inode.mu.Lock() | ||
if nlookup != 0 { | ||
inode.lookupCount -= nlookup // XXX log if it goes < 0 | ||
inode.changeCounter++ | ||
} | ||
return code | ||
if inode.lookupCount != 0 { | ||
inode.mu.Unlock() | ||
return | ||
} | ||
// don't perform unlinking if inode was already forgotten. | ||
// | ||
// Examples when inode could be already forgotten: | ||
// | ||
// - if Forget, in between inode -> inode + parents relock, was | ||
// interrupted by Lookup + another Forget -> we could retry here | ||
// with the inode already forgotten. | ||
// | ||
// - if inode is directory and its forgetting was delayed due | ||
// to alive children, the forget could be triggered by last | ||
// forgotten children, which could be simultaneous with new | ||
// Lookup and again Forget. | ||
if inode.nodeID == 0 { | ||
inode.mu.Unlock() | ||
return | ||
} | ||
if len(inode.children) != 0 { | ||
// client forgot about directory, but not about its children. | ||
// delay directory inode forget to until children are alive. | ||
inode.mu.Unlock() | ||
return | ||
} | ||
// relock inode + parents. | ||
// If inode stays the same - perform the forget; retry otherwise. | ||
nlookup = 0 // make sure not to change lookupCount again, if we'll be retrying | ||
lockv := []*Inode{inode} | ||
for p := range inode.parents { | ||
lockv = append(lockv, p.parent) | ||
} | ||
inodeVer := inode.changeCounter | ||
inode.mu.Unlock() | ||
lockNodes(lockv...) | ||
if inodeVer != inode.changeCounter { | ||
unlockNodes(lockv...) | ||
continue retry | ||
} | ||
// we have locks on inode and .parents; let's unlink and forget | ||
// inode. Also check if for unlinked parents forget was delayed, | ||
// and if so and we are last child - do the forget on parent. | ||
delayedForget := map[*Inode]struct{}{} | ||
for p := range inode.parents { | ||
iparent := p.parent | ||
delete(iparent.children, p.name) | ||
iparent.changeCounter++ | ||
// if parent was a directory with delayed forget and we | ||
// were the last of its children - now is the time to | ||
// do forget on the parent. | ||
if iparent.lookupCount == 0 && len(iparent.children) == 0 { | ||
delayedForget[iparent] = struct{}{} | ||
} | ||
} | ||
nodeID := inode.nodeID | ||
inode.nodeID = 0 // forgotten | ||
inode.parents = map[parentData]struct{}{} // clear | ||
inode.changeCounter++ | ||
unlockNodes(lockv...) | ||
// release nodeID. In the meantime, while we were not holding | ||
// b.mu locked and inode was forgotten, the ID could be already | ||
// reused. So check if it is not the case. | ||
b.mu.Lock() | ||
|
||
if b.nodes[nodeID].inode == inode { | ||
b.free = append(b.free, nodeID) | ||
b.nodes[nodeID].inode = nil | ||
} | ||
b.mu.Unlock() | ||
// execute delayed forget on parents | ||
for iparent := range delayedForget { | ||
b.forgetInode(iparent, 0) | ||
} | ||
// done | ||
return | ||
} | ||
} | ||
func (b *rawBridge) Lookup(header *fuse.InHeader, name string, out *fuse.EntryOut) (status fuse.Status) { | ||
b.mu.Lock() | ||
defer b.mu.Unlock() | ||
pentry := b.nodes[header.NodeId] | ||
b.mu.Unlock() | ||
lockNodes(parent, child) | ||
parent.addLookup(name, child) | ||
unlockNodes(parent, child) | ||
iparent := pentry.inode | ||
if child.nodeID == 0 { | ||
b.registerInode(child) | ||
if iparent == nil { | ||
return fuse.ENOENT // FIXME errcode=ok? | ||
} | ||
out.NodeId = child.nodeID | ||
out.Generation = b.nodes[child.nodeID].generation | ||
retry: | ||
for { | ||
iparent.mu.Lock() | ||
if iparent.nodeID == 0 { | ||
iparent.mu.Unlock() | ||
return fuse.ENOENT // forgotten XXX errcode=ok? | ||
} | ||
if b.options.AttrTimeout != nil { | ||
out.SetAttrTimeout(*b.options.AttrTimeout) | ||
} | ||
if b.options.EntryTimeout != nil { | ||
out.SetEntryTimeout(*b.options.EntryTimeout) | ||
} | ||
if iparent.mode&fuse.S_IFDIR == 0 { | ||
iparent.mu.Unlock() | ||
return fuse.ENOTDIR | ||
} | ||
return fuse.OK | ||
// if parent already has inode for this name - we can use it | ||
|
||
ichild := iparent.children[name] | ||
if ichild != nil { | ||
// relock iparent + ichild; retry if iparent changed | ||
iparentVer := iparent.changeCounter | ||
iparent.mu.Unlock() | ||
lockNodes(iparent, ichild) | ||
if iparentVer != iparent.changeCounter { | ||
unlockNodes(iparent, ichild) | ||
continue retry | ||
} | ||
} | ||
// iparent & ichild are locked; ichild might be nil. | ||
lockv := []*Inode{iparent, ichild} | ||
ichildNew := false | ||
if ichild == nil { | ||
// this goroutine becomes reponsible to call node.Lookup | ||
// TODO place lookup-in-progress inode and do Lookup without iparent lock | ||
child, code := iparent.node.Lookup(context.TODO(), name, out) | ||
|
||
if !code.Ok() { | ||
iparent.mu.Unlock() | ||
if b.options.NegativeTimeout != nil { | ||
out.SetEntryTimeout(*b.options.NegativeTimeout) | ||
} | ||
return code | ||
} | ||
ichild := newInode(child, out.Attr.Mode) // XXX + bridge | ||
ichildNew = true | ||
ok := child.setInode(ichild) | ||
if !ok { | ||
// someone concurrently going to this node (via | ||
// different path), already set node -> inode | ||
// assocation. This can happen e.g. if for | ||
// | ||
// root | ||
// / \ | ||
// dir1 dir2 | ||
// \ / | ||
// file | ||
// | ||
// dir1.Lookup("file") and dir2.Lookup("file") are executed simultaneously. | ||
// | ||
// we have to use that inode. | ||
ichild = child.inode() | ||
// relock to iparent + ichild and retry if iparent changes. | ||
iparentVer := iparent.changeCounter | ||
iparent.mu.Unlock() | ||
lockNodes(iparent, ichild) | ||
if iparentVer != iparent.changeCounter { | ||
unlockNodes(iparent, ichild) | ||
continue retry | ||
} | ||
ichildNew = false | ||
lockv[1] = ichild | ||
} | ||
} | ||
// now either: | ||
// | ||
// - iparent & ichild are locked, or | ||
// - iparent is locked and ichild was just created anew. | ||
// | ||
// (what was locked is covered by lockv) | ||
// | ||
// -> perform `iparent <-> ichild` linking. | ||
iparent.setEntry(name, ichild) | ||
ichild.lookupCount++ // XXX ichild.changeCounter++ is in setEntry | ||
// if ichild was created anew - we have to register ID for it. | ||
// retrieve child generation while b is locked along the way. | ||
b.mu.Lock() | ||
if ichildNew { | ||
b.registerInode(ichild) | ||
} | ||
childID := ichild.nodeID | ||
childGen := b.nodes[childID].generation | ||
b.mu.Unlock() | ||
// release iparent + (ichild)? locks and we are done | ||
unlockNodes(lockv...) | ||
out.NodeId = childID | ||
out.Generation = childGen | ||
if b.options.AttrTimeout != nil { | ||
out.SetAttrTimeout(*b.options.AttrTimeout) | ||
} | ||
if b.options.EntryTimeout != nil { | ||
out.SetEntryTimeout(*b.options.EntryTimeout) | ||
} | ||
return fuse.OK | ||
} | ||
} | ||
func (b *rawBridge) registerInode(child *Inode) { | ||
// registerInode allocates new inode ID and registers inode with that ID to | ||
// inode ID index. | ||
// | ||
// must be called with inode and b locked. | ||
func (b *rawBridge) registerInode(inode *Inode) { | ||
if l := len(b.free); l > 0 { | ||
last := b.free[l-1] | ||
b.free = b.free[:l-1] | ||
child.nodeID = last | ||
b.nodes[last].inode = child | ||
inode.nodeID = last | ||
b.nodes[last].inode = inode | ||
b.nodes[last].generation++ | ||
} else { | ||
last := len(b.nodes) | ||
b.nodes = append(b.nodes, mapEntry{ | ||
inode: child, | ||
inode: inode, | ||
}) | ||
child.nodeID = uint64(last) | ||
inode.nodeID = uint64(last) | ||
} | ||
inode.changeCounter++ | ||
} | ||
func (b *rawBridge) Create(input *fuse.CreateIn, name string, out *fuse.CreateOut) (code fuse.Status) { | ||
panic("TODO - similarly to Lookup") | ||
/* | ||
ctx := context.TODO() | ||
parent, _ := b.inode(input.NodeId, 0) | ||
child, f, flags, code := parent.node.Create(ctx, name, input.Flags, input.Mode) | ||
... | ... | @@ -163,8 +412,11 @@ func (b *rawBridge) Create(input *fuse.CreateIn, name string, out *fuse.CreateOu |
f.GetAttr(ctx, &out.Attr) | ||
return fuse.OK | ||
*/ | ||
} | ||
// XXX move -> near Lookup, so that the interaction between Forget/Lookup is easier to oversee | ||
/* | ||
func (b *rawBridge) Forget(nodeid, nlookup uint64) { | ||
b.mu.Lock() | ||
defer b.mu.Unlock() | ||
... | ... | @@ -179,6 +431,7 @@ func (b *rawBridge) Forget(nodeid, nlookup uint64) { |
} | ||
} | ||
*/ | ||
func (b *rawBridge) SetDebug(debug bool) {} | ||
... | ... |
... | ... | @@ -6,22 +6,47 @@ package nodefs |
import ( | ||
"context" | ||
"sync/atomic" | ||
"time" | ||
"unsafe" | ||
"github.com/hanwen/go-fuse/fuse" | ||
) | ||
// DefaultNode must be embedded in a Node implementation. | ||
// DefaultNode provides common base Node functionality. | ||
// | ||
// It must be embedded in any Node implementation. | ||
type DefaultNode struct { | ||
inode *Inode | ||
} | ||
func (dn *DefaultNode) setInode(n *Inode) { | ||
dn.inode = n | ||
} | ||
func (dn *DefaultNode) Inode() *Inode { | ||
return dn.inode | ||
inode_ *Inode | ||
} | ||
// set/retrieve inode. | ||
// | ||
// node -> inode association, can be simultaneously tried to be set, if for e.g. | ||
|
||
// | ||
// root | ||
// / \ | ||
// dir1 dir2 | ||
// \ / | ||
// file | ||
// | ||
// dir1.Lookup("file") and dir2.Lookup("file") are executed simultaneously. | ||
// | ||
// We use atomics so that only one set can win and rawBridge.Lookup cares to | ||
// cancel inode that loosed. | ||
// | ||
// To read node.inode atomic.LoadPointer is used, however it is not expensive | ||
// since it translates to regular MOVQ on amd64. | ||
func (dn *DefaultNode) setInode(inode *Inode) bool { | ||
return atomic.CompareAndSwapPointer( | ||
(*unsafe.Pointer)(unsafe.Pointer(&dn.inode_)), | ||
nil, unsafe.Pointer(inode)) | ||
} | ||
func (dn *DefaultNode) inode() *Inode { | ||
return (*Inode)(atomic.LoadPointer( | ||
(*unsafe.Pointer)(unsafe.Pointer(&dn.inode_)))) | ||
} | ||
func (n *DefaultNode) Read(ctx context.Context, f File, dest []byte, off int64) (fuse.ReadResult, fuse.Status) { | ||
... | ... |
... | ... | @@ -34,38 +34,93 @@ type Inode struct { |
// Following data is mutable. | ||
// Protected by bridge.mu | ||
lookupCount uint64 | ||
nodeID uint64 | ||
// mu protects the following mutable fields. When locking | ||
// multiple Inodes, locks must be acquired using | ||
// lockNodes/unlockNodes | ||
mu sync.Mutex | ||
// incremented every time the 'children' or 'parents' field is changed. | ||
// changeCounter increments every time the below mutable state | ||
// (lookupCount, nodeID, children, parents) is modified. | ||
// | ||
// This is used in places where we have to relock inode into inode | ||
// group lock, and after locking the group we have to check if inode | ||
// did not changed, and if it changed - retry the operation. | ||
changeCounter uint32 | ||
lookupCount uint64 | ||
// ID of the inode; 0 if inode was forgotten. | ||
// forgotten inodes are unlinked from parent and children, but could be | ||
// still not yet removed from bridge.nodes . | ||
nodeID uint64 | ||
children map[string]*Inode | ||
parents map[parentData]struct{} | ||
} | ||
// newInode creates creates new inode pointing to node. | ||
// | ||
// node -> inode association is NOT set. | ||
// the inode is _not_ yet has | ||
func newInode(node Node, mode uint32) *Inode { | ||
inode := &Inode{ | ||
mode: mode ^ 07777, | ||
node: node, | ||
//bridge: n.bridge, | ||
parents: make(map[parentData]struct{}), | ||
} | ||
if mode&fuse.S_IFDIR != 0 { | ||
inode.children = make(map[string]*Inode) | ||
} | ||
//node.setInode(ch) | ||
return inode | ||
} | ||
// sortNodes rearranges inode group in consistent order. | ||
// | ||
// The nodes are ordered by their in-RAM address, which gives consistency | ||
// property: for any A and B inodes, sortNodes will either always order A < B, | ||
// or always order A > B. | ||
// | ||
// See lockNodes where this property is used to avoid deadlock when taking | ||
// locks on inode group. | ||
func sortNodes(ns []*Inode) { | ||
sort.Slice(ns, func(i, j int) bool { | ||
return uintptr(unsafe.Pointer(ns[i])) < uintptr(unsafe.Pointer(ns[j])) | ||
}) | ||
} | ||
// lockNodes locks group of inodes. | ||
// | ||
// It always lock the inodes in the same order - to avoid deadlocks. | ||
// It also avoids locking an inode more than once, if it was specified multiple times. | ||
// An example when an inode might be given multiple times is if dir/a and dir/b | ||
// are hardlinked to the same inode and the caller needs to take locks on dir children. | ||
// | ||
// It is valid to give nil nodes - those are simply ignored. | ||
|
||
func lockNodes(ns ...*Inode) { | ||
sortNodes(ns) | ||
var nprev *Inode | ||
for _, n := range ns { | ||
n.mu.Lock() | ||
if n != nprev { | ||
n.mu.Lock() | ||
nprev = n | ||
} | ||
} | ||
} | ||
// unlockNodes releases locks taken by lockNodes. | ||
func unlockNodes(ns ...*Inode) { | ||
// we don't need to unlock in the same order that was used in lockNodes. | ||
// however it still helps to have nodes sorted to avoid duplicates. | ||
sortNodes(ns) | ||
var nprev *Inode | ||
for _, n := range ns { | ||
n.mu.Unlock() | ||
if n != nprev { | ||
n.mu.Unlock() | ||
nprev = n | ||
} | ||
} | ||
} | ||
... | ... | @@ -74,9 +129,14 @@ func unlockNodes(ns ...*Inode) { |
// kernel has no way of reviving forgotten nodes by its own | ||
// initiative. | ||
func (n *Inode) Forgotten() bool { | ||
/* | ||
n.bridge.mu.Lock() | ||
defer n.bridge.mu.Unlock() | ||
return n.lookupCount == 0 | ||
*/ | ||
n.mu.Lock() | ||
defer n.mu.Unlock() | ||
return n.nodeID == 0 | ||
} | ||
// Node returns the Node object implementing the file system operations. | ||
... | ... | @@ -155,14 +215,24 @@ func (n *Inode) FindChildByOpaqueID(name string, opaqueID uint64) *Inode { |
return nil | ||
} | ||
func (n *Inode) addLookup(name string, child *Inode) { | ||
child.lookupCount++ | ||
child.parents[parentData{name, n}] = struct{}{} | ||
n.children[name] = child | ||
child.changeCounter++ | ||
n.changeCounter++ | ||
// setEntry does `iparent[name] = ichild` linking. | ||
// | ||
// setEntry must not be called simultaneously for any of iparent or ichild. | ||
// This, for example could be satisfied if both iparent and ichild are locked, | ||
// but it could be also valid if only iparent is locked and ichild was just | ||
// created and only one goroutine keeps referencing it. | ||
// | ||
// XXX also ichild.lookupCount++ ? | ||
func (iparent *Inode) setEntry(name string, ichild *Inode) { | ||
// ichild.lookupCount++ | ||
ichild.parents[parentData{name, iparent}] = struct{}{} | ||
iparent.children[name] = ichild | ||
ichild.changeCounter++ | ||
iparent.changeCounter++ | ||
} | ||
// XXX kill | ||
/* | ||
func (n *Inode) clearParents() { | ||
for { | ||
lockme := []*Inode{n} | ||
... | ... | @@ -191,7 +261,10 @@ func (n *Inode) clearParents() { |
} | ||
} | ||
} | ||
*/ | ||
// XXX kill | ||
/* | ||
func (n *Inode) clearChildren() { | ||
if n.mode != fuse.S_IFDIR { | ||
return | ||
... | ... | @@ -226,12 +299,18 @@ func (n *Inode) clearChildren() { |
} | ||
} | ||
// XXX not right - we cannot fully clear our children, because they can | ||
// be also children of another directory. | ||
|
||
// | ||
// XXX also not right - the kernel can send FORGET(idir) but keep | ||
|
||
// references to children inodes. | ||
for _, ch := range lockme { | ||
if ch != n { | ||
ch.clearChildren() | ||
} | ||
} | ||
} | ||
*/ | ||
// NewPersistentInode returns an Inode with a LookupCount == 1, ie. the | ||
// node will only get garbage collected if the kernel issues a forget | ||
... | ... |
-
Maintainer
.
-
mentioned in merge request nexedi/wendelin.core!15 (closed)