Commit 6f0cdaff authored by Kirill Smelkov's avatar Kirill Smelkov

wcfs: Provide isolation to clients

Via custom isolation protocol that both server and clients must cooperatively
follow. This is the core change that enables file cache to be practically
shared while each client can still be provided with isolated view of the database.

This patch brings only server changes, tests + the minimum client bits to support the tests.
The client library, that will implement isolation protocol on client side, will come next.

This patch is organized as follows:

- wcfs.go brings in description of the protocol, overview of how server
  implements that protocol and the implementation itself.
  See also notes.txt

- wcfs_test.py brings in tests for server implementation.
  tWCFS._abort_ontimeout had to be moved into nogil mode into wcfs_test.pyx
  to avoid deadlock on the GIL (see comments in wcfs_test.pyx for details).

- files added in wcfs/client/ are needed to provide client-side
  implementation of WatchLink - the message exchange protocol over
  opened head/watch file - for tests. Client-side watchlink implementation
  lives in wcfs/client/wcfs_watchlink.{h,cpp}. The other additions in
  wcfs/client/ are to support that and to expose the WatchLink to Python.

  Client-side bits are done right in C++ because upcoming WCFS client
  library will be implemented in C++ to work in nogil mode in order to
  avoid deadlock on the GIL because client-side pinner thread might be
  woken-up synchronously by WCFS server at any moment, including when
  another client thread already holds the GIL and is paused by WCFS.

Some preliminary history:

kirr/wendelin.core@9b4a42a3    X invalidation design draftly settled
kirr/wendelin.core@27d91d47    X δFtail settled
kirr/wendelin.core@c27c1940    X mmap over under pagefault to this mmapping works
kirr/wendelin.core@d36b171f    X ptrace when client is under pagefault or syscall won't work
kirr/wendelin.core@c1f5bb19    X notes on why lazy-invalidate approach was taken
kirr/wendelin.core@4fbdd270    X Proof that that it is possible to change mmapping while under pagefault to it
kirr/wendelin.core@33e0dfce    X ΔTail draftly done
kirr/wendelin.core@12628943    X make sure "bye" is always processed immediately - even if a handleWatch is currently blocked
kirr/wendelin.core@af0a64cb    X test for "bye" canceling blocked handlers
kirr/wendelin.core@996dc6a8    X Fix race in test
kirr/wendelin.core@43915fe9    X wcfs: Don't forbid simultaneous watch requests
kirr/wendelin.core@941dc54b    X wcfs: threading.Lock -> sync.Mutex
kirr/wendelin.core@d75b2304    X wcfs: Move _abort_ontimeout to pyx/nogil
kirr/wendelin.core@79234659    X Notes on why eagier invalidation was rejected
kirr/wendelin.core@f05271b1    X Test that sysread(/head/watch) can be interrupted
kirr/wendelin.core@5ba816da    X restore test_wcfs_watch_robust after f05271b1.
kirr/wendelin.core@4bd88564    X "Invalidation protocol" -> "Isolation protocol"
kirr/wendelin.core@f7b54ca4    X avoid fmt::vsprintf  (now compils again with latest pygolang@master)
kirr/wendelin.core@0a8fcd9d    X wcfs/client: Move EOF -> pygolang
kirr/wendelin.core@153e02e6    X test_wcfs_watch_setup and test_wcfs_watch_setup_ahead work again
kirr/wendelin.core@17f98edc    X wcfs: client: os: Factor syserr -> string into _sysErrString
kirr/wendelin.core@7b0c301c    X wcfs: tests: Fix tFile.assertBlk not to segfault on a test failure
kirr/wendelin.core@b74dda09    X Start switching Track from Track(key) to Track(keycov)
kirr/wendelin.core@8b5d8523    X Move tracking of which blocks were accessed from wcfs to ΔFtail
parent 4430de41
/* Wendelin.bigfile | virtual memory tests
* Copyright (C) 2014-2019 Nexedi SA and Contributors.
* Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -339,6 +339,7 @@ void test_file_access_synthetic(void)
int err;
/* MUST_FAULT(code) - checks that code faults */
/* somewhat dup in wcfs/internal/wcfs_test.pyx */
sigjmp_buf fault_jmp;
volatile int fault_expected = 0;
void sigfault_handler(int sig) {
......
......@@ -282,6 +282,12 @@ libvirtmem_h = [
'include/wendelin/utils.h',
]
libwcfs_h = [
'wcfs/client/wcfs.h',
'wcfs/client/wcfs_misc.h',
'wcfs/client/wcfs_watchlink.h',
]
setup(
name = 'wendelin.core',
version = '0.13',
......@@ -306,7 +312,13 @@ setup(
'lib/utils.c'],
depends = libvirtmem_h,
define_macros = [('_GNU_SOURCE',None)],
language = 'c')],
language = 'c'),
DSO('wendelin.wcfs.client.libwcfs',
['wcfs/client/wcfs.cpp',
'wcfs/client/wcfs_watchlink.cpp',
'wcfs/client/wcfs_misc.cpp'],
depends = libwcfs_h)],
ext_modules = [
PyGoExt('wendelin.bigfile._bigfile',
......@@ -319,6 +331,11 @@ setup(
language = 'c',
dsos = ['wendelin.bigfile.libvirtmem']),
PyGoExt('wendelin.wcfs.client._wcfs',
['wcfs/client/_wcfs.pyx'],
depends = libwcfs_h,
dsos = ['wendelin.wcfs.client.libwcfs']),
PyGoExt('wendelin.wcfs.internal.wcfs_test',
['wcfs/internal/wcfs_test.pyx']),
......
......@@ -60,6 +60,10 @@ from persistent import Persistent
from zodbtools.util import ashex as h
from six.moves.urllib.parse import urlsplit, urlunsplit
from .client._wcfs import \
PyWCFS as _WCFS, \
PyWatchLink as WatchLink \
# Server represents running wcfs server.
#
......@@ -79,7 +83,7 @@ class Server:
# Raw files on wcfs can be accessed with ._path/._read/._stat/._open .
#
# WCFS logically mirrors ZODB.DB .
class WCFS:
class WCFS(_WCFS):
# .mountpoint path to wcfs mountpoint
# ._fwcfs /.wcfs/zurl opened to keep the server from going away (at least cleanly)
# ._njoin this connection was returned for so many joins
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# cython: language_level=2
# distutils: language=c++
# Package _wcfs provides Python-wrappers for C++ wcfs client package.
#
# It wraps WCFS and WatchLink.
from golang cimport chan, structZ, string, error, refptr
from golang cimport context
from libc.stdint cimport int64_t, uint64_t
from libcpp.utility cimport pair
from libcpp.vector cimport vector
cdef extern from "wcfs/client/wcfs_misc.h" namespace "zodb" nogil:
ctypedef uint64_t Tid
ctypedef uint64_t Oid
cdef extern from "wcfs/client/wcfs_misc.h" namespace "wcfs" nogil:
const Tid TidHead
# pyx/nogil description for C++ classes
cdef extern from "wcfs/client/wcfs_watchlink.h" namespace "wcfs" nogil:
cppclass _WatchLink:
error close()
error closeWrite()
pair[string, error] sendReq(context.Context ctx, const string &req)
error recvReq(context.Context ctx, PinReq *prx)
error replyReq(context.Context ctx, const PinReq *req, const string& reply);
vector[string] fatalv
chan[structZ] rx_eof
cppclass WatchLink (refptr[_WatchLink]):
# WatchLink.X = WatchLink->X in C++
error close "_ptr()->close" ()
error closeWrite "_ptr()->closeWrite"()
pair[string, error] sendReq "_ptr()->sendReq" (context.Context ctx, const string &req)
error recvReq "_ptr()->recvReq" (context.Context ctx, PinReq *prx)
error replyReq "_ptr()->replyReq" (context.Context ctx, const PinReq *req, const string& reply);
vector[string] fatalv "_ptr()->fatalv"
chan[structZ] rx_eof "_ptr()->rx_eof"
cppclass PinReq:
Oid foid
int64_t blk
Tid at
string msg
error _twlinkwrite(WatchLink wlink, const string& pkt)
cdef extern from "wcfs/client/wcfs.h" namespace "wcfs" nogil:
cppclass WCFS:
string mountpoint
pair[WatchLink, error] _openwatch()
# ---- python bits ----
cdef class PyWCFS:
cdef WCFS wc
cdef class PyWatchLink:
cdef WatchLink wlink
cdef class PyPinReq:
cdef PinReq pinreq
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# cython: language_level=2
# cython: auto_pickle=False
# distutils: language=c++
# Package _wcfs provides Python-wrappers for C++ wcfs client package.
# See _wcfs.pxd for package overview.
from golang cimport pychan, pyerror, nil
from golang cimport io
from ZODB.utils import p64
cdef class PyWCFS:
property mountpoint:
def __get__(PyWCFS pywc):
return pywc.wc.mountpoint
def __set__(PyWCFS pywc, string v):
pywc.wc.mountpoint = v
cdef class PyWatchLink:
def __init__(PyWatchLink pywlink, PyWCFS pywc):
with nogil:
_ = wcfs_openwatch_pyexc(&pywc.wc)
pywlink.wlink = _.first
err = _.second
if err != nil:
raise pyerr(err)
def __dealloc__(PyWatchLink pywlink):
pywlink.wlink = nil
def close(PyWatchLink pywlink):
with nogil:
err = wlink_close_pyexc(pywlink.wlink)
if err != nil:
raise pyerr(err)
def closeWrite(PyWatchLink pywlink):
with nogil:
err = wlink_closeWrite_pyexc(pywlink.wlink)
if err != nil:
raise pyerr(err)
def sendReq(PyWatchLink pywlink, context.PyContext pyctx, string req): # -> reply(string)
with nogil:
_ = wlink_sendReq_pyexc(pywlink.wlink, pyctx.ctx, req)
reply = _.first
err = _.second
if err != nil:
raise pyerr(err)
return reply
def recvReq(PyWatchLink pywlink, context.PyContext pyctx): # -> PinReq | None when EOF
cdef PyPinReq pyreq = PyPinReq.__new__(PyPinReq)
with nogil:
err = wlink_recvReq_pyexc(pywlink.wlink, pyctx.ctx, &pyreq.pinreq)
if err.eq(io.EOF):
return None
if err != nil:
raise pyerr(err)
return pyreq
def replyReq(PyWatchLink pywlink, context.PyContext pyctx, PyPinReq pyreq, string reply):
with nogil:
err = wlink_replyReq_pyexc(pywlink.wlink, pyctx.ctx, &pyreq.pinreq, reply)
if err != nil:
raise pyerr(err)
return
# XXX for tests
property fatalv:
def __get__(PyWatchLink pywlink):
return pywlink.wlink.fatalv
property rx_eof:
def __get__(PyWatchLink pywlink):
return pychan.from_chan_structZ(pywlink.wlink.rx_eof)
cdef class PyPinReq:
property foid:
def __get__(PyPinReq pypin):
return p64(pypin.pinreq.foid)
property blk:
def __get__(PyPinReq pypin):
return pypin.pinreq.blk
property at:
def __get__(PyPinReq pypin):
at = pypin.pinreq.at
if at == TidHead:
return None
return p64(at)
# wcfs_test.py uses req.msg in several places
property msg:
def __get__(PyPinReq pypin):
return pypin.pinreq.msg
def _tpywlinkwrite(PyWatchLink pywlink, bytes pypkt):
cdef string pkt = pypkt
with nogil:
err = _twlinkwrite_pyexc(pywlink.wlink, pkt)
if err != nil:
raise pyerr(err)
# ---- misc ----
# pyerr converts error into python error.
cdef object pyerr(error err):
return pyerror.from_error(err)
from golang cimport topyexc
cdef nogil:
pair[WatchLink, error] wcfs_openwatch_pyexc(WCFS *wcfs) except +topyexc:
return wcfs._openwatch()
error wlink_close_pyexc(WatchLink wlink) except +topyexc:
return wlink.close()
error wlink_closeWrite_pyexc(WatchLink wlink) except +topyexc:
return wlink.closeWrite()
pair[string, error] wlink_sendReq_pyexc(WatchLink wlink, context.Context ctx, const string &req) except +topyexc:
return wlink.sendReq(ctx, req)
error wlink_recvReq_pyexc(WatchLink wlink, context.Context ctx, PinReq *prx) except +topyexc:
return wlink.recvReq(ctx, prx)
error wlink_replyReq_pyexc(WatchLink wlink, context.Context ctx, const PinReq *req, const string& reply) except +topyexc:
return wlink.replyReq(ctx, req, reply)
error _twlinkwrite_pyexc(WatchLink wlink, const string& pkt) except +topyexc:
return _twlinkwrite(wlink, pkt)
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides WCFS client.
#include "wcfs_misc.h"
#include "wcfs.h"
#include "wcfs_watchlink.h"
#include <golang/errors.h>
#include <golang/fmt.h>
// wcfs::
namespace wcfs {
// ---- WCFS raw file access ----
// _path returns path for object on wcfs.
// - str: wcfs root + obj;
string WCFS::_path(const string &obj) {
WCFS& wc = *this;
return wc.mountpoint + "/" + obj;
}
tuple<os::File, error> WCFS::_open(const string &path, int flags) {
WCFS& wc = *this;
string path_ = wc._path(path);
return os::open(path_, flags);
}
// ---- misc ----
string WCFS::String() const {
const WCFS& wc = *this;
return fmt::sprintf("wcfs %s", v(wc.mountpoint));
}
} // wcfs::
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides WCFS client.
#ifndef _NXD_WCFS_H_
#define _NXD_WCFS_H_
#include <golang/libgolang.h>
#include <tuple>
#include "wcfs_misc.h"
// wcfs::
namespace wcfs {
using namespace golang;
using std::tuple;
using std::pair;
typedef refptr<struct _WatchLink> WatchLink;
struct PinReq;
// WCFS represents filesystem-level connection to wcfs server.
//
// Use wcfs.join in Python API to create it.
//
// WCFS logically mirrors ZODB.DB .
// It is safe to use WCFS from multiple threads simultaneously.
struct WCFS {
string mountpoint;
pair<WatchLink, error> _openwatch();
string String() const;
// at OS-level, on-WCFS raw files can be accessed via ._path and ._open.
string _path(const string &obj);
tuple<os::File, error> _open(const string &path, int flags=O_RDONLY);
};
} // wcfs::
#endif
// Copyright (C) 2019-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
#include "wcfs_misc.h"
#include <golang/libgolang.h>
#include <golang/errors.h>
#include <golang/fmt.h>
#include <golang/io.h>
using namespace golang;
#include <inttypes.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <algorithm>
#include <memory>
// golang::
namespace golang {
// os::
namespace os {
// TODO -> os.PathError + err=syscall.Errno
static error _pathError(const char *op, const string &path, int syserr);
static string _sysErrString(int syserr);
int _File::fd() const { return _fd; }
string _File::name() const { return _path; }
_File::_File() {}
_File::~_File() {}
void _File::decref() {
if (__decref())
delete this;
}
tuple<File, error> open(const string &path, int flags, mode_t mode) {
int fd = ::open(path.c_str(), flags, mode);
if (fd == -1)
return make_tuple(nil, _pathError("open", path, errno));
File f = adoptref(new _File);
f->_path = path;
f->_fd = fd;
return make_tuple(f, nil);
}
error _File::close() {
_File& f = *this;
int err = ::close(f._fd);
if (err != 0)
return f._errno("close");
f._fd = -1;
return nil;
}
tuple<int, error> _File::read(void *buf, size_t count) {
_File& f = *this;
int n;
n = ::read(f._fd, buf, count);
if (n == 0)
return make_tuple(n, io::EOF_);
if (n < 0)
return make_tuple(0, f._errno("read"));
return make_tuple(n, nil);
}
tuple <int, error> _File::write(const void *buf, size_t count) {
_File& f = *this;
int n, wrote=0;
// NOTE contrary to write(2) we have to write all data as io.Writer requires.
while (count != 0) {
n = ::write(f._fd, buf, count);
if (n < 0)
return make_tuple(wrote, f._errno("write"));
wrote += n;
buf = ((const char *)buf) + n;
count -= n;
}
return make_tuple(wrote, nil);
}
error _File::stat(struct stat *st) {
_File& f = *this;
int err = fstat(f._fd, st);
if (err != 0)
return f._errno("stat");
return nil;
}
// _errno returns error corresponding to op(file) and errno.
error _File::_errno(const char *op) {
_File& f = *this;
return _pathError(op, f._path, errno);
}
// _pathError returns os.PathError-like for op/path and system error
// indicated by syserr.
static error _pathError(const char *op, const string &path, int syserr) {
// TODO v(_sysErrString(syserr)) -> v(syscall.Errno(syserr))
return fmt::errorf("%s %s: %s", op, v(path), v(_sysErrString(syserr)));
}
// _sysErrString returns string corresponding to system error syserr.
static string _sysErrString(int syserr) {
char ebuf[128];
char *estr = strerror_r(syserr, ebuf, sizeof(ebuf));
return string(estr);
}
} // os::
// xstrconv:: (strconv-like)
namespace xstrconv {
// parseHex64 decodes 16-character-wide hex-encoded string into uint64.
tuple<uint64_t, error> parseHex64(const string& s) {
if (s.size() != 16)
return make_tuple(0, fmt::errorf("hex64 %s invalid", v(s)));
uint64_t v;
int n = sscanf(s.c_str(), "%16" SCNx64, &v);
if (n != 1)
return make_tuple(0, fmt::errorf("hex64 %s invalid", v(s)));
return make_tuple(v, nil);
}
// parseInt decodes string s as signed decimal integer.
tuple<int64_t, error> parseInt(const string& s) {
int64_t v;
int n = sscanf(s.c_str(), "%" SCNi64, &v);
if (!(n == 1 && std::to_string(v) == s))
return make_tuple(0, fmt::errorf("int %s invalid", v(s)));
return make_tuple(v, nil);
}
// parseUint decodes string s as unsigned decimal integer.
tuple<uint64_t, error> parseUint(const string& s) {
uint64_t v;
int n = sscanf(s.c_str(), "%" SCNu64, &v);
if (!(n == 1 && std::to_string(v) == s))
return make_tuple(0, fmt::errorf("uint %s invalid", v(s)));
return make_tuple(v, nil);
}
} // xstrconv::
} // golang::
// xerr::
namespace xerr {
// XXX don't require fmt::vsprintf
#if 0
Contextf::Contextf(const char *format, ...) {
Contextf& c = *this;
va_list argp;
va_start(argp, format);
c.errctx = fmt::sprintfv(format, argp);
va_end(argp);
}
#endif
error Contextf::operator() (error err) const {
const Contextf& c = *this;
if (err == nil)
return nil;
return fmt::errorf("%s: %w", v(c.errctx), err);
}
} // xerr::
#include <golang/time.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
// golang::log::
namespace golang {
namespace log {
void __Logf(const char *file, int line, char level, const char *format, ...) {
double t = time::now();
time_t t_int = time_t(t);
struct tm tm_loc;
localtime_r(&t_int, &tm_loc);
char t_buf[32];
strftime(t_buf, sizeof(t_buf), "%m%d %H:%M:%S", &tm_loc);
int t_us = int((t-t_int)*1E6);
pid_t tid = syscall(SYS_gettid);
string prefix = fmt::sprintf("%c%s.%06d % 7d %s:%d] ", level, t_buf, t_us, tid, file, line);
// TODO better to emit prefix and msg in one go.
flockfile(stderr);
fprintf(stderr, "%s", v(prefix));
va_list argp;
va_start(argp, format);
vfprintf(stderr, format, argp);
va_end(argp);
fprintf(stderr, "\n");
funlockfile(stderr);
}
}} // golang::log::
// wcfs::
namespace wcfs {
template<> string v_(error err) {
return (err != nil) ? err->Error() : "nil";
}
static string h016(uint64_t v) { return fmt::sprintf("%016lx", v); }
template<> string v_(const zodb::Tid& tid) { return h016(tid); }
//template<> string v_(zodb::Oid oid) { return h016(oid); }
// XXX Tid and Oid are typedefs for uint64_t and C++ reduces template
// specializations to the underlying type. This providing specialization for
// both Tid and Oid results in "multiple definition" error.
} // wcfs::
// Copyright (C) 2019-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// wcfs_misc.{h,cpp} provide miscellaneous utilities for other wcfs_* files.
#ifndef _NXD_WCFS_MISC_H_
#define _NXD_WCFS_MISC_H_
// XXX hack: C++ does not have __builtin_types_compatible_p, but CCAN configure
// thinks it does because CCAN is configured via C, not C++.
#include <config.h>
#undef HAVE_BUILTIN_TYPES_COMPATIBLE_P
#define HAVE_BUILTIN_TYPES_COMPATIBLE_P 0
#include <ccan/array_size/array_size.h>
#include <stddef.h>
#include <stdint.h>
#include <golang/libgolang.h>
using namespace golang;
#include <string>
using std::string;
#include <utility>
using std::pair;
using std::make_pair;
#include <tuple>
using std::tuple;
using std::make_tuple;
using std::tie;
#include <vector>
using std::vector;
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
// golang::
namespace golang {
// os::
namespace os {
// os::File mimics os.File from Go.
// its operations return error with full file context.
typedef refptr<class _File> File;
class _File : public object {
int _fd;
string _path;
// don't new - create via open
private:
_File();
~_File();
friend tuple<File, error> open(const string &path, int flags, mode_t mode);
public:
void decref();
public:
int fd() const;
string name() const;
error close();
// read implements io.Reader from Go: it reads into buf up-to count bytes.
// XXX buf -> slice<byte> ?
tuple<int, error> read(void *buf, size_t count);
// write implements io.Writer from Go: it writes all data from buf.
//
// NOTE write behaves like io.Writer in Go - it tries to write as much
// bytes as requested, and if it could write only less - it returns error.
// XXX buf -> slice<byte> ?
tuple<int, error> write(const void *buf, size_t count);
error stat(struct stat *st);
private:
error _errno(const char *op);
};
// open opens file @path.
tuple<File, error> open(const string &path, int flags = O_RDONLY,
mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IWOTH | S_IXOTH);
} // os::
// ---- misc ----
// xstrconv::
namespace xstrconv {
tuple<uint64_t, error> parseHex64(const string& s);
tuple<int64_t, error> parseInt(const string& s);
tuple<uint64_t, error> parseUint(const string& s);
} // xstrconv::
// log::
namespace log {
#define Debugf(format, ...) __Logf(__FILE__, __LINE__, 'D', format, ##__VA_ARGS__)
#define Infof(format, ...) __Logf(__FILE__, __LINE__, 'I', format, ##__VA_ARGS__)
#define Warnf(format, ...) __Logf(__FILE__, __LINE__, 'W', format, ##__VA_ARGS__)
#define Errorf(format, ...) __Logf(__FILE__, __LINE__, 'E', format, ##__VA_ARGS__)
#define Fatalf(format, ...) __Logf(__FILE__, __LINE__, 'F', format, ##__VA_ARGS__)
void __Logf(const char *file, int line, char level, const char *format, ...);
} // log::
} // golang::
// zodb::
namespace zodb {
typedef uint64_t Tid;
typedef uint64_t Oid;
} // zodb::
#include <golang/fmt.h>
// xerr::
namespace xerr {
// xerr::Contextf mimics xerr.Contextf from Go.
//
// Usage is a bit different(*) compared to Go:
//
// func doSomething(arg) {
// xerr.Contextf E("doing something %s", v(arg));
// ...
// return E(err);
// }
//
// (*) because C++ does not allow to modify returned value on the fly.
class Contextf {
string errctx;
public:
template<typename ...Argv>
inline Contextf(const char *format, Argv... argv) {
// XXX string() to avoid "error: format not a string literal" given by -Werror=format-security
errctx = fmt::sprintf(string(format), argv...);
}
error operator() (error) const;
};
} // xerr::
// wcfs::
namespace wcfs {
// TidHead is invalid Tid which is largest Tid value and means @head.
const zodb::Tid TidHead = -1ULL;
// v mimics %v for T to be used in printf & friends.
//
// NOTE returned char* pointer is guaranteed to stay valid only till end of
// current expression. For example
//
// printf("hello %s", v(obj))
//
// is valid, while
//
// x = v(obj);
// use(x);
//
// is not valid.
#define v(obj) (wcfs::v_(obj).c_str())
template<typename T> string v_(T* obj) { return obj->String(); }
template<typename T> string v_(const T* obj) { return obj->String(); }
template<typename T> string v_(const T& obj) { return obj.String(); }
template<typename T> string v_(refptr<T> obj) { return obj->String(); }
template<> inline string v_(const string& s) { return s; }
template<> string v_(error);
template<> string v_(const zodb::Tid&);
template<> string v_(const zodb::Oid&);
} // wcfs::
#endif
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
#include "wcfs_watchlink.h"
#include <golang/errors.h>
#include <golang/fmt.h>
#include <golang/io.h>
#include <golang/strings.h>
#include <string.h>
#define TRACE 0
#if TRACE
# define trace(format, ...) log::Debugf(format, ##__VA_ARGS__)
#else
# define trace(format, ...) do {} while (0)
#endif
// wcfs::
namespace wcfs {
// ErrLinkDown is the error indicating that WCFS watch link is no-longer operational.
global<error> ErrLinkDown = errors::New("link is down");
// _openwatch opens new watch link on wcfs.
pair<WatchLink, error> WCFS::_openwatch() {
WCFS *wc = this;
xerr::Contextf E("%s: openwatch", v(wc));
// head/watch handle.
os::File f;
error err;
tie(f, err) = wc->_open("head/watch", O_RDWR);
if (err != nil)
return make_pair(nil, E(err));
WatchLink wlink = adoptref(new(_WatchLink));
wlink->_wc = wc;
wlink->_f = f;
wlink->_acceptq = makechan<rxPkt>();
wlink->_down = false;
wlink->_rxeof = false;
wlink->_req_next = 1;
wlink->rx_eof = makechan<structZ>();
context::Context serveCtx;
tie(serveCtx, wlink->_serveCancel) = context::with_cancel(context::background());
wlink->_serveWG = sync::NewWorkGroup(serveCtx);
wlink->_serveWG->go([wlink](context::Context ctx) -> error {
return wlink->_serveRX(ctx);
});
return make_pair(wlink, nil);
}
// close closes the link.
error _WatchLink::close() {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: close", v(wlink));
error err = wlink.closeWrite();
wlink._serveCancel();
// NOTE we can get stuck here if wcfs does not behave correctly by closing
// its side in reply to our "bye" message.
//
// TODO -> better pthread_kill(SIGINT) instead of relying on wcfs proper behaviour?
error err2 = wlink._serveWG->wait();
if (errors::Is(err2, context::canceled) || // we canceled _serveWG
errors::Is(err2, io::EOF_) || // EOF received from WCFS
errors::Is(err2, ErrLinkDown)) // link shutdown due to logic error; details logged
err2 = nil;
error err3 = wlink._f->close();
if (err == nil)
err = err2;
if (err == nil)
err = err3;
return E(err);
}
// closeWrite closes send half of the link.
error _WatchLink::closeWrite() {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: closeWrite", v(wlink));
wlink._txclose1.do_([&]() {
// ask wcfs to close its tx & rx sides; wcfs.close(tx) wakes up
// _serveRX on client (= on us). The connection can be already closed
// by wcfs - so ignore errors when sending bye.
(void)wlink._send(wlink._nextReqID(), "bye");
// NOTE vvv should be ~ shutdown(wlink._f, SHUT_WR), however shutdown does
// not work for non-socket file descriptors. And even if we dup link
// fd, and close only one used for TX, peer's RX will still be blocked
// as fds are referring to one file object which stays in opened
// state. So just use ^^^ "bye" as "TX closed" message.
// wlink._wtx.close();
});
return nil;
}
// _serveRX receives messages from ._f and dispatches them according to
// streamID either to .recvReq, or to .sendReq waiting for reply.
error _WatchLink::_serveRX(context::Context ctx) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: serve rx", v(wlink));
bool rxeof = false;
// when finishing - wakeup everyone waiting for rx
defer([&]() {
wlink._rxmu.lock();
wlink._rxeof = rxeof;
wlink._down = true; // don't allow new rxtab registers; mark the link as down
for (auto _ : wlink._rxtab) {
auto rxq = _.second;
rxq.close();
}
wlink._rxmu.unlock();
wlink._acceptq.close();
});
string l;
error err;
rxPkt pkt;
while (1) {
// NOTE: .close() makes sure ._f.read*() will wake up
tie(l, err) = wlink._readline();
if (err != nil) {
// peer closed its tx
if (err == io::EOF_) {
rxeof = true;
wlink.rx_eof.close();
}
return E(err);
}
trace("C: %s: rx: \"%s\"", v(wlink), v(l));
err = pkt.from_string(l);
if (err != nil)
return E(err);
if (pkt.stream == 0) { // control/fatal message from wcfs
log::Errorf("%s: rx fatal: %s\n", v(wlink), v(l));
wlink.fatalv.push_back(pkt.to_string()); // TODO -> wlink.errorq
continue; // wcfs should close link after error
}
bool reply = (pkt.stream % 2 != 0);
// wcfs replies to our request
if (reply) {
chan<rxPkt> rxq;
bool ok;
wlink._rxmu.lock();
tie(rxq, ok) = wlink._rxtab.pop_(pkt.stream);
wlink._rxmu.unlock();
if (!ok) {
// wcfs sent reply on unexpected stream -> shutdown wlink.
log::Errorf("%s: .%lu: wcfs sent reply on unexpected stream", v(wlink), pkt.stream);
return E(ErrLinkDown);
}
int _ = select({
ctx->done().recvs(), // 0
rxq.sends(&pkt), // 1
});
if (_ == 0)
return E(ctx->err());
}
// wcfs originated request
else {
wlink._rxmu.lock();
if (wlink._accepted.has(pkt.stream)) {
wlink._rxmu.unlock();
// wcfs request on already used stream
log::Errorf("%s: .%lu: wcfs sent request on already used stream", v(wlink), pkt.stream);
return E(ErrLinkDown);
}
wlink._accepted.insert(pkt.stream);
wlink._rxmu.unlock();
int _ = select({
ctx->done().recvs(), // 0
wlink._acceptq.sends(&pkt), // 1
});
if (_ == 0)
return E(ctx->err());
}
}
}
// recvReq receives client <- server request.
//
// it returns EOF when server closes the link.
static error _parsePinReq(PinReq *pin, const rxPkt *pkt);
error _WatchLink::recvReq(context::Context ctx, PinReq *prx) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: recvReq", v(wlink));
rxPkt pkt;
bool ok;
int _ = select({
ctx->done().recvs(), // 0
wlink._acceptq.recvs(&pkt, &ok), // 1
});
if (_ == 0)
return E(ctx->err());
if (!ok) {
wlink._rxmu.lock();
bool rxeof = wlink._rxeof;
wlink._rxmu.unlock();
if (rxeof)
return io::EOF_; // NOTE EOF goes without E
return E(ErrLinkDown);
}
return E(_parsePinReq(prx, &pkt));
}
// replyReq sends reply to client <- server request received via recvReq.
error _WatchLink::replyReq(context::Context ctx, const PinReq *req, const string& answer) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: replyReq .%d", v(wlink), req->stream);
wlink._rxmu.lock();
bool ok = wlink._accepted.has(req->stream);
bool down = wlink._down;
wlink._rxmu.unlock();
if (!ok)
panic("reply to not accepted stream");
if (down)
return E(ErrLinkDown);
error err = wlink._send(req->stream, answer);
wlink._rxmu.lock();
ok = wlink._accepted.has(req->stream);
if (ok)
wlink._accepted.erase(req->stream);
wlink._rxmu.unlock();
if (!ok)
panic("BUG: stream vanished from wlink._accepted while reply was in progress");
// TODO also track as answered for some time and don't accept new requests with the same ID?
return E(err);
}
// sendReq sends client -> server request and returns server reply.
pair</*reply*/string, error> _WatchLink::sendReq(context::Context ctx, const string &req) {
_WatchLink& wlink = *this;
StreamID stream = wlink._nextReqID();
xerr::Contextf E("%s: sendReq .%d", v(wlink), stream);
rxPkt rx; bool ok;
chan<rxPkt> rxq;
error err;
tie(rxq, err) = wlink._sendReq(ctx, stream, req);
if (err != nil)
return make_pair("", E(err));
// wait for reply
E = xerr::Contextf("%s: sendReq .%d: recvReply", v(wlink), stream);
int _ = select({
ctx->done().recvs(), // 0
rxq.recvs(&rx, &ok), // 1
});
if (_ == 0)
return make_pair("", E(ctx->err()));
if (!ok) {
wlink._rxmu.lock();
bool down = wlink._down;
wlink._rxmu.unlock();
return make_pair("", E(down ? ErrLinkDown : io::ErrUnexpectedEOF));
}
string reply = rx.to_string();
return make_pair(reply, nil);
}
tuple</*rxq*/chan<rxPkt>, error> _WatchLink::_sendReq(context::Context ctx, StreamID stream, const string &req) {
_WatchLink& wlink = *this;
auto rxq = makechan<rxPkt>(1);
wlink._rxmu.lock();
if (wlink._down) {
wlink._rxmu.unlock();
return make_tuple(nil, ErrLinkDown);
}
if (wlink._rxtab.has(stream)) {
wlink._rxmu.unlock();
panic("BUG: to-be-sent stream is present in rxtab");
}
wlink._rxtab[stream] = rxq;
wlink._rxmu.unlock();
error err = wlink._send(stream, req);
if (err != nil) {
// remove rxq from rxtab
wlink._rxmu.lock();
wlink._rxtab.erase(stream);
wlink._rxmu.unlock();
// no need to drain rxq - it was created with cap=1
rxq = nil;
}
return make_tuple(rxq, err);
}
// _send sends raw message via specified stream.
//
// multiple _send can be called in parallel - _send serializes writes.
// msg must not include \n.
error _WatchLink::_send(StreamID stream, const string &msg) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: send .%d", v(wlink), stream);
if (msg.find('\n') != string::npos)
panic("msg has \\n");
string pkt = fmt::sprintf("%lu %s\n", stream, v(msg));
return E(wlink._write(pkt));
}
error _twlinkwrite(WatchLink wlink, const string &pkt) {
return wlink->_write(pkt);
}
error _WatchLink::_write(const string &pkt) {
_WatchLink& wlink = *this;
// no errctx
wlink._txmu.lock();
defer([&]() {
wlink._txmu.unlock();
});
trace("C: %s: tx: \"%s\"", v(wlink), v(pkt));
int n;
error err;
tie(n, err) = wlink._f->write(pkt.c_str(), pkt.size());
return err;
}
// _parsePinReq parses message into PinReq according to wcfs isolation protocol.
static error _parsePinReq(PinReq *pin, const rxPkt *pkt) {
pin->stream = pkt->stream;
string msg = pkt->to_string();
pin->msg = msg;
xerr::Contextf E("bad pin: '%s'", v(msg));
// pin <foid>) #<blk> @<at>
if (!strings::has_prefix(msg, "pin ")) {
return E(fmt::errorf("not a pin request"));
}
auto argv = strings::split(msg.substr(4), ' ');
if (argv.size() != 3)
return E(fmt::errorf("expected 3 arguments, got %zd", argv.size()));
error err;
tie(pin->foid, err) = xstrconv::parseHex64(argv[0]);
if (err != nil)
return E(fmt::errorf("invalid foid"));
if (!strings::has_prefix(argv[1], '#'))
return E(fmt::errorf("invalid blk"));
tie(pin->blk, err) = xstrconv::parseInt(argv[1].substr(1));
if (err != nil)
return E(fmt::errorf("invalid blk"));
if (!strings::has_prefix(argv[2], '@'))
return E(fmt::errorf("invalid at"));
auto at = argv[2].substr(1);
if (at == "head") {
pin->at = TidHead;
} else {
tie(pin->at, err) = xstrconv::parseHex64(at);
if (err != nil)
return E(fmt::errorf("invalid at"));
}
return nil;
}
// _readline reads next raw line sent from wcfs.
tuple<string, error> _WatchLink::_readline() {
_WatchLink& wlink = *this;
char buf[128];
size_t nl_searchfrom = 0;
while (1) {
auto nl = wlink._rxbuf.find('\n', nl_searchfrom);
if (nl != string::npos) {
auto line = wlink._rxbuf.substr(0, nl+1);
wlink._rxbuf = wlink._rxbuf.substr(nl+1);
return make_tuple(line, nil);
}
nl_searchfrom = wlink._rxbuf.length();
// limit line length to avoid DoS
if (wlink._rxbuf.length() > 128)
return make_tuple("", fmt::errorf("input line is too long"));
int n;
error err;
tie(n, err) = wlink._f->read(buf, sizeof(buf));
if (n > 0) {
wlink._rxbuf += string(buf, n);
continue;
}
if (err == nil)
panic("read returned (0, nil)");
if (err == io::EOF_ && wlink._rxbuf.length() != 0)
err = io::ErrUnexpectedEOF;
return make_tuple("", err);
}
}
// from_string parses string into rxPkt.
error rxPkt::from_string(const string &rx) {
rxPkt& pkt = *this;
xerr::Contextf E("invalid pkt");
// <stream> ... \n
auto sp = rx.find(' ');
if (sp == string::npos)
return E(fmt::errorf("no SP"));
if (!strings::has_suffix(rx, '\n'))
return E(fmt::errorf("no LF"));
string sid = rx.substr(0, sp);
string smsg = strings::trim_suffix(rx.substr(sp+1), '\n');
error err;
tie(pkt.stream, err) = xstrconv::parseUint(sid);
if (err != nil)
return E(fmt::errorf("invalid stream ID"));
auto msglen = smsg.length();
if (msglen > ARRAY_SIZE(pkt.data))
return E(fmt::errorf("len(msg) > %zu", ARRAY_SIZE(pkt.data)));
memcpy(pkt.data, smsg.c_str(), msglen);
pkt.datalen = msglen;
return nil;
}
// to_string converts rxPkt data into string.
string rxPkt::to_string() const {
const rxPkt& pkt = *this;
return string(pkt.data, pkt.datalen);
}
_WatchLink::_WatchLink() {}
_WatchLink::~_WatchLink() {}
void _WatchLink::incref() {
object::incref();
}
void _WatchLink::decref() {
if (__decref())
delete this;
}
string _WatchLink::String() const {
const _WatchLink& wlink = *this;
// XXX don't include wcfs as prefix here? (see Conn.String for details)
return fmt::sprintf("%s: wlink%d", v(wlink._wc), wlink._f->fd());
}
int _WatchLink::fd() const {
const _WatchLink& wlink = *this;
return wlink._f->fd();
}
// _nextReqID returns stream ID for next client-originating request to be made.
StreamID _WatchLink::_nextReqID() {
_WatchLink& wlink = *this;
wlink._txmu.lock(); // TODO ._req_next -> atomic (currently uses arbitrary lock)
StreamID stream = wlink._req_next;
wlink._req_next = (wlink._req_next + 2); // wraparound at uint64 max
wlink._txmu.unlock();
return stream;
}
} // wcfs::
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// wcfs_watchlink provides WatchLink class that implements message exchange
// over /head/watch on wcfs.
#ifndef _NXD_WCFS_WATCHLINK_H_
#define _NXD_WCFS_WATCHLINK_H_
#include <golang/libgolang.h>
#include <golang/context.h>
#include <golang/cxx.h>
#include <golang/sync.h>
using namespace golang;
using cxx::dict;
using cxx::set;
#include "wcfs.h"
#include "wcfs_misc.h"
// wcfs::
namespace wcfs {
struct PinReq;
// StreamID stands for ID of a stream multiplexed over WatchLink.
typedef uint64_t StreamID;
// rxPkt internally represents data of one message received over WatchLink.
struct rxPkt {
// stream over which the data was received
StreamID stream;
// raw data received/to-be-sent.
// XXX not e.g. string, as chan<T> currently does not support types with
// non-trivial copy. Note: we anyway need to limit rx line length to
// avoid DoS, but just for DoS the limit would be higher.
uint16_t datalen;
char data[256 - sizeof(StreamID) - sizeof(uint16_t)];
error from_string(const string& rx);
string to_string() const;
};
static_assert(sizeof(rxPkt) == 256, "rxPkt miscompiled"); // NOTE 128 is too low for long error message
// WatchLink represents /head/watch link opened on wcfs.
//
// It is created by WCFS._openwatch().
//
// .sendReq()/.recvReq() provides raw IO in terms of wcfs isolation protocol messages.
// .close() closes the link.
//
// It is safe to use WatchLink from multiple threads simultaneously.
typedef refptr<class _WatchLink> WatchLink;
class _WatchLink : public object {
WCFS *_wc;
os::File _f; // head/watch file handle
string _rxbuf; // buffer for data already read from _f
// iso.protocol message IO
chan<rxPkt> _acceptq; // server originated messages go here
sync::Mutex _rxmu;
bool _down; // y when the link is no-longer operational
bool _rxeof; // y if EOF was received from server
dict<StreamID, chan<rxPkt>>
_rxtab; // {} stream -> rxq server replies go via here
set<StreamID> _accepted; // streams we accepted but did not replied yet
StreamID _req_next; // stream ID for next client-originated request TODO -> atomic
sync::Mutex _txmu; // serializes writes
sync::Once _txclose1;
sync::WorkGroup _serveWG; // _serveRX is running under _serveWG
func<void()> _serveCancel;
// XXX for tests
public:
vector<string> fatalv; // ad-hoc, racy. TODO rework to send messages to control channel
chan<structZ> rx_eof; // becomes ready when wcfs closes its tx side
// don't new - create only via WCFS._openwatch()
private:
_WatchLink();
virtual ~_WatchLink();
friend pair<WatchLink, error> WCFS::_openwatch();
public:
void incref();
void decref();
public:
error close();
error closeWrite();
pair<string, error> sendReq(context::Context ctx, const string &req);
error recvReq(context::Context ctx, PinReq *rx_into);
error replyReq(context::Context ctx, const PinReq *req, const string& reply);
string String() const;
int fd() const;
private:
error _serveRX(context::Context ctx);
tuple<string, error> _readline();
error _send(StreamID stream, const string &msg);
error _write(const string &pkt);
StreamID _nextReqID();
tuple<chan<rxPkt>, error> _sendReq(context::Context ctx, StreamID stream, const string &req);
friend error _twlinkwrite(WatchLink wlink, const string &pkt);
};
// PinReq represents 1 server-initiated wcfs pin request received over /head/watch link.
struct PinReq {
StreamID stream; // request was received with this stream ID
zodb::Oid foid; // request is about this file
int64_t blk; // ----//---- about this block
zodb::Tid at; // pin to this at; TidHead means unpin to head
string msg; // XXX raw message for tests (TODO kill)
};
// for testing
error _twlinkwrite(WatchLink wlink, const string &pkt);
} // wcfs::
#endif
......@@ -144,6 +144,18 @@ def map_ro(int fd, off_t offset, size_t size):
return <unsigned char[:size:1]>addr
# map_into_ro is similar to map_ro, but mmaps fd[offset:...] into mem's memory.
def map_into_ro(unsigned char[::1] mem not None, int fd, off_t offset):
cdef void *addr = &mem[0]
cdef size_t size = mem.shape[0]
addr = mman.mmap(addr, size, mman.PROT_READ, mman.MAP_FIXED |
mman.MAP_SHARED, fd, offset)
if addr == mman.MAP_FAILED:
PyErr_SetFromErrno(OSError)
return
# unmap unmaps memory covered by mem.
def unmap(const unsigned char[::1] mem not None):
cdef const void *addr = &mem[0]
......
......@@ -23,15 +23,140 @@
"""Module wcfs_test.pyx complements wcfs_test.py with things that cannot be
implemented in Python."""
from posix.signal cimport sigaction, sigaction_t, siginfo_t, SA_SIGINFO
from libc.signal cimport SIGBUS
from posix.signal cimport sigaction, sigaction_t, siginfo_t, SA_SIGINFO, sigemptyset
from libc.signal cimport SIGBUS, SIGSEGV
from libc.setjmp cimport sigjmp_buf, sigsetjmp, siglongjmp
from libc.stdlib cimport abort
from libc.string cimport strlen
from posix.unistd cimport write, sleep
from posix.types cimport off_t
from cpython.exc cimport PyErr_SetFromErrno
from golang cimport panic
from golang cimport chan, pychan, select, panic, topyexc, cbool
from golang cimport sync, time
# _tWCFS is pyx part of tWCFS.
cdef class _tWCFS:
cdef readonly pychan _closed # chan[structZ]
cdef readonly pychan _wcfuseaborted # chan[structZ]
def __cinit__(_tWCFS t):
t._closed = pychan(dtype='C.structZ')
t._wcfuseaborted = pychan(dtype='C.structZ')
# _abort_ontimeout sends abort to fuse control file if timeout happens
# before tDB is closed.
#
# It runs without GIL to avoid deadlock: e.g. if a code that is
# holding GIL will access wcfs-mmapped memory, and wcfs will send pin,
# but pin handler is failing one way or another - select will wake-up
# but, if _abort_ontimeout uses GIL, won't continue to run trying to lock
# GIL -> deadlock.
def _abort_ontimeout(_tWCFS t, int fdabort, double dt, pychan nogilready not None):
cdef chan[double] timeoutch = time.after(dt)
emsg1 = "\nC: test timed out after %.1fs\n" % (dt / time.second)
cdef char *_emsg1 = emsg1
with nogil:
# tell main thread that we entered nogil world
nogilready.chan_structZ().close()
t.__abort_ontimeout(dt, timeoutch, fdabort, _emsg1)
cdef void __abort_ontimeout(_tWCFS t, double dt, chan[double] timeoutch,
int fdabort, const char *emsg1) nogil except +topyexc:
_ = select([
timeoutch.recvs(), # 0
t._closed.chan_structZ().recvs(), # 1
])
if _ == 1:
return # tDB closed = testcase completed
# timeout -> force-umount wcfs
writeerr(emsg1)
writeerr("-> aborting wcfs fuse connection to unblock ...\n\n")
xwrite(fdabort, b"1\n")
t._wcfuseaborted.chan_structZ().close()
# read_exfault_nogil reads mem with GIL released and returns its content.
#
# If reading hits segmentation fault, it is converted to SegmentationFault exception.
class SegmentationFault(Exception): pass
cdef sync.Mutex exfaultMu # one at a time as sigaction is per-process
cdef sigjmp_buf exfaultJmp
cdef cbool faulted
def read_exfault_nogil(const unsigned char[::1] mem not None) -> bytes:
assert len(mem) == 1, "read_exfault_nogil: only [1] mem is supported for now"
cdef unsigned char b
global faulted
cdef cbool faulted_
# somewhat dup of MUST_FAULT in test_virtmem.c
with nogil:
exfaultMu.lock()
faulted = False
try:
with nogil:
b = _read_exfault(&mem[0])
finally:
faulted_ = faulted
with nogil:
exfaultMu.unlock()
if faulted_:
raise SegmentationFault()
return bytes(bytearray([b]))
cdef void exfaultSighand(int sig) nogil:
# return from sighandler to proper place with faulted=True
global faulted
faulted = True
siglongjmp(exfaultJmp, 1)
cdef unsigned char _read_exfault(const unsigned char *p) nogil except +topyexc:
global faulted
cdef sigaction_t act, saveact
act.sa_handler = exfaultSighand
act.sa_flags = 0
err = sigemptyset(&act.sa_mask)
if err != 0:
panic("sigemptyset: failed")
err = sigaction(SIGSEGV, &act, &saveact)
if err != 0:
panic("sigaction SIGSEGV -> exfaultSighand: failed")
b = 0xff
if sigsetjmp(exfaultJmp, 1) == 0:
b = p[0] # should pagefault -> sighandler does longjmp
else:
# faulted
if not faulted:
panic("faulted, but !faulted")
err = sigaction(SIGSEGV, &saveact, NULL)
if err != 0:
panic("sigaction SIGSEGV <- restore: failed")
return b
# --------
cdef extern from "<fcntl.h>" nogil:
int posix_fadvise(int fd, off_t offset, off_t len, int advice);
enum: POSIX_FADV_DONTNEED
# fadvise_dontneed tells the kernel that file<fd>[offset +len) is not needed.
#
# see fadvise(2) for details.
def fadvise_dontneed(int fd, off_t offset, off_t len):
cdef int err = posix_fadvise(fd, offset, len, POSIX_FADV_DONTNEED)
if err:
PyErr_SetFromErrno(OSError)
# ---- signal handling ----
# TODO -> golang.signal ?
......
......@@ -140,7 +140,9 @@ package xbtree
// from that job, it first waits for corresponding job(s) to complete.
//
// Explained rebuild organization allows non-overlapping queries/track-requests
// to run simultaneously.
// to run simultaneously. This property is essential to WCFS because otherwise
// WCFS would not be able to serve several non-overlapping READ requests to one
// file in parallel.
//
// --------
//
......
......@@ -85,7 +85,9 @@ package zdata
// Track/queries requests for long.
//
// Combined this organization allows non-overlapping queries/track-requests
// to run simultaneously.
// to run simultaneously. This property is essential to WCFS because otherwise
// WCFS would not be able to serve several non-overlapping READ requests to one
// file in parallel.
//
// See also "Concurrency" in ΔBtail organization for more details.
......
......@@ -25,6 +25,8 @@ import (
"fmt"
"io"
"math"
"strconv"
"strings"
"sync/atomic"
"syscall"
......@@ -35,6 +37,8 @@ import (
"github.com/pkg/errors"
"lab.nexedi.com/kirr/go123/xio"
"lab.nexedi.com/kirr/neo/go/zodb"
)
// ---- FUSE ----
......@@ -105,7 +109,7 @@ func err2LogStatus(err error) fuse.Status {
// from any single node will make the kernel think that the filesystem does not
// support Open at all.
//
// In wcfs we have dynamic files (e.g. upcoming /head/watch) and this way we have to
// In wcfs we have dynamic files (e.g. /head/watch) and this way we have to
// avoid returning ENOSYS on nodes, that do not need file handles.
//
// fsNode is like nodefs.defaultNode, but by default Open returns to kernel
......@@ -424,6 +428,67 @@ func (f *skFile) Release() {
}
// ---- parsing ----
// parseWatchFrame parses line going through /head/watch into (stream, msg)
//
// <stream> <msg...>
func parseWatchFrame(line string) (stream uint64, msg string, err error) {
sp := strings.IndexByte(line, ' ')
if sp == -1 {
return 0, "", fmt.Errorf("invalid frame: %q", line)
}
stream, err = strconv.ParseUint(line[:sp], 10, 64)
if err != nil {
return 0, "", fmt.Errorf("invalid frame: %q (invalid stream)", line)
}
msg = strings.TrimSuffix(line[sp+1:], "\n")
return stream, msg, nil
}
// parseWatch parses watch request wcfs received over /head/watch.
//
// watch <file> (@<at>|-)
//
// at="-" is returned as zodb.InvalidTid .
func parseWatch(msg string) (oid zodb.Oid, at zodb.Tid, err error) {
defer func() {
if err != nil {
oid = zodb.InvalidOid
at = zodb.InvalidTid
err = fmt.Errorf("bad watch: %s", err)
}
}()
if !strings.HasPrefix(msg, "watch ") {
return 0, 0, fmt.Errorf("not a watch request: %q", msg)
}
argv := strings.Split(msg[len("watch "):], " ")
if len(argv) != 2 {
return 0, 0, fmt.Errorf("expected 2 arguments, got %d", len(argv))
}
oid, err = zodb.ParseOid(argv[0])
if err != nil {
return 0, 0, fmt.Errorf("invalid oid")
}
switch {
case argv[1] == "-":
at = zodb.InvalidTid
case strings.HasPrefix(argv[1], "@"):
at, err = zodb.ParseTid(argv[1][1:])
default:
err = fmt.Errorf("x") // just anything
}
if err != nil {
return 0, 0, fmt.Errorf("invalid at")
}
return oid, at, nil
}
// ---- make df happy (else it complains "function not supported") ----
func (root *Root) StatFs() *fuse.StatfsOut {
......
......@@ -39,6 +39,143 @@ part of Linux 5.2:
https://git.kernel.org/linus/ad2ba64dd489
Invalidations to wcfs clients are delayed until block access
============================================================
Initially it was planned that wcfs would send invalidation messages to its
clients right after receiving invalidation message from ZODB at transaction
boundary time. That simplifies logic but requires that for a particular file,
wcfs has to send to clients whole range of where the file was changed.
Emitting whole δR right at transaction-boundary time requires to keep whole
ZBigFile.blktab index in RAM. Even though from space point of view it is
somewhat acceptable (~ 0.01% of whole-file data size, i.e. ~ 128MB of index for
~ 1TB of data), it is not good from time overhead point of view - initial open
of a file this way would be potentially slow as full blktab scan - including
Trees _and_ Buckets nodes - would be required.
-> we took the approach where we send invalidation to client about a block
lazily only when the block is actually accessed.
Rejected alternative:
Building δFtail lazily along serving FUSE reads during scope of one
transaction is not trivial and would create concurrency bottlenecks if simple
locking scheme is used. With the main difficulty being to populate tracking set
of δBtree lazily. However as the first approach we could still build complete
tracking set for a BTree at the time of file open: we need to scan through all
trees but _not_ buckets: this way we'll know oid of all tree nodes: trees _and_
buckets, while avoiding loading buckets makes this approach practical: with
default LOBTree settings (1 bucket = 60·objects, 1 tree = 500·buckets) it will
require ~ 20 trees to cover 1TB of data. And we can scan those trees very
quickly even if doing so serially. For 1PB of data it will require to scan ~
10⁴ trees. If RTT to load 1 object is ~1ms this will become 10 seconds if done
serially. However if we load all those tree objects in parallel it will be
much less. Still the number of trees to scan is linear to the amount of data.
-> rejected: ΔFtail and ΔBtail were instead fixed to allow several Track and
queries requests to run in parallel. See "Concurrency" section in ΔFtail/ΔBtail
organization overview.
Changing mmapping while under pagefault is possible
===================================================
We can change a mapping while a page from it is under pagefault:
- the kernel, upon handling pagefault, queues read request to filesystem
server. As of Linux 4.20 this is done _with_ holding client->mm->mmap_sem:
kprobe:fuse_readpages (client->mm->mmap_sem.count: 1)
fuse_readpages+1
read_pages+109
__do_page_cache_readahead+401
filemap_fault+635
__do_fault+31
__handle_mm_fault+3403
handle_mm_fault+220
__do_page_fault+598
page_fault+30
- however the read request is queued to be performed asynchronously -
the kernel does not wait for it in fuse_readpages, because
* git.kernel.org/linus/c1aa96a5,
* git.kernel.org/linus/9cd68455,
* and go-fuse initially negotiating CAP_ASYNC_READ to the kernel.
- the kernel then _releases_ client->mm->mmap_sem and then waits
for to-read pages to become ready:
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n2411
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n2457
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n1301
- the filesystem server, upon receiving the read request, can manipulate
client's address space. This requires to write-lock client->mm->mmap_sem,
but we can be sure it won't deadlock because the kernel releases it
before waiting (see previous point).
in practice the manipulation is done by another client thread, because
on Linux it is not possible to change mm of another process. However
the main point here is that the manipulation is possible because
there will be no deadlock on client->mm->mmap_sem.
For the reference here is how filesystem server reply looks under trace:
kprobe:fuse_readpages_end
fuse_readpages_end+1
request_end+188
fuse_dev_do_write+1921
fuse_dev_write+78
do_iter_readv_writev+325
do_iter_write+128
vfs_writev+152
do_writev+94
do_syscall_64+85
entry_SYSCALL_64_after_hwframe+68
and a test program that demonstrates that it is possible to change
mmapping while under pagefault to it:
https://lab.nexedi.com/kirr/go-fuse/commit/f822c9db
Starting from Linux 5.1 mmap_sem should be generally released while doing any IO:
https://git.kernel.org/linus/6b4c9f4469
but before that the analysis remains FUSE-specific.
The property that changing mmapping while under pagefault is possible is
verified by wcfs testsuite in `test_wcfs_remmap_on_pin` test.
Client cannot be ptraced while under pagefault
==============================================
We cannot use ptrace to run code on client thread that is under pagefault:
The kernel sends SIGSTOP to interrupt tracee, but the signal will be
processed only when the process returns from kernel space, e.g. here
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/entry/common.c?id=v4.19-rc8-151-g23469de647c4#n160
This way the tracer won't receive obligatory information that tracee
stopped (via wait...) and even though ptrace(ATTACH) succeeds, all other
ptrace commands will fail:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/ptrace.c?id=v4.19-rc8-151-g23469de647c4#n1140
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/ptrace.c?id=v4.19-rc8-151-g23469de647c4#n207
My original idea was to use ptrace to run code in process to change it's
memory mappings, while the triggering process is under pagefault/read
to wcfs, and the above shows it won't work - trying to ptrace the
client from under wcfs will just block forever (the kernel will be
waiting for read operation to finish for ptrace, and read will be first
waiting on ptrace stopping to complete = deadlock)
Kernel locks page on read/cache store/... - we have to be careful not to deadlock
=================================================================================
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2019-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""Program wcfs_readcancel is helper for wcfs_test to verify that
sysread(/head/watch) is unblocked and canceled when kernel asks WCFS to cancel
that read request.
Without proper FUSE INTERRUPT handling on WCFS side, such reads are not
cancelled, which results in processes that were aborted or even `kill-9`ed being
stuck forever waiting for WCFS to release them.
"""
from __future__ import print_function, absolute_import
from golang import select, default
from golang import context, sync, time
import os, sys
def main():
wcfs_root = sys.argv[1]
f = open("%s/head/watch" % wcfs_root)
wg = sync.WorkGroup(context.background())
def _(ctx):
data = f.read() # should block forever
raise AssertionError("read: woken up: data=%r" % data)
wg.go(_)
def _(ctx):
time.sleep(100*time.millisecond)
_, _rx = select(
default, # 0
ctx.done().recv, # 1
)
if _ == 1:
raise ctx.err()
os._exit(0)
wg.go(_)
wg.wait()
raise AssertionError("should be unreachable")
if __name__ == '__main__':
main()
......@@ -29,8 +29,11 @@
//
// For a client, the primary way to access a bigfile should be to mmap
// head/bigfile/<bigfileX> which represents always latest bigfile data.
// Clients that want to get isolation guarantee should subscribe for
// invalidations and re-mmap invalidated regions to file with pinned bigfile revision for
// the duration of their transaction. See "Isolation protocol" for details.
//
// In the usual situation when bigfiles are big
// In the usual situation when bigfiles are big, and there are O(1)/δt updates,
// there should be no need for any cache besides shared kernel cache of latest
// bigfile data.
//
......@@ -54,6 +57,7 @@
//
// head/
// at ; data inside head/ is as of this ZODB transaction
// watch ; channel for bigfile invalidations
// bigfile/ ; bigfiles' data
// <oid(ZBigFile1)>
// <oid(ZBigFile2)>
......@@ -62,7 +66,9 @@
// where /bigfile/<bigfileX> represents latest bigfile data as stored in
// upstream ZODB. As there can be some lag receiving updates from the database,
// /at describes precisely ZODB state for which bigfile data is currently
// exposed.
// exposed. Whenever bigfile data is changed in upstream ZODB, information
// about the changes is first propagated to /watch, and only after that
// /bigfile/<bigfileX> is updated. See "Isolation protocol" for details.
//
// @<revX>/ has the following structure:
//
......@@ -78,6 +84,124 @@
// wcfs filesystem. Similarly @<revX>/ become visible only after access.
//
//
// Isolation protocol
//
// In order to support isolation, wcfs implements isolation protocol that
// must be cooperatively followed by both wcfs and client.
//
// First, client mmaps latest bigfile, but does not access it:
//
// mmap(head/bigfile/<bigfileX>)
//
// Then client opens head/watch and tells wcfs through it for which ZODB state
// it wants to get bigfile's view:
//
// C: 1 watch <bigfileX> @<at>
//
// The server then, after potentially sending initial pin and unpin messages
// (see below), reports either success or failure:
//
// S: 1 ok
// S: 1 error ... ; if <at> is too far away back from head/at
//
// The server sends "ok" reply only after head/at is ≥ requested <at>, and only
// after all initial pin/unpin messages are fully acknowledged by the client.
// The client can start to use mmapped data after it gets "ok".
// The server sends "error" reply e.g. if requested <at> is too far away back
// from head/at, or on any other error.
// TODO specify watch state after error.
//
// Upon watch request, either initially, or after sending "ok", the server will be notifying the
// client about file blocks that client needs to pin in order to observe file's
// data as of <at> revision:
//
// The filesystem server itself receives information about changed data from
// ZODB server through regular ZODB invalidation channel (as it is ZODB client
// itself). Then, separately for each changed file block, before actually
// updating head/bigfile/<bigfileX> content, it notifies through opened
// head/watch links to clients, that had requested it (separately to each
// client), about the changes:
//
// S: <2·k> pin <bigfileX> #<blk> @<rev_max> ; @head means unpin
//
// and waits until all clients confirm that changed file block can be updated
// in global OS cache.
//
// The client in turn should now re-mmap requested to be pinned block to bigfile@<rev_max>
//
// # mmapped at address corresponding to #blk
// mmap(@<rev_max>/bigfile/<bigfileX>, #blk, MAP_FIXED)
//
// or, if given @head as @<rev_max>, to bigfile@head
//
// mmap(head/bigfile/<bigfileX>, #blk, MAP_FIXED)
//
// and must send ack back to the server when it is done:
//
// C: <2·k> ack
//
// The server sends pin notifications only for file blocks, that are known to
// be potentially changed after client's <at>, and <rev_max> describes the
// upper bound for the block revision as of <at> database view:
//
// <rev_max> ≤ <at> ; block stays unchanged in (<rev_max>, <at>] range
//
// The server maintains short history tail of file changes to be able to
// support openings with <at> being slightly in the past compared to current
// head/at. The server might reject a watch request if <at> is too far away in
// the past from head/at. The client is advised to restart its transaction with
// more uptodate database view if it gets watch setup error.
//
// A later request from the client for the same <bigfileX> but with different
// <at>, overrides previous watch request for that file. A client can use "-"
// instead of "@<at>" to stop watching a file.
//
// A single client can send several watch requests through single head/watch
// open, as well as it can use several head/watch opens simultaneously.
// The server sends pin notifications for all files requested to be watched via
// every opened head/watch link.
//
// Note: a client could use a single watch to manage its several views for the same
// file but with different <at>. This could be achieved via watching with
// @<at_min>, and then deciding internally which views needs to be adjusted and
// which views need not. Wcfs does not oblige clients to do so though, and a
// client is free to use as many head/watch openings as it needs to.
//
// When clients are done with @<revX>/bigfile/<bigfileX> (i.e. client's
// transaction ends and array is unmapped), the server sees number of opened
// files to @<revX>/bigfile/<bigfileX> drops to zero, and automatically
// destroys @<revX>/bigfile/<bigfileX> after reasonable timeout.
//
// The client should send "bye" before closing head/watch file:
//
// C: <2·k+1> bye
//
//
// Protection against slow or faulty clients
//
// If a client, on purpose or due to a bug or being stopped, is slow to respond
// with ack to file invalidation notification, it creates a problem because the
// server will become blocked waiting for pin acknowledgments, and thus all
// other clients, that try to work with the same file, will get stuck.
//
// The problem could be avoided, if wcfs would reside inside OS kernel and this
// way could be able to manipulate clients address space directly (then
// isolation protocol won't be needed). It is also possible to imagine
// mechanism, where wcfs would synchronously change clients' address space via
// injecting trusted code and running it on client side via ptrace to adjust
// file mappings.
//
// However ptrace does not work when client thread is blocked under pagefault,
// and that is exactly what wcfs would need to do to process invalidations
// lazily, because eager invalidation processing results in prohibitively slow
// file opens. See internal wcfs overview for details about why ptrace
// cannot be used and why lazy invalidation processing is required.
//
// Lacking OS primitives to change address space of another process and not
// being able to work it around with ptrace in userspace, wcfs takes approach
// to kill a slow client on 30 seconds timeout by default.
//
//
// Writes
//
// As each bigfile is represented by 1 synthetic file, there can be several
......@@ -121,8 +245,8 @@ package main
// Wcfs organization
//
// Wcfs is a ZODB client that translates ZODB objects into OS files as would
// non-wcfs wendelin.core do for a ZBigFile.
// It is organized as follows:
// non-wcfs wendelin.core do for a ZBigFile. Contrary to non-wcfs wendelin.core,
// it keeps bigfile data in shared OS cache efficiently. It is organized as follows:
//
// 1) 1 ZODB connection for "latest data" for whole filesystem (zhead).
// 2) head/bigfile/* of all bigfiles represent state as of zhead.At .
......@@ -173,9 +297,14 @@ package main
// won't be served from OS file cache and instead will trigger a FUSE read
// request to wcfs.
//
// 4.3) no invalidation messages are sent to wcfs clients at this point(+).
//
// 4.4) processing ZODB invalidations and serving file reads (see 7) are
// organized to be mutually exclusive.
//
// 4.5) similarly, processing ZODB invalidations and setting up watches (see
// 7.2) are organized to be mutually exclusive.
//
// 5) after OS file cache was invalidated, we resync zhead to new database
// view corresponding to tid.
//
......@@ -195,7 +324,15 @@ package main
// .rev↑
// {} file -> {}blk
//
// Scalability of δFtail plays important role in scalability of WCFS.
// min(rev) in δFtail is min(@at) at which head/bigfile/file is currently watched (see below).
//
// To support initial openings with @at being slightly in the past, we also
// make sure that min(rev) is enough to cover last 1 minute of history
// from head/at.
//
// Scalability of δFtail plays important role in scalability of WCFS because
// δFtail, besides other places, is queried and potentially rebuilt at every
// FUSE read request (see 7 below).
//
// See documentation in internal/zdata/δftail.go for more details on ΔFtail
// and its scalability properties.
......@@ -203,8 +340,64 @@ package main
// 7) when we receive a FUSE read(#blk) request to a head/bigfile/file, we process it as follows:
//
// 7.1) load blkdata for head/bigfile/file[blk] @zhead.at .
//
// while loading this also gives upper bound estimate of when the block
// was last changed:
//
// rev(blk) ≤ max(_.serial for _ in (ZBlk(#blk), all BTree/Bucket that lead to ZBlk))
//
// it is not exact because BTree/Bucket can change (e.g. rebalance)
// but still point to the same k->ZBlk.
//
// we also use δFtail to find either exact blk revision or another upper
// bound if file[blk] has no change during δFtail coverage:
//
// rev(blk) = δFtail.BlkRevAt(file, #blk, zhead.at)
//
// below rev'(blk) is min(of the estimates found):
//
// rev(blk) ≤ rev'(blk) rev'(blk) = min(^^^)
//
// Note: we delay recomputing δFtail.BlkRevAt(file, #blk, head) because
// using just cheap revmax estimate can frequently result in all watches
// being skipped.
//
// 7.2) for all registered client@at watches of head/bigfile/file:
//
// - rev'(blk) ≤ at: -> do nothing
// - rev'(blk) > at:
// - if blk ∈ watch.pinned -> do nothing
// - rev = δFtail.BlkRevAt(file, #blk, at)
// - watch.pin(file, #blk, @rev)
// - watch.pinned += blk
//
// where
//
// watch.pin(file, #blk, @rev)
//
// sends pin message according to "Isolation protocol", and is assumed
// to cause
//
// remmap(file, #blk, @rev/bigfile/file)
//
// on client.
//
// ( one could imagine adjusting mappings synchronously via running
// wcfs-trusted code via ptrace that wcfs injects into clients, but ptrace
// won't work when client thread is blocked under pagefault or syscall(^) )
//
// in order to support watching for each head/bigfile/file
//
// [] of watch{client@at↑, pinned}
//
// is maintained.
//
// 7.3) blkdata is returned to kernel.
//
// Thus a client that wants latest data on pagefault will get latest data,
// and a client that wants @rev data will get @rev data, even if it was this
// "old" client that triggered the pagefault(~).
//
// 8) serving FUSE reads from @<rev>/bigfile/file is organized similarly to
// serving reads from head/bigfile/file, but with using dedicated per-<rev>
// ZODB connection and without notifying any watches.
......@@ -217,6 +410,9 @@ package main
//
//
// (*) see notes.txt -> "Notes on OS pagecache control"
// (+) see notes.txt -> "Invalidations to wcfs clients are delayed until block access"
// (~) see notes.txt -> "Changing mmapping while under pagefault is possible"
// (^) see notes.txt -> "Client cannot be ptraced while under pagefault"
// (%) no need to keep track of ZData - ZBlk1 is always marked as changed on blk data change.
......@@ -239,6 +435,24 @@ package main
// If a lock is not minor to zheadMu, it is still ok to lock it under zheadMu.R
// as zheadMu, being the most major lock in wcfs, always comes locked first, if
// it needs to be locked.
//
// For watches, similarly to zhead, watch.at is protected by major-for-watch
// per-watch RW lock watch.atMu . When watch.at is updated during watch
// setup/upgrade time - watch.atMu.W is taken. Contrary whenever watch is
// notified with pin messages - watch.atMu.R is taken to make sure watch.at
// stays unchanged while pins are prepared and processed.
//
// For watches, similarly to zheadMu, there are several minor-to-atMu locks
// that protect internal data structures. Such locks are noted similarly to
// zheadMu enslavement.
//
// In addition to what is written above there are other ordering rules that are
// followed consistently to avoid hitting deadlock:
//
// BigFile.watchMu > Watch.atMu
// WatchLink.byfileMu > BigFile.watchMu
// WatchLink.byfileMu > BigFileDir.fileMu
// WatchLink.byfileMu > Watch.atMu
// Notation used
......@@ -251,8 +465,11 @@ package main
//
// f - BigFile
// bfdir - BigFileDir
// wlink - WatchLink
// w - Watch
import (
"bufio"
"context"
"flag"
"fmt"
......@@ -328,6 +545,7 @@ type Head struct {
rev zodb.Tid // 0 for head/, !0 for @<rev>/
bfdir *BigFileDir // bigfile/
// at - served by .readAt
// watch - implicitly linked to by fs
// ZODB connection for everything under this head
......@@ -352,6 +570,14 @@ type Head struct {
continueOSCacheUpload chan struct{}
// uploadBlk signals to zwatcher that there are so many inflight OS cache uploads currently.
inflightOSCacheUploads int32
// head/watch opens
wlinkMu sync.Mutex
wlinkTab map[*WatchLink]struct{}
// waiters for zhead.At to become ≥ their at.
hwaitMu sync.Mutex // zheadMu.W | zheadMu.R + hwaitMu
hwait map[hwaiter]struct{} // set{(at, ready)}
}
// /(head|<rev>)/bigfile/ - served by BigFileDir.
......@@ -395,6 +621,14 @@ type BigFile struct {
// consulted/invalidated whenever wcfs logic needs to consult/invalidate OS cache.
loadMu sync.Mutex // zheadMu.W | zheadMu.R + loadMu
loading map[int64]*blkLoadState // #blk -> {... blkdata}
// watches attached to this file.
//
// both watches in already "established" state (i.e. initial watch
// request was completed and answered with "ok"), and watches in
// progress of being established are kept here.
watchMu sync.RWMutex
watchTab map[*Watch]struct{}
}
// blkLoadState represents a ZBlk load state/result.
......@@ -408,6 +642,62 @@ type blkLoadState struct {
err error
}
// /head/watch - served by WatchNode.
type WatchNode struct {
fsNode
head *Head // parent head/
idNext int32 // ID for next opened WatchLink
}
// /head/watch open - served by WatchLink.
type WatchLink struct {
sk *FileSock // IO channel to client
id int32 // ID of this /head/watch handle (for debug log)
head *Head
// watches associated with this watch link.
//
// both already established, and watches being initialized in-progress are registered here.
// (see setupWatch)
byfileMu sync.Mutex
byfile map[zodb.Oid]*Watch // {} foid -> Watch
// IO
reqNext uint64 // stream ID for next wcfs-originated request; 0 is reserved for control messages
txMu sync.Mutex
rxMu sync.Mutex
rxTab map[/*stream*/uint64]chan string // client replies go via here
}
// Watch represents watching for changes to 1 BigFile over particular watch link.
type Watch struct {
link *WatchLink // link to client
file *BigFile // watching this file
// atMu, similarly to zheadMu, protects watch.at and pins associated with Watch.
// atMu.R guarantees that watch.at is not changing, but multiple
// simultaneous pins could be running (used e.g. by readPinWatchers).
// atMu.W guarantees that only one user has watch.at write access and
// that no pins are running (used by setupWatch).
atMu sync.RWMutex
at zodb.Tid // requested to be watched @at
pinnedMu sync.Mutex // atMu.W | atMu.R + pinnedMu
pinned map[int64]*blkPinState // {} blk -> {... rev} blocks that are already pinned to be ≤ at
}
// blkPinState represents state/result of pinning one block.
//
// when !ready the pinning is in progress.
// when ready the pinning has been completed.
type blkPinState struct {
rev zodb.Tid // revision to which the block is being or has been pinned
ready chan struct{}
err error
}
// -------- ZODB cache control --------
// zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict
......@@ -579,9 +869,14 @@ retry:
file := bfdir.fileTab[foid]
if δfile.Epoch {
wg.Go(func(ctx context.Context) error {
return file.invalidateAll() // NOTE does not accept ctx
})
// XXX while invalidating whole file at epoch is easy,
// it becomes not so easy to handle isolation if epochs
// could be present. For this reason we forbid changes
// to ZBigFile objects for now.
return fmt.Errorf("ZBigFile<%s> changed @%s", foid, δF.Rev)
// wg.Go(func(ctx context.Context) error {
// return file.invalidateAll() // NOTE does not accept ctx
// })
} else {
for blk := range δfile.Blocks {
blk := blk
......@@ -660,14 +955,73 @@ retry:
}
// shrink δFtail not to grow indefinitely.
// cover history for at least 1 minute.
// cover history for at least 1 minute, but including all watches.
// No need to lock anything because we are holding zheadMu and
// setupWatch too runs with zheadMu locked.
//
// TODO shrink δFtail only once in a while - there is no need to
// cut δFtail on every transaction.
// TODO shrink δFtail only once in a while - there is no need to compute
// revCut and cut δFtail on every transaction.
revCut := zodb.TidFromTime(zhead.At().Time().Add(-1*time.Minute))
for wlink := range head.wlinkTab {
for _, w := range wlink.byfile {
if w.at < revCut {
revCut = w.at
}
}
}
bfdir.δFtail.ForgetPast(revCut)
// notify zhead.At waiters
for hw := range head.hwait {
if hw.at <= δZ.Tid {
delete(head.hwait, hw)
close(hw.ready)
}
}
return nil
}
// hwaiter represents someone waiting for zhead to become ≥ at.
type hwaiter struct {
at zodb.Tid
ready chan struct{}
}
// zheadWait waits till head.zconn.At becomes ≥ at.
//
// It returns error either if wcfs is down or ctx is canceled.
func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) {
defer xerr.Contextf(&err, "wait zhead ≥ %s", at)
if head.rev != 0 {
panic("must be called only for head/, not @revX/")
}
// TODO check wcfs.down
// check if zhead is already ≥ at
head.zheadMu.RLock()
if head.zconn.At() >= at {
head.zheadMu.RUnlock()
return nil
}
// no - we have to wait for it
ready := make(chan struct{})
head.hwaitMu.Lock()
head.hwait[hwaiter{at, ready}] = struct{}{}
head.hwaitMu.Unlock()
head.zheadMu.RUnlock()
select {
case <-ctx.Done():
return ctx.Err()
case <-ready:
return nil // ok - zhead.At went ≥ at
}
}
// invalidateBlk invalidates 1 file block in kernel cache.
......@@ -898,14 +1252,21 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro
}
// noone was loading - we became responsible to load this block
blkdata, treepath, blkcov, zblk, _, err := f.zfile.LoadBlk(ctx, blk)
blkdata, treepath, blkcov, zblk, blkrevMax, err := f.zfile.LoadBlk(ctx, blk)
// head/ - update δFtail
// head/ - update δFtail + pin watchers
if f.head.rev == 0 && err == nil {
// update δFtail index
// see "3) for */head/data the following invariant is maintained..."
δFtail := f.head.bfdir.δFtail
δFtail.Track(f.zfile, blk, treepath, blkcov, zblk)
// we have the data - it can be used after watchers are updated
// XXX should we use ctx here? (see readPinWatchers comments)
err = f.readPinWatchers(ctx, blk, blkrevMax)
if err != nil {
blkdata = nil
}
}
loading.blkdata = blkdata
......@@ -1017,6 +1378,659 @@ retry:
log.Errorf("BUG: bigfile %s: blk %d: -> pagecache: %s (ignoring, but reading from bigfile will be very slow)", oid, blk, st)
}
// -------- isolation protocol notification/serving --------
//
// (see "7.2) for all registered client@at watchers ...")
const _traceIso = false
func traceIso(format string, argv ...interface{}) {
if !_traceIso {
return
}
log.InfoDepth(1, fmt.Sprintf(format, argv...))
}
// pin makes sure that file[blk] on client side is the same as of @rev state.
//
// rev = zodb.TidMax means @head; otherwise rev must be ≤ w.at and there must
// be no rev_next changing file[blk]: rev < rev_next ≤ w.at.
//
// must be called with atMu rlocked.
//
// TODO close watch on any error
func (w *Watch) pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
defer xerr.Contextf(&err, "wlink%d: f<%s>", w.link.id, w.file.zfile.POid())
return w._pin(ctx, blk, rev)
}
func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
foid := w.file.zfile.POid()
revstr := rev.String()
if rev == zodb.TidMax {
revstr = "head"
}
defer xerr.Contextf(&err, "pin #%d @%s", blk, revstr)
if !(rev == zodb.TidMax || rev <= w.at) {
panicf("f<%s>: wlink%d: pin #%d @%s: watch.at (%s) < rev",
foid, w.link.id, blk, rev, w.at)
}
w.pinnedMu.Lock()
// check/wait for previous/simultaneous pin.
// (pin could be called simultaneously e.g. by setupWatch and readPinWatchers)
for {
blkpin := w.pinned[blk]
if blkpin == nil {
break
}
w.pinnedMu.Unlock()
<-blkpin.ready // TODO +ctx cancel
if blkpin.rev == rev {
// already pinned
// (e.g. os cache for block was evicted and read called the second time)
return blkpin.err
}
// relock the watch and check that w.pinned[blk] is the same. Retry if it is not.
// ( w.pinned[blk] could have changed while w.mu was not held e.g. by
// simultaneous setupWatch if we were called by readPinWatchers )
w.pinnedMu.Lock()
if blkpin == w.pinned[blk] {
if blkpin.rev == zodb.TidMax {
w.pinnedMu.Unlock()
panicf("f<%s>: wlink%d: pinned[#%d] = @head", foid, w.link.id, blk)
}
break
}
}
// w.pinnedMu locked & previous pin is either nil or completed and its .rev != rev
// -> setup new pin state
blkpin := &blkPinState{rev: rev, ready: make(chan struct{})}
w.pinned[blk] = blkpin
// perform IO without w.pinnedMu
w.pinnedMu.Unlock()
ack, err := w.link.sendReq(ctx, fmt.Sprintf("pin %s #%d @%s", foid, blk, revstr))
w.pinnedMu.Lock()
// check IO reply & verify/signal blkpin is ready
defer func() {
if rev == zodb.TidMax {
delete(w.pinned, blk)
}
w.pinnedMu.Unlock()
close(blkpin.ready)
}()
if err != nil {
blkpin.err = err
return err
}
if ack != "ack" {
blkpin.err = fmt.Errorf("expect %q; got %q", "ack", ack)
return blkpin.err
}
if blkpin != w.pinned[blk] {
blkpin.err = fmt.Errorf("BUG: pinned[#%d] mutated while doing IO", blk)
panicf("f<%s>: wlink%d: %s", foid, w.link.id, blkpin.err)
}
return nil
}
// readPinWatchers complements readBlk: it sends `pin blk` for watchers of the file
// after a block was loaded from ZODB but before block data is returned to kernel.
//
// See "7.2) for all registered client@at watchers ..."
//
// Must be called only for f under head/
// Must be called with f.head.zheadMu rlocked.
//
// XXX do we really need to use/propagate caller context here? ideally update
// watchers should be synchronous, and in practice we just use 30s timeout (TODO).
// Should a READ interrupt cause watch update failure? -> probably no
func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, blkrevMax zodb.Tid) (err error) {
defer xerr.Context(&err, "pin watchers") // f.path and blk is already put into context by readBlk
// only head/ is being watched for
if f.head.rev != 0 {
panic("BUG: readPinWatchers: called for file under @revX/")
}
//fmt.Printf("S: read #%d -> pin watchers (#%d)\n", blk, len(f.watchTab))
// make sure that file[blk] on clients side stays as of @w.at state.
// try to use blkrevMax only as the first cheap criteria to skip updating watchers.
// This is likely to be the case, since most watchers should be usually close to head.
// If using blkrevMax only turns out to be not sufficient, we'll
// consult δFtail, which might involve recomputing it.
δFtail := f.head.bfdir.δFtail
blkrev := blkrevMax
blkrevRough := true
wg := xsync.NewWorkGroup(ctx)
f.watchMu.RLock()
for w := range f.watchTab {
w := w
// make sure w.at stays unchanged while we prepare and pin the block
w.atMu.RLock()
// the block is already covered by @w.at database view
if blkrev <= w.at {
w.atMu.RUnlock()
continue
}
// if blkrev is rough estimation and that upper bound is > w.at
// we have to recompute ~exact file[blk] revision @head.
if blkrevRough {
// unlock atMu while we are (re-)calculating blkrev
// we'll relock atMu again and recheck blkrev vs w.at after.
w.atMu.RUnlock()
var err error
blkrev, _, err = δFtail.BlkRevAt(ctx, f.zfile, blk, f.head.zconn.At())
if err != nil {
return err
}
blkrevRough = false
w.atMu.RLock()
if blkrev <= w.at {
w.atMu.RUnlock()
continue
}
}
// the block is newer - find out its revision as of @w.at and pin to that.
//
// We don't pin to w.at since if we would do so for several clients,
// and most of them would be on different w.at - cache of the file will
// be lost. Via pinning to particular block revision, we make sure the
// revision to pin is the same on all clients, and so file cache is shared.
wg.Go(func(ctx context.Context) error {
defer w.atMu.RUnlock()
pinrev, _, err := δFtail.BlkRevAt(ctx, f.zfile, blk, w.at)
if err != nil {
return err
}
//fmt.Printf("S: read #%d: watch @%s: pin -> @%s\n", blk, w.at, pinrev)
// TODO close watcher on any error
return w.pin(ctx, blk, pinrev)
})
}
f.watchMu.RUnlock()
return wg.Wait()
}
// setupWatch sets up or updates a Watch when client sends `watch <file> @<at>` request.
//
// It sends "pin" notifications; final "ok" or "error" must be sent by caller.
func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.Tid) (err error) {
defer xerr.Contextf(&err, "setup watch f<%s> @%s", foid, at)
head := wlink.head
bfdir := head.bfdir
// wait for zhead.At ≥ at
if at != zodb.InvalidTid {
err = head.zheadWait(ctx, at)
if err != nil {
return err
}
}
// make sure zhead.At stays unchanged while we are preparing the watch
// (see vvv e.g. about unpin to @head for why it is needed)
head.zheadMu.RLock()
defer head.zheadMu.RUnlock()
headAt := head.zconn.At()
if at != zodb.InvalidTid && at < bfdir.δFtail.Tail() {
return fmt.Errorf("too far away back from head/at (@%s); δt = %s",
headAt, headAt.Time().Sub(at.Time().Time))
}
wlink.byfileMu.Lock()
// if watch was already established - we need to update it
w := wlink.byfile[foid]
if w == nil {
// watch was not previously established - set it up anew
bfdir.fileMu.Lock()
f := bfdir.fileTab[foid]
bfdir.fileMu.Unlock()
if f == nil {
wlink.byfileMu.Unlock()
// by "isolation protocol" watch is setup after data file was opened
return fmt.Errorf("file not yet known to wcfs or is not a ZBigFile")
}
w = &Watch{
link: wlink,
file: f,
at: at,
pinned: make(map[int64]*blkPinState),
}
}
f := w.file
f.watchMu.Lock()
// at="-" (InvalidTid) means "remove the watch"
if at == zodb.InvalidTid {
delete(wlink.byfile, foid)
delete(f.watchTab, w)
f.watchMu.Unlock()
wlink.byfileMu.Unlock()
return nil
}
// request exclusive access to the watch to change .at and compute pins.
// The lock will be downgraded from W to R after pins computation is done.
// Pins will be executed with atMu.R only - with the idea not to block
// other clients that read-access the file simultaneously to setupWatch.
w.atMu.Lock()
// check at >= w.at
// TODO(?) we might want to allow going back in history if we need it.
if !(at >= w.at) {
w.atMu.Unlock()
f.watchMu.Unlock()
wlink.byfileMu.Unlock()
return fmt.Errorf("going back in history is forbidden")
}
// register w to f early, so that READs going in parallel to us
// preparing and processing initial pins, also send pins to w for read
// blocks. If we don't, we can miss to send pin to w for a freshly read
// block which could have revision > w.at:
//
// 1 3 2 4
// ─────.────x───o────x───x──────]──────────
// ↑ ↑
// w.at head
//
// Here blocks #1, #2 and #4 were previously accessed, are thus tracked
// by δFtail and are changed after w.at - they will be returned by vvv
// δFtail query and pin-sent to w. Block #3 was not yet accessed but
// was also changed after w.at . As head/file[#3] might be accessed
// simultaneously to watch setup, and f.readBlk will be checking
// f.watchTab; if w ∉ f.watchTab at that moment, w will miss to receive
// pin for #3.
//
// NOTE for `unpin blk` to -> @head we can be sure there won't be
// simultaneous `pin blk` request, because:
//
// - unpin means blk was previously pinned,
// - blk was pinned means it is tracked by δFtail,
// - if blk is tracked and δFtail says there is no δblk ∈ (at, head],
// there is indeed no blk change in that region,
// - which means that δblk with rev > w.at might be only > head,
// - but such δblk are processed with zhead wlocked and we keep zhead
// rlocked during pin setup.
//
// δ δ
// ────x────.────────────]────x────
// ↑ ↑
// w.at head
//
// - also: there won't be simultaneous READs that would need to be
// unpinned, because we update w.at to requested at early.
w.at = at
f.watchTab[w] = struct{}{}
wlink.byfile[foid] = w
f.watchMu.Unlock()
wlink.byfileMu.Unlock()
// TODO defer -> unregister watch if error
// pin all tracked file blocks that were changed in (at, head] range.
toPin := map[int64]zodb.Tid{} // blk -> @rev
δFtail := bfdir.δFtail
vδf, err := δFtail.SliceByFileRevEx(f.zfile, at, headAt, zdata.QueryOptions{
// blk might be in δFtail because it is adjacent in
// ZBigFile.blktab to another blk that was explicitly tracked.
// We do not want to get those to avoid unnecessarily pinning
// potentially more blocks than needed.
//
// wcfs tests also verify that only blocks that were previously
// explicitly accessed are included into watch setup pins.
OnlyExplicitlyTracked: true,
})
if err != nil {
return err
}
for _, δfile := range vδf {
if δfile.Epoch {
// file epochs are currently forbidden (see watcher), so the only
// case when we could see an epoch here is creation of
// the file if w.at is before that time:
//
// create file
// ────.────────x────────]────
// ↑ ↑
// w.at head
//
// but then the file should not be normally accessed in that case.
//
// -> reject such watches with an error
return fmt.Errorf("file epoch detected @%s in between (at,head=@%s]", δfile.Rev, headAt)
}
for blk := range δfile.Blocks {
_, already := toPin[blk]
if already {
continue
}
toPin[blk], _, err = δFtail.BlkRevAt(ctx, f.zfile, blk, at)
if err != nil {
return err
}
}
}
// if a block was previously pinned, but ∉ δ(at, head] -> unpin it to head.
for blk, pinPrev := range w.pinned {
// only 1 setupWatch can be run simultaneously for one file
pinNew, pinning := toPin[blk]
if !pinning {
toPin[blk] = zodb.TidMax // @head
}
// TODO don't bother to spawn .pin goroutines if pin revision is the same ?
// if pinNew == pinPrev.rev && ready(pinPrev.ready) && pinPrev.err == nil {
// delete(toPin, blk)
// }
_ = pinPrev
_ = pinNew
}
// downgrade atMu.W -> atMu.R to let other clients to access the file.
// NOTE there is no primitive to do Wlock->Rlock atomically, but we are
// ok with that since we prepared everything to handle simultaneous pins
// from other reads.
w.atMu.Unlock()
w.atMu.RLock()
defer w.atMu.RUnlock()
wg := xsync.NewWorkGroup(ctx)
for blk, rev := range toPin {
blk := blk
rev := rev
wg.Go(func(ctx context.Context) error {
return w._pin(ctx, blk, rev)
})
}
err = wg.Wait()
if err != nil {
return err
}
return nil
}
// Open serves /head/watch opens.
func (wnode *WatchNode) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) {
// TODO(?) check flags
head := wnode.head
wlink := &WatchLink{
sk: NewFileSock(),
id: atomic.AddInt32(&wnode.idNext, +1),
head: head,
byfile: make(map[zodb.Oid]*Watch),
rxTab: make(map[uint64]chan string),
}
head.wlinkMu.Lock()
// XXX del wlinkTab[w] on w.sk.File.Release
head.wlinkTab[wlink] = struct{}{}
head.wlinkMu.Unlock()
go wlink.serve()
return wlink.sk.File(), fuse.OK
}
// serve serves client initiated watch requests and routes client replies to
// wcfs initiated pin requests.
func (wlink *WatchLink) serve() {
err := wlink._serve()
if err != nil {
log.Error(err)
}
head := wlink.head
head.wlinkMu.Lock()
delete(head.wlinkTab, wlink)
head.wlinkMu.Unlock()
}
func (wlink *WatchLink) _serve() (err error) {
defer xerr.Contextf(&err, "wlink %d: serve rx", wlink.id)
ctx0 := context.TODO() // TODO ctx = merge(ctx of wcfs running, ctx of wlink timeout)
ctx, cancel := context.WithCancel(ctx0)
wg := xsync.NewWorkGroup(ctx)
r := bufio.NewReader(xio.BindCtxR(wlink.sk, ctx))
defer func() {
// cancel all handlers on both error and ok return.
// ( ok return is e.g. when we received "bye", so if client
// sends "bye" and some pin handlers are in progress - they
// anyway don't need to wait for client replies anymore )
cancel()
err2 := wg.Wait()
if err == nil {
err = err2
}
// unregister all watches created on this wlink
wlink.byfileMu.Lock()
for _, w := range wlink.byfile {
w.file.watchMu.Lock()
delete(w.file.watchTab, w)
w.file.watchMu.Unlock()
}
wlink.byfile = nil
wlink.byfileMu.Unlock()
// write to peer if it was logical error on client side
if err != nil {
_ = wlink.send(ctx0, 0, fmt.Sprintf("error: %s", err))
}
// close .sk.tx : this wakes up rx on client side.
err2 = wlink.sk.CloseWrite()
if err == nil {
err = err2
}
}()
// close .sk.rx on error/wcfs stopping or return: this wakes up read(sk).
retq := make(chan struct{})
defer close(retq)
wg.Go(func(ctx context.Context) error {
// monitor is always canceled - either at parent ctx cancel, or
// upon return from serve (see "cancel all handlers ..." ^^^).
// If it was return - report returned error to wg.Wait, not "canceled".
<-ctx.Done()
e := ctx.Err()
select {
default:
case <-retq:
e = err // returned error
}
e2 := wlink.sk.CloseRead()
if e == nil {
e = e2
}
return e
})
for {
l, err := r.ReadString('\n') // TODO limit accepted line len to prevent DOS
if err != nil {
// r.Read is woken up by sk.CloseRead when serve decides to exit
if err == io.ErrClosedPipe || err == io.EOF {
err = nil
}
return err
}
traceIso("S: wlink%d: rx: %q\n", wlink.id, l)
stream, msg, err := parseWatchFrame(l)
if err != nil {
return err
}
// reply from client to wcfs
reply := (stream % 2 == 0)
if reply {
wlink.rxMu.Lock()
rxq := wlink.rxTab[stream]
delete(wlink.rxTab, stream)
wlink.rxMu.Unlock()
if rxq == nil {
return fmt.Errorf("%d: reply on unexpected stream", stream)
}
rxq <- msg
continue
}
// client-initiated request
// bye
if msg == "bye" {
return nil // deferred sk.Close will wake-up rx on client side
}
// watch ...
wg.Go(func(ctx context.Context) error {
return wlink.handleWatch(ctx, stream, msg)
})
}
}
// handleWatch handles watch request from client.
//
// returned error comes without full error prefix.
func (wlink *WatchLink) handleWatch(ctx context.Context, stream uint64, msg string) (err error) {
defer xerr.Contextf(&err, "%d", stream)
err = wlink._handleWatch(ctx, msg)
reply := "ok"
if err != nil {
// logical error is reported back to client, but watch link remains live
reply = fmt.Sprintf("error %s", err)
err = nil
}
err = wlink.send(ctx, stream, reply)
return err
}
func (wlink *WatchLink) _handleWatch(ctx context.Context, msg string) error {
foid, at, err := parseWatch(msg)
if err != nil {
return err
}
err = wlink.setupWatch(ctx, foid, at)
return err
}
// sendReq sends wcfs-originated request to client and returns client response.
func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string, err error) {
defer xerr.Context(&err, "sendReq") // wlink is already put into ctx by caller
var stream uint64
for stream == 0 {
stream = atomic.AddUint64(&wlink.reqNext, +2)
}
rxq := make(chan string, 1)
wlink.rxMu.Lock()
_, already := wlink.rxTab[stream]
if !already {
wlink.rxTab[stream] = rxq
}
wlink.rxMu.Unlock()
if already {
panic("BUG: to-be-sent stream is present in rxtab")
}
defer func() {
if err != nil {
// remove rxq from rxTab
// ( _serve could have already deleted it if unexpected
// reply came to the stream, but no other rxq should
// have registered on the [stream] slot )
wlink.rxMu.Lock()
delete(wlink.rxTab, stream)
wlink.rxMu.Unlock()
// no need to drain rxq - it was created with cap=1
}
}()
err = wlink.send(ctx, stream, req)
if err != nil {
return "", err
}
select {
case <-ctx.Done():
return "", ctx.Err()
case reply = <-rxq:
return reply, nil
}
}
// send sends a message to client over specified stream ID.
//
// Multiple send can be called simultaneously; send serializes writes.
func (wlink *WatchLink) send(ctx context.Context, stream uint64, msg string) (err error) {
defer xerr.Contextf(&err, "send .%d", stream) // wlink is already put into ctx by caller
// assert '\n' not in msg
if strings.ContainsRune(msg, '\n') {
panicf("BUG: msg contains \\n ; msg: %q", msg)
}
wlink.txMu.Lock()
defer wlink.txMu.Unlock()
pkt := []byte(fmt.Sprintf("%d %s\n", stream, msg))
traceIso("S: wlink%d: tx: %q\n", wlink.id, pkt)
_, err = wlink.sk.Write(ctx, pkt)
if err != nil {
return err
}
return nil
}
// ---- Lookup ----
......@@ -1221,10 +2235,12 @@ func (head *Head) bigfopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err e
loading: make(map[int64]*blkLoadState),
}
// only head/ needs δFtail.
// only head/ needs δFtail and watches.
if head.rev == 0 {
// see "3) for */head/data the following invariant is maintained..."
head.bfdir.δFtail.Track(f.zfile, -1, sizePath, blkCov, nil)
f.watchTab = make(map[*Watch]struct{})
}
return f, nil
......@@ -1405,6 +2421,13 @@ func _main() (err error) {
fsNode: newFSNode(fSticky),
rev: 0,
zconn: zhead,
wlinkTab: make(map[*WatchLink]struct{}),
hwait: make(map[hwaiter]struct{}),
}
wnode := &WatchNode{
fsNode: newFSNode(fSticky),
head: head,
}
bfdir := &BigFileDir{
......@@ -1472,6 +2495,7 @@ func _main() (err error) {
mkdir(root, "head", head)
mkdir(head, "bigfile", bfdir)
mkfile(head, "at", NewSmallFile(head.readAt)) // TODO mtime(at) = tidtime(at)
mkfile(head, "watch", wnode)
// for debugging/testing
_wcfs := newFSNode(fSticky)
......
......@@ -32,20 +32,22 @@ from wendelin import wcfs
import transaction
from persistent import Persistent
from persistent.timestamp import TimeStamp
from ZODB.utils import z64, u64, p64
import sys, os, os.path
import sys, os, os.path, subprocess
from thread import get_ident as gettid
from time import gmtime
from errno import EINVAL, ENOTCONN
from resource import setrlimit, getrlimit, RLIMIT_MEMLOCK
from golang import go, chan, select, func, defer, b
from golang import context, time
from golang import go, chan, select, func, defer, error, b
from golang import context, errors, sync, time
from zodbtools.util import ashex as h, fromhex
import pytest; xfail = pytest.mark.xfail
from pytest import raises, fail
from wendelin.wcfs.internal import io, mm
from wendelin.wcfs.internal.wcfs_test import install_sigbus_trap
from wendelin.wcfs import _is_mountpoint as is_mountpoint
from wendelin.wcfs.internal.wcfs_test import _tWCFS, read_exfault_nogil, SegmentationFault, install_sigbus_trap, fadvise_dontneed
from wendelin.wcfs.client._wcfs import _tpywlinkwrite as _twlinkwrite
from wendelin.wcfs import _is_mountpoint as is_mountpoint, _procwait as procwait, _ready as ready
# setup:
......@@ -285,7 +287,7 @@ def start_and_crash_wcfs(zurl, mntpt): # -> WCFS
# ---- infrastructure for data access tests ----
#
# Testing infrastructure consists of tDB and tFile that
# Testing infrastructure consists of tDB, tFile, tWatch and tWatchLink that
# jointly organize wcfs behaviour testing. See individual classes for details.
# many tests need to be run with some reasonable timeout to detect lack of wcfs
......@@ -297,6 +299,11 @@ def timeout(parent=context.background()): # -> ctx
ctx, _ = with_timeout()
return ctx
# tdelay is used in places where we need to delay a bit in order to e.g.
# increase probability of a bug due to race condition.
def tdelay():
time.sleep(10*time.millisecond)
# DF represents a change in files space.
# it corresponds to ΔF in wcfs.go .
......@@ -321,18 +328,19 @@ class DFile:
# Database root and wcfs connection are represented by .root and .wc correspondingly.
# The database is initialized with one ZBigFile created and opened via ZODB connection as .zfile .
#
# The primary way to access wcfs is by opening BigFiles.
# The primary way to access wcfs is by opening BigFiles and WatchLinks.
# A BigFile opened under tDB is represented as tFile - see .open for details.
# A WatchLink opened under tDB is represented as tWatchLink - see .openwatch for details.
#
# The database can be mutated (via !wcfs codepath) with .change + .commit .
# Current database head is represented by .head .
# The history of the changes is kept in .dFtail .
# There are various helpers to query history (_blkDataAt, ...)
# There are various helpers to query history (_blkDataAt, _pinnedAt, .iter_revv, ...)
#
# tDB must be explicitly closed once no longer used.
#
# TODO(?) print -> t.trace/debug() + t.verbose depending on py.test -v -v ?
class tWCFS(object):
class tWCFS(_tWCFS):
@func
def __init__(t):
assert not os.path.exists(testmntpt)
......@@ -348,27 +356,12 @@ class tWCFS(object):
# cases, when wcfs, even after receiving `kill -9`, will be stuck in kernel.
# ( git.kernel.org/linus/a131de0a482a makes in-kernel FUSE client to
# still wait for request completion even after fatal signal )
t._closed = chan()
t._wcfuseaborted = chan()
t._wcfuseabort = os.fdopen(os.dup(wc._wcsrv._fuseabort.fileno()), 'w')
go(t._abort_ontimeout, 10*time.second) # NOTE must be: with_timeout << · << wcfs_pin_timeout
# _abort_ontimeout sends abort to fuse control file if timeout happens
# before tDB is closed.
def _abort_ontimeout(t, dt):
_, _rx = select(
time.after(dt).recv, # 0
t._closed.recv, # 1
)
if _ == 1:
return # tDB closed = testcase completed
nogilready = chan(dtype='C.structZ')
t._wcfuseabort = os.dup(wc._wcsrv._fuseabort.fileno())
go(t._abort_ontimeout, t._wcfuseabort, 10*time.second, nogilready) # NOTE must be: with_timeout << · << wcfs_pin_timeout
nogilready.recv() # wait till _abort_ontimeout enters nogil
# timeout -> force-umount wcfs
eprint("\nC: test timed out after %.1fs" % (dt / time.second))
eprint("-> aborting wcfs fuse connection to unblock ...\n")
t._wcfuseabort.write(b"1\n")
t._wcfuseabort.flush()
t._wcfuseaborted.close()
# _abort_ontimeout is in wcfs_test.pyx
# close closes connection to wcfs, unmounts the filesystem and makes sure
# that wcfs server exits.
......@@ -421,8 +414,13 @@ class tDB(tWCFS):
t._wc_zheadfh = open(t.wc.mountpoint + "/.wcfs/zhead")
t._wc_zheadv = []
# tracked opened tFiles
# whether head/ ZBigFile(s) blocks were ever accessed via wcfs.
# this is updated only explicitly via ._blkheadaccess() .
t._blkaccessedViaHead = {} # ZBigFile -> set(blk)
# tracked opened tFiles & tWatchLinks
t._files = set()
t._wlinks = set()
# ID of the thread which created tDB
# ( transaction plays dirty games with threading.local and we have to
......@@ -438,7 +436,7 @@ class tDB(tWCFS):
def head(t):
return t.dFtail[-1].rev
# close closes test database as well as all tracked files and wcfs.
# close closes test database as well as all tracked files, watch links and wcfs.
# it also prints change history to help developer overview current testcase.
@func
def close(t):
......@@ -448,7 +446,10 @@ class tDB(tWCFS):
defer(t.dump_history)
for tf in t._files.copy():
tf.close()
for tw in t._wlinks.copy():
tw.close()
assert len(t._files) == 0
assert len(t._wlinks) == 0
t._wc_zheadfh.close()
# open opens wcfs file corresponding to zf@at and starts to track it.
......@@ -456,6 +457,11 @@ class tDB(tWCFS):
def open(t, zf, at=None): # -> tFile
return tFile(t, zf, at=at)
# openwatch opens /head/watch on wcfs.
# see returned tWatchLink for details.
def openwatch(t): # -> tWatchLink
return tWatchLink(t)
# change schedules zf to be changed according to changeDelta at commit.
#
# changeDelta: {} blk -> data.
......@@ -541,6 +547,14 @@ class tDB(tWCFS):
# head/at = last txn of whole db
assert t.wc._read("head/at") == h(t.head)
# _blkheadaccess marks head/zf[blk] accessed.
def _blkheadaccess(t, zf, blk):
t._blkaccessed(zf).add(blk)
# _blkaccessed returns set describing which head/zf blocks were ever accessed.
def _blkaccessed(t, zf): # set(blk)
return t._blkaccessedViaHead.setdefault(zf, set())
# tFile provides testing environment for one bigfile opened on wcfs.
#
......@@ -670,8 +684,17 @@ class tFile:
#
# Expected data may be given with size < t.blksize. In such case the data
# is implicitly appended with trailing zeros. Data can be both bytes and unicode.
#
# It also checks that file watches are properly notified on data access -
# - see "7.2) for all registered client@at watchers ..."
#
# pinokByWLink: {} tWatchLink -> {} blk -> at.
# pinokByWLink can be omitted - in that case it is computed only automatically.
#
# The automatic computation of pinokByWLink is verified against explicitly
# provided pinokByWLink when it is present.
@func
def assertBlk(t, blk, dataok):
def assertBlk(t, blk, dataok, pinokByWLink=None):
# TODO -> assertCtx('blk #%d' % blk)
def _():
assertCtx = 'blk #%d' % blk
......@@ -684,10 +707,10 @@ class tFile:
dataok = b(dataok)
blkdata, _ = t.tdb._blkDataAt(t.zf, blk, t.at)
assert blkdata == dataok, "computed vs explicit data"
t._assertBlk(blk, dataok)
t._assertBlk(blk, dataok, pinokByWLink)
@func
def _assertBlk(t, blk, dataok):
def _assertBlk(t, blk, dataok, pinokByWLink=None, pinfunc=None):
assert len(dataok) <= t.blksize
dataok += b'\0'*(t.blksize - len(dataok)) # tailing zeros
assert blk < t._sizeinblk()
......@@ -703,10 +726,93 @@ class tFile:
cached = t.cached()[blk]
assert cached in (0, 1) # every check accesses a block in full
shouldPin = False # whether at least one wlink should receive a pin
# watches that must be notified if access goes to @head/file
wpin = {} # tWatchLink -> pinok
blkrev = t.tdb._blkRevAt(t.zf, blk, t.at)
if t.at is None: # @head/...
for wlink in t.tdb._wlinks:
pinok = {}
w = wlink._watching.get(t.zf._p_oid)
if w is not None and w.at < blkrev:
if cached == 1:
# @head[blk].rev is after w.at - w[blk] must be already pinned
assert blk in w.pinned
assert w.pinned[blk] <= w.at
else:
assert cached == 0
# even if @head[blk] is uncached, the block could be
# already pinned by setup watch
if blk not in w.pinned:
pinok = {blk: t.tdb._blkRevAt(t.zf, blk, w.at)}
shouldPin = True
wpin[wlink] = pinok
if pinokByWLink is not None:
assert wpin == pinokByWLink, "computed vs explicit pinokByWLink"
pinokByWLink = wpin
# doCheckingPin expects every wlink entry to also contain zf
for wlink, pinok in pinokByWLink.items():
pinokByWLink[wlink] = (t.zf, pinok)
# access 1 byte on the block and verify that wcfs sends us correct pins
blkview = t._blk(blk)
assert t.cached()[blk] == cached
def _(ctx, ev):
assert t.cached()[blk] == cached
ev.append('read pre')
# access data with released GIL so that the thread that reads data from
# head/watch can receive pin message. Be careful to handle cancellation,
# so that on error in another worker we don't get stuck and the
# error can be propagated to wait and reported.
#
# we handle cancellation by spawning read in another thread and
# waiting for either ctx cancel, or read thread to complete. This
# way on ctx cancel (e.g. assertion failure in another worker), the
# read thread can remain running even after _assertBlk returns, and
# in particular till the point where the whole test is marked as
# failed and shut down. But on test shutdown .fmmap is unmapped for
# all opened tFiles, and so read will hit SIGSEGV. Prepare to catch
# that SIGSEGV here.
have_read = chan(1)
def _():
try:
b = read_exfault_nogil(blkview[0:1])
except SegmentationFault:
b = 'FAULT'
t._blkaccess(blk)
have_read.send(b)
go(_)
_, _rx = select(
ctx.done().recv, # 0
have_read.recv, # 1
)
if _ == 0:
raise ctx.err()
b = _rx
ev.append('read ' + b)
ev = doCheckingPin(_, pinokByWLink, pinfunc)
# XXX hack - wlinks are notified and emit events simultaneously - we
# check only that events begin and end with read pre/post and that pins
# are inside (i.e. read is stuck until pins are acknowledged).
# Better do explicit check in tracetest style.
assert ev[0] == 'read pre', ev
assert ev[-1] == 'read ' + dataok[0], ev
ev = ev[1:-1]
if not shouldPin:
assert ev == []
else:
assert 'pin rx' in ev
assert 'pin ack pre' in ev
assert t.cached()[blk] > 0
# verify full data of the block
# TODO(?) assert individually for every block's page? (easier debugging?)
assert blkview.tobytes() == dataok
......@@ -740,6 +846,268 @@ class tFile:
t.assertCache(cachev)
# tWatch represents watch for one file setup on a tWatchLink.
class tWatch:
def __init__(w, foid):
w.foid = foid
w.at = z64 # not None - always concrete
w.pinned = {} # blk -> rev
# tWatchLink provides testing environment for /head/watch link opened on wcfs.
#
# .watch() setups/adjusts a watch for a file and verifies that wcfs correctly sends initial pins.
class tWatchLink(wcfs.WatchLink):
def __init__(t, tdb):
super(tWatchLink, t).__init__(tdb.wc)
t.tdb = tdb
tdb._wlinks.add(t)
# this tWatchLink currently watches the following files at particular state.
t._watching = {} # {} foid -> tWatch
def close(t):
t.tdb._wlinks.remove(t)
super(tWatchLink, t).close()
# disable all established watches
for w in t._watching.values():
w.at = z64
w.pinned = {}
t._watching = {}
# ---- infrastructure: watch setup/adjust ----
# watch sets up or adjusts a watch for file@at.
#
# During setup it verifies that wcfs sends correct initial/update pins.
#
# pinok: {} blk -> rev
# pinok can be omitted - in that case it is computed automatically.
#
# The automatic computation of pinok is verified against explicitly provided
# pinok when it is present.
@func(tWatchLink)
def watch(twlink, zf, at, pinok=None): # -> tWatch
foid = zf._p_oid
t = twlink.tdb
w = twlink._watching.get(foid)
if w is None:
w = twlink._watching[foid] = tWatch(foid)
at_prev = None
else:
at_prev = w.at # we were previously watching zf @at_prev
at_from = ''
if at_prev is not None:
at_from = '(%s ->) ' % at_prev
print('\nC: setup watch f<%s> %s%s' % (h(foid), at_from, at))
accessed = t._blkaccessed(zf)
lastRevOf = lambda blk: t._blkRevAt(zf, blk, t.head)
pin_prev = {}
if at_prev is not None:
assert at_prev <= at, 'TODO %s -> %s' % (at_prev, at)
pin_prev = t._pinnedAt(zf, at_prev)
assert w.pinned == pin_prev
pin = t._pinnedAt(zf, at)
if at_prev != at and at_prev is not None:
print('# pin@old: %s\n# pin@new: %s' % (t.hpin(pin_prev), t.hpin(pin)))
for blk in set(pin_prev.keys()).union(pin.keys()):
# blk ∉ pin_prev, blk ∉ pin -> cannot happen
assert (blk in pin_prev) or (blk in pin)
# blk ∉ pin_prev, blk ∈ pin -> cannot happen, except on first start
if blk not in pin_prev and blk in pin:
if at_prev is not None:
fail('#%d pinned %s; not pinned %s' % (blk, at_prev, at))
# blk ∈ pin -> blk is tracked; has rev > at
# (see criteria in _pinnedAt)
assert blk in accessed
assert at < lastRevOf(blk)
# blk ∈ pin_prev, blk ∉ pin -> unpin to head
elif blk in pin_prev and blk not in pin:
# blk ∈ pin_prev -> blk is tracked; has rev > at_prev
assert blk in accessed
assert at_prev < lastRevOf(blk)
# blk ∉ pin -> last blk revision is ≤ at
assert lastRevOf(blk) <= at
pin[blk] = None # @head
# blk ∈ pin_prev, blk ∈ pin -> if rev different: use pin
elif blk in pin_prev and blk in pin:
# blk ∈ pin_prev, pin -> blk is tracked; has rev > at_prev, at
assert blk in accessed
assert at_prev < lastRevOf(blk)
assert at < lastRevOf(blk)
assert pin_prev[blk] <= pin[blk]
if pin_prev[blk] == pin[blk]:
del pin[blk] # would need to pin to what it is already pinned
#print('-> %s' % t.hpin(pin))
# {} blk -> at that have to be pinned.
if pinok is not None:
assert pin == pinok, "computed vs explicit pinok"
pinok = pin
print('# pinok: %s' % t.hpin(pinok))
# send watch request and check that we receive pins for tracked (previously
# accessed at least once) blocks changed > at.
twlink._watch(zf, at, pinok, "ok")
w.at = at
# `watch ... -> at_i -> at_j` must be the same as `watch ø -> at_j`
assert w.pinned == t._pinnedAt(zf, at)
return w
# stop_watch instructs wlink to stop watching the file.
@func(tWatchLink)
def stop_watch(twlink, zf):
foid = zf._p_oid
assert foid in twlink._watching
w = twlink._watching.pop(foid)
twlink._watch(zf, b"-", {}, "ok")
w.at = z64
w.pinned = {}
# _watch sends watch request for zf@at, expects initial pins specified by pinok and final reply.
#
# at also can be b"-" which means "stop watching"
#
# pinok: {} blk -> at that have to be pinned.
# if replyok ends with '…' only reply prefix until the dots is checked.
@func(tWatchLink)
def _watch(twlink, zf, at, pinok, replyok):
if at == b"-":
xat = at
else:
xat = b"@%s" % h(at)
def _(ctx, ev):
reply = twlink.sendReq(ctx, b"watch %s %s" % (h(zf._p_oid), xat))
if replyok.endswith('…'):
rok = replyok[:-len('…')]
assert reply[:len(rok)] == rok
else:
assert reply == replyok
doCheckingPin(_, {twlink: (zf, pinok)})
# doCheckingPin calls f and verifies that wcfs sends expected pins during the
# time f executes.
#
# f(ctx, eventv)
# pinokByWLink: {} tWatchLink -> (zf, {} blk -> at).
# pinfunc(wlink, foid, blk, at) | None.
#
# pinfunc is called after pin request is received from wcfs, but before pin ack
# is replied back. Pinfunc must not block.
def doCheckingPin(f, pinokByWLink, pinfunc=None): # -> []event(str)
# call f and check that we receive pins as specified.
# Use timeout to detect wcfs replying less pins than expected.
#
# XXX detect not sent pins via ack'ing previous pins as they come in (not
# waiting for all of them) and then seeing that we did not received expected
# pin when f completes?
ctx, cancel = with_timeout()
wg = sync.WorkGroup(ctx)
ev = []
for wlink, (zf, pinok) in pinokByWLink.items():
def _(ctx, wlink, zf, pinok):
w = wlink._watching.get(zf._p_oid)
if len(pinok) > 0:
assert w is not None
pinv = wlink._expectPin(ctx, zf, pinok)
if len(pinv) > 0:
ev.append('pin rx')
# increase probability to receive erroneous extra pins
tdelay()
if len(pinv) > 0:
if pinfunc is not None:
for p in pinv:
pinfunc(wlink, p.foid, p.blk, p.at)
ev.append('pin ack pre')
for p in pinv:
assert w.foid == p.foid
if p.at is None: # unpin to @head
assert p.blk in w.pinned # must have been pinned before
del w.pinned[p.blk]
else:
w.pinned[p.blk] = p.at
#p.reply(b"ack")
wlink.replyReq(ctx, p, b"ack")
# check that we don't get extra pins before f completes
try:
req = wlink.recvReq(ctx)
except Exception as e:
if errors.Is(e, context.canceled):
return # cancel is expected after f completes
raise
fail("extra pin message received: %r" % req.msg)
wg.go(_, wlink, zf, pinok)
def _(ctx):
f(ctx, ev)
# cancel _expectPin waiting upon completing f
# -> error that missed pins were not received.
cancel()
wg.go(_)
wg.wait()
return ev
# _expectPin asserts that wcfs sends expected pin messages.
#
# expect is {} blk -> at
# returns [] of received pin requests.
@func(tWatchLink)
def _expectPin(twlink, ctx, zf, expect): # -> []SrvReq
expected = set() # of expected pin messages
for blk, at in expect.items():
hat = h(at) if at is not None else 'head'
msg = b"pin %s #%d @%s" % (h(zf._p_oid), blk, hat)
assert msg not in expected
expected.add(msg)
reqv = [] # of received requests
while len(expected) > 0:
try:
req = twlink.recvReq(ctx)
except Exception as e:
raise RuntimeError("%s\nnot all pin messages received - pending:\n%s" % (e, expected))
assert req is not None # channel not closed
assert req.msg in expected
expected.remove(req.msg)
reqv.append(req)
return reqv
# ---- infrastructure: helpers to query dFtail/accessed history ----
# _blkDataAt returns expected zf[blk] data and its revision as of @at database state.
......@@ -768,11 +1136,58 @@ def _blkDataAt(t, zf, blk, at): # -> (data, rev)
assert rev <= at
return data, rev
# _blkRevAt returns expected zf[blk] revision as of @at database state.
@func(tDB)
def _blkRevAt(t, zf, blk, at): # -> rev
_, rev = t._blkDataAt(zf, blk, at)
return rev
# _pinnedAt returns which blocks need to be pinned for zf@at compared to zf@head
# according to wcfs isolation protocol.
#
# Criteria for when blk must be pinned as of @at view:
#
# blk ∈ pinned(at) <=> 1) ∃ r = rev(blk): at < r ; blk was changed after at
# 2) blk ∈ tracked ; blk was accessed at least once
# ; (and so is tracked by wcfs)
@func(tDB)
def _pinnedAt(t, zf, at): # -> pin = {} blk -> rev
# all changes to zf
vdf = [_.byfile[zf] for _ in t.dFtail if zf in _.byfile]
# {} blk -> at for changes ∈ (at, head]
pin = {}
for df in [_ for _ in vdf if _.rev > at]:
for blk in df.ddata:
if blk in pin:
continue
if blk in t._blkaccessed(zf):
pin[blk] = t._blkRevAt(zf, blk, at)
return pin
# iter_revv iterates through all possible at_i -> at_j -> at_k ... sequences.
# at_i < at_j NOTE all sequences go till head.
@func(tDB)
def iter_revv(t, start=z64, level=0):
dFtail = [_ for _ in t.dFtail if _.rev > start]
#print(' '*level, 'iter_revv', start, [_.rev for _ in dFtail])
if len(dFtail) == 0:
yield []
return
for dF in dFtail:
#print(' '*level, 'QQQ', dF.rev)
for tail in t.iter_revv(start=dF.rev, level=level+1):
#print(' '*level, 'zzz', tail)
yield ([dF.rev] + tail)
# -------------------------------------
# ---- actual tests to access data ----
# exercise wcfs functionality
# exercise wcfs functionality without wcfs isolation protocol.
# plain data access + wcfs handling of ZODB invalidations.
@func
def test_wcfs_basic():
......@@ -899,6 +1314,502 @@ def test_wcfs_basic_read_aftertail():
assert _(100*blksize) == b''
# ---- verify wcfs functionality that depends on isolation protocol ----
# verify that watch setup is robust to client errors/misbehaviour.
@func
def test_wcfs_watch_robust():
t = tDB(); zf = t.zfile
defer(t.close)
# sysread(/head/watch) can be interrupted
p = subprocess.Popen(["%s/testprog/wcfs_readcancel.py" %
os.path.dirname(__file__), t.wc.mountpoint])
procwait(timeout(), p)
at1 = t.commit(zf, {2:'c1'})
at2 = t.commit(zf, {2:'c2'})
# file not yet opened on wcfs
wl = t.openwatch()
assert wl.sendReq(timeout(), b"watch %s @%s" % (h(zf._p_oid), h(at1))) == \
"error setup watch f<%s> @%s: " % (h(zf._p_oid), h(at1)) + \
"file not yet known to wcfs or is not a ZBigFile"
wl.close()
# closeTX/bye cancels blocked pin handlers
f = t.open(zf)
f.assertBlk(2, 'c2')
f.assertCache([0,0,1])
wl = t.openwatch()
wg = sync.WorkGroup(timeout())
def _(ctx):
# TODO clarify what wcfs should do if pin handler closes wlink TX:
# - reply error + close, or
# - just close
# t = when reviewing WatchLink.serve in wcfs.go
#assert wl.sendReq(ctx, b"watch %s @%s" % (h(zf._p_oid), h(at1))) == \
# "error setup watch f<%s> @%s: " % (h(zf._p_oid), h(at1)) + \
# "pin #%d @%s: context canceled" % (2, h(at1))
#with raises(error, match="unexpected EOF"):
with raises(error, match="recvReply: link is down"):
wl.sendReq(ctx, b"watch %s @%s" % (h(zf._p_oid), h(at1)))
wg.go(_)
def _(ctx):
req = wl.recvReq(ctx)
assert req is not None
assert req.msg == b"pin %s #%d @%s" % (h(zf._p_oid), 2, h(at1))
# don't reply to req - close instead
wl.closeWrite()
wg.go(_)
wg.wait()
wl.close()
# NOTE if wcfs.go does not fully cleanup this canceled watch and leaves it
# in half-working state, it will break on further commit, as pin to the
# watch won't be handled.
at3 = t.commit(zf, {2:'c3'})
# invalid requests -> wcfs replies error
wl = t.openwatch()
assert wl.sendReq(timeout(), b'bla bla') == \
b'error bad watch: not a watch request: "bla bla"'
# invalid request not following frame structure -> fatal + wcfs must close watch link
assert wl.fatalv == []
_twlinkwrite(wl, b'zzz hello\n')
_, _rx = select(
timeout().done().recv,
wl.rx_eof.recv,
)
if _ == 0:
raise RuntimeError("%s: did not rx EOF after bad frame " % wl)
assert wl.fatalv == [b'error: invalid frame: "zzz hello\\n" (invalid stream)']
wl.close()
# watch with @at < δtail.tail -> rejected
wl = t.openwatch()
atpast = p64(u64(t.tail)-1)
wl._watch(zf, atpast, {}, "error setup watch f<%s> @%s: too far away back from"
" head/at (@%s); …" % (h(zf._p_oid), h(atpast), h(t.head)))
wl.close()
# verify that `watch file @at` -> error, for @at when file did not existed.
@func
def test_wcfs_watch_before_create():
t = tDB(); zf = t.zfile
defer(t.close)
at1 = t.commit(zf, {2:'c1'})
zf2 = t.root['zfile2'] = ZBigFile(blksize) # zf2 created @at2
at2 = t.commit()
at3 = t.commit(zf2, {1:'β3'})
# force wcfs to access/know zf2
f2 = t.open(zf2)
f2.assertData(['','β3'])
wl = t.openwatch()
assert wl.sendReq(timeout(), b"watch %s @%s" % (h(zf2._p_oid), h(at1))) == \
"error setup watch f<%s> @%s: " % (h(zf2._p_oid), h(at1)) + \
"file epoch detected @%s in between (at,head=@%s]" % (h(at2), h(t.head))
wl.close()
# verify that watch @at_i -> @at_j ↓ is rejected
# TODO(?) we might want to allow going back in history later.
@func
def test_wcfs_watch_going_back():
t = tDB(); zf = t.zfile
defer(t.close)
at1 = t.commit(zf, {2:'c1'})
at2 = t.commit(zf, {2:'c2'})
f = t.open(zf)
f.assertData(['','','c2'])
wl = t.openwatch()
wl.watch(zf, at2, {})
wl.sendReq(timeout(), b"watch %s @%s" % (h(zf._p_oid), h(at1))) == \
"error setup watch f<%s> @%s: " % (h(zf._p_oid), h(at1)) + \
"going back in history is forbidden"
wl.close()
# verify that wcfs kills slow/faulty client who does not reply to pin in time.
@xfail # protection against faulty/slow clients
@func
def test_wcfs_pintimeout_kill():
# adjusted wcfs timeout to kill client who is stuck not providing pin reply
tkill = 3*time.second
t = tDB(); zf = t.zfile # XXX wcfs args += tkill=<small>
defer(t.close)
at1 = t.commit(zf, {2:'c1'})
at2 = t.commit(zf, {2:'c2'})
f = t.open(zf)
f.assertData(['','','c2'])
# XXX move into subprocess not to kill whole testing
ctx, _ = context.with_timeout(context.background(), 2*tkill)
wl = t.openwatch()
wg = sync.WorkGroup(ctx)
def _(ctx):
# send watch. The pin handler won't be replying -> we should never get reply here.
wl.sendReq(ctx, b"watch %s @%s" % (h(zf._p_oid), h(at1)))
fail("watch request completed (should not as pin handler is stuck)")
wg.go(_)
def _(ctx):
req = wl.recvReq(ctx)
assert req is not None
assert req.msg == b"pin %s #%d @%s" % (h(zf._p_oid), 2, h(at1))
# sleep > wcfs pin timeout - wcfs must kill us
_, _rx = select(
ctx.done().recv, # 0
time.after(tkill).recv, # 1
)
if _ == 0:
raise ctx.err()
fail("wcfs did not killed stuck client")
wg.go(_)
wg.wait()
# watch with @at > head - must wait for head to become >= at.
# TODO(?) too far ahead - reject?
@func
def test_wcfs_watch_setup_ahead():
t = tDB(); zf = t.zfile
defer(t.close)
f = t.open(zf)
at1 = t.commit(zf, {2:'c1'})
f.assertData(['','x','c1']) # NOTE #1 not accessed for watch @at1 to receive no pins
wg = sync.WorkGroup(timeout())
dt = 100*time.millisecond
committing = chan() # becomes ready when T2 starts to commit
# T1: watch @(at1+1·dt)
@func
def _(ctx):
wl = t.openwatch()
defer(wl.close)
wat = tidfromtime(tidtime(at1) + 1*dt) # > at1, but < at2
rx = wl.sendReq(ctx, b"watch %s @%s" % (h(zf._p_oid), h(wat)))
assert ready(committing)
assert rx == b"ok"
wg.go(_)
# T2: sleep(10·dt); commit
@func
def _(ctx):
# reopen connection to database as we are committing from another thread
conn = t.root._p_jar.db().open()
defer(conn.close)
root = conn.root()
zf = root['zfile']
time.sleep(10*dt)
committing.close()
at2 = t.commit(zf, {1:'b2'})
assert tidtime(at2) - tidtime(at1) >= 10*dt
wg.go(_)
wg.wait()
# verify that watch setup/update sends correct pins.
@func
def test_wcfs_watch_setup():
t = tDB(); zf = t.zfile; at0=t.at0
defer(t.close)
f = t.open(zf)
at1 = t.commit(zf, {2:'c1'})
at2 = t.commit(zf, {2:'c2', 3:'d2', 4:'e2', 5:'f2'})
at3 = t.commit(zf, {0:'a3', 2:'c3', 5:'f3'})
f.assertData(['a3','','c3','d2','x','f3']) # access everything except e as of @at3
f.assertCache([1,1,1,1,0,1])
# change again, but don't access e and f
at4 = t.commit(zf, {2:'c4', 4:'e4', 5:'f4'})
at5 = t.commit(zf, {3:'d5', 5:'f5'})
f.assertData(['a3','','c4','d5','x','x'])
f.assertCache([1,1,1,1,0,0])
# some watch setup/update requests with explicit pinok (also partly
# verifies how tWatchLink.watch computes automatic pinok)
# new watch setup ø -> at
def assertNewWatch(at, pinok):
wl = t.openwatch()
wl.watch(zf, at, pinok)
wl.close()
assertNewWatch(at1, {0:at0, 2:at1, 3:at0, 5:at0})
assertNewWatch(at2, {0:at0, 2:at2, 3:at2, 5:at2})
assertNewWatch(at3, { 2:at3, 3:at2, 5:at3}) # f(5) is pinned, even though it was not
assertNewWatch(at4, { 3:at2, 5:at4}) # accessed after at3
assertNewWatch(at5, { })
# new watch + update at_i -> at_j
wl = t.openwatch()
# XXX check @at0 ?
wl.watch(zf, at1, {0:at0, 2:at1, 3:at0, 5:at0}) # -> at1 (new watch) XXX at0 -> ø?
wl.watch(zf, at2, { 2:at2, 3:at2, 5:at2}) # at1 -> at2
wl.watch(zf, at3, {0:None, 2:at3, 5:at3}) # at2 -> at3
wl.watch(zf, at4, { 2:None, 5:at4}) # at3 -> at4 f(5) pinned even it was not accessed >=4
wl.watch(zf, at5, { 3:None, 5:None}) # at4 -> at5 (current head)
wl.close()
# all valid watch setup/update requests going at_i -> at_j -> ... with automatic pinok
for zf in t.zfiles():
for revv in t.iter_revv():
print('\n--------')
print(' -> '.join(['%s' % _ for _ in revv]))
wl = t.openwatch()
wl.watch(zf, revv[0])
wl.watch(zf, revv[0]) # verify at_i -> at_i
for at in revv[1:]:
wl.watch(zf, at)
wl.close()
# verify that already setup watch(es) receive correct pins on block access.
@func
def test_wcfs_watch_vs_access():
t = tDB(); zf = t.zfile; at0=t.at0
defer(t.close)
f = t.open(zf)
at1 = t.commit(zf, {2:'c1'})
at2 = t.commit(zf, {2:'c2', 3:'d2', 5:'f2'})
at3 = t.commit(zf, {0:'a3', 2:'c3', 5:'f3'})
f.assertData(['a3','','c3','d2','x','x'])
f.assertCache([1,1,1,1,0,0])
# watched + commit -> read -> receive pin messages.
# read vs pin ordering is checked by assertBlk.
#
# f(5) is kept not accessed to check later how wcfs.go handles δFtail
# rebuild after it sees not yet accessed ZBlk that has change history.
wl3 = t.openwatch(); w3 = wl3.watch(zf, at3); assert at3 == t.head
assert w3.at == at3
assert w3.pinned == {}
wl3_ = t.openwatch(); w3_ = wl3_.watch(zf, at3)
assert w3_.at == at3
assert w3_.pinned == {}
wl2 = t.openwatch(); w2 = wl2.watch(zf, at2)
assert w2.at == at2
assert w2.pinned == {0:at0, 2:at2}
# w_assertPin asserts on state of .pinned for {w3,w3_,w2}
def w_assertPin(pinw3, pinw3_, pinw2):
assert w3.pinned == pinw3
assert w3_.pinned == pinw3_
assert w2.pinned == pinw2
f.assertCache([1,1,1,1,0,0])
at4 = t.commit(zf, {1:'b4', 2:'c4', 5:'f4', 6:'g4'})
f.assertCache([1,0,0,1,0,0,0])
f.assertBlk(0, 'a3', {wl3: {}, wl3_: {}, wl2: {}})
w_assertPin( {}, {}, {0:at0, 2:at2})
f.assertBlk(1, 'b4', {wl3: {1:at0}, wl3_: {1:at0}, wl2: {1:at0}})
w_assertPin( {1:at0}, {1:at0}, {0:at0, 1:at0, 2:at2})
f.assertBlk(2, 'c4', {wl3: {2:at3}, wl3_: {2:at3}, wl2: {}})
w_assertPin( {1:at0, 2:at3}, {1:at0, 2:at3}, {0:at0, 1:at0, 2:at2})
f.assertBlk(3, 'd2', {wl3: {}, wl3_: {}, wl2: {}})
w_assertPin( {1:at0, 2:at3}, {1:at0, 2:at3}, {0:at0, 1:at0, 2:at2})
# blk4 is hole @head - the same as at earlier db view - not pinned
f.assertBlk(4, '', {wl3: {}, wl3_: {}, wl2: {}})
w_assertPin( {1:at0, 2:at3}, {1:at0, 2:at3}, {0:at0, 1:at0, 2:at2})
# f(5) is kept unaccessed (see ^^^)
assert f.cached()[5] == 0
f.assertBlk(6, 'g4', {wl3: {6:at0}, wl3_: {6:at0}, wl2: {6:at0}}) # XXX at0->ø?
w_assertPin( {1:at0, 2:at3, 6:at0}, {1:at0, 2:at3, 6:at0}, {0:at0, 1:at0, 2:at2, 6:at0})
# commit again:
# - c(2) is already pinned -> wl3 not notified
# - watch stopped (wl3_) -> watch no longer notified
# - wlink closed (wl2) -> watch no longer notified
# - f(5) is still kept unaccessed (see ^^^)
f.assertCache([1,1,1,1,1,0,1])
at5 = t.commit(zf, {2:'c5', 3:'d5', 5:'f5'})
f.assertCache([1,1,0,0,1,0,1])
wl3_.stop_watch(zf) # w3_ should not be notified
wl2.close() # wl2:* should not be notified
def w_assertPin(pinw3):
assert w3.pinned == pinw3
assert w3_.pinned == {}; assert w3_.at == z64 # wl3_ unsubscribed from zf
assert w2.pinned == {}; assert w2.at == z64 # wl2 closed
f.assertBlk(0, 'a3', {wl3: {}, wl3_: {}}) # no change
w_assertPin( {1:at0, 2:at3, 6:at0})
f.assertBlk(1, 'b4', {wl3: {}, wl3_: {}})
w_assertPin( {1:at0, 2:at3, 6:at0})
f.assertBlk(2, 'c5', {wl3: {}, wl3_: {}}) # c(2) already pinned on wl3
w_assertPin( {1:at0, 2:at3, 6:at0})
f.assertBlk(3, 'd5', {wl3: {3:at2}, wl3_: {}}) # d(3) was not pinned on wl3; wl3_ not notified
w_assertPin( {1:at0, 2:at3, 3:at2, 6:at0})
f.assertBlk(4, '', {wl3: {}, wl3_: {}})
w_assertPin( {1:at0, 2:at3, 3:at2, 6:at0})
# f(5) is kept still unaccessed (see ^^^)
assert f.cached()[5] == 0
f.assertBlk(6, 'g4', {wl3: {}, wl3_: {}})
w_assertPin( {1:at0, 2:at3, 3:at2, 6:at0})
# advance watch - receives new pins/unpins to @head.
# this is also tested ^^^ in `at_i -> at_j -> ...` watch setup/adjust.
# NOTE f(5) is not affected because it was not pinned previously.
wl3.watch(zf, at4, {1:None, 2:at4, 6:None}) # at3 -> at4
w_assertPin( {2:at4, 3:at2})
# access f(5) -> wl3 should be correctly pinned
assert f.cached() == [1,1,1,1,1,0,1] # f(5) was not yet accessed
f.assertBlk(5, 'f5', {wl3: {5:at4}, wl3_: {}})
w_assertPin( {2:at4, 3:at2, 5:at4})
# advance watch again
wl3.watch(zf, at5, {2:None, 3:None, 5:None}) # at4 -> at5
w_assertPin( {})
wl3.close()
# verify that on pin message, while under pagefault, we can mmap @at/f[blk]
# into where head/f[blk] was mmaped; the result of original pagefaulting read
# must be from newly inserted mapping.
#
# TODO same with two mappings to the same file, but only one changing blk mmap
# -> one read gets changed data, one read gets data from @head.
@func
def test_wcfs_remmap_on_pin():
t = tDB(); zf = t.zfile
defer(t.close)
at1 = t.commit(zf, {2:'hello'})
at2 = t.commit(zf, {2:'world'})
f = t.open(zf)
f1 = t.open(zf, at=at1)
wl = t.openwatch()
wl.watch(zf, at1, {})
f.assertCache([0,0,0])
def _(wlink, foid, blk, at):
assert wlink is wl
assert foid == zf._p_oid
assert blk == 2
assert at == at1
mm.map_into_ro(f._blk(blk), f1.f.fileno(), blk*f.blksize)
f._assertBlk(2, 'hello', {wl: {2:at1}}, pinfunc=_) # NOTE not world
# verify that pin message is not sent for the same blk@at twice.
@func
def test_wcfs_no_pin_twice():
t = tDB(); zf = t.zfile
defer(t.close)
f = t.open(zf)
at1 = t.commit(zf, {2:'c1'})
at2 = t.commit(zf, {2:'c2'})
wl = t.openwatch()
w = wl.watch(zf, at1, {})
f.assertCache([0,0,0])
f.assertBlk(2, 'c2', {wl: {2:at1}})
f.assertCache([0,0,1])
assert w.pinned == {2:at1}
# drop file[blk] from cache, access again -> no pin message sent the second time
#
# ( we need both madvise(DONTNEED) and fadvise(DONTNEED) - given only one of
# those the kernel won't release the page from pagecache; madvise does
# not work without munlock. )
mm.unlock(f._blk(2))
mm.advise(f._blk(2), mm.MADV_DONTNEED)
fadvise_dontneed(f.f.fileno(), 2*blksize, 1*blksize)
f.assertCache([0,0,0])
f.assertBlk(2, 'c2', {wl: {}})
f.assertCache([0,0,1])
# verify watching for 2 files over single watch link.
@func
def test_wcfs_watch_2files():
t = tDB(); zf1 = t.zfile
defer(t.close)
t.root['zfile2'] = zf2 = ZBigFile(blksize)
t.commit()
t.change(zf1, {0:'a2', 2:'c2'})
t.change(zf2, {1:'β2', 3:'δ2'})
at2 = t.commit()
t.change(zf1, {0:'a3', 2:'c3'})
t.change(zf2, {1:'β3', 3:'δ3'})
at3 = t.commit()
f1 = t.open(zf1)
f2 = t.open(zf2)
f1.assertData(['a3', '', 'x' ])
f2.assertData(['', 'β3', '', 'x'])
wl = t.openwatch()
w1 = wl.watch(zf1, at2, {0:at2})
w2 = wl.watch(zf2, at2, {1:at2})
def w_assertPin(pinw1, pinw2):
assert w1.pinned == pinw1
assert w2.pinned == pinw2
w_assertPin( {0:at2}, {1:at2})
f1.assertBlk(2, 'c3', {wl: {2:at2}})
w_assertPin( {0:at2, 2:at2}, {1:at2})
f2.assertBlk(3, 'δ3', {wl: {3:at2}})
w_assertPin( {0:at2, 2:at2}, {1:at2, 3:at2})
wl.watch(zf1, at3, {0:None, 2:None})
w_assertPin( {}, {1:at2, 3:at2})
wl.watch(zf2, at3, {1:None, 3:None})
w_assertPin( {}, {})
# TODO new watch request while previous watch request is in progress (over the same /head/watch handle)
# TODO @revX/ is automatically removed after some time
# ---- misc ---
# readfile reads file @ path.
......@@ -975,6 +1886,18 @@ class tAt(bytes):
return "@" + h(at)
__str__ = __repr__
# hpin returns human-readable representation for {}blk->rev.
@func(tDB)
def hpin(t, pin):
pinv = []
for blk in sorted(pin.keys()):
if pin[blk] is None:
s = '@head'
else:
s = '%s' % pin[blk]
pinv.append('%d: %s' % (blk, s))
return '{%s}' % ', '.join(pinv)
# zfiles returns ZBigFiles that were ever changed under t.
@func(tDB)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment