// Copyright (C) 2018-2020  Nexedi SA and Contributors.
//                          Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.

// Package wcfs provides WCFS client integrated with user-space virtual memory manager.
//
// This client package takes care about WCFS isolation protocol details and
// provides to clients simple interface to isolated view of bigfile data on
// WCFS similar to regular files: given a particular revision of database @at,
// it provides synthetic read-only bigfile memory mappings with data
// corresponding to @at state, but using /head/bigfile/* most of the time to
// build and maintain the mappings.
//
// For its data a mapping to bigfile X mostly reuses kernel cache for
// /head/bigfile/X with amount of data not associated with kernel cache for
// /head/bigfile/X being proportional to δ(bigfile/X, at..head). In the usual
// case where many client workers simultaneously serve requests, their database
// views are a bit outdated, but close to head, which means that in practice
// the kernel cache for /head/bigfile/* is being used almost 100% of the time.
//
// A mapping for bigfile X@at is built from OS-level memory mappings of
// on-WCFS files as follows:
//
//                                           ___        /@revA/bigfile/X
//         __                                           /@revB/bigfile/X
//                _                                     /@revC/bigfile/X
//                            +                         ...
//      ───  ───── ──────────────────────────   ─────   /head/bigfile/X
//
// where @revR mmaps are being dynamically added/removed by this client package
// to maintain X@at data view according to WCFS isolation protocol(*).
//
//
// Integration with wendelin.core virtmem layer
//
// This client package can be used standalone, but additionally provides
// integration with wendelin.core userspace virtual memory manager: when a
// Mapping is created, it can be associated as serving base layer for a
// particular virtmem VMA via FileH.mmap(vma=...). In that case, since virtmem
// itself adds another layer of dirty pages over read-only base provided by
// Mapping(+)
//
//                  ┌──┐                      ┌──┐
//                  │RW│                      │RW│    ← virtmem VMA dirty pages
//                  └──┘                      └──┘
//                            +
//                                                    VMA base = X@at view provided by Mapping:
//
//                                           ___        /@revA/bigfile/X
//         __                                           /@revB/bigfile/X
//                _                                     /@revC/bigfile/X
//                            +                         ...
//      ───  ───── ──────────────────────────   ─────   /head/bigfile/X
//
// the Mapping will interact with virtmem layer to coordinate
// updates to mapping virtual memory.
//
//
// API overview
//
//  - `WCFS` represents filesystem-level connection to wcfs server.
//  - `Conn` represents logical connection that provides view of data on wcfs
//    filesystem as of particular database state.
//  - `FileH` represent isolated file view under Conn.
//  - `Mapping` represents one memory mapping of FileH.
//
// A path from WCFS to Mapping is as follows:
//
//  WCFS.connect(at)                    -> Conn
//  Conn.open(foid)                     -> FileH
//  FileH.mmap([blk_start +blk_len))    -> Mapping
//
// A connection can be resynced to another database view via Conn.resync(at').
//
// Documentation for classes provides more thorough overview and API details.
//
// --------
//
// (*) see wcfs.go documentation for WCFS isolation protocol overview and details.
// (+) see bigfile_ops interface (wendelin/bigfile/file.h) that gives virtmem
//     point of view on layering.

#ifndef _NXD_WCFS_H_
#define _NXD_WCFS_H_

#include <golang/libgolang.h>
#include <golang/cxx.h>
#include <golang/sync.h>

#include <tuple>
#include <utility>

#include "wcfs_misc.h"
#include <wendelin/bug.h>

// from wendelin/bigfile/virtmem.h
extern "C" {
struct VMA;
}


// wcfs::
namespace wcfs {

using namespace golang;
using cxx::dict;
using cxx::set;
using std::tuple;
using std::pair;


typedef refptr<struct _Conn> Conn;
typedef refptr<struct _Mapping> Mapping;
typedef refptr<struct _FileH> FileH;
typedef refptr<struct _WatchLink> WatchLink;
struct PinReq;


// WCFS represents filesystem-level connection to wcfs server.
//
// Use wcfs.join in Python API to create it.
//
// The primary way to access wcfs is to open logical connection viewing on-wcfs
// data as of particular database state, and use that logical connection to
// create base-layer mappings. See .connect and Conn for details.
//
// WCFS logically mirrors ZODB.DB .
// It is safe to use WCFS from multiple threads simultaneously.
struct WCFS {
    string  mountpoint;

    pair<Conn, error>       connect(zodb::Tid at);
    pair<WatchLink, error>  _openwatch();

    string String() const;
    error _headWait(zodb::Tid at);

    // at OS-level, on-WCFS raw files can be accessed via ._path and ._open.
    string                  _path(const string &obj);
    tuple<os::File, error>  _open(const string &path, int flags=O_RDONLY);
};

// Conn represents logical connection that provides view of data on wcfs
// filesystem as of particular database state.
//
// It uses /head/bigfile/* and notifications received from /head/watch to
// maintain isolated database view while at the same time sharing most of data
// cache in OS pagecache of /head/bigfile/*.
//
// Use WCFS.connect(at) to create Conn.
// Use .open to create new FileH.
// Use .resync to resync Conn onto different database view.
//
// Conn logically mirrors ZODB.Connection .
// It is safe to use Conn from multiple threads simultaneously.
typedef refptr<struct _Conn> Conn;
struct _Conn : os::_IAfterFork, object {
    WCFS        *_wc;
    WatchLink   _wlink; // watch/receive pins for mappings created under this conn

    // atMu protects .at.
    // While it is rlocked, .at is guaranteed to stay unchanged and Conn
    // viewing the database at particular state. .resync write-locks this and
    // knows noone is using the connection for reading simultaneously.
    sync::RWMutex _atMu;
    zodb::Tid     _at;

    sync::RWMutex           _filehMu;  // _atMu.W  |  _atMu.R + _filehMu
    error                   _downErr;  // !nil if connection is closed or no longer operational
    dict<zodb::Oid, FileH>  _filehTab; // {} foid -> fileh

    sync::WorkGroup _pinWG;     // pin/unpin messages from wcfs are served by _pinner
    func<void()>    _pinCancel; // spawned under _pinWG.

    // don't new - create via WCFS.connect
private:
    _Conn();
    virtual ~_Conn();
    friend pair<Conn, error> WCFS::connect(zodb::Tid at);
public:
    void incref();
    void decref();

public:
    zodb::Tid at();
    pair<FileH, error> open(zodb::Oid foid);
    error close();
    error resync(zodb::Tid at);

    string String() const;

private:
    error _pinner(context::Context ctx);
    error __pinner(context::Context ctx);
    error _pin1(PinReq *req);
    error __pin1(PinReq *req);

    void afterFork();
};

// FileH represent isolated file view under Conn.
//
// The file view is maintained to be as of @Conn.at database state even in the
// presence of simultaneous database changes. The file view uses
// /head/<file>/data primarily and /@revX/<file>/data pin overrides.
//
// Use .mmap to map file view into memory.
//
// It is safe to use FileH from multiple threads simultaneously.
enum _FileHState {
    // NOTE order of states is semantically important
    _FileHOpening   = 0,    // FileH open is in progress
    _FileHOpened    = 1,    // FileH is opened and can be used
    _FileHClosing   = 2,    // FileH close is in progress
    _FileHClosed    = 3,    // FileH is closed
};
typedef refptr<struct _FileH> FileH;
struct _FileH : object {
    Conn        wconn;
    zodb::Oid   foid;       // ZBigFile root object ID (does not change after fileh open)

    // protected by wconn._filehMu
    _FileHState _state;  // opening/opened/closing/closed
    int         _nopen;  // number of times Conn.open returned this fileh

    chan<structZ> _openReady; // in-flight open completed
    error         _openErr;   // error result from open
    chan<structZ> _closedq;   // in-flight close completed

    os::File      _headf;     // file object of head/file
    size_t        blksize;    // block size of this file (does not change after fileh open)

    // head/file size is known to be at least headfsize (size ↑=)
    // protected by .wconn._atMu
    off_t       _headfsize;

    sync::Mutex               _mmapMu; // atMu.W  |  atMu.R + _mmapMu
    dict<int64_t, zodb::Tid>  _pinned; // {} blk -> rev   that wcfs already sent us for this file
    vector<Mapping>           _mmaps;  // []Mapping ↑blk_start    mappings of this file

    // don't new - create via Conn.open
private:
    _FileH();
    ~_FileH();
    friend pair<FileH, error> _Conn::open(zodb::Oid foid);
public:
    void decref();

public:
    error close();
    pair<Mapping, error> mmap(int64_t blk_start, int64_t blk_len, VMA *vma=nil);
    string String() const;

    error _open();
    error _closeLocked(bool force);
    void  _afterFork();
};

// Mapping represents one memory mapping of FileH.
//
// The mapped memory is [.mem_start, .mem_stop)
// Use .unmap to release virtual memory resources used by mapping.
//
// Except unmap, it is safe to use Mapping from multiple threads simultaneously.
typedef refptr<struct _Mapping> Mapping;
struct _Mapping : object {
    FileH   fileh;
    int64_t blk_start;  // offset of this mapping in file

    // protected by fileh._mmapMu
    uint8_t  *mem_start;    // mmapped memory [mem_start, mem_stop)
    uint8_t  *mem_stop;
    VMA      *vma;          // mmapped under this virtmem VMA | nil if created standalone from virtmem
    bool     efaulted;      // y after mapping was switched to be invalid (gives SIGSEGV on access)

    int64_t blk_stop() const {
        ASSERT((mem_stop - mem_start) % fileh->blksize == 0);
        return blk_start + (mem_stop - mem_start) / fileh->blksize;
    }

    error remmap_blk(int64_t blk); // for virtmem-only
    error unmap();

    void  _assertVMAOk();
    error _remmapblk(int64_t blk, zodb::Tid at);
    error __remmapAsEfault();
    error __remmapBlkAsEfault(int64_t blk);

    // don't new - create via FileH.mmap
private:
    _Mapping();
    ~_Mapping();
    friend pair<Mapping, error> _FileH::mmap(int64_t blk_start, int64_t blk_len, VMA *vma);
public:
    void decref();

    string String() const;
};


// for testing
dict<int64_t, zodb::Tid> _tfileh_pinned(FileH fileh);


}   // wcfs::

#endif