Commit 932d166b authored by Kirill Smelkov's avatar Kirill Smelkov

[4.19 backport] fuse: allow filesystems to have precise control over data cache

On networked filesystems file data can be changed externally.  FUSE
provides notification messages for filesystem to inform kernel that
metadata or data region of a file needs to be invalidated in local page
cache. That provides the basis for filesystem implementations to invalidate
kernel cache explicitly based on observed filesystem-specific events.

FUSE has also "automatic" invalidation mode(*) when the kernel
automatically invalidates data cache of a file if it sees mtime change.  It
also automatically invalidates whole data cache of a file if it sees file
size being changed.

The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA.
However, due to probably historical reason, that capability controls only
whether mtime change should be resulting in automatic invalidation or
not. A change in file size always results in invalidating whole data cache
of a file irregardless of whether FUSE_AUTO_INVAL_DATA was negotiated(+).

The filesystem I write[1] represents data arrays stored in networked
database as local files suitable for mmap. It is read-only filesystem -
changes to data are committed externally via database interfaces and the
filesystem only glues data into contiguous file streams suitable for mmap
and traditional array processing. The files are big - starting from
hundreds gigabytes and more. The files change regularly, and frequently by
data being appended to their end. The size of files thus changes
frequently.

If a file was accessed locally and some part of its data got into page
cache, we want that data to stay cached unless there is memory pressure, or
unless corresponding part of the file was actually changed. However current
FUSE behaviour - when it sees file size change - is to invalidate the whole
file. The data cache of the file is thus completely lost even on small size
change, and despite that the filesystem server is careful to accurately
translate database changes into FUSE invalidation messages to kernel.

Let's fix it: if a filesystem, through new FUSE_EXPLICIT_INVAL_DATA
capability, indicates to kernel that it is fully responsible for data cache
invalidation, then the kernel won't invalidate files data cache on size
change and only truncate that cache to new size in case the size decreased.

(*) see 72d0d248 "fuse: add FUSE_AUTO_INVAL_DATA init flag",
eed2179e "fuse: invalidate inode mapping if mtime changes"

(+) in writeback mode the kernel does not invalidate data cache on file
size change, but neither it allows the filesystem to set the size due to
external event (see 8373200b "fuse: Trust kernel i_size only")

[1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20Signed-off-by: Kirill Smelkov's avatarKirill Smelkov <kirr@nexedi.com>
Signed-off-by: default avatarMiklos Szeredi <mszeredi@redhat.com>
parent 1327080d
...@@ -616,6 +616,9 @@ struct fuse_conn { ...@@ -616,6 +616,9 @@ struct fuse_conn {
/** Use enhanced/automatic page cache invalidation. */ /** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1; unsigned auto_inval_data:1;
/** Filesystem is fully reponsible for page cache invalidation. */
unsigned explicit_inval_data:1;
/** Does the filesystem support readdirplus? */ /** Does the filesystem support readdirplus? */
unsigned do_readdirplus:1; unsigned do_readdirplus:1;
......
...@@ -235,7 +235,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, ...@@ -235,7 +235,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
if (oldsize != attr->size) { if (oldsize != attr->size) {
truncate_pagecache(inode, attr->size); truncate_pagecache(inode, attr->size);
inval = true; if (!fc->explicit_inval_data)
inval = true;
} else if (fc->auto_inval_data) { } else if (fc->auto_inval_data) {
struct timespec64 new_mtime = { struct timespec64 new_mtime = {
.tv_sec = attr->mtime, .tv_sec = attr->mtime,
...@@ -867,6 +868,9 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) ...@@ -867,6 +868,9 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
} }
} }
// FIXME hack (in-tree include/uapi/linux/fuse.h is not included when building as make -C /lib/modules/`uname -r`/build M=`pwd`)
#define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{ {
struct fuse_init_out *arg = &req->misc.init_out; struct fuse_init_out *arg = &req->misc.init_out;
...@@ -904,6 +908,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) ...@@ -904,6 +908,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->dont_mask = 1; fc->dont_mask = 1;
if (arg->flags & FUSE_AUTO_INVAL_DATA) if (arg->flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1; fc->auto_inval_data = 1;
else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA)
fc->explicit_inval_data = 1;
if (arg->flags & FUSE_DO_READDIRPLUS) { if (arg->flags & FUSE_DO_READDIRPLUS) {
fc->do_readdirplus = 1; fc->do_readdirplus = 1;
if (arg->flags & FUSE_READDIRPLUS_AUTO) if (arg->flags & FUSE_READDIRPLUS_AUTO)
...@@ -957,7 +963,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) ...@@ -957,7 +963,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR; FUSE_ABORT_ERROR |
FUSE_EXPLICIT_INVAL_DATA;
req->in.h.opcode = FUSE_INIT; req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1; req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg); req->in.args[0].size = sizeof(*arg);
......
...@@ -251,6 +251,7 @@ struct fuse_file_lock { ...@@ -251,6 +251,7 @@ struct fuse_file_lock {
* FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc
* FUSE_POSIX_ACL: filesystem supports posix acls * FUSE_POSIX_ACL: filesystem supports posix acls
* FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED
* FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
*/ */
#define FUSE_ASYNC_READ (1 << 0) #define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1) #define FUSE_POSIX_LOCKS (1 << 1)
...@@ -274,6 +275,7 @@ struct fuse_file_lock { ...@@ -274,6 +275,7 @@ struct fuse_file_lock {
#define FUSE_HANDLE_KILLPRIV (1 << 19) #define FUSE_HANDLE_KILLPRIV (1 << 19)
#define FUSE_POSIX_ACL (1 << 20) #define FUSE_POSIX_ACL (1 << 20)
#define FUSE_ABORT_ERROR (1 << 21) #define FUSE_ABORT_ERROR (1 << 21)
#define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
/** /**
* CUSE INIT request/reply flags * CUSE INIT request/reply flags
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment