Commit c75b1d94 authored by Jens Axboe's avatar Jens Axboe

fs: add fcntl() interface for setting/getting write life time hints

Define a set of write life time hints:

RWH_WRITE_LIFE_NOT_SET	No hint information set
RWH_WRITE_LIFE_NONE	No hints about write life time
RWH_WRITE_LIFE_SHORT	Data written has a short life time
RWH_WRITE_LIFE_MEDIUM	Data written has a medium life time
RWH_WRITE_LIFE_LONG	Data written has a long life time
RWH_WRITE_LIFE_EXTREME	Data written has an extremely long life time

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT		Returns the read/write hint set on the
			underlying inode.

F_SET_RW_HINT		Set one of the above write hints on the
			underlying inode.

F_GET_FILE_RW_HINT	Returns the read/write hint set on the
			file descriptor.

F_SET_FILE_RW_HINT	Set one of the above write hints on the
			file descriptor.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags
and in struct file as well, and pass them to the kiocb flags. If
both a file and its corresponding inode has a write hint, then we
use the one in the file, if available. The file hint can be used
for sync/direct IO, for buffered writeback only the inode hint
is available.

This is in preparation for utilizing these hints in the block layer,
to guide on-media data placement.

/*
 * writehint.c: get or set an inode write hint
 */
 #include <stdio.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdbool.h>
 #include <inttypes.h>

 #ifndef F_GET_RW_HINT
 #define F_LINUX_SPECIFIC_BASE	1024
 #define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
 #define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
 #endif

static char *str[] = { "RWF_WRITE_LIFE_NOT_SET", "RWH_WRITE_LIFE_NONE",
			"RWH_WRITE_LIFE_SHORT", "RWH_WRITE_LIFE_MEDIUM",
			"RWH_WRITE_LIFE_LONG", "RWH_WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
	uint64_t hint;
	int fd, ret;

	if (argc < 2) {
		fprintf(stderr, "%s: file <hint>\n", argv[0]);
		return 1;
	}

	fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		perror("open");
		return 2;
	}

	if (argc > 2) {
		hint = atoi(argv[2]);
		ret = fcntl(fd, F_SET_RW_HINT, &hint);
		if (ret < 0) {
			perror("fcntl: F_SET_RW_HINT");
			return 4;
		}
	}

	ret = fcntl(fd, F_GET_RW_HINT, &hint);
	if (ret < 0) {
		perror("fcntl: F_GET_RW_HINT");
		return 3;
	}

	printf("%s: hint %s\n", argv[1], str[hint]);
	close(fd);
	return 0;
}
Reviewed-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 12e9a6d6
...@@ -243,6 +243,62 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) ...@@ -243,6 +243,62 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
} }
#endif #endif
static bool rw_hint_valid(enum rw_hint hint)
{
switch (hint) {
case RWF_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
case RWH_WRITE_LIFE_LONG:
case RWH_WRITE_LIFE_EXTREME:
return true;
default:
return false;
}
}
static long fcntl_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 *argp = (u64 __user *)arg;
enum rw_hint hint;
switch (cmd) {
case F_GET_FILE_RW_HINT:
if (put_user(file_write_hint(file), argp))
return -EFAULT;
return 0;
case F_SET_FILE_RW_HINT:
if (get_user(hint, argp))
return -EFAULT;
if (!rw_hint_valid(hint))
return -EINVAL;
spin_lock(&file->f_lock);
file->f_write_hint = hint;
spin_unlock(&file->f_lock);
return 0;
case F_GET_RW_HINT:
if (put_user(inode->i_write_hint, argp))
return -EFAULT;
return 0;
case F_SET_RW_HINT:
if (get_user(hint, argp))
return -EFAULT;
if (!rw_hint_valid(hint))
return -EINVAL;
inode_lock(inode);
inode->i_write_hint = hint;
inode_unlock(inode);
return 0;
default:
return -EINVAL;
}
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp) struct file *filp)
{ {
...@@ -337,6 +393,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, ...@@ -337,6 +393,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GET_SEALS: case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg); err = shmem_fcntl(filp, cmd, arg);
break; break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
case F_GET_FILE_RW_HINT:
case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
default: default:
break; break;
} }
......
...@@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) ...@@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
i_gid_write(inode, 0); i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0); atomic_set(&inode->i_writecount, 0);
inode->i_size = 0; inode->i_size = 0;
inode->i_write_hint = WRITE_LIFE_NOT_SET;
inode->i_blocks = 0; inode->i_blocks = 0;
inode->i_bytes = 0; inode->i_bytes = 0;
inode->i_generation = 0; inode->i_generation = 0;
......
...@@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f, ...@@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f,
likely(f->f_op->write || f->f_op->write_iter)) likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE; f->f_mode |= FMODE_CAN_WRITE;
f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/fiemap.h> #include <linux/fiemap.h>
#include <linux/rculist_bl.h> #include <linux/rculist_bl.h>
#include <linux/atomic.h> #include <linux/atomic.h>
...@@ -265,6 +266,18 @@ struct page; ...@@ -265,6 +266,18 @@ struct page;
struct address_space; struct address_space;
struct writeback_control; struct writeback_control;
/*
* Write life time hint values.
*/
enum rw_hint {
WRITE_LIFE_NOT_SET = 0,
WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
};
#define IOCB_EVENTFD (1 << 0) #define IOCB_EVENTFD (1 << 0)
#define IOCB_APPEND (1 << 1) #define IOCB_APPEND (1 << 1)
#define IOCB_DIRECT (1 << 2) #define IOCB_DIRECT (1 << 2)
...@@ -280,6 +293,7 @@ struct kiocb { ...@@ -280,6 +293,7 @@ struct kiocb {
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void *private; void *private;
int ki_flags; int ki_flags;
enum rw_hint ki_hint;
}; };
static inline bool is_sync_kiocb(struct kiocb *kiocb) static inline bool is_sync_kiocb(struct kiocb *kiocb)
...@@ -287,16 +301,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) ...@@ -287,16 +301,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
return kiocb->ki_complete == NULL; return kiocb->ki_complete == NULL;
} }
static inline int iocb_flags(struct file *file);
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
.ki_filp = filp,
.ki_flags = iocb_flags(filp),
};
}
/* /*
* "descriptor" for what we're up to with a read. * "descriptor" for what we're up to with a read.
* This allows us to use the same read code yet * This allows us to use the same read code yet
...@@ -597,6 +601,7 @@ struct inode { ...@@ -597,6 +601,7 @@ struct inode {
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes; unsigned short i_bytes;
unsigned int i_blkbits; unsigned int i_blkbits;
enum rw_hint i_write_hint;
blkcnt_t i_blocks; blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED #ifdef __NEED_I_SIZE_ORDERED
...@@ -851,6 +856,7 @@ struct file { ...@@ -851,6 +856,7 @@ struct file {
* Must not be taken from IRQ context. * Must not be taken from IRQ context.
*/ */
spinlock_t f_lock; spinlock_t f_lock;
enum rw_hint f_write_hint;
atomic_long_t f_count; atomic_long_t f_count;
unsigned int f_flags; unsigned int f_flags;
fmode_t f_mode; fmode_t f_mode;
...@@ -1026,8 +1032,6 @@ struct file_lock_context { ...@@ -1026,8 +1032,6 @@ struct file_lock_context {
#define OFFT_OFFSET_MAX INT_LIMIT(off_t) #define OFFT_OFFSET_MAX INT_LIMIT(off_t)
#endif #endif
#include <linux/fcntl.h>
extern void send_sigio(struct fown_struct *fown, int fd, int band); extern void send_sigio(struct fown_struct *fown, int fd, int band);
/* /*
...@@ -1878,6 +1882,25 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode) ...@@ -1878,6 +1882,25 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode)
return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
} }
static inline enum rw_hint file_write_hint(struct file *file)
{
if (file->f_write_hint != WRITE_LIFE_NOT_SET)
return file->f_write_hint;
return file_inode(file)->i_write_hint;
}
static inline int iocb_flags(struct file *file);
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
.ki_filp = filp,
.ki_flags = iocb_flags(filp),
.ki_hint = file_write_hint(filp),
};
}
/* /*
* Inode state bits. Protected by inode->i_lock * Inode state bits. Protected by inode->i_lock
* *
......
...@@ -42,6 +42,27 @@ ...@@ -42,6 +42,27 @@
#define F_SEAL_WRITE 0x0008 /* prevent writes */ #define F_SEAL_WRITE 0x0008 /* prevent writes */
/* (1U << 31) is reserved for signed error codes */ /* (1U << 31) is reserved for signed error codes */
/*
* Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
* underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
* the specific file.
*/
#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11)
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
*/
#define RWF_WRITE_LIFE_NOT_SET 0
#define RWH_WRITE_LIFE_NONE 1
#define RWH_WRITE_LIFE_SHORT 2
#define RWH_WRITE_LIFE_MEDIUM 3
#define RWH_WRITE_LIFE_LONG 4
#define RWH_WRITE_LIFE_EXTREME 5
/* /*
* Types of directory notifications that may be requested. * Types of directory notifications that may be requested.
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment