Commit 9c225f26 authored by Linus Torvalds's avatar Linus Torvalds Committed by Al Viro

vfs: atomic f_pos accesses as per POSIX

Our write() system call has always been atomic in the sense that you get
the expected thread-safe contiguous write, but we haven't actually
guaranteed that concurrent writes are serialized wrt f_pos accesses, so
threads (or processes) that share a file descriptor and use "write()"
concurrently would quite likely overwrite each others data.

This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says:

 "2.9.7 Thread Interactions with Regular File Operations

  All of the following functions shall be atomic with respect to each
  other in the effects specified in POSIX.1-2008 when they operate on
  regular files or symbolic links: [...]"

and one of the effects is the file position update.

This unprotected file position behavior is not new behavior, and nobody
has ever cared.  Until now.  Yongzhi Pan reported unexpected behavior to
Michael Kerrisk that was due to this.

This resolves the issue with a f_pos-specific lock that is taken by
read/write/lseek on file descriptors that may be shared across threads
or processes.
Reported-by: default avatarYongzhi Pan <panyongzhi@gmail.com>
Reported-by: default avatarMichael Kerrisk <mtk.manpages@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 1b56e989
......@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
eventpoll_init_file(f);
/* f->f_version: 0 */
return f;
......
......@@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
if (f.need_put)
if (f.flags & FDPUT_FPUT)
*fp = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
rcu_read_lock();
......
......@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
return 0;
}
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
f->f_op = fops_get(inode->i_fop);
if (unlikely(WARN_ON(!f->f_op))) {
error = -ENODEV;
......
......@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);
/*
* We only lock f_pos if we have threads or if the file might be
* shared with another process. In both cases we'll have an elevated
* file count (done either by fdget() or by fork()).
*/
static inline struct fd fdget_pos(int fd)
{
struct fd f = fdget(fd);
struct file *file = f.file;
if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
if (file_count(file) > 1) {
f.flags |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
}
return f;
}
static inline void fdput_pos(struct fd f)
{
if (f.flags & FDPUT_POS_UNLOCK)
mutex_unlock(&f.file->f_pos_lock);
fdput(f);
}
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
if (!f.file)
return -EBADF;
......@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
fdput(f);
fdput_pos(f);
return retval;
}
......@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
......@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}
return ret;
}
......@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
......@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}
return ret;
......@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
......@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}
if (ret > 0)
......@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
......@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput(f);
fdput_pos(f);
}
if (ret > 0)
......@@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
......@@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
ret = compat_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
fdput(f);
fdput_pos(f);
return ret;
}
......@@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
const struct compat_iovec __user *, vec,
compat_ulong_t, vlen)
{
struct fd f = fdget(fd);
struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
......@@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
ret = compat_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
fdput(f);
fdput_pos(f);
return ret;
}
......
......@@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed)
struct fd {
struct file *file;
int need_put;
unsigned int flags;
};
#define FDPUT_FPUT 1
#define FDPUT_POS_UNLOCK 2
static inline void fdput(struct fd fd)
{
if (fd.need_put)
if (fd.flags & FDPUT_FPUT)
fput(fd.file);
}
......
......@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH ((__force fmode_t)0x4000)
/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
......@@ -780,13 +783,14 @@ struct file {
const struct file_operations *f_op;
/*
* Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment