Commit b0d7c223 authored by Yan, Zheng's avatar Yan, Zheng Committed by Sage Weil

ceph: introduce i_truncate_mutex

I encountered below deadlock when running fsstress

wmtruncate work      truncate                 MDS
---------------  ------------------  --------------------------
                   lock i_mutex
                                      <- truncate file
lock i_mutex (blocked)
                                      <- revoking Fcb (filelock to MIX)
                   send request ->
                                         handle request (xlock filelock)

At the initial time, there are some dirty pages in the page cache.
When the kclient receives the truncate message, it reduces inode size
and creates some 'out of i_size' dirty pages. wmtruncate work can't
truncate these dirty pages because it's blocked by the i_mutex. Later
when the kclient receives the cap message that revokes Fcb caps, It
can't flush all dirty pages because writepages() only flushes dirty
pages within the inode size.

When the MDS handles the 'truncate' request from kclient, it waits
for the filelock to become stable. But the filelock is stuck in
unstable state because it can't finish revoking kclient's Fcb caps.

The truncate pagecache locking has already caused lots of trouble
for use. I think it's time simplify it by introducing a new mutex.
We use the new mutex to prevent concurrent truncate_inode_pages().
There is no need to worry about race between buffered write and
truncate_inode_pages(), because our "get caps" mechanism prevents
them from concurrent execution.
Reviewed-by: default avatarSage Weil <sage@inktank.com>
Signed-off-by: default avatarYan, Zheng <zheng.z.yan@intel.com>
parent b150f5c1
...@@ -2072,11 +2072,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ...@@ -2072,11 +2072,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */ /* finish pending truncate */
while (ci->i_truncate_pending) { while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (!(need & CEPH_CAP_FILE_WR))
mutex_lock(&inode->i_mutex);
__ceph_do_pending_vmtruncate(inode); __ceph_do_pending_vmtruncate(inode);
if (!(need & CEPH_CAP_FILE_WR))
mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
} }
......
...@@ -773,6 +773,13 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, ...@@ -773,6 +773,13 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
goto retry_snap; goto retry_snap;
} }
} else { } else {
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
* message to us. We can't get Fwb cap while there
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
written = generic_file_buffered_write(iocb, iov, nr_segs, written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, &iocb->ki_pos, pos, &iocb->ki_pos,
count, 0); count, 0);
...@@ -819,7 +826,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) ...@@ -819,7 +826,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
int ret; int ret;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
__ceph_do_pending_vmtruncate(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
......
...@@ -352,6 +352,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -352,6 +352,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
for (i = 0; i < CEPH_FILE_MODE_NUM; i++) for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
ci->i_nr_by_mode[i] = 0; ci->i_nr_by_mode[i] = 0;
mutex_init(&ci->i_truncate_mutex);
ci->i_truncate_seq = 0; ci->i_truncate_seq = 0;
ci->i_truncate_size = 0; ci->i_truncate_size = 0;
ci->i_truncate_pending = 0; ci->i_truncate_pending = 0;
...@@ -463,16 +464,20 @@ int ceph_fill_file_size(struct inode *inode, int issued, ...@@ -463,16 +464,20 @@ int ceph_fill_file_size(struct inode *inode, int issued,
dout("truncate_seq %u -> %u\n", dout("truncate_seq %u -> %u\n",
ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq; ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_WR |
CEPH_CAP_FILE_LAZYIO));
/* /*
* If we hold relevant caps, or in the case where we're * If we hold relevant caps, or in the case where we're
* not the only client referencing this file and we * not the only client referencing this file and we
* don't hold those caps, then we need to check whether * don't hold those caps, then we need to check whether
* the file is either opened or mmaped * the file is either opened or mmaped
*/ */
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| CEPH_CAP_FILE_BUFFER)) ||
CEPH_CAP_FILE_EXCL|
CEPH_CAP_FILE_LAZYIO)) ||
mapping_mapped(inode->i_mapping) || mapping_mapped(inode->i_mapping) ||
__ceph_caps_file_wanted(ci)) { __ceph_caps_file_wanted(ci)) {
ci->i_truncate_pending++; ci->i_truncate_pending++;
...@@ -1427,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work) ...@@ -1427,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work)
u32 orig_gen; u32 orig_gen;
int check = 0; int check = 0;
mutex_lock(&ci->i_truncate_mutex);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode, dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking); ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
/* nevermind! */ /* nevermind! */
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
goto out; goto out;
} }
orig_gen = ci->i_rdcache_gen; orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(inode->i_mapping, 0);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen && if (orig_gen == ci->i_rdcache_gen &&
...@@ -1453,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work) ...@@ -1453,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work)
ci->i_rdcache_revoking); ci->i_rdcache_revoking);
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
if (check) if (check)
ceph_check_caps(ci, 0, NULL); ceph_check_caps(ci, 0, NULL);
...@@ -1473,16 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) ...@@ -1473,16 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
dout("vmtruncate_work %p\n", inode); dout("vmtruncate_work %p\n", inode);
if (!mutex_trylock(&inode->i_mutex)) {
/*
* the i_mutex can be hold by a writer who is waiting for
* caps. wake up waiters, they will do pending vmtruncate.
*/
wake_up_all(&ci->i_cap_wq);
mutex_lock(&inode->i_mutex);
}
__ceph_do_pending_vmtruncate(inode); __ceph_do_pending_vmtruncate(inode);
mutex_unlock(&inode->i_mutex);
iput(inode); iput(inode);
} }
...@@ -1515,11 +1514,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) ...@@ -1515,11 +1514,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
u64 to; u64 to;
int wrbuffer_refs, finish = 0; int wrbuffer_refs, finish = 0;
mutex_lock(&ci->i_truncate_mutex);
retry: retry:
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) { if (ci->i_truncate_pending == 0) {
dout("__do_pending_vmtruncate %p none pending\n", inode); dout("__do_pending_vmtruncate %p none pending\n", inode);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
return; return;
} }
...@@ -1536,6 +1537,9 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) ...@@ -1536,6 +1537,9 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
goto retry; goto retry;
} }
/* there should be no reader or writer */
WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
to = ci->i_truncate_size; to = ci->i_truncate_size;
wrbuffer_refs = ci->i_wrbuffer_ref; wrbuffer_refs = ci->i_wrbuffer_ref;
dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
...@@ -1553,6 +1557,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) ...@@ -1553,6 +1557,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
if (!finish) if (!finish)
goto retry; goto retry;
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0) if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
...@@ -1601,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1601,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP) if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS; return -EROFS;
__ceph_do_pending_vmtruncate(inode);
err = inode_change_ok(inode, attr); err = inode_change_ok(inode, attr);
if (err != 0) if (err != 0)
return err; return err;
...@@ -1783,6 +1787,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1783,6 +1787,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_cap_string(dirtied), mask); ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode); __ceph_do_pending_vmtruncate(inode);
return err; return err;
out: out:
......
...@@ -288,6 +288,7 @@ struct ceph_inode_info { ...@@ -288,6 +288,7 @@ struct ceph_inode_info {
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */ u32 i_truncate_seq; /* last truncate to smaller size */
u64 i_truncate_size; /* and the size we last truncated down to */ u64 i_truncate_size; /* and the size we last truncated down to */
int i_truncate_pending; /* still need to call vmtruncate */ int i_truncate_pending; /* still need to call vmtruncate */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment