Commit e63890f3 authored by Ryan Ding's avatar Ryan Ding Committed by Linus Torvalds

ocfs2: fix ip_unaligned_aio deadlock with dio work queue

In the current implementation of unaligned aio+dio, lock order behave as
follow:

in user process context:
  -> call io_submit()
    -> get i_mutex
		<== window1
      -> get ip_unaligned_aio
        -> submit direct io to block device
    -> release i_mutex
  -> io_submit() return

in dio work queue context(the work queue is created in __blockdev_direct_IO):
  -> release ip_unaligned_aio
		<== window2
    -> get i_mutex
      -> clear unwritten flag & change i_size
    -> release i_mutex

There is a limitation to the thread number of dio work queue.  256 at
default.  If all 256 thread are in the above 'window2' stage, and there
is a user process in the 'window1' stage, the system will became
deadlock.  Since the user process hold i_mutex to wait ip_unaligned_aio
lock, while there is a direct bio hold ip_unaligned_aio mutex who is
waiting for a dio work queue thread to be schedule.  But all the dio
work queue thread is waiting for i_mutex lock in 'window2'.

This case only happened in a test which send a large number(more than
256) of aio at one io_submit() call.

My design is to remove ip_unaligned_aio lock.  Change it to a sync io
instead.  Just like ip_unaligned_aio lock, serialize the unaligned aio
dio.

[akpm@linux-foundation.org: remove OCFS2_IOCB_UNALIGNED_IO, per Junxiao Bi]
Signed-off-by: default avatarRyan Ding <ryan.ding@oracle.com>
Reviewed-by: default avatarJunxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f1f973ff
...@@ -2391,12 +2391,6 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, ...@@ -2391,12 +2391,6 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
/* this io's submitter should not have unlocked this before we could */ /* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
if (ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
if (private) if (private)
ocfs2_dio_end_io_write(inode, private, offset, bytes); ocfs2_dio_end_io_write(inode, private, offset, bytes);
......
...@@ -84,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) ...@@ -84,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
enum ocfs2_iocb_lock_bits { enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL, OCFS2_IOCB_RW_LOCK_LEVEL,
OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS OCFS2_IOCB_NUM_LOCKS
}; };
...@@ -93,11 +92,4 @@ enum ocfs2_iocb_lock_bits { ...@@ -93,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
#define ocfs2_iocb_rw_locked_level(iocb) \ #define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_unaligned_aio(iocb) \
set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#define ocfs2_iocb_clear_unaligned_aio(iocb) \
clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#define ocfs2_iocb_is_unaligned_aio(iocb) \
test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */ #endif /* OCFS2_FILE_H */
...@@ -2178,7 +2178,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ...@@ -2178,7 +2178,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt & int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED); OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0; void *saved_ki_complete = NULL;
int append_write = ((iocb->ki_pos + count) >= int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0); i_size_read(inode) ? 1 : 0);
...@@ -2241,17 +2241,12 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ...@@ -2241,17 +2241,12 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
goto out; goto out;
} }
if (direct_io && !is_sync_kiocb(iocb)) if (direct_io && !is_sync_kiocb(iocb) &&
unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
if (unaligned_dio) {
/* /*
* Wait on previous unaligned aio to complete before * Make it a sync io if it's an unaligned aio.
* proceeding.
*/ */
mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); saved_ki_complete = xchg(&iocb->ki_complete, NULL);
/* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
ocfs2_iocb_set_unaligned_aio(iocb);
} }
/* communicate with ocfs2_dio_end_io */ /* communicate with ocfs2_dio_end_io */
...@@ -2272,11 +2267,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ...@@ -2272,11 +2267,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
*/ */
if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1; rw_level = -1;
unaligned_dio = 0;
} }
if (unlikely(written <= 0)) if (unlikely(written <= 0))
goto no_sync; goto out;
if (((file->f_flags & O_DSYNC) && !direct_io) || if (((file->f_flags & O_DSYNC) && !direct_io) ||
IS_SYNC(inode)) { IS_SYNC(inode)) {
...@@ -2298,13 +2292,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ...@@ -2298,13 +2292,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
iocb->ki_pos - 1); iocb->ki_pos - 1);
} }
no_sync:
if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
out: out:
if (saved_ki_complete)
xchg(&iocb->ki_complete, saved_ki_complete);
if (rw_level != -1) if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level); ocfs2_rw_unlock(inode, rw_level);
......
...@@ -43,9 +43,6 @@ struct ocfs2_inode_info ...@@ -43,9 +43,6 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */ /* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem; struct rw_semaphore ip_xattr_sem;
/* Number of outstanding AIO's which are not page aligned */
struct mutex ip_unaligned_aio;
/* These fields are protected by ip_lock */ /* These fields are protected by ip_lock */
spinlock_t ip_lock; spinlock_t ip_lock;
u32 ip_open_count; u32 ip_open_count;
......
...@@ -1747,7 +1747,6 @@ static void ocfs2_inode_init_once(void *data) ...@@ -1747,7 +1747,6 @@ static void ocfs2_inode_init_once(void *data)
INIT_LIST_HEAD(&oi->ip_io_markers); INIT_LIST_HEAD(&oi->ip_io_markers);
INIT_LIST_HEAD(&oi->ip_unwritten_list); INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0; oi->ip_dir_start_lookup = 0;
mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem); init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex); mutex_init(&oi->ip_io_mutex);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment