Commit 6d482bc5 authored by Liu Bo's avatar Liu Bo Committed by Greg Kroah-Hartman

blk-iolatency: fix IO hang due to negative inflight counter

[ Upstream commit 8c772a9b ]

Our test reported the following stack, and vmcore showed that
->inflight counter is -1.

[ffffc9003fcc38d0] __schedule at ffffffff8173d95d
[ffffc9003fcc3958] schedule at ffffffff8173de26
[ffffc9003fcc3970] io_schedule at ffffffff810bb6b6
[ffffc9003fcc3988] blkcg_iolatency_throttle at ffffffff813911cb
[ffffc9003fcc3a20] rq_qos_throttle at ffffffff813847f3
[ffffc9003fcc3a48] blk_mq_make_request at ffffffff8137468a
[ffffc9003fcc3b08] generic_make_request at ffffffff81368b49
[ffffc9003fcc3b68] submit_bio at ffffffff81368d7d
[ffffc9003fcc3bb8] ext4_io_submit at ffffffffa031be00 [ext4]
[ffffc9003fcc3c00] ext4_writepages at ffffffffa03163de [ext4]
[ffffc9003fcc3d68] do_writepages at ffffffff811c49ae
[ffffc9003fcc3d78] __filemap_fdatawrite_range at ffffffff811b6188
[ffffc9003fcc3e30] filemap_write_and_wait_range at ffffffff811b6301
[ffffc9003fcc3e60] ext4_sync_file at ffffffffa030cee8 [ext4]
[ffffc9003fcc3ea8] vfs_fsync_range at ffffffff8128594b
[ffffc9003fcc3ee8] do_fsync at ffffffff81285abd
[ffffc9003fcc3f18] sys_fsync at ffffffff81285d50
[ffffc9003fcc3f28] do_syscall_64 at ffffffff81003c04
[ffffc9003fcc3f50] entry_SYSCALL_64_after_swapgs at ffffffff81742b8e

The ->inflight counter may be negative (-1) if

1) blk-iolatency was disabled when the IO was issued,

2) blk-iolatency was enabled before this IO reached its endio,

3) the ->inflight counter is decreased from 0 to -1 in endio()

In fact the hang can be easily reproduced by the below script,

H=/sys/fs/cgroup/unified/
P=/sys/fs/cgroup/unified/test

echo "+io" > $H/cgroup.subtree_control
mkdir -p $P

echo $$ > $P/cgroup.procs

xfs_io -f -d -c "pwrite 0 4k" /dev/sdg

echo "`cat /sys/block/sdg/dev` target=1000000" > $P/io.latency

xfs_io -f -d -c "pwrite 0 4k" /dev/sdg

This fixes the problem by freezing the queue so that while
enabling/disabling iolatency, there is no inflight rq running.

Note that quiesce_queue is not needed as this only updating iolatency
configuration about which dispatching request_queue doesn't care.
Signed-off-by: default avatarLiu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
parent 1781ae6f
...@@ -72,6 +72,7 @@ ...@@ -72,6 +72,7 @@
#include <linux/sched/loadavg.h> #include <linux/sched/loadavg.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <trace/events/block.h> #include <trace/events/block.h>
#include <linux/blk-mq.h>
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
#include "blk-stat.h" #include "blk-stat.h"
...@@ -568,6 +569,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) ...@@ -568,6 +569,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
return; return;
enabled = blk_iolatency_enabled(iolat->blkiolat); enabled = blk_iolatency_enabled(iolat->blkiolat);
if (!enabled)
return;
while (blkg && blkg->parent) { while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg); iolat = blkg_to_lat(blkg);
if (!iolat) { if (!iolat) {
...@@ -577,7 +581,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) ...@@ -577,7 +581,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
rqw = &iolat->rq_wait; rqw = &iolat->rq_wait;
atomic_dec(&rqw->inflight); atomic_dec(&rqw->inflight);
if (!enabled || iolat->min_lat_nsec == 0) if (iolat->min_lat_nsec == 0)
goto next; goto next;
iolatency_record_time(iolat, &bio->bi_issue, now, iolatency_record_time(iolat, &bio->bi_issue, now,
issue_as_root); issue_as_root);
...@@ -721,10 +725,13 @@ int blk_iolatency_init(struct request_queue *q) ...@@ -721,10 +725,13 @@ int blk_iolatency_init(struct request_queue *q)
return 0; return 0;
} }
static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) /*
* return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
* return 0.
*/
static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
{ {
struct iolatency_grp *iolat = blkg_to_lat(blkg); struct iolatency_grp *iolat = blkg_to_lat(blkg);
struct blk_iolatency *blkiolat = iolat->blkiolat;
u64 oldval = iolat->min_lat_nsec; u64 oldval = iolat->min_lat_nsec;
iolat->min_lat_nsec = val; iolat->min_lat_nsec = val;
...@@ -733,9 +740,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) ...@@ -733,9 +740,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
BLKIOLATENCY_MAX_WIN_SIZE); BLKIOLATENCY_MAX_WIN_SIZE);
if (!oldval && val) if (!oldval && val)
atomic_inc(&blkiolat->enabled); return 1;
if (oldval && !val) if (oldval && !val)
atomic_dec(&blkiolat->enabled); return -1;
return 0;
} }
static void iolatency_clear_scaling(struct blkcg_gq *blkg) static void iolatency_clear_scaling(struct blkcg_gq *blkg)
...@@ -768,6 +776,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, ...@@ -768,6 +776,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
u64 lat_val = 0; u64 lat_val = 0;
u64 oldval; u64 oldval;
int ret; int ret;
int enable = 0;
ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
if (ret) if (ret)
...@@ -803,7 +812,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, ...@@ -803,7 +812,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
blkg = ctx.blkg; blkg = ctx.blkg;
oldval = iolat->min_lat_nsec; oldval = iolat->min_lat_nsec;
iolatency_set_min_lat_nsec(blkg, lat_val); enable = iolatency_set_min_lat_nsec(blkg, lat_val);
if (enable) {
WARN_ON_ONCE(!blk_get_queue(blkg->q));
blkg_get(blkg);
}
if (oldval != iolat->min_lat_nsec) { if (oldval != iolat->min_lat_nsec) {
iolatency_clear_scaling(blkg); iolatency_clear_scaling(blkg);
} }
...@@ -811,6 +825,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, ...@@ -811,6 +825,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
ret = 0; ret = 0;
out: out:
blkg_conf_finish(&ctx); blkg_conf_finish(&ctx);
if (ret == 0 && enable) {
struct iolatency_grp *tmp = blkg_to_lat(blkg);
struct blk_iolatency *blkiolat = tmp->blkiolat;
blk_mq_freeze_queue(blkg->q);
if (enable == 1)
atomic_inc(&blkiolat->enabled);
else if (enable == -1)
atomic_dec(&blkiolat->enabled);
else
WARN_ON_ONCE(1);
blk_mq_unfreeze_queue(blkg->q);
blkg_put(blkg);
blk_put_queue(blkg->q);
}
return ret ?: nbytes; return ret ?: nbytes;
} }
...@@ -910,8 +942,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd) ...@@ -910,8 +942,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
{ {
struct iolatency_grp *iolat = pd_to_lat(pd); struct iolatency_grp *iolat = pd_to_lat(pd);
struct blkcg_gq *blkg = lat_to_blkg(iolat); struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct blk_iolatency *blkiolat = iolat->blkiolat;
int ret;
iolatency_set_min_lat_nsec(blkg, 0); ret = iolatency_set_min_lat_nsec(blkg, 0);
if (ret == 1)
atomic_inc(&blkiolat->enabled);
if (ret == -1)
atomic_dec(&blkiolat->enabled);
iolatency_clear_scaling(blkg); iolatency_clear_scaling(blkg);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment