Commit 36a52481 authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

blk-iocost: Account force-charged overage in absolute vtime

Currently, when a bio needs to be force-charged and there isn't enough
budget, vtime is simply pushed into the future.  This means that the
cost of the whole bio is scaled using the current hweight and then
charged immediately.  Until the global vtime advances beyond this
future vtime, the cgroup won't be allowed to issue normal IOs.

This is incorrect and can lead to, for example, exploding vrate or
extended stalls if vrate range is constrained.  Consider the following
scenario.

1. A cgroup with a very low hweight runs out of budget.

2. A storm of swap-out happens on it.  All of them are scaled
   according to the current low hweight and charged to vtime pushing
   it to a far future.

3. All other cgroups go idle and now the above cgroup has access to
   the whole device.  However, because vtime is already wound using
   the past low hweight, what its current hweight is doesn't matter
   until global vtime catches up to the local vtime.

4. As a result, either vrate gets ramped up extremely or the IOs stall
   while the underlying device is idle.

This is because the hweight the overage is calculated at is different
from the hweight that it's being paid at.

Fix it by remembering the overage in absoulte vtime and continuously
paying with the actual budget according to the current hweight at each
period.

Note that non-forced bios which wait already remembers the cost in
absolute vtime.  This brings forced-bio accounting in line.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent e036c4ca
...@@ -469,6 +469,7 @@ struct ioc_gq { ...@@ -469,6 +469,7 @@ struct ioc_gq {
*/ */
atomic64_t vtime; atomic64_t vtime;
atomic64_t done_vtime; atomic64_t done_vtime;
atomic64_t abs_vdebt;
u64 last_vtime; u64 last_vtime;
/* /*
...@@ -653,13 +654,21 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) ...@@ -653,13 +654,21 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
/* /*
* Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
* weight, the more expensive each IO. * weight, the more expensive each IO. Must round up.
*/ */
static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
{ {
return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
} }
/*
* The inverse of abs_cost_to_cost(). Must round up.
*/
static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
{
return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
}
static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
{ {
bio->bi_iocost_cost = cost; bio->bi_iocost_cost = cost;
...@@ -1132,16 +1141,36 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) ...@@ -1132,16 +1141,36 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
struct iocg_wake_ctx ctx = { .iocg = iocg }; struct iocg_wake_ctx ctx = { .iocg = iocg };
u64 margin_ns = (u64)(ioc->period_us * u64 margin_ns = (u64)(ioc->period_us *
WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
u64 vshortage, expires, oexpires; u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
s64 vbudget;
u32 hw_inuse;
lockdep_assert_held(&iocg->waitq.lock); lockdep_assert_held(&iocg->waitq.lock);
current_hweight(iocg, NULL, &hw_inuse);
vbudget = now->vnow - atomic64_read(&iocg->vtime);
/* pay off debt */
abs_vdebt = atomic64_read(&iocg->abs_vdebt);
vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
if (vdebt && vbudget > 0) {
u64 delta = min_t(u64, vbudget, vdebt);
u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
abs_vdebt);
atomic64_add(delta, &iocg->vtime);
atomic64_add(delta, &iocg->done_vtime);
atomic64_sub(abs_delta, &iocg->abs_vdebt);
if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
atomic64_set(&iocg->abs_vdebt, 0);
}
/* /*
* Wake up the ones which are due and see how much vtime we'll need * Wake up the ones which are due and see how much vtime we'll need
* for the next one. * for the next one.
*/ */
current_hweight(iocg, NULL, &ctx.hw_inuse); ctx.hw_inuse = hw_inuse;
ctx.vbudget = now->vnow - atomic64_read(&iocg->vtime); ctx.vbudget = vbudget - vdebt;
__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
if (!waitqueue_active(&iocg->waitq)) if (!waitqueue_active(&iocg->waitq))
return; return;
...@@ -1187,6 +1216,11 @@ static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) ...@@ -1187,6 +1216,11 @@ static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
u64 vmargin = ioc->margin_us * now->vrate; u64 vmargin = ioc->margin_us * now->vrate;
u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
u64 expires, oexpires; u64 expires, oexpires;
u32 hw_inuse;
/* debt-adjust vtime */
current_hweight(iocg, NULL, &hw_inuse);
vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
/* clear or maintain depending on the overage */ /* clear or maintain depending on the overage */
if (time_before_eq64(vtime, now->vnow)) { if (time_before_eq64(vtime, now->vnow)) {
...@@ -1332,12 +1366,14 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1332,12 +1366,14 @@ static void ioc_timer_fn(struct timer_list *timer)
* should have woken up in the last period and expire idle iocgs. * should have woken up in the last period and expire idle iocgs.
*/ */
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
if (!waitqueue_active(&iocg->waitq) && !iocg_is_idle(iocg)) if (!waitqueue_active(&iocg->waitq) &&
!atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
continue; continue;
spin_lock(&iocg->waitq.lock); spin_lock(&iocg->waitq.lock);
if (waitqueue_active(&iocg->waitq)) { if (waitqueue_active(&iocg->waitq) ||
atomic64_read(&iocg->abs_vdebt)) {
/* might be oversleeping vtime / hweight changes, kick */ /* might be oversleeping vtime / hweight changes, kick */
iocg_kick_waitq(iocg, &now); iocg_kick_waitq(iocg, &now);
iocg_kick_delay(iocg, &now, 0); iocg_kick_delay(iocg, &now, 0);
...@@ -1673,13 +1709,24 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) ...@@ -1673,13 +1709,24 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* in a while which is fine. * in a while which is fine.
*/ */
if (!waitqueue_active(&iocg->waitq) && if (!waitqueue_active(&iocg->waitq) &&
!atomic64_read(&iocg->abs_vdebt) &&
time_before_eq64(vtime + cost, now.vnow)) { time_before_eq64(vtime + cost, now.vnow)) {
iocg_commit_bio(iocg, bio, cost); iocg_commit_bio(iocg, bio, cost);
return; return;
} }
/*
* We're over budget. If @bio has to be issued regardless,
* remember the abs_cost instead of advancing vtime.
* iocg_kick_waitq() will pay off the debt before waking more IOs.
* This way, the debt is continuously paid off each period with the
* actual budget available to the cgroup. If we just wound vtime,
* we would incorrectly use the current hw_inuse for the entire
* amount which, for example, can lead to the cgroup staying
* blocked for a long time even with substantially raised hw_inuse.
*/
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
iocg_commit_bio(iocg, bio, cost); atomic64_add(abs_cost, &iocg->abs_vdebt);
iocg_kick_delay(iocg, &now, cost); iocg_kick_delay(iocg, &now, cost);
return; return;
} }
...@@ -1928,6 +1975,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) ...@@ -1928,6 +1975,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
iocg->ioc = ioc; iocg->ioc = ioc;
atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->vtime, now.vnow);
atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow);
atomic64_set(&iocg->abs_vdebt, 0);
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->active_list);
iocg->hweight_active = HWEIGHT_WHOLE; iocg->hweight_active = HWEIGHT_WHOLE;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment