Commit 297e3d85 authored by Shaohua Li's avatar Shaohua Li Committed by Jens Axboe

blk-throttle: make throtl_slice tunable

throtl_slice is important for blk-throttling. It's called slice
internally but it really is a time window blk-throttling samples data.
blk-throttling will make decision based on the samplings. An example is
bandwidth measurement. A cgroup's bandwidth is measured in the time
interval of throtl_slice.

A small throtl_slice meanse cgroups have smoother throughput but burn
more CPUs. It has 100ms default value, which is not appropriate for all
disks. A fast SSD can dispatch a lot of IOs in 100ms. This patch makes
it tunable.

Since throtl_slice isn't a time slice, the sysfs name
'throttle_sample_time' reflects its character better.
Signed-off-by: default avatarShaohua Li <shli@fb.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 06cceedc
...@@ -192,5 +192,11 @@ scaling back writes. Writing a value of '0' to this file disables the ...@@ -192,5 +192,11 @@ scaling back writes. Writing a value of '0' to this file disables the
feature. Writing a value of '-1' to this file resets the value to the feature. Writing a value of '-1' to this file resets the value to the
default setting. default setting.
throttle_sample_time (RW)
-------------------------
This is the time window that blk-throttle samples data, in millisecond.
blk-throttle makes decision based on the samplings. Lower time means cgroups
have more smooth throughput, but higher CPU overhead. This exists only when
CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
Jens Axboe <jens.axboe@oracle.com>, February 2009 Jens Axboe <jens.axboe@oracle.com>, February 2009
...@@ -677,6 +677,14 @@ static struct queue_sysfs_entry queue_wb_lat_entry = { ...@@ -677,6 +677,14 @@ static struct queue_sysfs_entry queue_wb_lat_entry = {
.store = queue_wb_lat_store, .store = queue_wb_lat_store,
}; };
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static struct queue_sysfs_entry throtl_sample_time_entry = {
.attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
.show = blk_throtl_sample_time_show,
.store = blk_throtl_sample_time_store,
};
#endif
static struct attribute *default_attrs[] = { static struct attribute *default_attrs[] = {
&queue_requests_entry.attr, &queue_requests_entry.attr,
&queue_ra_entry.attr, &queue_ra_entry.attr,
...@@ -710,6 +718,9 @@ static struct attribute *default_attrs[] = { ...@@ -710,6 +718,9 @@ static struct attribute *default_attrs[] = {
&queue_dax_entry.attr, &queue_dax_entry.attr,
&queue_wb_lat_entry.attr, &queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr, &queue_poll_delay_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
&throtl_sample_time_entry.attr,
#endif
NULL, NULL,
}; };
......
...@@ -19,7 +19,8 @@ static int throtl_grp_quantum = 8; ...@@ -19,7 +19,8 @@ static int throtl_grp_quantum = 8;
static int throtl_quantum = 32; static int throtl_quantum = 32;
/* Throttling is performed over 100ms slice and after that slice is renewed */ /* Throttling is performed over 100ms slice and after that slice is renewed */
static unsigned long throtl_slice = HZ/10; /* 100 ms */ #define DFL_THROTL_SLICE (HZ / 10)
#define MAX_THROTL_SLICE (HZ)
static struct blkcg_policy blkcg_policy_throtl; static struct blkcg_policy blkcg_policy_throtl;
...@@ -162,6 +163,8 @@ struct throtl_data ...@@ -162,6 +163,8 @@ struct throtl_data
/* Total Number of queued bios on READ and WRITE lists */ /* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2]; unsigned int nr_queued[2];
unsigned int throtl_slice;
/* Work for dispatching throttled bios */ /* Work for dispatching throttled bios */
struct work_struct dispatch_work; struct work_struct dispatch_work;
unsigned int limit_index; unsigned int limit_index;
...@@ -590,7 +593,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) ...@@ -590,7 +593,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires) unsigned long expires)
{ {
unsigned long max_expire = jiffies + 8 * throtl_slice; unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
/* /*
* Since we are adjusting the throttle limit dynamically, the sleep * Since we are adjusting the throttle limit dynamically, the sleep
...@@ -658,7 +661,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, ...@@ -658,7 +661,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
if (time_after_eq(start, tg->slice_start[rw])) if (time_after_eq(start, tg->slice_start[rw]))
tg->slice_start[rw] = start; tg->slice_start[rw] = start;
tg->slice_end[rw] = jiffies + throtl_slice; tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue, throtl_log(&tg->service_queue,
"[%c] new slice with credit start=%lu end=%lu jiffies=%lu", "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw], rw == READ ? 'R' : 'W', tg->slice_start[rw],
...@@ -670,7 +673,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) ...@@ -670,7 +673,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->bytes_disp[rw] = 0; tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0; tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies; tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + throtl_slice; tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue, throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu", "[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw], rw == READ ? 'R' : 'W', tg->slice_start[rw],
...@@ -680,13 +683,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) ...@@ -680,13 +683,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end) unsigned long jiffy_end)
{ {
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
} }
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end) unsigned long jiffy_end)
{ {
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
throtl_log(&tg->service_queue, throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu", "[%c] extend slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw], rw == READ ? 'R' : 'W', tg->slice_start[rw],
...@@ -726,19 +729,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) ...@@ -726,19 +729,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start. * is bad because it does not allow new slice to start.
*/ */
throtl_set_slice_end(tg, rw, jiffies + throtl_slice); throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw]; time_elapsed = jiffies - tg->slice_start[rw];
nr_slices = time_elapsed / throtl_slice; nr_slices = time_elapsed / tg->td->throtl_slice;
if (!nr_slices) if (!nr_slices)
return; return;
tmp = tg_bps_limit(tg, rw) * throtl_slice * nr_slices; tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
do_div(tmp, HZ); do_div(tmp, HZ);
bytes_trim = tmp; bytes_trim = tmp;
io_trim = (tg_iops_limit(tg, rw) * throtl_slice * nr_slices) / HZ; io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
HZ;
if (!bytes_trim && !io_trim) if (!bytes_trim && !io_trim)
return; return;
...@@ -753,7 +757,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) ...@@ -753,7 +757,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
else else
tg->io_disp[rw] = 0; tg->io_disp[rw] = 0;
tg->slice_start[rw] += nr_slices * throtl_slice; tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
throtl_log(&tg->service_queue, throtl_log(&tg->service_queue,
"[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
...@@ -773,9 +777,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, ...@@ -773,9 +777,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */ /* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed) if (!jiffy_elapsed)
jiffy_elapsed_rnd = throtl_slice; jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
/* /*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be * jiffy_elapsed_rnd should not be a big value as minimum iops can be
...@@ -822,9 +826,9 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, ...@@ -822,9 +826,9 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */ /* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed) if (!jiffy_elapsed)
jiffy_elapsed_rnd = throtl_slice; jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ); do_div(tmp, HZ);
...@@ -890,8 +894,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, ...@@ -890,8 +894,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw); throtl_start_new_slice(tg, rw);
else { else {
if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) if (time_before(tg->slice_end[rw],
throtl_extend_slice(tg, rw, jiffies + throtl_slice); jiffies + tg->td->throtl_slice))
throtl_extend_slice(tg, rw,
jiffies + tg->td->throtl_slice);
} }
if (tg_with_in_bps_limit(tg, bio, &bps_wait) && if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
...@@ -1632,7 +1638,7 @@ static bool throtl_can_upgrade(struct throtl_data *td, ...@@ -1632,7 +1638,7 @@ static bool throtl_can_upgrade(struct throtl_data *td,
if (td->limit_index != LIMIT_LOW) if (td->limit_index != LIMIT_LOW)
return false; return false;
if (time_before(jiffies, td->low_downgrade_time + throtl_slice)) if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
return false; return false;
rcu_read_lock(); rcu_read_lock();
...@@ -1689,8 +1695,9 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) ...@@ -1689,8 +1695,9 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
* If cgroup is below low limit, consider downgrade and throttle other * If cgroup is below low limit, consider downgrade and throttle other
* cgroups * cgroups
*/ */
if (time_after_eq(now, td->low_upgrade_time + throtl_slice) && if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
time_after_eq(now, tg_last_low_overflow_time(tg) + throtl_slice)) time_after_eq(now, tg_last_low_overflow_time(tg) +
td->throtl_slice))
return true; return true;
return false; return false;
} }
...@@ -1719,13 +1726,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) ...@@ -1719,13 +1726,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
return; return;
if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
return; return;
if (time_after(tg->last_check_time + throtl_slice, now)) if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
return; return;
elapsed_time = now - tg->last_check_time; elapsed_time = now - tg->last_check_time;
tg->last_check_time = now; tg->last_check_time = now;
if (time_before(now, tg_last_low_overflow_time(tg) + throtl_slice)) if (time_before(now, tg_last_low_overflow_time(tg) +
tg->td->throtl_slice))
return; return;
if (tg->bps[READ][LIMIT_LOW]) { if (tg->bps[READ][LIMIT_LOW]) {
...@@ -1953,6 +1961,7 @@ int blk_throtl_init(struct request_queue *q) ...@@ -1953,6 +1961,7 @@ int blk_throtl_init(struct request_queue *q)
q->td = td; q->td = td;
td->queue = q; td->queue = q;
td->throtl_slice = DFL_THROTL_SLICE;
td->limit_valid[LIMIT_MAX] = true; td->limit_valid[LIMIT_MAX] = true;
td->limit_index = LIMIT_MAX; td->limit_index = LIMIT_MAX;
...@@ -1973,6 +1982,32 @@ void blk_throtl_exit(struct request_queue *q) ...@@ -1973,6 +1982,32 @@ void blk_throtl_exit(struct request_queue *q)
kfree(q->td); kfree(q->td);
} }
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
{
if (!q->td)
return -EINVAL;
return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
}
ssize_t blk_throtl_sample_time_store(struct request_queue *q,
const char *page, size_t count)
{
unsigned long v;
unsigned long t;
if (!q->td)
return -EINVAL;
if (kstrtoul(page, 10, &v))
return -EINVAL;
t = msecs_to_jiffies(v);
if (t == 0 || t > MAX_THROTL_SLICE)
return -EINVAL;
q->td->throtl_slice = t;
return count;
}
#endif
static int __init throtl_init(void) static int __init throtl_init(void)
{ {
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
......
...@@ -324,5 +324,10 @@ static inline void blk_throtl_drain(struct request_queue *q) { } ...@@ -324,5 +324,10 @@ static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_exit(struct request_queue *q) { }
#endif /* CONFIG_BLK_DEV_THROTTLING */ #endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
const char *page, size_t count);
#endif
#endif /* BLK_INTERNAL_H */ #endif /* BLK_INTERNAL_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment