Commit c2a4f318 authored by Kent Overstreet's avatar Kent Overstreet Committed by Linus Torvalds

bcache: Fix a writeback performance regression

Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.

When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO.  Doh.
Signed-off-by: default avatarKent Overstreet <kmo@daterainc.com>
Cc: linux-stable <stable@vger.kernel.org> # >= v3.10
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 61cbd250
...@@ -498,7 +498,7 @@ struct cached_dev { ...@@ -498,7 +498,7 @@ struct cached_dev {
*/ */
atomic_t has_dirty; atomic_t has_dirty;
struct ratelimit writeback_rate; struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update; struct delayed_work writeback_rate_update;
/* /*
...@@ -507,10 +507,9 @@ struct cached_dev { ...@@ -507,10 +507,9 @@ struct cached_dev {
*/ */
sector_t last_read; sector_t last_read;
/* Number of writeback bios in flight */ /* Limit number of writeback bios in flight */
atomic_t in_flight; struct semaphore in_flight;
struct closure_with_timer writeback; struct closure_with_timer writeback;
struct closure_waitlist writeback_wait;
struct keybuf writeback_keys; struct keybuf writeback_keys;
......
...@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) ...@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
stats->last = now ?: 1; stats->last = now ?: 1;
} }
unsigned bch_next_delay(struct ratelimit *d, uint64_t done) /**
* bch_next_delay() - increment @d by the amount of work done, and return how
* long to delay until the next time to do some work.
*
* @d - the struct bch_ratelimit to update
* @done - the amount of work done, in arbitrary units
*
* Returns the amount of time to delay by, in jiffies
*/
uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{ {
uint64_t now = local_clock(); uint64_t now = local_clock();
......
...@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units) ...@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
(ewma) >> factor; \ (ewma) >> factor; \
}) })
struct ratelimit { struct bch_ratelimit {
/* Next time we want to do some work, in nanoseconds */
uint64_t next; uint64_t next;
/*
* Rate at which we want to do work, in units per nanosecond
* The units here correspond to the units passed to bch_next_delay()
*/
unsigned rate; unsigned rate;
}; };
static inline void ratelimit_reset(struct ratelimit *d) static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{ {
d->next = local_clock(); d->next = local_clock();
} }
unsigned bch_next_delay(struct ratelimit *d, uint64_t done); uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
#define __DIV_SAFE(n, d, zero) \ #define __DIV_SAFE(n, d, zero) \
({ \ ({ \
......
...@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work) ...@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{ {
uint64_t ret;
if (atomic_read(&dc->disk.detaching) || if (atomic_read(&dc->disk.detaching) ||
!dc->writeback_percent) !dc->writeback_percent)
return 0; return 0;
return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
return min_t(uint64_t, ret, HZ);
} }
/* Background writeback */ /* Background writeback */
...@@ -208,7 +212,7 @@ static void refill_dirty(struct closure *cl) ...@@ -208,7 +212,7 @@ static void refill_dirty(struct closure *cl)
up_write(&dc->writeback_lock); up_write(&dc->writeback_lock);
ratelimit_reset(&dc->writeback_rate); bch_ratelimit_reset(&dc->writeback_rate);
/* Punt to workqueue only so we don't recurse and blow the stack */ /* Punt to workqueue only so we don't recurse and blow the stack */
continue_at(cl, read_dirty, dirty_wq); continue_at(cl, read_dirty, dirty_wq);
...@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl) ...@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
} }
bch_keybuf_del(&dc->writeback_keys, w); bch_keybuf_del(&dc->writeback_keys, w);
atomic_dec_bug(&dc->in_flight); up(&dc->in_flight);
closure_wake_up(&dc->writeback_wait);
closure_return_with_destructor(cl, dirty_io_destructor); closure_return_with_destructor(cl, dirty_io_destructor);
} }
...@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl) ...@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl, &io->dc->disk); closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty_finish, dirty_wq); continue_at(cl, write_dirty_finish, system_wq);
} }
static void read_dirty_endio(struct bio *bio, int error) static void read_dirty_endio(struct bio *bio, int error)
...@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl) ...@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)
closure_bio_submit(&io->bio, cl, &io->dc->disk); closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty, dirty_wq); continue_at(cl, write_dirty, system_wq);
} }
static void read_dirty(struct closure *cl) static void read_dirty(struct closure *cl)
...@@ -394,12 +396,9 @@ static void read_dirty(struct closure *cl) ...@@ -394,12 +396,9 @@ static void read_dirty(struct closure *cl)
if (delay > 0 && if (delay > 0 &&
(KEY_START(&w->key) != dc->last_read || (KEY_START(&w->key) != dc->last_read ||
jiffies_to_msecs(delay) > 50)) { jiffies_to_msecs(delay) > 50))
w->private = NULL; while (delay)
delay = schedule_timeout(delay);
closure_delay(&dc->writeback, delay);
continue_at(cl, read_dirty, dirty_wq);
}
dc->last_read = KEY_OFFSET(&w->key); dc->last_read = KEY_OFFSET(&w->key);
...@@ -424,15 +423,10 @@ static void read_dirty(struct closure *cl) ...@@ -424,15 +423,10 @@ static void read_dirty(struct closure *cl)
trace_bcache_writeback(&w->key); trace_bcache_writeback(&w->key);
closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); down(&dc->in_flight);
closure_call(&io->cl, read_dirty_submit, NULL, cl);
delay = writeback_delay(dc, KEY_SIZE(&w->key)); delay = writeback_delay(dc, KEY_SIZE(&w->key));
atomic_inc(&dc->in_flight);
if (!closure_wait_event(&dc->writeback_wait, cl,
atomic_read(&dc->in_flight) < 64))
continue_at(cl, read_dirty, dirty_wq);
} }
if (0) { if (0) {
...@@ -442,7 +436,11 @@ static void read_dirty(struct closure *cl) ...@@ -442,7 +436,11 @@ static void read_dirty(struct closure *cl)
bch_keybuf_del(&dc->writeback_keys, w); bch_keybuf_del(&dc->writeback_keys, w);
} }
refill_dirty(cl); /*
* Wait for outstanding writeback IOs to finish (and keybuf slots to be
* freed) before refilling again
*/
continue_at(cl, refill_dirty, dirty_wq);
} }
/* Init */ /* Init */
...@@ -484,6 +482,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) ...@@ -484,6 +482,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
void bch_cached_dev_writeback_init(struct cached_dev *dc) void bch_cached_dev_writeback_init(struct cached_dev *dc)
{ {
sema_init(&dc->in_flight, 64);
closure_init_unlocked(&dc->writeback); closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock); init_rwsem(&dc->writeback_lock);
...@@ -513,7 +512,7 @@ void bch_writeback_exit(void) ...@@ -513,7 +512,7 @@ void bch_writeback_exit(void)
int __init bch_writeback_init(void) int __init bch_writeback_init(void)
{ {
dirty_wq = create_singlethread_workqueue("bcache_writeback"); dirty_wq = create_workqueue("bcache_writeback");
if (!dirty_wq) if (!dirty_wq)
return -ENOMEM; return -ENOMEM;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment