Commit 7e027ca4 authored by Coly Li's avatar Coly Li Committed by Jens Axboe

bcache: add stop_when_cache_set_failed option to backing device

When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.

If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.

But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.

Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.

Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
    "auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
    (always stop).
Signed-off-by: default avatarColy Li <colyli@suse.de>
Reviewed-by: default avatarMichael Lyle <mlyle@lyle.org>
Signed-off-by: default avatarMichael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 771f393e
...@@ -288,6 +288,12 @@ struct io { ...@@ -288,6 +288,12 @@ struct io {
sector_t last; sector_t last;
}; };
enum stop_on_failure {
BCH_CACHED_DEV_STOP_AUTO = 0,
BCH_CACHED_DEV_STOP_ALWAYS,
BCH_CACHED_DEV_STOP_MODE_MAX,
};
struct cached_dev { struct cached_dev {
struct list_head list; struct list_head list;
struct bcache_device disk; struct bcache_device disk;
...@@ -380,6 +386,8 @@ struct cached_dev { ...@@ -380,6 +386,8 @@ struct cached_dev {
unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_i_term_inverse;
unsigned writeback_rate_p_term_inverse; unsigned writeback_rate_p_term_inverse;
unsigned writeback_rate_minimum; unsigned writeback_rate_minimum;
enum stop_on_failure stop_when_cache_set_failed;
}; };
enum alloc_reserve { enum alloc_reserve {
...@@ -939,6 +947,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *); ...@@ -939,6 +947,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *);
extern struct workqueue_struct *bcache_wq; extern struct workqueue_struct *bcache_wq;
extern const char * const bch_cache_modes[]; extern const char * const bch_cache_modes[];
extern const char * const bch_stop_on_failure_modes[];
extern struct mutex bch_register_lock; extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets; extern struct list_head bch_cache_sets;
......
...@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = { ...@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = {
NULL NULL
}; };
/* Default is -1; we skip past it for stop_when_cache_set_failed */
const char * const bch_stop_on_failure_modes[] = {
"default",
"auto",
"always",
NULL
};
static struct kobject *bcache_kobj; static struct kobject *bcache_kobj;
struct mutex bch_register_lock; struct mutex bch_register_lock;
LIST_HEAD(bch_cache_sets); LIST_HEAD(bch_cache_sets);
...@@ -1188,6 +1196,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) ...@@ -1188,6 +1196,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
max(dc->disk.disk->queue->backing_dev_info->ra_pages, max(dc->disk.disk->queue->backing_dev_info->ra_pages,
q->backing_dev_info->ra_pages); q->backing_dev_info->ra_pages);
/* default to auto */
dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
bch_cached_dev_request_init(dc); bch_cached_dev_request_init(dc);
bch_cached_dev_writeback_init(dc); bch_cached_dev_writeback_init(dc);
return 0; return 0;
...@@ -1464,23 +1475,70 @@ static void cache_set_flush(struct closure *cl) ...@@ -1464,23 +1475,70 @@ static void cache_set_flush(struct closure *cl)
closure_return(cl); closure_return(cl);
} }
/*
* This function is only called when CACHE_SET_IO_DISABLE is set, which means
* cache set is unregistering due to too many I/O errors. In this condition,
* the bcache device might be stopped, it depends on stop_when_cache_set_failed
* value and whether the broken cache has dirty data:
*
* dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
* BCH_CACHED_STOP_AUTO 0 NO
* BCH_CACHED_STOP_AUTO 1 YES
* BCH_CACHED_DEV_STOP_ALWAYS 0 YES
* BCH_CACHED_DEV_STOP_ALWAYS 1 YES
*
* The expected behavior is, if stop_when_cache_set_failed is configured to
* "auto" via sysfs interface, the bcache device will not be stopped if the
* backing device is clean on the broken cache device.
*/
static void conditional_stop_bcache_device(struct cache_set *c,
struct bcache_device *d,
struct cached_dev *dc)
{
if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
d->disk->disk_name, c->sb.set_uuid);
bcache_device_stop(d);
} else if (atomic_read(&dc->has_dirty)) {
/*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
* and dc->has_dirty == 1
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
d->disk->disk_name);
bcache_device_stop(d);
} else {
/*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
* and dc->has_dirty == 0
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
d->disk->disk_name);
}
}
static void __cache_set_unregister(struct closure *cl) static void __cache_set_unregister(struct closure *cl)
{ {
struct cache_set *c = container_of(cl, struct cache_set, caching); struct cache_set *c = container_of(cl, struct cache_set, caching);
struct cached_dev *dc; struct cached_dev *dc;
struct bcache_device *d;
size_t i; size_t i;
mutex_lock(&bch_register_lock); mutex_lock(&bch_register_lock);
for (i = 0; i < c->devices_max_used; i++) for (i = 0; i < c->devices_max_used; i++) {
if (c->devices[i]) { d = c->devices[i];
if (!d)
continue;
if (!UUID_FLASH_ONLY(&c->uuids[i]) && if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
dc = container_of(c->devices[i], dc = container_of(d, struct cached_dev, disk);
struct cached_dev, disk);
bch_cached_dev_detach(dc); bch_cached_dev_detach(dc);
if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
conditional_stop_bcache_device(c, d, dc);
} else { } else {
bcache_device_stop(c->devices[i]); bcache_device_stop(d);
} }
} }
......
...@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us); ...@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff); rw_attribute(sequential_cutoff);
rw_attribute(data_csum); rw_attribute(data_csum);
rw_attribute(cache_mode); rw_attribute(cache_mode);
rw_attribute(stop_when_cache_set_failed);
rw_attribute(writeback_metadata); rw_attribute(writeback_metadata);
rw_attribute(writeback_running); rw_attribute(writeback_running);
rw_attribute(writeback_percent); rw_attribute(writeback_percent);
...@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev) ...@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev)
bch_cache_modes + 1, bch_cache_modes + 1,
BDEV_CACHE_MODE(&dc->sb)); BDEV_CACHE_MODE(&dc->sb));
if (attr == &sysfs_stop_when_cache_set_failed)
return bch_snprint_string_list(buf, PAGE_SIZE,
bch_stop_on_failure_modes + 1,
dc->stop_when_cache_set_failed);
sysfs_printf(data_csum, "%i", dc->disk.data_csum); sysfs_printf(data_csum, "%i", dc->disk.data_csum);
var_printf(verify, "%i"); var_printf(verify, "%i");
var_printf(bypass_torture_test, "%i"); var_printf(bypass_torture_test, "%i");
...@@ -247,6 +254,15 @@ STORE(__cached_dev) ...@@ -247,6 +254,15 @@ STORE(__cached_dev)
} }
} }
if (attr == &sysfs_stop_when_cache_set_failed) {
v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
if (v < 0)
return v;
dc->stop_when_cache_set_failed = v;
}
if (attr == &sysfs_label) { if (attr == &sysfs_label) {
if (size > SB_LABEL_SIZE) if (size > SB_LABEL_SIZE)
return -EINVAL; return -EINVAL;
...@@ -326,6 +342,7 @@ static struct attribute *bch_cached_dev_files[] = { ...@@ -326,6 +342,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_data_csum, &sysfs_data_csum,
#endif #endif
&sysfs_cache_mode, &sysfs_cache_mode,
&sysfs_stop_when_cache_set_failed,
&sysfs_writeback_metadata, &sysfs_writeback_metadata,
&sysfs_writeback_running, &sysfs_writeback_running,
&sysfs_writeback_delay, &sysfs_writeback_delay,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment