Commit e84987a1 authored by Jens Axboe's avatar Jens Axboe

Merge branch 'bcache-for-3.15' of git://evilpiepirate.org/~kent/linux-bcache into for-3.15/drivers

Kent writes:

Jens, here's the bcache changes for 3.15. Lots of bugfixes, and some
refactoring and cleanups.
parents 5eb9291c cb851149
...@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG ...@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
Keeps all active closures in a linked list and provides a debugfs Keeps all active closures in a linked list and provides a debugfs
interface to list them, which makes it possible to see asynchronous interface to list them, which makes it possible to see asynchronous
operations that get stuck. operations that get stuck.
# cgroup code needs to be updated:
#
#config CGROUP_BCACHE
# bool "Cgroup controls for bcache"
# depends on BCACHE && BLK_CGROUP
# ---help---
# TODO
...@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) ...@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
if (CACHE_SYNC(&ca->set->sb)) {
ca->need_save_prio = max(ca->need_save_prio,
bucket_disk_gen(b));
WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
}
return ret; return ret;
} }
...@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) ...@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
} }
/* Allocation */ /*
* Background allocation thread: scans for buckets to be invalidated,
* invalidates them, rewrites prios/gens (marking them as invalidated on disk),
* then optionally issues discard commands to the newly free buckets, then puts
* them on the various freelists.
*/
static inline bool can_inc_bucket_gen(struct bucket *b) static inline bool can_inc_bucket_gen(struct bucket *b)
{ {
return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
} }
bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{ {
BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); BUG_ON(!ca->set->gc_mark_valid);
if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
unsigned i;
for (i = 0; i < RESERVE_NONE; i++)
if (!fifo_full(&ca->free[i]))
goto add;
return false;
}
add:
b->prio = 0;
if (can_inc_bucket_gen(b) &&
fifo_push(&ca->unused, b - ca->buckets)) {
atomic_inc(&b->pin);
return true;
}
return false; return (!GC_MARK(b) ||
} GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
!atomic_read(&b->pin) && !atomic_read(&b->pin) &&
can_inc_bucket_gen(b); can_inc_bucket_gen(b);
} }
static void invalidate_one_bucket(struct cache *ca, struct bucket *b) void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
{ {
lockdep_assert_held(&ca->set->bucket_lock);
BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
if (GC_SECTORS_USED(b))
trace_bcache_invalidate(ca, b - ca->buckets);
bch_inc_gen(ca, b); bch_inc_gen(ca, b);
b->prio = INITIAL_PRIO; b->prio = INITIAL_PRIO;
atomic_inc(&b->pin); atomic_inc(&b->pin);
}
static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
{
__bch_invalidate_one_bucket(ca, b);
fifo_push(&ca->free_inc, b - ca->buckets); fifo_push(&ca->free_inc, b - ca->buckets);
} }
...@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca) ...@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
ca->heap.used = 0; ca->heap.used = 0;
for_each_bucket(b, ca) { for_each_bucket(b, ca) {
/* if (!bch_can_invalidate_bucket(ca, b))
* If we fill up the unused list, if we then return before
* adding anything to the free_inc list we'll skip writing
* prios/gens and just go back to allocating from the unused
* list:
*/
if (fifo_full(&ca->unused))
return;
if (!can_invalidate_bucket(ca, b))
continue;
if (!GC_SECTORS_USED(b) &&
bch_bucket_add_unused(ca, b))
continue; continue;
if (!heap_full(&ca->heap)) if (!heap_full(&ca->heap))
...@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca) ...@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
return; return;
} }
invalidate_one_bucket(ca, b); bch_invalidate_one_bucket(ca, b);
} }
} }
...@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca) ...@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
b = ca->buckets + ca->fifo_last_bucket++; b = ca->buckets + ca->fifo_last_bucket++;
if (can_invalidate_bucket(ca, b)) if (bch_can_invalidate_bucket(ca, b))
invalidate_one_bucket(ca, b); bch_invalidate_one_bucket(ca, b);
if (++checked >= ca->sb.nbuckets) { if (++checked >= ca->sb.nbuckets) {
ca->invalidate_needs_gc = 1; ca->invalidate_needs_gc = 1;
...@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca) ...@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
b = ca->buckets + n; b = ca->buckets + n;
if (can_invalidate_bucket(ca, b)) if (bch_can_invalidate_bucket(ca, b))
invalidate_one_bucket(ca, b); bch_invalidate_one_bucket(ca, b);
if (++checked >= ca->sb.nbuckets / 2) { if (++checked >= ca->sb.nbuckets / 2) {
ca->invalidate_needs_gc = 1; ca->invalidate_needs_gc = 1;
...@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca) ...@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
static void invalidate_buckets(struct cache *ca) static void invalidate_buckets(struct cache *ca)
{ {
if (ca->invalidate_needs_gc) BUG_ON(ca->invalidate_needs_gc);
return;
switch (CACHE_REPLACEMENT(&ca->sb)) { switch (CACHE_REPLACEMENT(&ca->sb)) {
case CACHE_REPLACEMENT_LRU: case CACHE_REPLACEMENT_LRU:
...@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca) ...@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
invalidate_buckets_random(ca); invalidate_buckets_random(ca);
break; break;
} }
trace_bcache_alloc_invalidate(ca);
} }
#define allocator_wait(ca, cond) \ #define allocator_wait(ca, cond) \
...@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg) ...@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg)
* possibly issue discards to them, then we add the bucket to * possibly issue discards to them, then we add the bucket to
* the free list: * the free list:
*/ */
while (1) { while (!fifo_empty(&ca->free_inc)) {
long bucket; long bucket;
if ((!atomic_read(&ca->set->prio_blocked) ||
!CACHE_SYNC(&ca->set->sb)) &&
!fifo_empty(&ca->unused))
fifo_pop(&ca->unused, bucket);
else if (!fifo_empty(&ca->free_inc))
fifo_pop(&ca->free_inc, bucket); fifo_pop(&ca->free_inc, bucket);
else
break;
if (ca->discard) { if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock); mutex_unlock(&ca->set->bucket_lock);
...@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg) ...@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg)
} }
allocator_wait(ca, bch_allocator_push(ca, bucket)); allocator_wait(ca, bch_allocator_push(ca, bucket));
wake_up(&ca->set->btree_cache_wait);
wake_up(&ca->set->bucket_wait); wake_up(&ca->set->bucket_wait);
} }
...@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg) ...@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg)
* them to the free_inc list: * them to the free_inc list:
*/ */
retry_invalidate:
allocator_wait(ca, ca->set->gc_mark_valid && allocator_wait(ca, ca->set->gc_mark_valid &&
(ca->need_save_prio > 64 || !ca->invalidate_needs_gc);
!ca->invalidate_needs_gc));
invalidate_buckets(ca); invalidate_buckets(ca);
/* /*
...@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg) ...@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg)
* new stuff to them: * new stuff to them:
*/ */
allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
if (CACHE_SYNC(&ca->set->sb) && if (CACHE_SYNC(&ca->set->sb)) {
(!fifo_empty(&ca->free_inc) || /*
ca->need_save_prio > 64)) * This could deadlock if an allocation with a btree
* node locked ever blocked - having the btree node
* locked would block garbage collection, but here we're
* waiting on garbage collection before we invalidate
* and free anything.
*
* But this should be safe since the btree code always
* uses btree_check_reserve() before allocating now, and
* if it fails it blocks without btree nodes locked.
*/
if (!fifo_full(&ca->free_inc))
goto retry_invalidate;
bch_prio_write(ca); bch_prio_write(ca);
} }
}
} }
/* Allocation */
long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
{ {
DEFINE_WAIT(w); DEFINE_WAIT(w);
...@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) ...@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
fifo_pop(&ca->free[reserve], r)) fifo_pop(&ca->free[reserve], r))
goto out; goto out;
if (!wait) if (!wait) {
trace_bcache_alloc_fail(ca, reserve);
return -1; return -1;
}
do { do {
prepare_to_wait(&ca->set->bucket_wait, &w, prepare_to_wait(&ca->set->bucket_wait, &w,
...@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) ...@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
out: out:
wake_up_process(ca->alloc_thread); wake_up_process(ca->alloc_thread);
trace_bcache_alloc(ca, reserve);
if (expensive_debug_checks(ca->set)) { if (expensive_debug_checks(ca->set)) {
size_t iter; size_t iter;
long i; long i;
...@@ -438,8 +423,6 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) ...@@ -438,8 +423,6 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
BUG_ON(i == r); BUG_ON(i == r);
fifo_for_each(i, &ca->free_inc, iter) fifo_for_each(i, &ca->free_inc, iter)
BUG_ON(i == r); BUG_ON(i == r);
fifo_for_each(i, &ca->unused, iter)
BUG_ON(i == r);
} }
b = ca->buckets + r; b = ca->buckets + r;
...@@ -461,17 +444,19 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) ...@@ -461,17 +444,19 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
return r; return r;
} }
void __bch_bucket_free(struct cache *ca, struct bucket *b)
{
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
}
void bch_bucket_free(struct cache_set *c, struct bkey *k) void bch_bucket_free(struct cache_set *c, struct bkey *k)
{ {
unsigned i; unsigned i;
for (i = 0; i < KEY_PTRS(k); i++) { for (i = 0; i < KEY_PTRS(k); i++)
struct bucket *b = PTR_BUCKET(c, k, i); __bch_bucket_free(PTR_CACHE(c, k, i),
PTR_BUCKET(c, k, i));
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
SET_GC_SECTORS_USED(b, 0);
bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
}
} }
int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
...@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca) ...@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
ca->alloc_thread = k; ca->alloc_thread = k;
return 0; return 0;
} }
int bch_cache_allocator_init(struct cache *ca)
{
/*
* Reserve:
* Prio/gen writes first
* Then 8 for btree allocations
* Then half for the moving garbage collector
*/
#if 0
ca->watermark[WATERMARK_PRIO] = 0;
ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
ca->watermark[WATERMARK_MOVINGGC] = 8 +
ca->watermark[WATERMARK_METADATA];
ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
ca->watermark[WATERMARK_MOVINGGC];
#endif
return 0;
}
...@@ -195,9 +195,7 @@ struct bucket { ...@@ -195,9 +195,7 @@ struct bucket {
atomic_t pin; atomic_t pin;
uint16_t prio; uint16_t prio;
uint8_t gen; uint8_t gen;
uint8_t disk_gen;
uint8_t last_gc; /* Most out of date gen in the btree */ uint8_t last_gc; /* Most out of date gen in the btree */
uint8_t gc_gen;
uint16_t gc_mark; /* Bitfield used by GC. See below for field */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */
}; };
...@@ -207,9 +205,9 @@ struct bucket { ...@@ -207,9 +205,9 @@ struct bucket {
*/ */
BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
#define GC_MARK_RECLAIMABLE 0 #define GC_MARK_RECLAIMABLE 1
#define GC_MARK_DIRTY 1 #define GC_MARK_DIRTY 2
#define GC_MARK_METADATA 2 #define GC_MARK_METADATA 3
#define GC_SECTORS_USED_SIZE 13 #define GC_SECTORS_USED_SIZE 13
#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
...@@ -426,14 +424,9 @@ struct cache { ...@@ -426,14 +424,9 @@ struct cache {
* their new gen to disk. After prio_write() finishes writing the new * their new gen to disk. After prio_write() finishes writing the new
* gens/prios, they'll be moved to the free list (and possibly discarded * gens/prios, they'll be moved to the free list (and possibly discarded
* in the process) * in the process)
*
* unused: GC found nothing pointing into these buckets (possibly
* because all the data they contained was overwritten), so we only
* need to discard them before they can be moved to the free list.
*/ */
DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc); DECLARE_FIFO(long, free_inc);
DECLARE_FIFO(long, unused);
size_t fifo_last_bucket; size_t fifo_last_bucket;
...@@ -442,12 +435,6 @@ struct cache { ...@@ -442,12 +435,6 @@ struct cache {
DECLARE_HEAP(struct bucket *, heap); DECLARE_HEAP(struct bucket *, heap);
/*
* max(gen - disk_gen) for all buckets. When it gets too big we have to
* call prio_write() to keep gens from wrapping.
*/
uint8_t need_save_prio;
/* /*
* If nonzero, we know we aren't going to find any buckets to invalidate * If nonzero, we know we aren't going to find any buckets to invalidate
* until a gc finishes - otherwise we could pointlessly burn a ton of * until a gc finishes - otherwise we could pointlessly burn a ton of
...@@ -562,19 +549,16 @@ struct cache_set { ...@@ -562,19 +549,16 @@ struct cache_set {
struct list_head btree_cache_freed; struct list_head btree_cache_freed;
/* Number of elements in btree_cache + btree_cache_freeable lists */ /* Number of elements in btree_cache + btree_cache_freeable lists */
unsigned bucket_cache_used; unsigned btree_cache_used;
/* /*
* If we need to allocate memory for a new btree node and that * If we need to allocate memory for a new btree node and that
* allocation fails, we can cannibalize another node in the btree cache * allocation fails, we can cannibalize another node in the btree cache
* to satisfy the allocation. However, only one thread can be doing this * to satisfy the allocation - lock to guarantee only one thread does
* at a time, for obvious reasons - try_harder and try_wait are * this at a time:
* basically a lock for this that we can wait on asynchronously. The
* btree_root() macro releases the lock when it returns.
*/ */
struct task_struct *try_harder; wait_queue_head_t btree_cache_wait;
wait_queue_head_t try_wait; struct task_struct *btree_cache_alloc_lock;
uint64_t try_harder_start;
/* /*
* When we free a btree node, we increment the gen of the bucket the * When we free a btree node, we increment the gen of the bucket the
...@@ -603,7 +587,7 @@ struct cache_set { ...@@ -603,7 +587,7 @@ struct cache_set {
uint16_t min_prio; uint16_t min_prio;
/* /*
* max(gen - gc_gen) for all buckets. When it gets too big we have to gc * max(gen - last_gc) for all buckets. When it gets too big we have to gc
* to keep gens from wrapping around. * to keep gens from wrapping around.
*/ */
uint8_t need_gc; uint8_t need_gc;
...@@ -628,6 +612,8 @@ struct cache_set { ...@@ -628,6 +612,8 @@ struct cache_set {
/* Number of moving GC bios in flight */ /* Number of moving GC bios in flight */
struct semaphore moving_in_flight; struct semaphore moving_in_flight;
struct workqueue_struct *moving_gc_wq;
struct btree *root; struct btree *root;
#ifdef CONFIG_BCACHE_DEBUG #ifdef CONFIG_BCACHE_DEBUG
...@@ -667,7 +653,6 @@ struct cache_set { ...@@ -667,7 +653,6 @@ struct cache_set {
struct time_stats btree_gc_time; struct time_stats btree_gc_time;
struct time_stats btree_split_time; struct time_stats btree_split_time;
struct time_stats btree_read_time; struct time_stats btree_read_time;
struct time_stats try_harder_time;
atomic_long_t cache_read_races; atomic_long_t cache_read_races;
atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_done;
...@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc) ...@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
/* /*
* bucket_gc_gen() returns the difference between the bucket's current gen and * bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree (last_gc). * the oldest gen of any pointer into that bucket in the btree (last_gc).
*
* bucket_disk_gen() returns the difference between the current gen and the gen
* on disk; they're both used to make sure gens don't wrap around.
*/ */
static inline uint8_t bucket_gc_gen(struct bucket *b) static inline uint8_t bucket_gc_gen(struct bucket *b)
...@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) ...@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
return b->gen - b->last_gc; return b->gen - b->last_gc;
} }
static inline uint8_t bucket_disk_gen(struct bucket *b)
{
return b->gen - b->disk_gen;
}
#define BUCKET_GC_GEN_MAX 96U #define BUCKET_GC_GEN_MAX 96U
#define BUCKET_DISK_GEN_MAX 64U
#define kobj_attribute_write(n, fn) \ #define kobj_attribute_write(n, fn) \
static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
...@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); ...@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *); uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int); void bch_rescale_priorities(struct cache_set *, int);
bool bch_bucket_add_unused(struct cache *, struct bucket *);
long bch_bucket_alloc(struct cache *, unsigned, bool); bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
void __bch_bucket_free(struct cache *, struct bucket *);
void bch_bucket_free(struct cache_set *, struct bkey *); void bch_bucket_free(struct cache_set *, struct bkey *);
long bch_bucket_alloc(struct cache *, unsigned, bool);
int __bch_bucket_alloc_set(struct cache_set *, unsigned, int __bch_bucket_alloc_set(struct cache_set *, unsigned,
struct bkey *, int, bool); struct bkey *, int, bool);
int bch_bucket_alloc_set(struct cache_set *, unsigned, int bch_bucket_alloc_set(struct cache_set *, unsigned,
...@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *); ...@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
void bch_open_buckets_free(struct cache_set *); void bch_open_buckets_free(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca); int bch_cache_allocator_start(struct cache *ca);
int bch_cache_allocator_init(struct cache *ca);
void bch_debug_exit(void); void bch_debug_exit(void);
int bch_debug_init(struct kobject *); int bch_debug_init(struct kobject *);
void bch_request_exit(void); void bch_request_exit(void);
int bch_request_init(void); int bch_request_init(void);
void bch_btree_exit(void);
int bch_btree_init(void);
#endif /* _BCACHE_H */ #endif /* _BCACHE_H */
...@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) ...@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
for (k = i->start; k < bset_bkey_last(i); k = next) { for (k = i->start; k < bset_bkey_last(i); k = next) {
next = bkey_next(k); next = bkey_next(k);
printk(KERN_ERR "block %u key %li/%u: ", set, printk(KERN_ERR "block %u key %u/%u: ", set,
(uint64_t *) k - i->d, i->keys); (unsigned) ((u64 *) k - i->d), i->keys);
if (b->ops->key_dump) if (b->ops->key_dump)
b->ops->key_dump(b, k); b->ops->key_dump(b, k);
......
...@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l) ...@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
l->top_p = l->keys_p = l->inline_keys; l->top_p = l->keys_p = l->inline_keys;
} }
static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
{
l->keys = k;
l->top = bkey_next(k);
}
static inline void bch_keylist_push(struct keylist *l) static inline void bch_keylist_push(struct keylist *l)
{ {
l->top = bkey_next(l->top); l->top = bkey_next(l->top);
......
...@@ -68,15 +68,11 @@ ...@@ -68,15 +68,11 @@
* alloc_bucket() cannot fail. This should be true but is not completely * alloc_bucket() cannot fail. This should be true but is not completely
* obvious. * obvious.
* *
* Make sure all allocations get charged to the root cgroup
*
* Plugging? * Plugging?
* *
* If data write is less than hard sector size of ssd, round up offset in open * If data write is less than hard sector size of ssd, round up offset in open
* bucket to the next whole sector * bucket to the next whole sector
* *
* Also lookup by cgroup in get_open_bucket()
*
* Superblock needs to be fleshed out for multiple cache devices * Superblock needs to be fleshed out for multiple cache devices
* *
* Add a sysfs tunable for the number of writeback IOs in flight * Add a sysfs tunable for the number of writeback IOs in flight
...@@ -97,8 +93,6 @@ ...@@ -97,8 +93,6 @@
#define PTR_HASH(c, k) \ #define PTR_HASH(c, k) \
(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
static struct workqueue_struct *btree_io_wq;
#define insert_lock(s, b) ((b)->level <= (s)->lock) #define insert_lock(s, b) ((b)->level <= (s)->lock)
/* /*
...@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq; ...@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq;
({ \ ({ \
int _r, l = (b)->level - 1; \ int _r, l = (b)->level - 1; \
bool _w = l <= (op)->lock; \ bool _w = l <= (op)->lock; \
struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
if (!IS_ERR(_child)) { \ if (!IS_ERR(_child)) { \
_child->parent = (b); \ _child->parent = (b); \
_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
...@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq; ...@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq;
_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
} \ } \
rw_unlock(_w, _b); \ rw_unlock(_w, _b); \
bch_cannibalize_unlock(c); \
if (_r == -EINTR) \ if (_r == -EINTR) \
schedule(); \ schedule(); \
bch_cannibalize_unlock(c); \
if (_r == -ENOSPC) { \
wait_event((c)->try_wait, \
!(c)->try_harder); \
_r = -EINTR; \
} \
} while (_r == -EINTR); \ } while (_r == -EINTR); \
\ \
finish_wait(&(c)->bucket_wait, &(op)->wait); \ finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
_r; \ _r; \
}) })
...@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b) ...@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b)
return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
} }
static void bch_btree_init_next(struct btree *b)
{
/* If not a leaf node, always sort */
if (b->level && b->keys.nsets)
bch_btree_sort(&b->keys, &b->c->sort);
else
bch_btree_sort_lazy(&b->keys, &b->c->sort);
if (b->written < btree_blocks(b))
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
}
/* Btree key manipulation */ /* Btree key manipulation */
void bkey_put(struct cache_set *c, struct bkey *k) void bkey_put(struct cache_set *c, struct bkey *k)
...@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl) ...@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
btree_complete_write(b, w); btree_complete_write(b, w);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
queue_delayed_work(btree_io_wq, &b->work, schedule_delayed_work(&b->work, 30 * HZ);
msecs_to_jiffies(30000));
closure_return_with_destructor(cl, btree_node_write_unlock); closure_return_with_destructor(cl, btree_node_write_unlock);
} }
...@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b) ...@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b)
} }
} }
void bch_btree_node_write(struct btree *b, struct closure *parent) void __bch_btree_node_write(struct btree *b, struct closure *parent)
{ {
struct bset *i = btree_bset_last(b); struct bset *i = btree_bset_last(b);
lockdep_assert_held(&b->write_lock);
trace_bcache_btree_write(b); trace_bcache_btree_write(b);
BUG_ON(current->bio_list); BUG_ON(current->bio_list);
...@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) ...@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
b->written += set_blocks(i, block_bytes(b->c)); b->written += set_blocks(i, block_bytes(b->c));
}
/* If not a leaf node, always sort */ void bch_btree_node_write(struct btree *b, struct closure *parent)
if (b->level && b->keys.nsets) {
bch_btree_sort(&b->keys, &b->c->sort); unsigned nsets = b->keys.nsets;
else
bch_btree_sort_lazy(&b->keys, &b->c->sort); lockdep_assert_held(&b->lock);
__bch_btree_node_write(b, parent);
/* /*
* do verify if there was more than one set initially (i.e. we did a * do verify if there was more than one set initially (i.e. we did a
* sort) and we sorted down to a single set: * sort) and we sorted down to a single set:
*/ */
if (i != b->keys.set->data && !b->keys.nsets) if (nsets && !b->keys.nsets)
bch_btree_verify(b); bch_btree_verify(b);
if (b->written < btree_blocks(b)) bch_btree_init_next(b);
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
} }
static void bch_btree_node_write_sync(struct btree *b) static void bch_btree_node_write_sync(struct btree *b)
...@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b) ...@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
struct closure cl; struct closure cl;
closure_init_stack(&cl); closure_init_stack(&cl);
mutex_lock(&b->write_lock);
bch_btree_node_write(b, &cl); bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
closure_sync(&cl); closure_sync(&cl);
} }
...@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w) ...@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w)
{ {
struct btree *b = container_of(to_delayed_work(w), struct btree, work); struct btree *b = container_of(to_delayed_work(w), struct btree, work);
rw_lock(true, b, b->level); mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
rw_unlock(true, b); mutex_unlock(&b->write_lock);
} }
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
...@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) ...@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
struct bset *i = btree_bset_last(b); struct bset *i = btree_bset_last(b);
struct btree_write *w = btree_current_write(b); struct btree_write *w = btree_current_write(b);
lockdep_assert_held(&b->write_lock);
BUG_ON(!b->written); BUG_ON(!b->written);
BUG_ON(!i->keys); BUG_ON(!i->keys);
if (!btree_node_dirty(b)) if (!btree_node_dirty(b))
queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); schedule_delayed_work(&b->work, 30 * HZ);
set_btree_node_dirty(b); set_btree_node_dirty(b);
...@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) ...@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
#define mca_reserve(c) (((c->root && c->root->level) \ #define mca_reserve(c) (((c->root && c->root->level) \
? c->root->level : 1) * 8 + 16) ? c->root->level : 1) * 8 + 16)
#define mca_can_free(c) \ #define mca_can_free(c) \
max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) max_t(int, 0, c->btree_cache_used - mca_reserve(c))
static void mca_data_free(struct btree *b) static void mca_data_free(struct btree *b)
{ {
...@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b) ...@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b)
bch_btree_keys_free(&b->keys); bch_btree_keys_free(&b->keys);
b->c->bucket_cache_used--; b->c->btree_cache_used--;
list_move(&b->list, &b->c->btree_cache_freed); list_move(&b->list, &b->c->btree_cache_freed);
} }
...@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) ...@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
ilog2(b->c->btree_pages), ilog2(b->c->btree_pages),
btree_order(k)), btree_order(k)),
gfp)) { gfp)) {
b->c->bucket_cache_used++; b->c->btree_cache_used++;
list_move(&b->list, &b->c->btree_cache); list_move(&b->list, &b->c->btree_cache);
} else { } else {
list_move(&b->list, &b->c->btree_cache_freed); list_move(&b->list, &b->c->btree_cache_freed);
...@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, ...@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
init_rwsem(&b->lock); init_rwsem(&b->lock);
lockdep_set_novalidate_class(&b->lock); lockdep_set_novalidate_class(&b->lock);
mutex_init(&b->write_lock);
lockdep_set_novalidate_class(&b->write_lock);
INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->list);
INIT_DELAYED_WORK(&b->work, btree_node_write_work); INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c; b->c = c;
...@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) ...@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
up(&b->io_mutex); up(&b->io_mutex);
} }
mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
bch_btree_node_write_sync(b); __bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
closure_sync(&cl);
/* wait for any in flight btree write */ /* wait for any in flight btree write */
down(&b->io_mutex); down(&b->io_mutex);
...@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, ...@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
if (c->shrinker_disabled) if (c->shrinker_disabled)
return SHRINK_STOP; return SHRINK_STOP;
if (c->try_harder) if (c->btree_cache_alloc_lock)
return SHRINK_STOP; return SHRINK_STOP;
/* Return -1 if we can't do anything right now */ /* Return -1 if we can't do anything right now */
...@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, ...@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
} }
} }
for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
if (list_empty(&c->btree_cache)) if (list_empty(&c->btree_cache))
goto out; goto out;
...@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, ...@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
if (c->shrinker_disabled) if (c->shrinker_disabled)
return 0; return 0;
if (c->try_harder) if (c->btree_cache_alloc_lock)
return 0; return 0;
return mca_can_free(c) * c->btree_pages; return mca_can_free(c) * c->btree_pages;
...@@ -819,17 +835,30 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k) ...@@ -819,17 +835,30 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k)
return b; return b;
} }
static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
struct task_struct *old;
old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
if (old && old != current) {
if (op)
prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
return -EINTR;
}
return 0;
}
static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
struct bkey *k)
{ {
struct btree *b; struct btree *b;
trace_bcache_btree_cache_cannibalize(c); trace_bcache_btree_cache_cannibalize(c);
if (!c->try_harder) { if (mca_cannibalize_lock(c, op))
c->try_harder = current; return ERR_PTR(-EINTR);
c->try_harder_start = local_clock();
} else if (c->try_harder != current)
return ERR_PTR(-ENOSPC);
list_for_each_entry_reverse(b, &c->btree_cache, list) list_for_each_entry_reverse(b, &c->btree_cache, list)
if (!mca_reap(b, btree_order(k), false)) if (!mca_reap(b, btree_order(k), false))
...@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) ...@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
if (!mca_reap(b, btree_order(k), true)) if (!mca_reap(b, btree_order(k), true))
return b; return b;
WARN(1, "btree cache cannibalize failed\n");
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
...@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) ...@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
*/ */
static void bch_cannibalize_unlock(struct cache_set *c) static void bch_cannibalize_unlock(struct cache_set *c)
{ {
if (c->try_harder == current) { if (c->btree_cache_alloc_lock == current) {
bch_time_stats_update(&c->try_harder_time, c->try_harder_start); c->btree_cache_alloc_lock = NULL;
c->try_harder = NULL; wake_up(&c->btree_cache_wait);
wake_up(&c->try_wait);
} }
} }
static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
struct bkey *k, int level)
{ {
struct btree *b; struct btree *b;
...@@ -920,7 +950,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) ...@@ -920,7 +950,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
if (b) if (b)
rw_unlock(true, b); rw_unlock(true, b);
b = mca_cannibalize(c, k); b = mca_cannibalize(c, op, k);
if (!IS_ERR(b)) if (!IS_ERR(b))
goto out; goto out;
...@@ -936,8 +966,8 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) ...@@ -936,8 +966,8 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
* The btree node will have either a read or a write lock held, depending on * The btree node will have either a read or a write lock held, depending on
* level and op->lock. * level and op->lock.
*/ */
struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
int level, bool write) struct bkey *k, int level, bool write)
{ {
int i = 0; int i = 0;
struct btree *b; struct btree *b;
...@@ -951,7 +981,7 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, ...@@ -951,7 +981,7 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
return ERR_PTR(-EAGAIN); return ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock); mutex_lock(&c->bucket_lock);
b = mca_alloc(c, k, level); b = mca_alloc(c, op, k, level);
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
if (!b) if (!b)
...@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) ...@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
struct btree *b; struct btree *b;
mutex_lock(&c->bucket_lock); mutex_lock(&c->bucket_lock);
b = mca_alloc(c, k, level); b = mca_alloc(c, NULL, k, level);
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
if (!IS_ERR_OR_NULL(b)) { if (!IS_ERR_OR_NULL(b)) {
...@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) ...@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
static void btree_node_free(struct btree *b) static void btree_node_free(struct btree *b)
{ {
unsigned i;
trace_bcache_btree_node_free(b); trace_bcache_btree_node_free(b);
BUG_ON(b == b->c->root); BUG_ON(b == b->c->root);
mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b)); btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags); clear_bit(BTREE_NODE_dirty, &b->flags);
mutex_unlock(&b->write_lock);
cancel_delayed_work(&b->work); cancel_delayed_work(&b->work);
mutex_lock(&b->c->bucket_lock); mutex_lock(&b->c->bucket_lock);
for (i = 0; i < KEY_PTRS(&b->key); i++) {
BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
PTR_BUCKET(b->c, &b->key, i));
}
bch_bucket_free(b->c, &b->key); bch_bucket_free(b->c, &b->key);
mca_bucket_free(b); mca_bucket_free(b);
mutex_unlock(&b->c->bucket_lock); mutex_unlock(&b->c->bucket_lock);
} }
struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
int level)
{ {
BKEY_PADDED(key) k; BKEY_PADDED(key) k;
struct btree *b = ERR_PTR(-EAGAIN); struct btree *b = ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock); mutex_lock(&c->bucket_lock);
retry: retry:
if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
goto err; goto err;
bkey_put(c, &k.key); bkey_put(c, &k.key);
SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
b = mca_alloc(c, &k.key, level); b = mca_alloc(c, op, &k.key, level);
if (IS_ERR(b)) if (IS_ERR(b))
goto err_free; goto err_free;
...@@ -1075,12 +1100,15 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) ...@@ -1075,12 +1100,15 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
return b; return b;
} }
static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) static struct btree *btree_node_alloc_replacement(struct btree *b,
struct btree_op *op)
{ {
struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
if (!IS_ERR_OR_NULL(n)) { if (!IS_ERR_OR_NULL(n)) {
mutex_lock(&n->write_lock);
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
bkey_copy_key(&n->key, &b->key); bkey_copy_key(&n->key, &b->key);
mutex_unlock(&n->write_lock);
} }
return n; return n;
...@@ -1090,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) ...@@ -1090,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
{ {
unsigned i; unsigned i;
mutex_lock(&b->c->bucket_lock);
atomic_inc(&b->c->prio_blocked);
bkey_copy(k, &b->key); bkey_copy(k, &b->key);
bkey_copy_key(k, &ZERO_KEY); bkey_copy_key(k, &ZERO_KEY);
for (i = 0; i < KEY_PTRS(k); i++) { for (i = 0; i < KEY_PTRS(k); i++)
uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; SET_PTR_GEN(k, i,
bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
SET_PTR_GEN(k, i, g); PTR_BUCKET(b->c, &b->key, i)));
}
atomic_inc(&b->c->prio_blocked); mutex_unlock(&b->c->bucket_lock);
} }
static int btree_check_reserve(struct btree *b, struct btree_op *op) static int btree_check_reserve(struct btree *b, struct btree_op *op)
{ {
struct cache_set *c = b->c; struct cache_set *c = b->c;
struct cache *ca; struct cache *ca;
unsigned i, reserve = c->root->level * 2 + 1; unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
int ret = 0;
mutex_lock(&c->bucket_lock); mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i) for_each_cache(ca, c, i)
if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
if (op) if (op)
prepare_to_wait(&c->bucket_wait, &op->wait, prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
ret = -EINTR; mutex_unlock(&c->bucket_lock);
break; return -EINTR;
} }
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
return ret;
return mca_cannibalize_lock(b->c, op);
} }
/* Garbage collection */ /* Garbage collection */
uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
struct bkey *k)
{ {
uint8_t stale = 0; uint8_t stale = 0;
unsigned i; unsigned i;
...@@ -1146,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) ...@@ -1146,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
g = PTR_BUCKET(c, k, i); g = PTR_BUCKET(c, k, i);
if (gen_after(g->gc_gen, PTR_GEN(k, i))) if (gen_after(g->last_gc, PTR_GEN(k, i)))
g->gc_gen = PTR_GEN(k, i); g->last_gc = PTR_GEN(k, i);
if (ptr_stale(c, k, i)) { if (ptr_stale(c, k, i)) {
stale = max(stale, ptr_stale(c, k, i)); stale = max(stale, ptr_stale(c, k, i));
...@@ -1163,6 +1195,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) ...@@ -1163,6 +1195,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
SET_GC_MARK(g, GC_MARK_METADATA); SET_GC_MARK(g, GC_MARK_METADATA);
else if (KEY_DIRTY(k)) else if (KEY_DIRTY(k))
SET_GC_MARK(g, GC_MARK_DIRTY); SET_GC_MARK(g, GC_MARK_DIRTY);
else if (!GC_MARK(g))
SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
/* guard against overflow */ /* guard against overflow */
SET_GC_SECTORS_USED(g, min_t(unsigned, SET_GC_SECTORS_USED(g, min_t(unsigned,
...@@ -1177,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) ...@@ -1177,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
{
unsigned i;
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i) &&
!ptr_stale(c, k, i)) {
struct bucket *b = PTR_BUCKET(c, k, i);
b->gen = PTR_GEN(k, i);
if (level && bkey_cmp(k, &ZERO_KEY))
b->prio = BTREE_PRIO;
else if (!level && b->prio == BTREE_PRIO)
b->prio = INITIAL_PRIO;
}
__bch_btree_mark_key(c, level, k);
}
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{ {
uint8_t stale = 0; uint8_t stale = 0;
...@@ -1230,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *, ...@@ -1230,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
struct keylist *, atomic_t *, struct bkey *); struct keylist *, atomic_t *, struct bkey *);
static int btree_gc_coalesce(struct btree *b, struct btree_op *op, static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
struct keylist *keylist, struct gc_stat *gc, struct gc_stat *gc, struct gc_merge_info *r)
struct gc_merge_info *r)
{ {
unsigned i, nodes = 0, keys = 0, blocks; unsigned i, nodes = 0, keys = 0, blocks;
struct btree *new_nodes[GC_MERGE_NODES]; struct btree *new_nodes[GC_MERGE_NODES];
struct keylist keylist;
struct closure cl; struct closure cl;
struct bkey *k; struct bkey *k;
bch_keylist_init(&keylist);
if (btree_check_reserve(b, NULL))
return 0;
memset(new_nodes, 0, sizeof(new_nodes)); memset(new_nodes, 0, sizeof(new_nodes));
closure_init_stack(&cl); closure_init_stack(&cl);
...@@ -1252,11 +1311,23 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, ...@@ -1252,11 +1311,23 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
return 0; return 0;
for (i = 0; i < nodes; i++) { for (i = 0; i < nodes; i++) {
new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
if (IS_ERR_OR_NULL(new_nodes[i])) if (IS_ERR_OR_NULL(new_nodes[i]))
goto out_nocoalesce; goto out_nocoalesce;
} }
/*
* We have to check the reserve here, after we've allocated our new
* nodes, to make sure the insert below will succeed - we also check
* before as an optimization to potentially avoid a bunch of expensive
* allocs/sorts
*/
if (btree_check_reserve(b, NULL))
goto out_nocoalesce;
for (i = 0; i < nodes; i++)
mutex_lock(&new_nodes[i]->write_lock);
for (i = nodes - 1; i > 0; --i) { for (i = nodes - 1; i > 0; --i) {
struct bset *n1 = btree_bset_first(new_nodes[i]); struct bset *n1 = btree_bset_first(new_nodes[i]);
struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
...@@ -1315,28 +1386,34 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, ...@@ -1315,28 +1386,34 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
n2->keys -= keys; n2->keys -= keys;
if (__bch_keylist_realloc(keylist, if (__bch_keylist_realloc(&keylist,
bkey_u64s(&new_nodes[i]->key))) bkey_u64s(&new_nodes[i]->key)))
goto out_nocoalesce; goto out_nocoalesce;
bch_btree_node_write(new_nodes[i], &cl); bch_btree_node_write(new_nodes[i], &cl);
bch_keylist_add(keylist, &new_nodes[i]->key); bch_keylist_add(&keylist, &new_nodes[i]->key);
} }
for (i = 0; i < nodes; i++) { for (i = 0; i < nodes; i++)
if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) mutex_unlock(&new_nodes[i]->write_lock);
goto out_nocoalesce;
make_btree_freeing_key(r[i].b, keylist->top); closure_sync(&cl);
bch_keylist_push(keylist);
}
/* We emptied out this node */ /* We emptied out this node */
BUG_ON(btree_bset_first(new_nodes[0])->keys); BUG_ON(btree_bset_first(new_nodes[0])->keys);
btree_node_free(new_nodes[0]); btree_node_free(new_nodes[0]);
rw_unlock(true, new_nodes[0]); rw_unlock(true, new_nodes[0]);
closure_sync(&cl); for (i = 0; i < nodes; i++) {
if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
goto out_nocoalesce;
make_btree_freeing_key(r[i].b, keylist.top);
bch_keylist_push(&keylist);
}
bch_btree_insert_node(b, op, &keylist, NULL, NULL);
BUG_ON(!bch_keylist_empty(&keylist));
for (i = 0; i < nodes; i++) { for (i = 0; i < nodes; i++) {
btree_node_free(r[i].b); btree_node_free(r[i].b);
...@@ -1345,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, ...@@ -1345,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
r[i].b = new_nodes[i]; r[i].b = new_nodes[i];
} }
bch_btree_insert_node(b, op, keylist, NULL, NULL);
BUG_ON(!bch_keylist_empty(keylist));
memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
r[nodes - 1].b = ERR_PTR(-EINTR); r[nodes - 1].b = ERR_PTR(-EINTR);
trace_bcache_btree_gc_coalesce(nodes); trace_bcache_btree_gc_coalesce(nodes);
gc->nodes--; gc->nodes--;
bch_keylist_free(&keylist);
/* Invalidated our iterator */ /* Invalidated our iterator */
return -EINTR; return -EINTR;
out_nocoalesce: out_nocoalesce:
closure_sync(&cl); closure_sync(&cl);
bch_keylist_free(&keylist);
while ((k = bch_keylist_pop(keylist))) while ((k = bch_keylist_pop(&keylist)))
if (!bkey_cmp(k, &ZERO_KEY)) if (!bkey_cmp(k, &ZERO_KEY))
atomic_dec(&b->c->prio_blocked); atomic_dec(&b->c->prio_blocked);
...@@ -1372,6 +1449,42 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, ...@@ -1372,6 +1449,42 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
return 0; return 0;
} }
static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
struct btree *replace)
{
struct keylist keys;
struct btree *n;
if (btree_check_reserve(b, NULL))
return 0;
n = btree_node_alloc_replacement(replace, NULL);
/* recheck reserve after allocating replacement node */
if (btree_check_reserve(b, NULL)) {
btree_node_free(n);
rw_unlock(true, n);
return 0;
}
bch_btree_node_write_sync(n);
bch_keylist_init(&keys);
bch_keylist_add(&keys, &n->key);
make_btree_freeing_key(replace, keys.top);
bch_keylist_push(&keys);
bch_btree_insert_node(b, op, &keys, NULL, NULL);
BUG_ON(!bch_keylist_empty(&keys));
btree_node_free(replace);
rw_unlock(true, n);
/* Invalidated our iterator */
return -EINTR;
}
static unsigned btree_gc_count_keys(struct btree *b) static unsigned btree_gc_count_keys(struct btree *b)
{ {
struct bkey *k; struct bkey *k;
...@@ -1387,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b) ...@@ -1387,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b)
static int btree_gc_recurse(struct btree *b, struct btree_op *op, static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc) struct closure *writes, struct gc_stat *gc)
{ {
unsigned i;
int ret = 0; int ret = 0;
bool should_rewrite; bool should_rewrite;
struct btree *n;
struct bkey *k; struct bkey *k;
struct keylist keys;
struct btree_iter iter; struct btree_iter iter;
struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *last = r + GC_MERGE_NODES - 1; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
bch_keylist_init(&keys);
bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
for (i = 0; i < GC_MERGE_NODES; i++) for (i = r; i < r + ARRAY_SIZE(r); i++)
r[i].b = ERR_PTR(-EINTR); i->b = ERR_PTR(-EINTR);
while (1) { while (1) {
k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
if (k) { if (k) {
r->b = bch_btree_node_get(b->c, k, b->level - 1, true); r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
true);
if (IS_ERR(r->b)) { if (IS_ERR(r->b)) {
ret = PTR_ERR(r->b); ret = PTR_ERR(r->b);
break; break;
...@@ -1414,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1414,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
r->keys = btree_gc_count_keys(r->b); r->keys = btree_gc_count_keys(r->b);
ret = btree_gc_coalesce(b, op, &keys, gc, r); ret = btree_gc_coalesce(b, op, gc, r);
if (ret) if (ret)
break; break;
} }
...@@ -1424,33 +1534,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1424,33 +1534,11 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
if (!IS_ERR(last->b)) { if (!IS_ERR(last->b)) {
should_rewrite = btree_gc_mark_node(last->b, gc); should_rewrite = btree_gc_mark_node(last->b, gc);
if (should_rewrite && if (should_rewrite) {
!btree_check_reserve(b, NULL)) { ret = btree_gc_rewrite_node(b, op, last->b);
n = btree_node_alloc_replacement(last->b, if (ret)
false);
if (!IS_ERR_OR_NULL(n)) {
bch_btree_node_write_sync(n);
bch_keylist_add(&keys, &n->key);
make_btree_freeing_key(last->b,
keys.top);
bch_keylist_push(&keys);
btree_node_free(last->b);
bch_btree_insert_node(b, op, &keys,
NULL, NULL);
BUG_ON(!bch_keylist_empty(&keys));
rw_unlock(true, last->b);
last->b = n;
/* Invalidated our iterator */
ret = -EINTR;
break; break;
} }
}
if (last->b->level) { if (last->b->level) {
ret = btree_gc_recurse(last->b, op, writes, gc); ret = btree_gc_recurse(last->b, op, writes, gc);
...@@ -1464,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1464,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
* Must flush leaf nodes before gc ends, since replace * Must flush leaf nodes before gc ends, since replace
* operations aren't journalled * operations aren't journalled
*/ */
mutex_lock(&last->b->write_lock);
if (btree_node_dirty(last->b)) if (btree_node_dirty(last->b))
bch_btree_node_write(last->b, writes); bch_btree_node_write(last->b, writes);
mutex_unlock(&last->b->write_lock);
rw_unlock(true, last->b); rw_unlock(true, last->b);
} }
...@@ -1478,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, ...@@ -1478,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
} }
} }
for (i = 0; i < GC_MERGE_NODES; i++) for (i = r; i < r + ARRAY_SIZE(r); i++)
if (!IS_ERR_OR_NULL(r[i].b)) { if (!IS_ERR_OR_NULL(i->b)) {
if (btree_node_dirty(r[i].b)) mutex_lock(&i->b->write_lock);
bch_btree_node_write(r[i].b, writes); if (btree_node_dirty(i->b))
rw_unlock(true, r[i].b); bch_btree_node_write(i->b, writes);
mutex_unlock(&i->b->write_lock);
rw_unlock(true, i->b);
} }
bch_keylist_free(&keys);
return ret; return ret;
} }
...@@ -1499,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ...@@ -1499,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
should_rewrite = btree_gc_mark_node(b, gc); should_rewrite = btree_gc_mark_node(b, gc);
if (should_rewrite) { if (should_rewrite) {
n = btree_node_alloc_replacement(b, false); n = btree_node_alloc_replacement(b, NULL);
if (!IS_ERR_OR_NULL(n)) { if (!IS_ERR_OR_NULL(n)) {
bch_btree_node_write_sync(n); bch_btree_node_write_sync(n);
bch_btree_set_root(n); bch_btree_set_root(n);
btree_node_free(b); btree_node_free(b);
rw_unlock(true, n); rw_unlock(true, n);
...@@ -1511,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ...@@ -1511,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
} }
} }
__bch_btree_mark_key(b->c, b->level + 1, &b->key);
if (b->level) { if (b->level) {
ret = btree_gc_recurse(b, op, writes, gc); ret = btree_gc_recurse(b, op, writes, gc);
if (ret) if (ret)
...@@ -1538,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c) ...@@ -1538,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c)
for_each_cache(ca, c, i) for_each_cache(ca, c, i)
for_each_bucket(b, ca) { for_each_bucket(b, ca) {
b->gc_gen = b->gen; b->last_gc = b->gen;
if (!atomic_read(&b->pin)) { if (!atomic_read(&b->pin)) {
SET_GC_MARK(b, GC_MARK_RECLAIMABLE); SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0); SET_GC_SECTORS_USED(b, 0);
} }
} }
...@@ -1548,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c) ...@@ -1548,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
} }
size_t bch_btree_gc_finish(struct cache_set *c) static size_t bch_btree_gc_finish(struct cache_set *c)
{ {
size_t available = 0; size_t available = 0;
struct bucket *b; struct bucket *b;
...@@ -1561,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) ...@@ -1561,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
c->gc_mark_valid = 1; c->gc_mark_valid = 1;
c->need_gc = 0; c->need_gc = 0;
if (c->root)
for (i = 0; i < KEY_PTRS(&c->root->key); i++)
SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
GC_MARK_METADATA);
for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
GC_MARK_METADATA); GC_MARK_METADATA);
...@@ -1605,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c) ...@@ -1605,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c)
SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
for_each_bucket(b, ca) { for_each_bucket(b, ca) {
b->last_gc = b->gc_gen;
c->need_gc = max(c->need_gc, bucket_gc_gen(b)); c->need_gc = max(c->need_gc, bucket_gc_gen(b));
if (!atomic_read(&b->pin) && if (atomic_read(&b->pin))
GC_MARK(b) == GC_MARK_RECLAIMABLE) { continue;
BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
available++; available++;
if (!GC_SECTORS_USED(b))
bch_bucket_add_unused(ca, b);
}
} }
} }
...@@ -1705,36 +1793,16 @@ int bch_gc_thread_start(struct cache_set *c) ...@@ -1705,36 +1793,16 @@ int bch_gc_thread_start(struct cache_set *c)
/* Initial partial gc */ /* Initial partial gc */
static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
unsigned long **seen)
{ {
int ret = 0; int ret = 0;
unsigned i;
struct bkey *k, *p = NULL; struct bkey *k, *p = NULL;
struct bucket *g;
struct btree_iter iter; struct btree_iter iter;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
for (i = 0; i < KEY_PTRS(k); i++) { bch_initial_mark_key(b->c, b->level, k);
if (!ptr_available(b->c, k, i))
continue;
g = PTR_BUCKET(b->c, k, i);
if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
seen[PTR_DEV(k, i)]) ||
!ptr_stale(b->c, k, i)) {
g->gen = PTR_GEN(k, i);
if (b->level)
g->prio = BTREE_PRIO;
else if (g->prio == BTREE_PRIO)
g->prio = INITIAL_PRIO;
}
}
btree_mark_key(b, k); bch_initial_mark_key(b->c, b->level + 1, &b->key);
}
if (b->level) { if (b->level) {
bch_btree_iter_init(&b->keys, &iter, NULL); bch_btree_iter_init(&b->keys, &iter, NULL);
...@@ -1746,40 +1814,58 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, ...@@ -1746,40 +1814,58 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
btree_node_prefetch(b->c, k, b->level - 1); btree_node_prefetch(b->c, k, b->level - 1);
if (p) if (p)
ret = btree(check_recurse, p, b, op, seen); ret = btree(check_recurse, p, b, op);
p = k; p = k;
} while (p && !ret); } while (p && !ret);
} }
return 0; return ret;
} }
int bch_btree_check(struct cache_set *c) int bch_btree_check(struct cache_set *c)
{ {
int ret = -ENOMEM;
unsigned i;
unsigned long *seen[MAX_CACHES_PER_SET];
struct btree_op op; struct btree_op op;
memset(seen, 0, sizeof(seen));
bch_btree_op_init(&op, SHRT_MAX); bch_btree_op_init(&op, SHRT_MAX);
for (i = 0; c->cache[i]; i++) { return btree_root(check_recurse, c, &op);
size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); }
seen[i] = kmalloc(n, GFP_KERNEL);
if (!seen[i]) void bch_initial_gc_finish(struct cache_set *c)
goto err; {
struct cache *ca;
struct bucket *b;
unsigned i;
/* Disables the seen array until prio_read() uses it too */ bch_btree_gc_finish(c);
memset(seen[i], 0xFF, n);
mutex_lock(&c->bucket_lock);
/*
* We need to put some unused buckets directly on the prio freelist in
* order to get the allocator thread started - it needs freed buckets in
* order to rewrite the prios and gens, and it needs to rewrite prios
* and gens in order to free buckets.
*
* This is only safe for buckets that have no live data in them, which
* there should always be some of.
*/
for_each_cache(ca, c, i) {
for_each_bucket(b, ca) {
if (fifo_full(&ca->free[RESERVE_PRIO]))
break;
if (bch_can_invalidate_bucket(ca, b) &&
!GC_MARK(b)) {
__bch_invalidate_one_bucket(ca, b);
fifo_push(&ca->free[RESERVE_PRIO],
b - ca->buckets);
}
}
} }
ret = btree_root(check_recurse, c, &op, seen); mutex_unlock(&c->bucket_lock);
err:
for (i = 0; i < MAX_CACHES_PER_SET; i++)
kfree(seen[i]);
return ret;
} }
/* Btree insertion */ /* Btree insertion */
...@@ -1871,11 +1957,14 @@ static int btree_split(struct btree *b, struct btree_op *op, ...@@ -1871,11 +1957,14 @@ static int btree_split(struct btree *b, struct btree_op *op,
closure_init_stack(&cl); closure_init_stack(&cl);
bch_keylist_init(&parent_keys); bch_keylist_init(&parent_keys);
if (!b->level && if (btree_check_reserve(b, op)) {
btree_check_reserve(b, op)) if (!b->level)
return -EINTR; return -EINTR;
else
WARN(1, "insufficient reserve for split\n");
}
n1 = btree_node_alloc_replacement(b, true); n1 = btree_node_alloc_replacement(b, op);
if (IS_ERR(n1)) if (IS_ERR(n1))
goto err; goto err;
...@@ -1887,16 +1976,19 @@ static int btree_split(struct btree *b, struct btree_op *op, ...@@ -1887,16 +1976,19 @@ static int btree_split(struct btree *b, struct btree_op *op,
trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
n2 = bch_btree_node_alloc(b->c, b->level, true); n2 = bch_btree_node_alloc(b->c, op, b->level);
if (IS_ERR(n2)) if (IS_ERR(n2))
goto err_free1; goto err_free1;
if (!b->parent) { if (!b->parent) {
n3 = bch_btree_node_alloc(b->c, b->level + 1, true); n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
if (IS_ERR(n3)) if (IS_ERR(n3))
goto err_free2; goto err_free2;
} }
mutex_lock(&n1->write_lock);
mutex_lock(&n2->write_lock);
bch_btree_insert_keys(n1, op, insert_keys, replace_key); bch_btree_insert_keys(n1, op, insert_keys, replace_key);
/* /*
...@@ -1923,45 +2015,45 @@ static int btree_split(struct btree *b, struct btree_op *op, ...@@ -1923,45 +2015,45 @@ static int btree_split(struct btree *b, struct btree_op *op,
bch_keylist_add(&parent_keys, &n2->key); bch_keylist_add(&parent_keys, &n2->key);
bch_btree_node_write(n2, &cl); bch_btree_node_write(n2, &cl);
mutex_unlock(&n2->write_lock);
rw_unlock(true, n2); rw_unlock(true, n2);
} else { } else {
trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
mutex_lock(&n1->write_lock);
bch_btree_insert_keys(n1, op, insert_keys, replace_key); bch_btree_insert_keys(n1, op, insert_keys, replace_key);
} }
bch_keylist_add(&parent_keys, &n1->key); bch_keylist_add(&parent_keys, &n1->key);
bch_btree_node_write(n1, &cl); bch_btree_node_write(n1, &cl);
mutex_unlock(&n1->write_lock);
if (n3) { if (n3) {
/* Depth increases, make a new root */ /* Depth increases, make a new root */
mutex_lock(&n3->write_lock);
bkey_copy_key(&n3->key, &MAX_KEY); bkey_copy_key(&n3->key, &MAX_KEY);
bch_btree_insert_keys(n3, op, &parent_keys, NULL); bch_btree_insert_keys(n3, op, &parent_keys, NULL);
bch_btree_node_write(n3, &cl); bch_btree_node_write(n3, &cl);
mutex_unlock(&n3->write_lock);
closure_sync(&cl); closure_sync(&cl);
bch_btree_set_root(n3); bch_btree_set_root(n3);
rw_unlock(true, n3); rw_unlock(true, n3);
btree_node_free(b);
} else if (!b->parent) { } else if (!b->parent) {
/* Root filled up but didn't need to be split */ /* Root filled up but didn't need to be split */
closure_sync(&cl); closure_sync(&cl);
bch_btree_set_root(n1); bch_btree_set_root(n1);
btree_node_free(b);
} else { } else {
/* Split a non root node */ /* Split a non root node */
closure_sync(&cl); closure_sync(&cl);
make_btree_freeing_key(b, parent_keys.top); make_btree_freeing_key(b, parent_keys.top);
bch_keylist_push(&parent_keys); bch_keylist_push(&parent_keys);
btree_node_free(b);
bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
BUG_ON(!bch_keylist_empty(&parent_keys)); BUG_ON(!bch_keylist_empty(&parent_keys));
} }
btree_node_free(b);
rw_unlock(true, n1); rw_unlock(true, n1);
bch_time_stats_update(&b->c->btree_split_time, start_time); bch_time_stats_update(&b->c->btree_split_time, start_time);
...@@ -1976,7 +2068,7 @@ static int btree_split(struct btree *b, struct btree_op *op, ...@@ -1976,7 +2068,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
btree_node_free(n1); btree_node_free(n1);
rw_unlock(true, n1); rw_unlock(true, n1);
err: err:
WARN(1, "bcache: btree split failed"); WARN(1, "bcache: btree split failed (level %u)", b->level);
if (n3 == ERR_PTR(-EAGAIN) || if (n3 == ERR_PTR(-EAGAIN) ||
n2 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) ||
...@@ -1991,9 +2083,39 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, ...@@ -1991,9 +2083,39 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
atomic_t *journal_ref, atomic_t *journal_ref,
struct bkey *replace_key) struct bkey *replace_key)
{ {
struct closure cl;
BUG_ON(b->level && replace_key); BUG_ON(b->level && replace_key);
closure_init_stack(&cl);
mutex_lock(&b->write_lock);
if (write_block(b) != btree_bset_last(b) &&
b->keys.last_set_unwritten)
bch_btree_init_next(b); /* just wrote a set */
if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
mutex_unlock(&b->write_lock);
goto split;
}
BUG_ON(write_block(b) != btree_bset_last(b));
if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
if (!b->level)
bch_btree_leaf_dirty(b, journal_ref);
else
bch_btree_node_write(b, &cl);
}
mutex_unlock(&b->write_lock);
/* wait for btree node write if necessary, after unlock */
closure_sync(&cl);
return 0;
split:
if (current->bio_list) { if (current->bio_list) {
op->lock = b->c->root->level + 1; op->lock = b->c->root->level + 1;
return -EAGAIN; return -EAGAIN;
...@@ -2004,20 +2126,11 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, ...@@ -2004,20 +2126,11 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
/* Invalidated all iterators */ /* Invalidated all iterators */
int ret = btree_split(b, op, insert_keys, replace_key); int ret = btree_split(b, op, insert_keys, replace_key);
return bch_keylist_empty(insert_keys) ? if (bch_keylist_empty(insert_keys))
0 : ret ?: -EINTR;
}
} else {
BUG_ON(write_block(b) != btree_bset_last(b));
if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
if (!b->level)
bch_btree_leaf_dirty(b, journal_ref);
else
bch_btree_node_write_sync(b);
}
return 0; return 0;
else if (!ret)
return -EINTR;
return ret;
} }
} }
...@@ -2403,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf) ...@@ -2403,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf)
spin_lock_init(&buf->lock); spin_lock_init(&buf->lock);
array_allocator_init(&buf->freelist); array_allocator_init(&buf->freelist);
} }
void bch_btree_exit(void)
{
if (btree_io_wq)
destroy_workqueue(btree_io_wq);
}
int __init bch_btree_init(void)
{
btree_io_wq = create_singlethread_workqueue("bch_btree_io");
if (!btree_io_wq)
return -ENOMEM;
return 0;
}
...@@ -127,6 +127,8 @@ struct btree { ...@@ -127,6 +127,8 @@ struct btree {
struct cache_set *c; struct cache_set *c;
struct btree *parent; struct btree *parent;
struct mutex write_lock;
unsigned long flags; unsigned long flags;
uint16_t written; /* would be nice to kill */ uint16_t written; /* would be nice to kill */
uint8_t level; uint8_t level;
...@@ -236,11 +238,13 @@ static inline void rw_unlock(bool w, struct btree *b) ...@@ -236,11 +238,13 @@ static inline void rw_unlock(bool w, struct btree *b)
} }
void bch_btree_node_read_done(struct btree *); void bch_btree_node_read_done(struct btree *);
void __bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *);
void bch_btree_set_root(struct btree *); void bch_btree_set_root(struct btree *);
struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
struct bkey *, int, bool);
int bch_btree_insert_check_key(struct btree *, struct btree_op *, int bch_btree_insert_check_key(struct btree *, struct btree_op *,
struct bkey *); struct bkey *);
...@@ -248,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *, ...@@ -248,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
atomic_t *, struct bkey *); atomic_t *, struct bkey *);
int bch_gc_thread_start(struct cache_set *); int bch_gc_thread_start(struct cache_set *);
size_t bch_btree_gc_finish(struct cache_set *); void bch_initial_gc_finish(struct cache_set *);
void bch_moving_gc(struct cache_set *); void bch_moving_gc(struct cache_set *);
int bch_btree_check(struct cache_set *); int bch_btree_check(struct cache_set *);
uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
static inline void wake_up_gc(struct cache_set *c) static inline void wake_up_gc(struct cache_set *c)
{ {
......
...@@ -194,9 +194,9 @@ static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) ...@@ -194,9 +194,9 @@ static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
mutex_unlock(&b->c->bucket_lock); mutex_unlock(&b->c->bucket_lock);
bch_extent_to_text(buf, sizeof(buf), k); bch_extent_to_text(buf, sizeof(buf), k);
btree_bug(b, btree_bug(b,
"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", "inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); g->prio, g->gen, g->last_gc, GC_MARK(g));
return true; return true;
} }
...@@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, ...@@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
return NULL; return NULL;
} }
static void bch_subtract_dirty(struct bkey *k,
struct cache_set *c,
uint64_t offset,
int sectors)
{
if (KEY_DIRTY(k))
bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
offset, -sectors);
}
static bool bch_extent_insert_fixup(struct btree_keys *b, static bool bch_extent_insert_fixup(struct btree_keys *b,
struct bkey *insert, struct bkey *insert,
struct btree_iter *iter, struct btree_iter *iter,
...@@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, ...@@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
{ {
struct cache_set *c = container_of(b, struct btree, keys)->c; struct cache_set *c = container_of(b, struct btree, keys)->c;
void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
{
if (KEY_DIRTY(k))
bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
offset, -sectors);
}
uint64_t old_offset; uint64_t old_offset;
unsigned old_size, sectors_found = 0; unsigned old_size, sectors_found = 0;
...@@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, ...@@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
struct bkey *top; struct bkey *top;
subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); bch_subtract_dirty(k, c, KEY_START(insert),
KEY_SIZE(insert));
if (bkey_written(b, k)) { if (bkey_written(b, k)) {
/* /*
...@@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, ...@@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
} }
} }
subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
} }
check_failed: check_failed:
...@@ -499,9 +503,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, ...@@ -499,9 +503,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
if (mutex_trylock(&b->c->bucket_lock)) { if (mutex_trylock(&b->c->bucket_lock)) {
if (b->c->gc_mark_valid && if (b->c->gc_mark_valid &&
((GC_MARK(g) != GC_MARK_DIRTY && (!GC_MARK(g) ||
KEY_DIRTY(k)) || GC_MARK(g) == GC_MARK_METADATA ||
GC_MARK(g) == GC_MARK_METADATA)) (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
goto err; goto err;
if (g->prio == BTREE_PRIO) if (g->prio == BTREE_PRIO)
...@@ -515,9 +519,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, ...@@ -515,9 +519,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
mutex_unlock(&b->c->bucket_lock); mutex_unlock(&b->c->bucket_lock);
bch_extent_to_text(buf, sizeof(buf), k); bch_extent_to_text(buf, sizeof(buf), k);
btree_bug(b, btree_bug(b,
"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); g->prio, g->gen, g->last_gc, GC_MARK(g));
return true; return true;
} }
......
...@@ -237,8 +237,14 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) ...@@ -237,8 +237,14 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
for (i = 0; i < ca->sb.njournal_buckets; i++) for (i = 0; i < ca->sb.njournal_buckets; i++)
if (ja->seq[i] > seq) { if (ja->seq[i] > seq) {
seq = ja->seq[i]; seq = ja->seq[i];
ja->cur_idx = ja->discard_idx = /*
ja->last_idx = i; * When journal_reclaim() goes to allocate for
* the first time, it'll use the bucket after
* ja->cur_idx
*/
ja->cur_idx = i;
ja->last_idx = ja->discard_idx = (i + 1) %
ca->sb.njournal_buckets;
} }
} }
...@@ -288,16 +294,11 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) ...@@ -288,16 +294,11 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
k = bkey_next(k)) { k = bkey_next(k)) {
unsigned j; unsigned j;
for (j = 0; j < KEY_PTRS(k); j++) { for (j = 0; j < KEY_PTRS(k); j++)
struct bucket *g = PTR_BUCKET(c, k, j); if (ptr_available(c, k, j))
atomic_inc(&g->pin); atomic_inc(&PTR_BUCKET(c, k, j)->pin);
if (g->prio == BTREE_PRIO &&
!ptr_stale(c, k, j))
g->prio = INITIAL_PRIO;
}
__bch_btree_mark_key(c, 0, k); bch_initial_mark_key(c, 0, k);
} }
} }
} }
...@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) ...@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
uint64_t start = i->j.last_seq, end = i->j.seq, n = start; uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
struct keylist keylist; struct keylist keylist;
bch_keylist_init(&keylist);
list_for_each_entry(i, list, list) { list_for_each_entry(i, list, list) {
BUG_ON(i->pin && atomic_read(i->pin) != 1); BUG_ON(i->pin && atomic_read(i->pin) != 1);
...@@ -326,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) ...@@ -326,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
k = bkey_next(k)) { k = bkey_next(k)) {
trace_bcache_journal_replay_key(k); trace_bcache_journal_replay_key(k);
bkey_copy(keylist.top, k); bch_keylist_init_single(&keylist, k);
bch_keylist_push(&keylist);
ret = bch_btree_insert(s, &keylist, i->pin, NULL); ret = bch_btree_insert(s, &keylist, i->pin, NULL);
if (ret) if (ret)
...@@ -383,16 +381,15 @@ static void btree_flush_write(struct cache_set *c) ...@@ -383,16 +381,15 @@ static void btree_flush_write(struct cache_set *c)
b = best; b = best;
if (b) { if (b) {
rw_lock(true, b, b->level); mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) { if (!btree_current_write(b)->journal) {
rw_unlock(true, b); mutex_unlock(&b->write_lock);
/* We raced */ /* We raced */
goto retry; goto retry;
} }
bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
rw_unlock(true, b); mutex_unlock(&b->write_lock);
} }
} }
...@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j) ...@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j)
atomic_set(&fifo_back(&j->pin), 1); atomic_set(&fifo_back(&j->pin), 1);
j->cur->data->seq = ++j->seq; j->cur->data->seq = ++j->seq;
j->cur->dirty = false;
j->cur->need_write = false; j->cur->need_write = false;
j->cur->data->keys = 0; j->cur->data->keys = 0;
...@@ -731,7 +729,10 @@ static void journal_write_work(struct work_struct *work) ...@@ -731,7 +729,10 @@ static void journal_write_work(struct work_struct *work)
struct cache_set, struct cache_set,
journal.work); journal.work);
spin_lock(&c->journal.lock); spin_lock(&c->journal.lock);
if (c->journal.cur->dirty)
journal_try_write(c); journal_try_write(c);
else
spin_unlock(&c->journal.lock);
} }
/* /*
...@@ -761,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c, ...@@ -761,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c,
if (parent) { if (parent) {
closure_wait(&w->wait, parent); closure_wait(&w->wait, parent);
journal_try_write(c); journal_try_write(c);
} else if (!w->need_write) { } else if (!w->dirty) {
w->dirty = true;
schedule_delayed_work(&c->journal.work, schedule_delayed_work(&c->journal.work,
msecs_to_jiffies(c->journal_delay_ms)); msecs_to_jiffies(c->journal_delay_ms));
spin_unlock(&c->journal.lock); spin_unlock(&c->journal.lock);
......
...@@ -95,6 +95,7 @@ struct journal_write { ...@@ -95,6 +95,7 @@ struct journal_write {
struct cache_set *c; struct cache_set *c;
struct closure_waitlist wait; struct closure_waitlist wait;
bool dirty;
bool need_write; bool need_write;
}; };
......
...@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) ...@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
moving_gc_keys); moving_gc_keys);
unsigned i; unsigned i;
for (i = 0; i < KEY_PTRS(k); i++) { for (i = 0; i < KEY_PTRS(k); i++)
struct bucket *g = PTR_BUCKET(c, k, i); if (ptr_available(c, k, i) &&
GC_MOVE(PTR_BUCKET(c, k, i)))
if (GC_MOVE(g))
return true; return true;
}
return false; return false;
} }
...@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl) ...@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl)
closure_call(&op->cl, bch_data_insert, NULL, cl); closure_call(&op->cl, bch_data_insert, NULL, cl);
} }
continue_at(cl, write_moving_finish, system_wq); continue_at(cl, write_moving_finish, op->wq);
} }
static void read_moving_submit(struct closure *cl) static void read_moving_submit(struct closure *cl)
...@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl) ...@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl)
bch_submit_bbio(bio, io->op.c, &io->w->key, 0); bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
continue_at(cl, write_moving, system_wq); continue_at(cl, write_moving, io->op.wq);
} }
static void read_moving(struct cache_set *c) static void read_moving(struct cache_set *c)
...@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c) ...@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c)
io->w = w; io->w = w;
io->op.inode = KEY_INODE(&w->key); io->op.inode = KEY_INODE(&w->key);
io->op.c = c; io->op.c = c;
io->op.wq = c->moving_gc_wq;
moving_init(io); moving_init(io);
bio = &io->bio.bio; bio = &io->bio.bio;
...@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c) ...@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c)
ca->heap.used = 0; ca->heap.used = 0;
for_each_bucket(b, ca) { for_each_bucket(b, ca) {
if (!GC_SECTORS_USED(b)) if (GC_MARK(b) == GC_MARK_METADATA ||
!GC_SECTORS_USED(b) ||
GC_SECTORS_USED(b) == ca->sb.bucket_size ||
atomic_read(&b->pin))
continue; continue;
if (!heap_full(&ca->heap)) { if (!heap_full(&ca->heap)) {
......
...@@ -12,11 +12,9 @@ ...@@ -12,11 +12,9 @@
#include "request.h" #include "request.h"
#include "writeback.h" #include "writeback.h"
#include <linux/cgroup.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/hash.h> #include <linux/hash.h>
#include <linux/random.h> #include <linux/random.h>
#include "blk-cgroup.h"
#include <trace/events/bcache.h> #include <trace/events/bcache.h>
...@@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache; ...@@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *); static void bch_data_insert_start(struct closure *);
/* Cgroup interface */
#ifdef CONFIG_CGROUP_BCACHE
static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
{
struct cgroup_subsys_state *css;
return cgroup &&
(css = cgroup_subsys_state(cgroup, bcache_subsys_id))
? container_of(css, struct bch_cgroup, css)
: &bcache_default_cgroup;
}
struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
{
struct cgroup_subsys_state *css = bio->bi_css
? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
: task_subsys_state(current, bcache_subsys_id);
return css
? container_of(css, struct bch_cgroup, css)
: &bcache_default_cgroup;
}
static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
{
char tmp[1024];
int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
cgroup_to_bcache(cgrp)->cache_mode + 1);
if (len < 0)
return len;
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
const char *buf)
{
int v = bch_read_string_list(buf, bch_cache_modes);
if (v < 0)
return v;
cgroup_to_bcache(cgrp)->cache_mode = v - 1;
return 0;
}
static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
{
return cgroup_to_bcache(cgrp)->verify;
}
static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
cgroup_to_bcache(cgrp)->verify = val;
return 0;
}
static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
{
struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
return atomic_read(&bcachecg->stats.cache_hits);
}
static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
{
struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
return atomic_read(&bcachecg->stats.cache_misses);
}
static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
struct cftype *cft)
{
struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
return atomic_read(&bcachecg->stats.cache_bypass_hits);
}
static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
struct cftype *cft)
{
struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
return atomic_read(&bcachecg->stats.cache_bypass_misses);
}
static struct cftype bch_files[] = {
{
.name = "cache_mode",
.read = cache_mode_read,
.write_string = cache_mode_write,
},
{
.name = "verify",
.read_u64 = bch_verify_read,
.write_u64 = bch_verify_write,
},
{
.name = "cache_hits",
.read_u64 = bch_cache_hits_read,
},
{
.name = "cache_misses",
.read_u64 = bch_cache_misses_read,
},
{
.name = "cache_bypass_hits",
.read_u64 = bch_cache_bypass_hits_read,
},
{
.name = "cache_bypass_misses",
.read_u64 = bch_cache_bypass_misses_read,
},
{ } /* terminate */
};
static void init_bch_cgroup(struct bch_cgroup *cg)
{
cg->cache_mode = -1;
}
static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
{
struct bch_cgroup *cg;
cg = kzalloc(sizeof(*cg), GFP_KERNEL);
if (!cg)
return ERR_PTR(-ENOMEM);
init_bch_cgroup(cg);
return &cg->css;
}
static void bcachecg_destroy(struct cgroup *cgroup)
{
struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
kfree(cg);
}
struct cgroup_subsys bcache_subsys = {
.create = bcachecg_create,
.destroy = bcachecg_destroy,
.subsys_id = bcache_subsys_id,
.name = "bcache",
.module = THIS_MODULE,
};
EXPORT_SYMBOL_GPL(bcache_subsys);
#endif
static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
{ {
#ifdef CONFIG_CGROUP_BCACHE
int r = bch_bio_to_cgroup(bio)->cache_mode;
if (r >= 0)
return r;
#endif
return BDEV_CACHE_MODE(&dc->sb); return BDEV_CACHE_MODE(&dc->sb);
} }
static bool verify(struct cached_dev *dc, struct bio *bio) static bool verify(struct cached_dev *dc, struct bio *bio)
{ {
#ifdef CONFIG_CGROUP_BCACHE
if (bch_bio_to_cgroup(bio)->verify)
return true;
#endif
return dc->verify; return dc->verify;
} }
...@@ -248,7 +88,7 @@ static void bch_data_insert_keys(struct closure *cl) ...@@ -248,7 +88,7 @@ static void bch_data_insert_keys(struct closure *cl)
atomic_dec_bug(journal_ref); atomic_dec_bug(journal_ref);
if (!op->insert_data_done) if (!op->insert_data_done)
continue_at(cl, bch_data_insert_start, bcache_wq); continue_at(cl, bch_data_insert_start, op->wq);
bch_keylist_free(&op->insert_keys); bch_keylist_free(&op->insert_keys);
closure_return(cl); closure_return(cl);
...@@ -297,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl) ...@@ -297,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl)
op->insert_data_done = true; op->insert_data_done = true;
bio_put(bio); bio_put(bio);
out: out:
continue_at(cl, bch_data_insert_keys, bcache_wq); continue_at(cl, bch_data_insert_keys, op->wq);
} }
static void bch_data_insert_error(struct closure *cl) static void bch_data_insert_error(struct closure *cl)
...@@ -340,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error) ...@@ -340,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
if (op->writeback) if (op->writeback)
op->error = error; op->error = error;
else if (!op->replace) else if (!op->replace)
set_closure_fn(cl, bch_data_insert_error, bcache_wq); set_closure_fn(cl, bch_data_insert_error, op->wq);
else else
set_closure_fn(cl, NULL, NULL); set_closure_fn(cl, NULL, NULL);
} }
...@@ -376,7 +216,7 @@ static void bch_data_insert_start(struct closure *cl) ...@@ -376,7 +216,7 @@ static void bch_data_insert_start(struct closure *cl)
if (bch_keylist_realloc(&op->insert_keys, if (bch_keylist_realloc(&op->insert_keys,
3 + (op->csum ? 1 : 0), 3 + (op->csum ? 1 : 0),
op->c)) op->c))
continue_at(cl, bch_data_insert_keys, bcache_wq); continue_at(cl, bch_data_insert_keys, op->wq);
k = op->insert_keys.top; k = op->insert_keys.top;
bkey_init(k); bkey_init(k);
...@@ -413,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl) ...@@ -413,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl)
} while (n != bio); } while (n != bio);
op->insert_data_done = true; op->insert_data_done = true;
continue_at(cl, bch_data_insert_keys, bcache_wq); continue_at(cl, bch_data_insert_keys, op->wq);
err: err:
/* bch_alloc_sectors() blocks if s->writeback = true */ /* bch_alloc_sectors() blocks if s->writeback = true */
BUG_ON(op->writeback); BUG_ON(op->writeback);
...@@ -442,7 +282,7 @@ static void bch_data_insert_start(struct closure *cl) ...@@ -442,7 +282,7 @@ static void bch_data_insert_start(struct closure *cl)
bio_put(bio); bio_put(bio);
if (!bch_keylist_empty(&op->insert_keys)) if (!bch_keylist_empty(&op->insert_keys))
continue_at(cl, bch_data_insert_keys, bcache_wq); continue_at(cl, bch_data_insert_keys, op->wq);
else else
closure_return(cl); closure_return(cl);
} }
...@@ -824,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio, ...@@ -824,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->iop.error = 0; s->iop.error = 0;
s->iop.flags = 0; s->iop.flags = 0;
s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
s->iop.wq = bcache_wq;
return s; return s;
} }
...@@ -1203,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc) ...@@ -1203,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
static int flash_dev_cache_miss(struct btree *b, struct search *s, static int flash_dev_cache_miss(struct btree *b, struct search *s,
struct bio *bio, unsigned sectors) struct bio *bio, unsigned sectors)
{ {
struct bio_vec bv; unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
struct bvec_iter iter;
/* Zero fill bio */
bio_for_each_segment(bv, bio, iter) { swap(bio->bi_iter.bi_size, bytes);
unsigned j = min(bv.bv_len >> 9, sectors); zero_fill_bio(bio);
swap(bio->bi_iter.bi_size, bytes);
void *p = kmap(bv.bv_page);
memset(p + bv.bv_offset, 0, j << 9);
kunmap(bv.bv_page);
sectors -= j;
}
bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); bio_advance(bio, bytes);
if (!bio->bi_iter.bi_size) if (!bio->bi_iter.bi_size)
return MAP_DONE; return MAP_DONE;
...@@ -1313,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d) ...@@ -1313,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d)
void bch_request_exit(void) void bch_request_exit(void)
{ {
#ifdef CONFIG_CGROUP_BCACHE
cgroup_unload_subsys(&bcache_subsys);
#endif
if (bch_search_cache) if (bch_search_cache)
kmem_cache_destroy(bch_search_cache); kmem_cache_destroy(bch_search_cache);
} }
...@@ -1326,11 +1155,5 @@ int __init bch_request_init(void) ...@@ -1326,11 +1155,5 @@ int __init bch_request_init(void)
if (!bch_search_cache) if (!bch_search_cache)
return -ENOMEM; return -ENOMEM;
#ifdef CONFIG_CGROUP_BCACHE
cgroup_load_subsys(&bcache_subsys);
init_bch_cgroup(&bcache_default_cgroup);
cgroup_add_cftypes(&bcache_subsys, bch_files);
#endif
return 0; return 0;
} }
#ifndef _BCACHE_REQUEST_H_ #ifndef _BCACHE_REQUEST_H_
#define _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_
#include <linux/cgroup.h>
struct data_insert_op { struct data_insert_op {
struct closure cl; struct closure cl;
struct cache_set *c; struct cache_set *c;
struct bio *bio; struct bio *bio;
struct workqueue_struct *wq;
unsigned inode; unsigned inode;
uint16_t write_point; uint16_t write_point;
...@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d); ...@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d);
extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
struct bch_cgroup {
#ifdef CONFIG_CGROUP_BCACHE
struct cgroup_subsys_state css;
#endif
/*
* We subtract one from the index into bch_cache_modes[], so that
* default == -1; this makes it so the rest match up with d->cache_mode,
* and we use d->cache_mode if cgrp->cache_mode < 0
*/
short cache_mode;
bool verify;
struct cache_stat_collector stats;
};
struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
#endif /* _BCACHE_REQUEST_H_ */ #endif /* _BCACHE_REQUEST_H_ */
...@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, ...@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
struct cached_dev *dc = container_of(d, struct cached_dev, disk); struct cached_dev *dc = container_of(d, struct cached_dev, disk);
mark_cache_stats(&dc->accounting.collector, hit, bypass); mark_cache_stats(&dc->accounting.collector, hit, bypass);
mark_cache_stats(&c->accounting.collector, hit, bypass); mark_cache_stats(&c->accounting.collector, hit, bypass);
#ifdef CONFIG_CGROUP_BCACHE
mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
#endif
} }
void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
......
...@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) ...@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
closure_sync(cl); closure_sync(cl);
} }
#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
void bch_prio_write(struct cache *ca) void bch_prio_write(struct cache *ca)
{ {
int i; int i;
...@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca) ...@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca)
lockdep_assert_held(&ca->set->bucket_lock); lockdep_assert_held(&ca->set->bucket_lock);
for (b = ca->buckets;
b < ca->buckets + ca->sb.nbuckets; b++)
b->disk_gen = b->gen;
ca->disk_buckets->seq++; ca->disk_buckets->seq++;
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
...@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca) ...@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca)
mutex_lock(&ca->set->bucket_lock); mutex_lock(&ca->set->bucket_lock);
ca->need_save_prio = 0;
/* /*
* Don't want the old priorities to get garbage collected until after we * Don't want the old priorities to get garbage collected until after we
* finish writing the new ones, and they're journalled * finish writing the new ones, and they're journalled
*/ */
for (i = 0; i < prio_buckets(ca); i++) for (i = 0; i < prio_buckets(ca); i++) {
if (ca->prio_last_buckets[i])
__bch_bucket_free(ca,
&ca->buckets[ca->prio_last_buckets[i]]);
ca->prio_last_buckets[i] = ca->prio_buckets[i]; ca->prio_last_buckets[i] = ca->prio_buckets[i];
}
} }
static void prio_read(struct cache *ca, uint64_t bucket) static void prio_read(struct cache *ca, uint64_t bucket)
...@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) ...@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
} }
b->prio = le16_to_cpu(d->prio); b->prio = le16_to_cpu(d->prio);
b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; b->gen = b->last_gc = d->gen;
} }
} }
...@@ -843,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, ...@@ -843,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
q->limits.max_segment_size = UINT_MAX; q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_PAGES; q->limits.max_segments = BIO_MAX_PAGES;
q->limits.max_discard_sectors = UINT_MAX; q->limits.max_discard_sectors = UINT_MAX;
q->limits.discard_granularity = 512;
q->limits.io_min = block_size; q->limits.io_min = block_size;
q->limits.logical_block_size = block_size; q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size; q->limits.physical_block_size = block_size;
...@@ -1355,6 +1352,8 @@ static void cache_set_free(struct closure *cl) ...@@ -1355,6 +1352,8 @@ static void cache_set_free(struct closure *cl)
bch_bset_sort_state_free(&c->sort); bch_bset_sort_state_free(&c->sort);
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
if (c->bio_split) if (c->bio_split)
bioset_free(c->bio_split); bioset_free(c->bio_split);
if (c->fill_iter) if (c->fill_iter)
...@@ -1395,14 +1394,21 @@ static void cache_set_flush(struct closure *cl) ...@@ -1395,14 +1394,21 @@ static void cache_set_flush(struct closure *cl)
list_add(&c->root->list, &c->btree_cache); list_add(&c->root->list, &c->btree_cache);
/* Should skip this if we're unregistering because of an error */ /* Should skip this if we're unregistering because of an error */
list_for_each_entry(b, &c->btree_cache, list) list_for_each_entry(b, &c->btree_cache, list) {
mutex_lock(&b->write_lock);
if (btree_node_dirty(b)) if (btree_node_dirty(b))
bch_btree_node_write(b, NULL); __bch_btree_node_write(b, NULL);
mutex_unlock(&b->write_lock);
}
for_each_cache(ca, c, i) for_each_cache(ca, c, i)
if (ca->alloc_thread) if (ca->alloc_thread)
kthread_stop(ca->alloc_thread); kthread_stop(ca->alloc_thread);
cancel_delayed_work_sync(&c->journal.work);
/* flush last journal entry if needed */
c->journal.work.work.func(&c->journal.work.work);
closure_return(cl); closure_return(cl);
} }
...@@ -1485,14 +1491,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) ...@@ -1485,14 +1491,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
sema_init(&c->sb_write_mutex, 1); sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock); mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->btree_cache_wait);
init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->bucket_wait);
sema_init(&c->uuid_write_mutex, 1); sema_init(&c->uuid_write_mutex, 1);
spin_lock_init(&c->btree_gc_time.lock); spin_lock_init(&c->btree_gc_time.lock);
spin_lock_init(&c->btree_split_time.lock); spin_lock_init(&c->btree_split_time.lock);
spin_lock_init(&c->btree_read_time.lock); spin_lock_init(&c->btree_read_time.lock);
spin_lock_init(&c->try_harder_time.lock);
bch_moving_init_cache_set(c); bch_moving_init_cache_set(c);
...@@ -1517,6 +1522,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) ...@@ -1517,6 +1522,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
bch_journal_alloc(c) || bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) || bch_btree_cache_alloc(c) ||
bch_open_buckets_alloc(c) || bch_open_buckets_alloc(c) ||
...@@ -1580,7 +1586,7 @@ static void run_cache_set(struct cache_set *c) ...@@ -1580,7 +1586,7 @@ static void run_cache_set(struct cache_set *c)
goto err; goto err;
err = "error reading btree root"; err = "error reading btree root";
c->root = bch_btree_node_get(c, k, j->btree_level, true); c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
if (IS_ERR_OR_NULL(c->root)) if (IS_ERR_OR_NULL(c->root))
goto err; goto err;
...@@ -1596,7 +1602,7 @@ static void run_cache_set(struct cache_set *c) ...@@ -1596,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
goto err; goto err;
bch_journal_mark(c, &journal); bch_journal_mark(c, &journal);
bch_btree_gc_finish(c); bch_initial_gc_finish(c);
pr_debug("btree_check() done"); pr_debug("btree_check() done");
/* /*
...@@ -1638,7 +1644,7 @@ static void run_cache_set(struct cache_set *c) ...@@ -1638,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
ca->sb.d[j] = ca->sb.first_bucket + j; ca->sb.d[j] = ca->sb.first_bucket + j;
} }
bch_btree_gc_finish(c); bch_initial_gc_finish(c);
err = "error starting allocator thread"; err = "error starting allocator thread";
for_each_cache(ca, c, i) for_each_cache(ca, c, i)
...@@ -1655,12 +1661,14 @@ static void run_cache_set(struct cache_set *c) ...@@ -1655,12 +1661,14 @@ static void run_cache_set(struct cache_set *c)
goto err; goto err;
err = "cannot allocate new btree root"; err = "cannot allocate new btree root";
c->root = bch_btree_node_alloc(c, 0, true); c->root = bch_btree_node_alloc(c, NULL, 0);
if (IS_ERR_OR_NULL(c->root)) if (IS_ERR_OR_NULL(c->root))
goto err; goto err;
mutex_lock(&c->root->write_lock);
bkey_copy_key(&c->root->key, &MAX_KEY); bkey_copy_key(&c->root->key, &MAX_KEY);
bch_btree_node_write(c->root, &cl); bch_btree_node_write(c->root, &cl);
mutex_unlock(&c->root->write_lock);
bch_btree_set_root(c->root); bch_btree_set_root(c->root);
rw_unlock(true, c->root); rw_unlock(true, c->root);
...@@ -1782,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj) ...@@ -1782,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
vfree(ca->buckets); vfree(ca->buckets);
free_heap(&ca->heap); free_heap(&ca->heap);
free_fifo(&ca->unused);
free_fifo(&ca->free_inc); free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++) for (i = 0; i < RESERVE_NR; i++)
...@@ -1819,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) ...@@ -1819,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
!init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
!init_heap(&ca->heap, free << 3, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
!(ca->buckets = vzalloc(sizeof(struct bucket) * !(ca->buckets = vzalloc(sizeof(struct bucket) *
ca->sb.nbuckets)) || ca->sb.nbuckets)) ||
...@@ -1834,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) ...@@ -1834,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
for_each_bucket(b, ca) for_each_bucket(b, ca)
atomic_set(&b->pin, 0); atomic_set(&b->pin, 0);
if (bch_cache_allocator_init(ca))
goto err;
return 0; return 0;
err:
kobject_put(&ca->kobj);
return -ENOMEM;
} }
static void register_cache(struct cache_sb *sb, struct page *sb_page, static void register_cache(struct cache_sb *sb, struct page *sb_page,
...@@ -1869,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, ...@@ -1869,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
goto err; goto err;
mutex_lock(&bch_register_lock);
err = register_cache_set(ca); err = register_cache_set(ca);
mutex_unlock(&bch_register_lock);
if (err) if (err)
goto err; goto err;
...@@ -1931,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ...@@ -1931,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!try_module_get(THIS_MODULE)) if (!try_module_get(THIS_MODULE))
return -EBUSY; return -EBUSY;
mutex_lock(&bch_register_lock);
if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
!(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
goto err; goto err;
...@@ -1965,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ...@@ -1965,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!dc) if (!dc)
goto err_close; goto err_close;
mutex_lock(&bch_register_lock);
register_bdev(sb, sb_page, bdev, dc); register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
} else { } else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca) if (!ca)
...@@ -1978,7 +1981,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ...@@ -1978,7 +1981,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
put_page(sb_page); put_page(sb_page);
kfree(sb); kfree(sb);
kfree(path); kfree(path);
mutex_unlock(&bch_register_lock);
module_put(THIS_MODULE); module_put(THIS_MODULE);
return ret; return ret;
...@@ -2057,7 +2059,6 @@ static void bcache_exit(void) ...@@ -2057,7 +2059,6 @@ static void bcache_exit(void)
{ {
bch_debug_exit(); bch_debug_exit();
bch_request_exit(); bch_request_exit();
bch_btree_exit();
if (bcache_kobj) if (bcache_kobj)
kobject_put(bcache_kobj); kobject_put(bcache_kobj);
if (bcache_wq) if (bcache_wq)
...@@ -2087,7 +2088,6 @@ static int __init bcache_init(void) ...@@ -2087,7 +2088,6 @@ static int __init bcache_init(void)
if (!(bcache_wq = create_workqueue("bcache")) || if (!(bcache_wq = create_workqueue("bcache")) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
sysfs_create_files(bcache_kobj, files) || sysfs_create_files(bcache_kobj, files) ||
bch_btree_init() ||
bch_request_init() || bch_request_init() ||
bch_debug_init(bcache_kobj)) bch_debug_init(bcache_kobj))
goto err; goto err;
......
...@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms); ...@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms);
sysfs_time_stats_attribute(btree_split, sec, us); sysfs_time_stats_attribute(btree_split, sec, us);
sysfs_time_stats_attribute(btree_sort, ms, us); sysfs_time_stats_attribute(btree_sort, ms, us);
sysfs_time_stats_attribute(btree_read, ms, us); sysfs_time_stats_attribute(btree_read, ms, us);
sysfs_time_stats_attribute(try_harder, ms, us);
read_attribute(btree_nodes); read_attribute(btree_nodes);
read_attribute(btree_used_percent); read_attribute(btree_used_percent);
...@@ -406,7 +405,7 @@ struct bset_stats_op { ...@@ -406,7 +405,7 @@ struct bset_stats_op {
struct bset_stats stats; struct bset_stats stats;
}; };
static int btree_bset_stats(struct btree_op *b_op, struct btree *b) static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
{ {
struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
...@@ -424,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) ...@@ -424,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
memset(&op, 0, sizeof(op)); memset(&op, 0, sizeof(op));
bch_btree_op_init(&op.op, -1); bch_btree_op_init(&op.op, -1);
ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -442,10 +441,8 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) ...@@ -442,10 +441,8 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
op.stats.floats, op.stats.failed); op.stats.floats, op.stats.failed);
} }
SHOW(__bch_cache_set) static unsigned bch_root_usage(struct cache_set *c)
{ {
unsigned root_usage(struct cache_set *c)
{
unsigned bytes = 0; unsigned bytes = 0;
struct bkey *k; struct bkey *k;
struct btree *b; struct btree *b;
...@@ -466,10 +463,10 @@ SHOW(__bch_cache_set) ...@@ -466,10 +463,10 @@ SHOW(__bch_cache_set)
rw_unlock(false, b); rw_unlock(false, b);
return (bytes * 100) / btree_bytes(c); return (bytes * 100) / btree_bytes(c);
} }
size_t cache_size(struct cache_set *c) static size_t bch_cache_size(struct cache_set *c)
{ {
size_t ret = 0; size_t ret = 0;
struct btree *b; struct btree *b;
...@@ -479,10 +476,10 @@ SHOW(__bch_cache_set) ...@@ -479,10 +476,10 @@ SHOW(__bch_cache_set)
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
return ret; return ret;
} }
unsigned cache_max_chain(struct cache_set *c) static unsigned bch_cache_max_chain(struct cache_set *c)
{ {
unsigned ret = 0; unsigned ret = 0;
struct hlist_head *h; struct hlist_head *h;
...@@ -502,21 +499,23 @@ SHOW(__bch_cache_set) ...@@ -502,21 +499,23 @@ SHOW(__bch_cache_set)
mutex_unlock(&c->bucket_lock); mutex_unlock(&c->bucket_lock);
return ret; return ret;
} }
unsigned btree_used(struct cache_set *c) static unsigned bch_btree_used(struct cache_set *c)
{ {
return div64_u64(c->gc_stats.key_bytes * 100, return div64_u64(c->gc_stats.key_bytes * 100,
(c->gc_stats.nodes ?: 1) * btree_bytes(c)); (c->gc_stats.nodes ?: 1) * btree_bytes(c));
} }
unsigned average_key_size(struct cache_set *c) static unsigned bch_average_key_size(struct cache_set *c)
{ {
return c->gc_stats.nkeys return c->gc_stats.nkeys
? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
: 0; : 0;
} }
SHOW(__bch_cache_set)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj); struct cache_set *c = container_of(kobj, struct cache_set, kobj);
sysfs_print(synchronous, CACHE_SYNC(&c->sb)); sysfs_print(synchronous, CACHE_SYNC(&c->sb));
...@@ -524,21 +523,20 @@ SHOW(__bch_cache_set) ...@@ -524,21 +523,20 @@ SHOW(__bch_cache_set)
sysfs_hprint(bucket_size, bucket_bytes(c)); sysfs_hprint(bucket_size, bucket_bytes(c));
sysfs_hprint(block_size, block_bytes(c)); sysfs_hprint(block_size, block_bytes(c));
sysfs_print(tree_depth, c->root->level); sysfs_print(tree_depth, c->root->level);
sysfs_print(root_usage_percent, root_usage(c)); sysfs_print(root_usage_percent, bch_root_usage(c));
sysfs_hprint(btree_cache_size, cache_size(c)); sysfs_hprint(btree_cache_size, bch_cache_size(c));
sysfs_print(btree_cache_max_chain, cache_max_chain(c)); sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
sysfs_print(btree_used_percent, btree_used(c)); sysfs_print(btree_used_percent, bch_btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes); sysfs_print(btree_nodes, c->gc_stats.nodes);
sysfs_hprint(average_key_size, average_key_size(c)); sysfs_hprint(average_key_size, bch_average_key_size(c));
sysfs_print(cache_read_races, sysfs_print(cache_read_races,
atomic_long_read(&c->cache_read_races)); atomic_long_read(&c->cache_read_races));
...@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = { ...@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
sysfs_time_stats_attribute_list(btree_split, sec, us) sysfs_time_stats_attribute_list(btree_split, sec, us)
sysfs_time_stats_attribute_list(btree_sort, ms, us) sysfs_time_stats_attribute_list(btree_sort, ms, us)
sysfs_time_stats_attribute_list(btree_read, ms, us) sysfs_time_stats_attribute_list(btree_read, ms, us)
sysfs_time_stats_attribute_list(try_harder, ms, us)
&sysfs_btree_nodes, &sysfs_btree_nodes,
&sysfs_btree_used_percent, &sysfs_btree_used_percent,
...@@ -761,7 +758,9 @@ SHOW(__bch_cache) ...@@ -761,7 +758,9 @@ SHOW(__bch_cache)
int cmp(const void *l, const void *r) int cmp(const void *l, const void *r)
{ return *((uint16_t *) r) - *((uint16_t *) l); } { return *((uint16_t *) r) - *((uint16_t *) l); }
size_t n = ca->sb.nbuckets, i, unused, btree; struct bucket *b;
size_t n = ca->sb.nbuckets, i;
size_t unused = 0, available = 0, dirty = 0, meta = 0;
uint64_t sum = 0; uint64_t sum = 0;
/* Compute 31 quantiles */ /* Compute 31 quantiles */
uint16_t q[31], *p, *cached; uint16_t q[31], *p, *cached;
...@@ -772,6 +771,17 @@ SHOW(__bch_cache) ...@@ -772,6 +771,17 @@ SHOW(__bch_cache)
return -ENOMEM; return -ENOMEM;
mutex_lock(&ca->set->bucket_lock); mutex_lock(&ca->set->bucket_lock);
for_each_bucket(b, ca) {
if (!GC_SECTORS_USED(b))
unused++;
if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
available++;
if (GC_MARK(b) == GC_MARK_DIRTY)
dirty++;
if (GC_MARK(b) == GC_MARK_METADATA)
meta++;
}
for (i = ca->sb.first_bucket; i < n; i++) for (i = ca->sb.first_bucket; i < n; i++)
p[i] = ca->buckets[i].prio; p[i] = ca->buckets[i].prio;
mutex_unlock(&ca->set->bucket_lock); mutex_unlock(&ca->set->bucket_lock);
...@@ -786,10 +796,7 @@ SHOW(__bch_cache) ...@@ -786,10 +796,7 @@ SHOW(__bch_cache)
while (cached < p + n && while (cached < p + n &&
*cached == BTREE_PRIO) *cached == BTREE_PRIO)
cached++; cached++, n--;
btree = cached - p;
n -= btree;
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
sum += INITIAL_PRIO - cached[i]; sum += INITIAL_PRIO - cached[i];
...@@ -805,12 +812,16 @@ SHOW(__bch_cache) ...@@ -805,12 +812,16 @@ SHOW(__bch_cache)
ret = scnprintf(buf, PAGE_SIZE, ret = scnprintf(buf, PAGE_SIZE,
"Unused: %zu%%\n" "Unused: %zu%%\n"
"Clean: %zu%%\n"
"Dirty: %zu%%\n"
"Metadata: %zu%%\n" "Metadata: %zu%%\n"
"Average: %llu\n" "Average: %llu\n"
"Sectors per Q: %zu\n" "Sectors per Q: %zu\n"
"Quantiles: [", "Quantiles: [",
unused * 100 / (size_t) ca->sb.nbuckets, unused * 100 / (size_t) ca->sb.nbuckets,
btree * 100 / (size_t) ca->sb.nbuckets, sum, available * 100 / (size_t) ca->sb.nbuckets,
dirty * 100 / (size_t) ca->sb.nbuckets,
meta * 100 / (size_t) ca->sb.nbuckets, sum,
n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
for (i = 0; i < ARRAY_SIZE(q); i++) for (i = 0; i < ARRAY_SIZE(q); i++)
......
...@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); ...@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
......
...@@ -399,26 +399,43 @@ TRACE_EVENT(bcache_keyscan, ...@@ -399,26 +399,43 @@ TRACE_EVENT(bcache_keyscan,
/* Allocator */ /* Allocator */
TRACE_EVENT(bcache_alloc_invalidate, TRACE_EVENT(bcache_invalidate,
TP_PROTO(struct cache *ca), TP_PROTO(struct cache *ca, size_t bucket),
TP_ARGS(ca), TP_ARGS(ca, bucket),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(unsigned, free ) __field(unsigned, sectors )
__field(unsigned, free_inc ) __field(dev_t, dev )
__field(unsigned, free_inc_size ) __field(__u64, offset )
__field(unsigned, unused )
), ),
TP_fast_assign( TP_fast_assign(
__entry->free = fifo_used(&ca->free[RESERVE_NONE]); __entry->dev = ca->bdev->bd_dev;
__entry->free_inc = fifo_used(&ca->free_inc); __entry->offset = bucket << ca->set->bucket_bits;
__entry->free_inc_size = ca->free_inc.size; __entry->sectors = GC_SECTORS_USED(&ca->buckets[bucket]);
__entry->unused = fifo_used(&ca->unused);
), ),
TP_printk("free %u free_inc %u/%u unused %u", __entry->free, TP_printk("invalidated %u sectors at %d,%d sector=%llu",
__entry->free_inc, __entry->free_inc_size, __entry->unused) __entry->sectors, MAJOR(__entry->dev),
MINOR(__entry->dev), __entry->offset)
);
TRACE_EVENT(bcache_alloc,
TP_PROTO(struct cache *ca, size_t bucket),
TP_ARGS(ca, bucket),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(__u64, offset )
),
TP_fast_assign(
__entry->dev = ca->bdev->bd_dev;
__entry->offset = bucket << ca->set->bucket_bits;
),
TP_printk("allocated %d,%d sector=%llu", MAJOR(__entry->dev),
MINOR(__entry->dev), __entry->offset)
); );
TRACE_EVENT(bcache_alloc_fail, TRACE_EVENT(bcache_alloc_fail,
...@@ -426,21 +443,22 @@ TRACE_EVENT(bcache_alloc_fail, ...@@ -426,21 +443,22 @@ TRACE_EVENT(bcache_alloc_fail,
TP_ARGS(ca, reserve), TP_ARGS(ca, reserve),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(dev_t, dev )
__field(unsigned, free ) __field(unsigned, free )
__field(unsigned, free_inc ) __field(unsigned, free_inc )
__field(unsigned, unused )
__field(unsigned, blocked ) __field(unsigned, blocked )
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = ca->bdev->bd_dev;
__entry->free = fifo_used(&ca->free[reserve]); __entry->free = fifo_used(&ca->free[reserve]);
__entry->free_inc = fifo_used(&ca->free_inc); __entry->free_inc = fifo_used(&ca->free_inc);
__entry->unused = fifo_used(&ca->unused);
__entry->blocked = atomic_read(&ca->set->prio_blocked); __entry->blocked = atomic_read(&ca->set->prio_blocked);
), ),
TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, TP_printk("alloc fail %d,%d free %u free_inc %u blocked %u",
__entry->free_inc, __entry->unused, __entry->blocked) MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free,
__entry->free_inc, __entry->blocked)
); );
/* Background writeback */ /* Background writeback */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment