Commit 31403dca authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS

 - Some tweaks to greatly reduce locking overhead for the list of btree
   transactions, so that it can always be enabled: leave btree_trans
   objects on the list when they're on the percpu single item freelist,
   and only check for duplicates in the same process when
   CONFIG_BCACHEFS_DEBUG is enabled

 - don't zero out the full btree_trans() unless we allocated it from
   the mempool
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent fea153a8
...@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL ...@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL
depends on BCACHEFS_FS depends on BCACHEFS_FS
select FS_POSIX_ACL select FS_POSIX_ACL
config BCACHEFS_DEBUG_TRANSACTIONS
bool "bcachefs runtime info"
depends on BCACHEFS_FS
help
This makes the list of running btree transactions available in debugfs.
This is a highly useful debugging feature but does add a small amount of overhead.
config BCACHEFS_DEBUG config BCACHEFS_DEBUG
bool "bcachefs debugging" bool "bcachefs debugging"
depends on BCACHEFS_FS depends on BCACHEFS_FS
......
...@@ -2714,6 +2714,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) ...@@ -2714,6 +2714,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{ {
struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size; unsigned new_top = trans->mem_top + size;
unsigned old_bytes = trans->mem_bytes; unsigned old_bytes = trans->mem_bytes;
unsigned new_bytes = roundup_pow_of_two(new_top); unsigned new_bytes = roundup_pow_of_two(new_top);
...@@ -2721,17 +2722,19 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ...@@ -2721,17 +2722,19 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
void *new_mem; void *new_mem;
void *p; void *p;
trans->mem_max = max(trans->mem_max, new_top);
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
struct btree_transaction_stats *s = btree_trans_stats(trans);
if (s)
s->max_mem = max(s->max_mem, new_bytes);
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) { if (unlikely(!new_mem)) {
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX; new_bytes = BTREE_TRANS_MEM_MAX;
kfree(trans->mem); kfree(trans->mem);
} }
...@@ -2751,7 +2754,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ...@@ -2751,7 +2754,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes; trans->mem_bytes = new_bytes;
if (old_bytes) { if (old_bytes) {
trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
} }
...@@ -2860,25 +2863,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) ...@@ -2860,25 +2863,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
return trans->restart_count; return trans->restart_count;
} }
static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
{
struct btree_trans *trans;
if (IS_ENABLED(__KERNEL__)) {
trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
if (trans)
return trans;
}
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
/*
* paths need to be zeroed, bch2_check_for_deadlock looks at
* paths in other threads
*/
memset(&trans->paths, 0, sizeof(trans->paths));
return trans;
}
const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
unsigned bch2_trans_get_fn_idx(const char *fn) unsigned bch2_trans_get_fn_idx(const char *fn)
...@@ -2900,22 +2884,55 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) ...@@ -2900,22 +2884,55 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier) __acquires(&c->btree_trans_barrier)
{ {
struct btree_trans *trans; struct btree_trans *trans;
struct btree_transaction_stats *s;
trans = bch2_trans_alloc(c); if (IS_ENABLED(__KERNEL__)) {
trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
if (trans) {
memset(trans, 0, offsetof(struct btree_trans, updates));
goto got_trans;
}
}
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
memset(trans, 0, sizeof(*trans)); memset(trans, 0, sizeof(*trans));
closure_init_stack(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct btree_trans *pos;
pid_t pid = current->pid;
trans->locking_wait.task = current;
list_for_each_entry(pos, &c->btree_trans_list, list) {
struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
/*
* We'd much prefer to be stricter here and completely
* disallow multiple btree_trans in the same thread -
* but the data move path calls bch2_write when we
* already have a btree_trans initialized.
*/
BUG_ON(pos_task &&
pid == pos_task->pid &&
bch2_trans_locked(pos));
if (pos_task && pid < pos_task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
}
}
list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
seqmutex_unlock(&c->btree_trans_lock);
got_trans:
trans->c = c; trans->c = c;
trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
? bch2_btree_transaction_fns[fn_idx] : NULL;
trans->last_begin_time = local_clock(); trans->last_begin_time = local_clock();
trans->fn_idx = fn_idx; trans->fn_idx = fn_idx;
trans->locking_wait.task = current; trans->locking_wait.task = current;
trans->journal_replay_not_finished = trans->journal_replay_not_finished =
unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref); atomic_inc_not_zero(&c->journal_keys.ref);
closure_init_stack(&trans->ref);
trans->paths_allocated = trans->_paths_allocated; trans->paths_allocated = trans->_paths_allocated;
trans->sorted = trans->_sorted; trans->sorted = trans->_sorted;
trans->paths = trans->_paths; trans->paths = trans->_paths;
...@@ -2924,21 +2941,19 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) ...@@ -2924,21 +2941,19 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->paths_allocated[0] = 1; trans->paths_allocated[0] = 1;
s = btree_trans_stats(trans); if (fn_idx < BCH_TRANSACTIONS_NR) {
if (s && s->max_mem) { trans->fn = bch2_btree_transaction_fns[fn_idx];
unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
if (!unlikely(trans->mem)) { if (s->max_mem) {
trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
trans->mem_bytes = BTREE_TRANS_MEM_MAX;
} else { trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
trans->mem_bytes = expected_mem_bytes; if (likely(trans->mem))
trans->mem_bytes = expected_mem_bytes;
} }
}
if (s) {
trans->nr_paths_max = s->nr_max_paths; trans->nr_paths_max = s->nr_max_paths;
trans->journal_entries_size = s->journal_entries_size; trans->journal_entries_size = s->journal_entries_size;
} }
...@@ -2946,31 +2961,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) ...@@ -2946,31 +2961,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies; trans->srcu_lock_time = jiffies;
trans->srcu_held = true; trans->srcu_held = true;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
struct btree_trans *pos;
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) {
/*
* We'd much prefer to be stricter here and completely
* disallow multiple btree_trans in the same thread -
* but the data move path calls bch2_write when we
* already have a btree_trans initialized.
*/
BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
bch2_trans_locked(pos));
if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
}
list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
seqmutex_unlock(&c->btree_trans_lock);
}
return trans; return trans;
} }
...@@ -3001,24 +2991,13 @@ void bch2_trans_put(struct btree_trans *trans) ...@@ -3001,24 +2991,13 @@ void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier) __releases(&c->btree_trans_barrier)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
}
closure_sync(&trans->ref);
if (s)
s->max_mem = max(s->max_mem, trans->mem_max);
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
__btree_path_put(trans->paths + i->path, true); __btree_path_put(trans->paths + i->path, true);
trans->nr_updates = 0; trans->nr_updates = 0;
trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans); check_btree_paths_leaked(trans);
...@@ -3047,8 +3026,16 @@ void bch2_trans_put(struct btree_trans *trans) ...@@ -3047,8 +3026,16 @@ void bch2_trans_put(struct btree_trans *trans)
/* Userspace doesn't have a real percpu implementation: */ /* Userspace doesn't have a real percpu implementation: */
if (IS_ENABLED(__KERNEL__)) if (IS_ENABLED(__KERNEL__))
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
if (trans)
if (trans) {
closure_sync(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
mempool_free(trans, &c->btree_trans_pool); mempool_free(trans, &c->btree_trans_pool);
}
} }
static void __maybe_unused static void __maybe_unused
...@@ -3146,15 +3133,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) ...@@ -3146,15 +3133,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
struct btree_trans *trans; struct btree_trans *trans;
int cpu; int cpu;
if (c->btree_trans_bufs)
for_each_possible_cpu(cpu) {
struct btree_trans *trans =
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
if (trans) {
closure_sync(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
}
kfree(trans);
}
free_percpu(c->btree_trans_bufs);
trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
if (trans) if (trans)
panic("%s leaked btree_trans\n", trans->fn); panic("%s leaked btree_trans\n", trans->fn);
if (c->btree_trans_bufs)
for_each_possible_cpu(cpu)
kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
free_percpu(c->btree_trans_bufs);
for (s = c->btree_transaction_stats; for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) { s++) {
......
...@@ -95,9 +95,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) ...@@ -95,9 +95,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i; struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) { for (i = g->g; i != g->g + g->nr; i++) {
struct task_struct *task = i->trans->locking_wait.task;
if (i != g->g) if (i != g->g)
prt_str(out, "<- "); prt_str(out, "<- ");
prt_printf(out, "%u ", i->trans->locking_wait.task->pid); prt_printf(out, "%u ", task ?task->pid : 0);
} }
prt_newline(out); prt_newline(out);
} }
......
...@@ -386,7 +386,6 @@ struct btree_trans { ...@@ -386,7 +386,6 @@ struct btree_trans {
void *mem; void *mem;
unsigned mem_top; unsigned mem_top;
unsigned mem_max;
unsigned mem_bytes; unsigned mem_bytes;
btree_path_idx_t nr_sorted; btree_path_idx_t nr_sorted;
...@@ -413,8 +412,6 @@ struct btree_trans { ...@@ -413,8 +412,6 @@ struct btree_trans {
unsigned long srcu_lock_time; unsigned long srcu_lock_time;
const char *fn; const char *fn;
struct closure ref;
struct list_head list;
struct btree_bkey_cached_common *locking; struct btree_bkey_cached_common *locking;
struct six_lock_waiter locking_wait; struct six_lock_waiter locking_wait;
int srcu_idx; int srcu_idx;
...@@ -424,7 +421,6 @@ struct btree_trans { ...@@ -424,7 +421,6 @@ struct btree_trans {
u16 journal_entries_size; u16 journal_entries_size;
struct jset_entry *journal_entries; struct jset_entry *journal_entries;
struct btree_insert_entry updates[BTREE_ITER_MAX];
struct btree_trans_commit_hook *hooks; struct btree_trans_commit_hook *hooks;
struct journal_entry_pin *journal_pin; struct journal_entry_pin *journal_pin;
...@@ -435,6 +431,13 @@ struct btree_trans { ...@@ -435,6 +431,13 @@ struct btree_trans {
unsigned extra_disk_res; /* XXX kill */ unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas; struct replicas_delta_list *fs_usage_deltas;
/* Entries before this are zeroed out on every bch2_trans_get() call */
struct btree_insert_entry updates[BTREE_ITER_MAX];
struct list_head list;
struct closure ref;
unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)];
struct btree_trans_paths trans_paths; struct btree_trans_paths trans_paths;
struct btree_path _paths[BTREE_ITER_MAX]; struct btree_path _paths[BTREE_ITER_MAX];
......
...@@ -592,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = { ...@@ -592,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read, .read = bch2_cached_btree_nodes_read,
}; };
#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos) size_t size, loff_t *ppos)
{ {
...@@ -608,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, ...@@ -608,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
restart: restart:
seqmutex_lock(&c->btree_trans_lock); seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) { list_for_each_entry(trans, &c->btree_trans_list, list) {
if (trans->locking_wait.task->pid <= i->iter) struct task_struct *task = READ_ONCE(trans->locking_wait.task);
if (!task || task->pid <= i->iter)
continue; continue;
closure_get(&trans->ref); closure_get(&trans->ref);
...@@ -626,11 +627,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, ...@@ -626,11 +627,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "backtrace:"); prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf); prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2); printbuf_indent_add(&i->buf, 2);
bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); bch2_prt_task_backtrace(&i->buf, task);
printbuf_indent_sub(&i->buf, 2); printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf); prt_newline(&i->buf);
i->iter = trans->locking_wait.task->pid; i->iter = task->pid;
closure_put(&trans->ref); closure_put(&trans->ref);
...@@ -654,7 +655,6 @@ static const struct file_operations btree_transactions_ops = { ...@@ -654,7 +655,6 @@ static const struct file_operations btree_transactions_ops = {
.release = bch2_dump_release, .release = bch2_dump_release,
.read = bch2_btree_transactions_read, .read = bch2_btree_transactions_read,
}; };
#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos) size_t size, loff_t *ppos)
...@@ -811,7 +811,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, ...@@ -811,7 +811,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
restart: restart:
seqmutex_lock(&c->btree_trans_lock); seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) { list_for_each_entry(trans, &c->btree_trans_list, list) {
if (trans->locking_wait.task->pid <= i->iter) struct task_struct *task = READ_ONCE(trans->locking_wait.task);
if (!task || task->pid <= i->iter)
continue; continue;
closure_get(&trans->ref); closure_get(&trans->ref);
...@@ -826,7 +828,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, ...@@ -826,7 +828,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
bch2_check_for_deadlock(trans, &i->buf); bch2_check_for_deadlock(trans, &i->buf);
i->iter = trans->locking_wait.task->pid; i->iter = task->pid;
closure_put(&trans->ref); closure_put(&trans->ref);
...@@ -873,10 +875,8 @@ void bch2_fs_debug_init(struct bch_fs *c) ...@@ -873,10 +875,8 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
c->btree_debug, &cached_btree_nodes_ops); c->btree_debug, &cached_btree_nodes_ops);
#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
c->btree_debug, &btree_transactions_ops); c->btree_debug, &btree_transactions_ops);
#endif
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops); c->btree_debug, &journal_pins_ops);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment