Commit 4cb5300b authored by Chris Mason's avatar Chris Mason

Btrfs: add mount -o auto_defrag

This will detect small random writes into files and
queue the up for an auto defrag process.  It isn't well suited to
database workloads yet, but works for smaller files such as rpm, sqlite
or bdb databases.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent d6c0cb37
...@@ -153,6 +153,7 @@ struct btrfs_inode { ...@@ -153,6 +153,7 @@ struct btrfs_inode {
unsigned ordered_data_close:1; unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1; unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1; unsigned dummy_inode:1;
unsigned in_defrag:1;
/* /*
* always compress this one file * always compress this one file
......
...@@ -1074,6 +1074,11 @@ struct btrfs_fs_info { ...@@ -1074,6 +1074,11 @@ struct btrfs_fs_info {
/* all metadata allocations go through this cluster */ /* all metadata allocations go through this cluster */
struct btrfs_free_cluster meta_alloc_cluster; struct btrfs_free_cluster meta_alloc_cluster;
/* auto defrag inodes go here */
spinlock_t defrag_inodes_lock;
struct rb_root defrag_inodes;
atomic_t defrag_running;
spinlock_t ref_cache_lock; spinlock_t ref_cache_lock;
u64 total_ref_cache_size; u64 total_ref_cache_size;
...@@ -1205,6 +1210,38 @@ struct btrfs_root { ...@@ -1205,6 +1210,38 @@ struct btrfs_root {
struct super_block anon_super; struct super_block anon_super;
}; };
struct btrfs_ioctl_defrag_range_args {
/* start of the defrag operation */
__u64 start;
/* number of bytes to defrag, use (u64)-1 to say all */
__u64 len;
/*
* flags for the operation, which can include turning
* on compression for this one defrag
*/
__u64 flags;
/*
* any extent bigger than this will be considered
* already defragged. Use 0 to take the kernel default
* Use 1 to say every single extent must be rewritten
*/
__u32 extent_thresh;
/*
* which compression method to use if turning on compression
* for this defrag operation. If unspecified, zlib will
* be used
*/
__u32 compress_type;
/* spare for later */
__u32 unused[4];
};
/* /*
* inode items have the data typically returned from stat and store other * inode items have the data typically returned from stat and store other
* info about object characteristics. There is one for every file and dir in * info about object characteristics. There is one for every file and dir in
...@@ -1302,6 +1339,7 @@ struct btrfs_root { ...@@ -1302,6 +1339,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
...@@ -2528,8 +2566,13 @@ extern const struct dentry_operations btrfs_dentry_operations; ...@@ -2528,8 +2566,13 @@ extern const struct dentry_operations btrfs_dentry_operations;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
void btrfs_update_iflags(struct inode *inode); void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
/* file.c */ /* file.c */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, int datasync); int btrfs_sync_file(struct file *file, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned); int skip_pinned);
......
...@@ -1475,6 +1475,7 @@ static int cleaner_kthread(void *arg) ...@@ -1475,6 +1475,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(root); btrfs_run_delayed_iputs(root);
btrfs_clean_old_snapshots(root); btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex); mutex_unlock(&root->fs_info->cleaner_mutex);
btrfs_run_defrag_inodes(root->fs_info);
} }
if (freezing(current)) { if (freezing(current)) {
...@@ -1616,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, ...@@ -1616,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
init_completion(&fs_info->kobj_unregister); init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root; fs_info->tree_root = tree_root;
...@@ -1638,9 +1640,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, ...@@ -1638,9 +1640,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
fs_info->sb = sb; fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024; fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0; fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->thread_pool_size = min_t(unsigned long, fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8); num_online_cpus() + 2, 8);
...@@ -2501,6 +2505,14 @@ int close_ctree(struct btrfs_root *root) ...@@ -2501,6 +2505,14 @@ int close_ctree(struct btrfs_root *root)
smp_mb(); smp_mb();
btrfs_scrub_cancel(root); btrfs_scrub_cancel(root);
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);
btrfs_put_block_group_cache(fs_info); btrfs_put_block_group_cache(fs_info);
/* /*
......
...@@ -40,6 +40,263 @@ ...@@ -40,6 +40,263 @@
#include "locking.h" #include "locking.h"
#include "compat.h" #include "compat.h"
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
* inodes need defragging passes
*/
struct inode_defrag {
struct rb_node rb_node;
/* objectid */
u64 ino;
/*
* transid where the defrag was added, we search for
* extents newer than this
*/
u64 transid;
/* root objectid */
u64 root;
/* last offset we were able to defrag */
u64 last_offset;
/* if we've wrapped around back to zero once already */
int cycled;
};
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
* If you're inserting a record for an older transid than an
* existing record, the transid already in the tree is lowered
*
* If an existing record is found the defrag item you
* pass in is freed
*/
static int __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
p = &root->fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
if (defrag->ino < entry->ino)
p = &parent->rb_left;
else if (defrag->ino > entry->ino)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
* an old defrag run, make sure to
* lower the transid of our existing record
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
if (defrag->last_offset > entry->last_offset)
entry->last_offset = defrag->last_offset;
goto exists;
}
}
BTRFS_I(inode)->in_defrag = 1;
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
return 0;
exists:
kfree(defrag);
return 0;
}
/*
* insert a defrag record for this inode if auto defrag is
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
int ret = 0;
u64 transid;
if (!btrfs_test_opt(root, AUTO_DEFRAG))
return 0;
if (root->fs_info->closing)
return 0;
if (BTRFS_I(inode)->in_defrag)
return 0;
if (trans)
transid = trans->transid;
else
transid = BTRFS_I(inode)->root->last_trans;
defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
if (!defrag)
return -ENOMEM;
defrag->ino = inode->i_ino;
defrag->transid = transid;
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
if (!BTRFS_I(inode)->in_defrag)
ret = __btrfs_add_inode_defrag(inode, defrag);
spin_unlock(&root->fs_info->defrag_inodes_lock);
return ret;
}
/*
* must be called with the defrag_inodes lock held
*/
struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
struct rb_node **next)
{
struct inode_defrag *entry = NULL;
struct rb_node *p;
struct rb_node *parent = NULL;
p = info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
if (ino < entry->ino)
p = parent->rb_left;
else if (ino > entry->ino)
p = parent->rb_right;
else
return entry;
}
if (next) {
while (parent && ino > entry->ino) {
parent = rb_next(parent);
entry = rb_entry(parent, struct inode_defrag, rb_node);
}
*next = parent;
}
return NULL;
}
/*
* run through the list of inodes in the FS that need
* defragging
*/
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
struct btrfs_root *inode_root;
struct inode *inode;
struct rb_node *n;
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
u64 first_ino = 0;
int num_defrag;
int defrag_batch = 1024;
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
atomic_inc(&fs_info->defrag_running);
spin_lock(&fs_info->defrag_inodes_lock);
while(1) {
n = NULL;
/* find an inode to defrag */
defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
if (!defrag) {
if (n)
defrag = rb_entry(n, struct inode_defrag, rb_node);
else if (first_ino) {
first_ino = 0;
continue;
} else {
break;
}
}
/* remove it from the rbtree */
first_ino = defrag->ino + 1;
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
if (fs_info->closing)
goto next_free;
spin_unlock(&fs_info->defrag_inodes_lock);
/* get the inode */
key.objectid = defrag->root;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(inode_root))
goto next;
key.objectid = defrag->ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode))
goto next;
/* do a chunk of defrag */
BTRFS_I(inode)->in_defrag = 0;
range.start = defrag->last_offset;
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
defrag_batch);
/*
* if we filled the whole defrag batch, there
* must be more work to do. Queue this defrag
* again
*/
if (num_defrag == defrag_batch) {
defrag->last_offset = range.start;
__btrfs_add_inode_defrag(inode, defrag);
/*
* we don't want to kfree defrag, we added it back to
* the rbtree
*/
defrag = NULL;
} else if (defrag->last_offset && !defrag->cycled) {
/*
* we didn't fill our defrag batch, but
* we didn't start at zero. Make sure we loop
* around to the start of the file.
*/
defrag->last_offset = 0;
defrag->cycled = 1;
__btrfs_add_inode_defrag(inode, defrag);
defrag = NULL;
}
iput(inode);
next:
spin_lock(&fs_info->defrag_inodes_lock);
next_free:
kfree(defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);
atomic_dec(&fs_info->defrag_running);
/*
* during unmount, we use the transaction_wait queue to
* wait for the defragger to stop
*/
wake_up(&fs_info->transaction_wait);
return 0;
}
/* simple helper to fault in pages and copy. This should go away /* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code. * and be replaced with calls into generic code.
......
...@@ -342,6 +342,10 @@ static noinline int compress_file_range(struct inode *inode, ...@@ -342,6 +342,10 @@ static noinline int compress_file_range(struct inode *inode,
int will_compress; int will_compress;
int compress_type = root->fs_info->compress_type; int compress_type = root->fs_info->compress_type;
/* if this is a small write inside eof, kick off a defragbot */
if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
btrfs_add_inode_defrag(NULL, inode);
actual_end = min_t(u64, isize, end + 1); actual_end = min_t(u64, isize, end + 1);
again: again:
will_compress = 0; will_compress = 0;
...@@ -799,6 +803,10 @@ static noinline int cow_file_range(struct inode *inode, ...@@ -799,6 +803,10 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes; disk_num_bytes = num_bytes;
ret = 0; ret = 0;
/* if this is a small write inside eof, kick off defrag */
if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
btrfs_add_inode_defrag(trans, inode);
if (start == 0) { if (start == 0) {
/* lets try to make an inline extent */ /* lets try to make an inline extent */
ret = cow_file_range_inline(trans, root, inode, ret = cow_file_range_inline(trans, root, inode,
...@@ -5371,6 +5379,9 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, ...@@ -5371,6 +5379,9 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
if (IS_ERR(trans)) if (IS_ERR(trans))
return ERR_CAST(trans); return ERR_CAST(trans);
if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
btrfs_add_inode_defrag(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv; trans->block_rsv = &root->fs_info->delalloc_block_rsv;
alloc_hint = get_extent_allocation_hint(inode, start, len); alloc_hint = get_extent_allocation_hint(inode, start, len);
...@@ -6682,6 +6693,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ...@@ -6682,6 +6693,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->ordered_data_close = 0; ei->ordered_data_close = 0;
ei->orphan_meta_reserved = 0; ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0; ei->dummy_inode = 0;
ei->in_defrag = 0;
ei->force_compress = BTRFS_COMPRESS_NONE; ei->force_compress = BTRFS_COMPRESS_NONE;
ei->delayed_node = NULL; ei->delayed_node = NULL;
......
This diff is collapsed.
...@@ -181,37 +181,6 @@ struct btrfs_ioctl_clone_range_args { ...@@ -181,37 +181,6 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_DEFRAG_RANGE_COMPRESS 1 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
#define BTRFS_DEFRAG_RANGE_START_IO 2 #define BTRFS_DEFRAG_RANGE_START_IO 2
struct btrfs_ioctl_defrag_range_args {
/* start of the defrag operation */
__u64 start;
/* number of bytes to defrag, use (u64)-1 to say all */
__u64 len;
/*
* flags for the operation, which can include turning
* on compression for this one defrag
*/
__u64 flags;
/*
* any extent bigger than this will be considered
* already defragged. Use 0 to take the kernel default
* Use 1 to say every single extent must be rewritten
*/
__u32 extent_thresh;
/*
* which compression method to use if turning on compression
* for this defrag operation. If unspecified, zlib will
* be used
*/
__u32 compress_type;
/* spare for later */
__u32 unused[4];
};
struct btrfs_ioctl_space_info { struct btrfs_ioctl_space_info {
__u64 flags; __u64 flags;
__u64 total_bytes; __u64 total_bytes;
......
...@@ -160,7 +160,7 @@ enum { ...@@ -160,7 +160,7 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
Opt_enospc_debug, Opt_subvolrootid, Opt_err, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
}; };
static match_table_t tokens = { static match_table_t tokens = {
...@@ -191,6 +191,7 @@ static match_table_t tokens = { ...@@ -191,6 +191,7 @@ static match_table_t tokens = {
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"}, {Opt_enospc_debug, "enospc_debug"},
{Opt_subvolrootid, "subvolrootid=%d"}, {Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
{Opt_err, NULL}, {Opt_err, NULL},
}; };
...@@ -369,6 +370,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) ...@@ -369,6 +370,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_enospc_debug: case Opt_enospc_debug:
btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
break; break;
case Opt_defrag:
printk(KERN_INFO "btrfs: enabling auto defrag");
btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
break;
case Opt_err: case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option " printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p); "'%s'\n", p);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment