Commit a53fe257 authored by Chris Mason's avatar Chris Mason

Merge branch 'for-chris-4.5' of...

Merge branch 'for-chris-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux into for-linus-4.5
parents bb9d6876 e44081ef
...@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ...@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM; return -ENOMEM;
/* /*
* We don't need the lock here since we are protected by the transaction * Even though we are in the critical section of the transaction commit,
* commit. We want to do the cache_save_setup first and then run the * we can still have concurrent tasks adding elements to this
* transaction's list of dirty block groups. These tasks correspond to
* endio free space workers started when writeback finishes for a
* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
* allocate new block groups as a result of COWing nodes of the root
* tree when updating the free space inode. The writeback for the space
* caches is triggered by an earlier call to
* btrfs_start_dirty_block_groups() and iterations of the following
* loop.
* Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all * delayed refs to make sure we have the best chance at doing this all
* in one shot. * in one shot.
*/ */
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) { while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs, cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache, struct btrfs_block_group_cache,
...@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ...@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* finish and then do it all again * finish and then do it all again
*/ */
if (!list_empty(&cache->io_list)) { if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list); list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache, btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path, &cache->io_ctl, path,
cache->key.objectid); cache->key.objectid);
btrfs_put_block_group(cache); btrfs_put_block_group(cache);
spin_lock(&cur_trans->dirty_bgs_lock);
} }
/* /*
...@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ...@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* on any pending IO * on any pending IO
*/ */
list_del_init(&cache->dirty_list); list_del_init(&cache->dirty_list);
spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1; should_put = 1;
cache_save_setup(cache, trans, path); cache_save_setup(cache, trans, path);
...@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ...@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */ /* if its not on the io list, we need to put the block group */
if (should_put) if (should_put)
btrfs_put_block_group(cache); btrfs_put_block_group(cache);
spin_lock(&cur_trans->dirty_bgs_lock);
} }
spin_unlock(&cur_trans->dirty_bgs_lock);
while (!list_empty(io)) { while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache, cache = list_first_entry(io, struct btrfs_block_group_cache,
......
...@@ -66,6 +66,13 @@ struct btrfs_iget_args { ...@@ -66,6 +66,13 @@ struct btrfs_iget_args {
struct btrfs_root *root; struct btrfs_root *root;
}; };
struct btrfs_dio_data {
u64 outstanding_extents;
u64 reserve;
u64 unsubmitted_oe_range_start;
u64 unsubmitted_oe_range_end;
};
static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations;
...@@ -7408,24 +7415,20 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, ...@@ -7408,24 +7415,20 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_start_ordered_extent(inode, ordered, 1); btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
} else { } else {
/* Screw you mmap */
ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
if (ret)
break;
ret = filemap_fdatawait_range(inode->i_mapping,
lockstart,
lockend);
if (ret)
break;
/* /*
* If we found a page that couldn't be invalidated just * We could trigger writeback for this range (and wait
* fall back to buffered. * for it to complete) and then invalidate the pages for
* this range (through invalidate_inode_pages2_range()),
* but that can lead us to a deadlock with a concurrent
* call to readpages() (a buffered read or a defrag call
* triggered a readahead) on a page lock due to an
* ordered dio extent we created before but did not have
* yet a corresponding bio submitted (whence it can not
* complete), which makes readpages() wait for that
* ordered extent to complete while holding a lock on
* that page.
*/ */
ret = invalidate_inode_pages2_range(inode->i_mapping, ret = -ENOTBLK;
lockstart >> PAGE_CACHE_SHIFT,
lockend >> PAGE_CACHE_SHIFT);
if (ret)
break; break;
} }
...@@ -7482,11 +7485,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, ...@@ -7482,11 +7485,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em; return em;
} }
struct btrfs_dio_data {
u64 outstanding_extents;
u64 reserve;
};
static void adjust_dio_outstanding_extents(struct inode *inode, static void adjust_dio_outstanding_extents(struct inode *inode,
struct btrfs_dio_data *dio_data, struct btrfs_dio_data *dio_data,
const u64 len) const u64 len)
...@@ -7670,6 +7668,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ...@@ -7670,6 +7668,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
btrfs_free_reserved_data_space(inode, start, len); btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len); WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len; dio_data->reserve -= len;
dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data; current->journal_info = dio_data;
} }
...@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio) ...@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
bio_put(bio); bio_put(bio);
} }
static void btrfs_endio_direct_write(struct bio *bio) static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
const u64 offset,
const u64 bytes,
const int uptodate)
{ {
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered = NULL; struct btrfs_ordered_extent *ordered = NULL;
u64 ordered_offset = dip->logical_offset; u64 ordered_offset = offset;
u64 ordered_bytes = dip->bytes; u64 ordered_bytes = bytes;
struct bio *dio_bio;
int ret; int ret;
again: again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset, &ordered_offset,
ordered_bytes, ordered_bytes,
!bio->bi_error); uptodate);
if (!ret) if (!ret)
goto out_test; goto out_test;
...@@ -8020,13 +8019,22 @@ static void btrfs_endio_direct_write(struct bio *bio) ...@@ -8020,13 +8019,22 @@ static void btrfs_endio_direct_write(struct bio *bio)
* our bio might span multiple ordered extents. If we haven't * our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again * completed the accounting for the whole dio, go back and try again
*/ */
if (ordered_offset < dip->logical_offset + dip->bytes) { if (ordered_offset < offset + bytes) {
ordered_bytes = dip->logical_offset + dip->bytes - ordered_bytes = offset + bytes - ordered_offset;
ordered_offset;
ordered = NULL; ordered = NULL;
goto again; goto again;
} }
dio_bio = dip->dio_bio; }
static void btrfs_endio_direct_write(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct bio *dio_bio = dip->dio_bio;
btrfs_endio_direct_write_update_ordered(dip->inode,
dip->logical_offset,
dip->bytes,
!bio->bi_error);
kfree(dip); kfree(dip);
...@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, ...@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->subio_endio = btrfs_subio_endio_read; dip->subio_endio = btrfs_subio_endio_read;
} }
/*
* Reset the range for unsubmitted ordered extents (to a 0 length range)
* even if we fail to submit a bio, because in such case we do the
* corresponding error handling below and it must not be done a second
* time by btrfs_direct_IO().
*/
if (write) {
struct btrfs_dio_data *dio_data = current->journal_info;
dio_data->unsubmitted_oe_range_end = dip->logical_offset +
dip->bytes;
dio_data->unsubmitted_oe_range_start =
dio_data->unsubmitted_oe_range_end;
}
ret = btrfs_submit_direct_hook(rw, dip, skip_sum); ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret) if (!ret)
return; return;
...@@ -8362,24 +8385,15 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, ...@@ -8362,24 +8385,15 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip = NULL; dip = NULL;
io_bio = NULL; io_bio = NULL;
} else { } else {
if (write) { if (write)
struct btrfs_ordered_extent *ordered; btrfs_endio_direct_write_update_ordered(inode,
file_offset,
ordered = btrfs_lookup_ordered_extent(inode, dio_bio->bi_iter.bi_size,
file_offset); 0);
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); else
/*
* Decrements our ref on the ordered extent and removes
* the ordered extent from the inode's ordered tree,
* doing all the proper resource cleanup such as for the
* reserved space and waking up any waiters for this
* ordered extent (through btrfs_remove_ordered_extent).
*/
btrfs_finish_ordered_io(ordered);
} else {
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1); file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_error = -EIO; dio_bio->bi_error = -EIO;
/* /*
* Releases and cleans up our dio_bio, no need to bio_put() * Releases and cleans up our dio_bio, no need to bio_put()
...@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* originally calculated. Abuse current->journal_info for this. * originally calculated. Abuse current->journal_info for this.
*/ */
dio_data.reserve = round_up(count, root->sectorsize); dio_data.reserve = round_up(count, root->sectorsize);
dio_data.unsubmitted_oe_range_start = (u64)offset;
dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data; current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) { &BTRFS_I(inode)->runtime_flags)) {
...@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (dio_data.reserve) if (dio_data.reserve)
btrfs_delalloc_release_space(inode, offset, btrfs_delalloc_release_space(inode, offset,
dio_data.reserve); dio_data.reserve);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
* cleanup them up to avoid other tasks getting them
* and waiting for them to complete forever.
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
btrfs_endio_direct_write_update_ordered(inode,
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
0);
} else if (ret >= 0 && (size_t)ret < count) } else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset, btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret); count - (size_t)ret);
......
...@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) ...@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
list_del_init(&em->list); list_del_init(&em->list);
free_extent_map(em); free_extent_map(em);
} }
/*
* If any block groups are found in ->deleted_bgs then it's
* because the transaction was aborted and a commit did not
* happen (things failed before writing the new superblock
* and calling btrfs_finish_extent_commit()), so we can not
* discard the physical locations of the block groups.
*/
while (!list_empty(&transaction->deleted_bgs)) {
struct btrfs_block_group_cache *cache;
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group_cache,
bg_list);
list_del_init(&cache->bg_list);
btrfs_put_block_group_trimming(cache);
btrfs_put_block_group(cache);
}
kmem_cache_free(btrfs_transaction_cachep, transaction); kmem_cache_free(btrfs_transaction_cachep, transaction);
} }
} }
......
...@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ...@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out; goto out;
} }
btrfs_release_path(path); btrfs_release_path(path);
/*
* We don't need a lock on a leaf. btrfs_realloc_node() will lock all
* leafs from path->nodes[1], so set lowest_level to 1 to avoid later
* a deadlock (attempting to write lock an already write locked leaf).
*/
path->lowest_level = 1;
wret = btrfs_search_slot(trans, root, &key, path, 0, 1); wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) { if (wret < 0) {
...@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ...@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0; ret = 0;
goto out; goto out;
} }
path->slots[1] = btrfs_header_nritems(path->nodes[1]); /*
next_key_ret = btrfs_find_next_key(root, path, &key, 1, * The node at level 1 must always be locked when our path has
min_trans); * keep_locks set and lowest_level is 1, regardless of the value of
* path->slots[1].
*/
BUG_ON(path->locks[1] == 0);
ret = btrfs_realloc_node(trans, root, ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0, path->nodes[1], 0,
&last_ret, &last_ret,
...@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ...@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
WARN_ON(ret == -EAGAIN); WARN_ON(ret == -EAGAIN);
goto out; goto out;
} }
/*
* Now that we reallocated the node we can find the next key. Note that
* btrfs_find_next_key() can release our path and do another search
* without COWing, this is because even with path->keep_locks = 1,
* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
* node when path->slots[node_level - 1] does not point to the last
* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
* we search for the next key after reallocating our node.
*/
path->slots[1] = btrfs_header_nritems(path->nodes[1]);
next_key_ret = btrfs_find_next_key(root, path, &key, 1,
min_trans);
if (next_key_ret == 0) { if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key)); memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN; ret = -EAGAIN;
......
...@@ -4825,19 +4825,31 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, ...@@ -4825,19 +4825,31 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out; goto out;
} }
/*
* Take the device list mutex to prevent races with the final phase of
* a device replace operation that replaces the device object associated
* with the map's stripes, because the device object's id can change
* at any time during that final phase of the device replace operation
* (dev-replace.c:btrfs_dev_replace_finishing()).
*/
mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) { for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev; device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical; dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device); ret = btrfs_update_device(trans, device);
if (ret) if (ret)
goto out; break;
ret = btrfs_alloc_dev_extent(trans, device, ret = btrfs_alloc_dev_extent(trans, device,
chunk_root->root_key.objectid, chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
chunk_offset, dev_offset, chunk_offset, dev_offset,
stripe_size); stripe_size);
if (ret) if (ret)
break;
}
if (ret) {
mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
goto out; goto out;
} }
...@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, ...@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++; stripe++;
} }
mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size); btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment