Commit d20f7043 authored by Chris Mason's avatar Chris Mason

Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent c99e905c
......@@ -69,11 +69,27 @@ struct compressed_bio {
/* IO errors */
int errors;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
struct bio *orig_bio;
/*
* the start of a variable length array of checksums only
* used by reads
*/
u32 sums;
};
static inline int compressed_bio_size(struct btrfs_root *root,
unsigned long disk_size)
{
u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
return sizeof(struct compressed_bio) +
((disk_size + root->sectorsize - 1) / root->sectorsize) *
csum_size;
}
static struct bio *compressed_bio_alloc(struct block_device *bdev,
u64 first_byte, gfp_t gfp_flags)
{
......@@ -96,6 +112,47 @@ static struct bio *compressed_bio_alloc(struct block_device *bdev,
return bio;
}
static int check_compressed_csum(struct inode *inode,
struct compressed_bio *cb,
u64 disk_start)
{
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page *page;
unsigned long i;
char *kaddr;
u32 csum;
u32 *cb_sum = &cb->sums;
if (btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM))
return 0;
for (i = 0; i < cb->nr_pages; i++) {
page = cb->compressed_pages[i];
csum = ~(u32)0;
kaddr = kmap_atomic(page, KM_USER0);
csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
btrfs_csum_final(csum, (char *)&csum);
kunmap_atomic(kaddr, KM_USER0);
if (csum != *cb_sum) {
printk("btrfs csum failed ino %lu extent %llu csum %u "
"wanted %u mirror %d\n", inode->i_ino,
(unsigned long long)disk_start,
csum, *cb_sum, cb->mirror_num);
ret = -EIO;
goto fail;
}
cb_sum++;
}
ret = 0;
fail:
return ret;
}
/* when we finish reading compressed pages from the disk, we
* decompress them and then run the bio end_io routines on the
* decompressed pages (in the inode address space).
......@@ -124,16 +181,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
if (!atomic_dec_and_test(&cb->pending_bios))
goto out;
inode = cb->inode;
ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
if (ret)
goto csum_failed;
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
inode = cb->inode;
tree = &BTRFS_I(inode)->io_tree;
ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
cb->start,
cb->orig_bio->bi_io_vec,
cb->orig_bio->bi_vcnt,
cb->compressed_len);
csum_failed:
if (ret)
cb->errors = 1;
......@@ -148,8 +210,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
/* do io completion on the original bio */
if (cb->errors) {
bio_io_error(cb->orig_bio);
} else
} else {
int bio_index = 0;
struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
while(bio_index < cb->orig_bio->bi_vcnt) {
SetPageChecked(bvec->bv_page);
bvec++;
bio_index++;
}
bio_endio(cb->orig_bio, 0);
}
/* finally free the cb struct */
kfree(cb->compressed_pages);
......@@ -277,12 +352,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
int ret;
WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
cb = kmalloc(sizeof(*cb), GFP_NOFS);
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
atomic_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->orig_bio = NULL;
......@@ -290,9 +366,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
ret = btrfs_csum_file_bytes(root, inode, start, len);
BUG_ON(ret);
bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
......@@ -325,6 +398,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
BUG_ON(ret);
ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
BUG_ON(ret);
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
BUG_ON(ret);
......@@ -348,6 +424,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
BUG_ON(ret);
ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
BUG_ON(ret);
ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
BUG_ON(ret);
......@@ -510,6 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
int ret;
u32 *sums;
tree = &BTRFS_I(inode)->io_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
......@@ -521,15 +601,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_CACHE_SIZE);
spin_unlock(&em_tree->lock);
cb = kmalloc(sizeof(*cb), GFP_NOFS);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
atomic_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = &cb->sums;
cb->start = em->orig_start;
compressed_len = em->block_len;
em_len = em->len;
em_start = em->start;
free_extent_map(em);
em = NULL;
......@@ -551,11 +634,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
add_ra_bio_pages(inode, em_start + em_len, cb);
if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
}
/* include any pages we added in add_ra-bio_pages */
uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
cb->len = uncompressed_len;
......@@ -568,6 +646,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
for (page_index = 0; page_index < nr_pages; page_index++) {
page = cb->compressed_pages[page_index];
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_CACHE_SHIFT;
if (comp_bio->bi_size)
ret = tree->ops->merge_bio_hook(page, 0,
PAGE_CACHE_SIZE,
......@@ -591,7 +671,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
*/
atomic_inc(&cb->pending_bios);
ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, comp_bio,
sums);
}
sums += (comp_bio->bi_size + root->sectorsize - 1) /
root->sectorsize;
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
BUG_ON(ret);
bio_put(comp_bio);
......@@ -610,7 +699,12 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
BUG_ON(ret);
ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
BUG_ON(ret);
bio_put(comp_bio);
......
......@@ -73,6 +73,9 @@ struct btrfs_ordered_sum;
/* directory objectid inside the root tree */
#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
......@@ -84,6 +87,13 @@ struct btrfs_ordered_sum;
#define BTRFS_TREE_RELOC_OBJECTID -8ULL
#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
/*
* extent checksums all have this objectid
* this allows them to share the logging tree
* for fsyncs
*/
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
......@@ -634,6 +644,7 @@ struct btrfs_fs_info {
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
struct btrfs_root *csum_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
......@@ -716,6 +727,7 @@ struct btrfs_fs_info {
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
/*
......@@ -858,13 +870,12 @@ struct btrfs_root {
* extent data is for file data
*/
#define BTRFS_EXTENT_DATA_KEY 108
/*
* csum items have the checksums for data in the extents
* extent csums are stored in a separate tree and hold csums for
* an entire extent on disk.
*/
#define BTRFS_CSUM_ITEM_KEY 120
/* reserve 21-31 for other file/dir stuff */
#define BTRFS_EXTENT_CSUM_KEY 128
/*
* root items point to tree roots. There are typically in the root
......@@ -1917,7 +1928,7 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
/* file-item.c */
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
struct bio *bio, u32 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
......@@ -1929,17 +1940,16 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 objectid,
u64 bytenr, int mod);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio);
struct bio *bio, u64 file_start, int contig);
int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
u64 start, unsigned long len);
struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 objectid, u64 offset,
int cow);
u64 bytenr, int cow);
int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
u64 isize);
......
......@@ -445,11 +445,18 @@ static void end_workqueue_bio(struct bio *bio, int err)
end_io_wq->error = err;
end_io_wq->work.func = end_workqueue_fn;
end_io_wq->work.flags = 0;
if (bio->bi_rw & (1 << BIO_RW))
if (bio->bi_rw & (1 << BIO_RW)) {
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
} else {
if (end_io_wq->metadata)
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_workers,
&end_io_wq->work);
}
}
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
......@@ -1208,6 +1215,9 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
info = (struct btrfs_fs_info *)bdi->unplug_io_data;
list_for_each(cur, &info->fs_devices->devices) {
device = list_entry(cur, struct btrfs_device, dev_list);
if (!device->bdev)
continue;
bdi = blk_get_backing_dev_info(device->bdev);
if (bdi->unplug_io_fn) {
bdi->unplug_io_fn(bdi, page);
......@@ -1344,7 +1354,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
* blocksize <= pagesize, it is basically a noop
*/
if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
btrfs_queue_worker(&fs_info->endio_workers,
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
return;
}
......@@ -1454,6 +1464,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct buffer_head *bh;
struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
......@@ -1470,7 +1482,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_super_block *disk_super;
if (!extent_root || !tree_root || !fs_info ||
!chunk_root || !dev_root) {
!chunk_root || !dev_root || !csum_root) {
err = -ENOMEM;
goto fail;
}
......@@ -1487,6 +1499,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
fs_info->extent_root = extent_root;
fs_info->csum_root = csum_root;
fs_info->chunk_root = chunk_root;
fs_info->dev_root = dev_root;
fs_info->fs_devices = fs_devices;
......@@ -1652,6 +1665,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
btrfs_init_workers(&fs_info->endio_workers, "endio",
fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
fs_info->thread_pool_size);
......@@ -1667,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_meta_workers,
fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);
......@@ -1751,6 +1768,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (ret)
goto fail_extent_root;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
goto fail_extent_root;
csum_root->track_dirty = 1;
btrfs_read_block_groups(extent_root);
fs_info->generation = generation + 1;
......@@ -1761,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
if (!fs_info->cleaner_kthread)
goto fail_extent_root;
goto fail_csum_root;
fs_info->transaction_kthread = kthread_run(transaction_kthread,
tree_root,
......@@ -1825,6 +1849,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_csum_root:
free_extent_buffer(csum_root->node);
fail_extent_root:
free_extent_buffer(extent_root->node);
fail_tree_root:
......@@ -1838,6 +1864,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
fail_iput:
......@@ -1853,6 +1880,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
kfree(fs_info);
kfree(chunk_root);
kfree(dev_root);
kfree(csum_root);
return ERR_PTR(err);
}
......@@ -2131,6 +2159,9 @@ int close_ctree(struct btrfs_root *root)
if (root->fs_info->dev_root->node);
free_extent_buffer(root->fs_info->dev_root->node);
if (root->fs_info->csum_root->node);
free_extent_buffer(root->fs_info->csum_root->node);
btrfs_free_block_groups(root->fs_info);
del_fs_roots(fs_info);
......@@ -2141,6 +2172,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
btrfs_stop_workers(&fs_info->endio_meta_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
......@@ -2163,6 +2195,7 @@ int close_ctree(struct btrfs_root *root)
kfree(fs_info->tree_root);
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
return 0;
}
......
......@@ -1732,6 +1732,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
int whole_page;
int ret;
if (err)
uptodate = 0;
do {
struct page *page = bvec->bv_page;
tree = &BTRFS_I(page->mapping->host)->io_tree;
......@@ -1761,6 +1764,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (ret == 0) {
uptodate =
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
uptodate = 0;
continue;
}
}
......
This diff is collapsed.
......@@ -1221,7 +1221,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
ret = btrfs_csum_one_bio(root, inode, bio);
ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
BUG_ON(ret);
return 0;
}
......@@ -1259,12 +1259,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
btrfs_test_flag(inode, NODATASUM);
if (!(rw & (1 << BIO_RW))) {
if (bio_flags & EXTENT_BIO_COMPRESSED)
if (bio_flags & EXTENT_BIO_COMPRESSED) {
return btrfs_submit_compressed_read(inode, bio,
mirror_num, bio_flags);
else if (!skip_sum)
btrfs_lookup_bio_sums(root, inode, bio);
} else if (!skip_sum)
btrfs_lookup_bio_sums(root, inode, bio, NULL);
goto mapit;
} else if (!skip_sum) {
/* we're doing a write, do the async checksumming */
......@@ -1292,8 +1291,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
btrfs_set_trans_block_group(trans, inode);
list_for_each(cur, list) {
sum = list_entry(cur, struct btrfs_ordered_sum, list);
btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
inode, sum);
btrfs_csum_file_blocks(trans,
BTRFS_I(inode)->root->fs_info->csum_root, sum);
}
return 0;
}
......@@ -1545,6 +1544,7 @@ struct io_failure_record {
u64 start;
u64 len;
u64 logical;
unsigned long bio_flags;
int last_mirror;
};
......@@ -1563,7 +1563,6 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
int ret;
int rw;
u64 logical;
unsigned long bio_flags = 0;
ret = get_state_private(failure_tree, start, &private);
if (ret) {
......@@ -1573,6 +1572,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
failrec->start = start;
failrec->len = end - start + 1;
failrec->last_mirror = 0;
failrec->bio_flags = 0;
spin_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
......@@ -1588,8 +1588,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
}
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
bio_flags = EXTENT_BIO_COMPRESSED;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
}
failrec->logical = logical;
free_extent_map(em);
set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
......@@ -1626,6 +1628,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
bio->bi_sector = failrec->logical >> 9;
bio->bi_bdev = failed_bio->bi_bdev;
bio->bi_size = 0;
bio_add_page(bio, page, failrec->len, start - page_offset(page));
if (failed_bio->bi_rw & (1 << BIO_RW))
rw = WRITE;
......@@ -1634,7 +1637,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
failrec->last_mirror,
bio_flags);
failrec->bio_flags);
return 0;
}
......@@ -1688,9 +1691,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
u32 csum = ~(u32)0;
unsigned long flags;
if (PageChecked(page)) {
ClearPageChecked(page);
goto good;
}
if (btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM))
return 0;
if (state && state->start == start) {
private = state->private;
ret = 0;
......@@ -1709,7 +1717,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
}
kunmap_atomic(kaddr, KM_IRQ0);
local_irq_restore(flags);
good:
/* if the io failure tree for this inode is non-empty,
* check to see if we've recovered from a failed IO
*/
......@@ -2243,6 +2251,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return err;
}
#if 0
/*
* when truncating bytes in a file, it is possible to avoid reading
* the leaves that contain only checksum items. This can be the
......@@ -2410,6 +2419,8 @@ static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
return ret;
}
#endif
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
......@@ -2459,9 +2470,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
btrfs_init_path(path);
ret = drop_csum_leaves(trans, root, path, inode, new_size);
BUG_ON(ret);
search_again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0) {
......@@ -2509,16 +2517,11 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
}
item_end--;
}
if (found_type == BTRFS_CSUM_ITEM_KEY) {
ret = btrfs_csum_truncate(trans, root, path,
new_size);
BUG_ON(ret);
}
if (item_end < new_size) {
if (found_type == BTRFS_DIR_ITEM_KEY) {
found_type = BTRFS_INODE_ITEM_KEY;
} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
found_type = BTRFS_CSUM_ITEM_KEY;
found_type = BTRFS_EXTENT_DATA_KEY;
} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
found_type = BTRFS_XATTR_ITEM_KEY;
} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
......
......@@ -714,8 +714,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
u64 hint_byte;
u16 csum_size =
btrfs_super_csum_size(&root->fs_info->super_copy);
/*
* TODO:
* - split compressed inline extents. annoying: we need to
......@@ -833,7 +832,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
key.objectid != src->i_ino)
break;
......@@ -958,56 +957,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_mark_buffer_dirty(leaf);
}
if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
u32 size;
struct btrfs_key new_key;
u64 coverslen;
int coff, clen;
size = btrfs_item_size_nr(leaf, slot);
coverslen = (size / csum_size) <<
root->fs_info->sb->s_blocksize_bits;
printk("csums for %llu~%llu\n",
key.offset, coverslen);
if (key.offset + coverslen < off ||
key.offset >= off+len)
goto next;
read_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot),
size);
btrfs_release_path(root, path);
coff = 0;
if (off > key.offset)
coff = ((off - key.offset) >>
root->fs_info->sb->s_blocksize_bits) *
csum_size;
clen = size - coff;
if (key.offset + coverslen > off+len)
clen -= ((key.offset+coverslen-off-len) >>
root->fs_info->sb->s_blocksize_bits) *
csum_size;
printk(" will dup %d~%d of %d\n",
coff, clen, size);
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, clen);
if (ret)
goto out;
leaf = path->nodes[0];
slot = path->slots[0];
write_extent_buffer(leaf, buf + coff,
btrfs_item_ptr_offset(leaf, slot),
clen);
btrfs_mark_buffer_dirty(leaf);
}
next:
btrfs_release_path(root, path);
key.offset++;
......
......@@ -610,7 +610,8 @@ int btrfs_ordered_update_i_size(struct inode *inode,
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum)
{
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_sector_sum *sector_sums;
......@@ -629,11 +630,11 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
mutex_lock(&tree->mutex);
list_for_each_prev(cur, &ordered->list) {
ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
if (offset >= ordered_sum->file_offset) {
if (disk_bytenr >= ordered_sum->bytenr) {
num_sectors = ordered_sum->len / sectorsize;
sector_sums = ordered_sum->sums;
for (i = 0; i < num_sectors; i++) {
if (sector_sums[i].offset == offset) {
if (sector_sums[i].bytenr == disk_bytenr) {
*sum = sector_sums[i].sum;
ret = 0;
goto out;
......
......@@ -33,15 +33,17 @@ struct btrfs_ordered_inode_tree {
* the ordered extent are on disk
*/
struct btrfs_sector_sum {
u64 offset;
/* bytenr on disk */
u64 bytenr;
u32 sum;
};
struct btrfs_ordered_sum {
u64 file_offset;
/* bytenr is the start of this extent on disk */
u64 bytenr;
/*
* this is the length in bytes covered by the sums array below.
* But, the sums array may not be contiguous in the file.
*/
unsigned long len;
struct list_head list;
......@@ -147,7 +149,7 @@ struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
int btrfs_ordered_update_i_size(struct inode *inode,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
......
......@@ -934,24 +934,17 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
unsigned long file_bytes;
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct inode *inode;
unsigned long ptr;
file_bytes = (item_size / csum_size) * root->sectorsize;
inode = read_one_inode(root, key->objectid);
if (!inode) {
return -EIO;
}
sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
if (!sums) {
iput(inode);
return -ENOMEM;
}
INIT_LIST_HEAD(&sums->list);
sums->len = file_bytes;
sums->file_offset = key->offset;
sums->bytenr = key->offset;
/*
* copy all the sums into the ordered sum struct
......@@ -960,7 +953,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
cur_offset = key->offset;
ptr = btrfs_item_ptr_offset(eb, slot);
while(item_size > 0) {
sector_sum->offset = cur_offset;
sector_sum->bytenr = cur_offset;
read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
sector_sum++;
item_size -= csum_size;
......@@ -969,11 +962,9 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
}
/* let btrfs_csum_file_blocks add them into the file */
ret = btrfs_csum_file_blocks(trans, root, inode, sums);
ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
BUG_ON(ret);
kfree(sums);
iput(inode);
return 0;
}
/*
......@@ -1670,7 +1661,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
BUG_ON(ret);
} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
ret = replay_one_csum(wc->trans, root, path,
eb, i, &key);
BUG_ON(ret);
......@@ -2466,6 +2457,85 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return 0;
}
static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
struct list_head *list,
struct btrfs_root *root,
u64 disk_bytenr, u64 len)
{
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
int ret;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
u64 end = disk_bytenr + len;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u32 diff;
u32 sum;
u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
sector_sum = sums->sums;
sums->bytenr = disk_bytenr;
sums->len = len;
list_add_tail(&sums->list, list);
path = btrfs_alloc_path();
while(disk_bytenr < end) {
if (!item || disk_bytenr < item_start_offset ||
disk_bytenr >= item_last_offset) {
struct btrfs_key found_key;
u32 item_size;
if (item)
btrfs_release_path(root, path);
item = btrfs_lookup_csum(NULL, root, path,
disk_bytenr, 0);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
sum = 0;
printk("log no csum found for byte %llu\n",
(unsigned long long)disk_bytenr);
item = NULL;
btrfs_release_path(root, path);
goto found;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
item_start_offset = found_key.offset;
item_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
item_last_offset = item_start_offset +
(item_size / csum_size) *
root->sectorsize;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
}
/*
* this byte range must be able to fit inside
* a single leaf so it will also fit inside a u32
*/
diff = disk_bytenr - item_start_offset;
diff = diff / root->sectorsize;
diff = diff * csum_size;
read_extent_buffer(path->nodes[0], &sum,
((unsigned long)item) + diff,
csum_size);
found:
sector_sum->bytenr = disk_bytenr;
sector_sum->sum = sum;
disk_bytenr += root->sectorsize;
sector_sum++;
}
btrfs_free_path(path);
return 0;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *dst_path,
......@@ -2481,6 +2551,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
u32 *ins_sizes;
char *ins_data;
int i;
struct list_head ordered_sums;
INIT_LIST_HEAD(&ordered_sums);
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
......@@ -2535,6 +2608,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
extent);
u64 dl = btrfs_file_extent_disk_num_bytes(src,
extent);
u64 cs = btrfs_file_extent_offset(src, extent);
u64 cl = btrfs_file_extent_num_bytes(src,
extent);;
/* ds == 0 is a hole */
if (ds != 0) {
ret = btrfs_inc_extent_ref(trans, log,
......@@ -2544,6 +2620,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
trans->transid,
ins_keys[i].objectid);
BUG_ON(ret);
ret = copy_extent_csums(trans,
&ordered_sums,
log->fs_info->csum_root,
ds + cs, cl);
BUG_ON(ret);
}
}
}
......@@ -2553,6 +2634,20 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(log, dst_path);
kfree(ins_data);
/*
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
while(!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
ret = btrfs_csum_file_blocks(trans, log, sums);
BUG_ON(ret);
list_del(&sums->list);
kfree(sums);
}
return 0;
}
......
......@@ -2771,6 +2771,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
device->work.func = pending_bios_fn;
fs_devices->num_devices++;
spin_lock_init(&device->io_lock);
INIT_LIST_HEAD(&device->dev_alloc_list);
memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
return device;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment