disk-io.c 125 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

Chris Mason's avatar
Chris Mason committed
6
#include <linux/fs.h>
7
#include <linux/blkdev.h>
8
#include <linux/radix-tree.h>
9
#include <linux/writeback.h>
10
#include <linux/buffer_head.h>
11
#include <linux/workqueue.h>
12
#include <linux/kthread.h>
13
#include <linux/slab.h>
14
#include <linux/migrate.h>
15
#include <linux/ratelimit.h>
16
#include <linux/uuid.h>
17
#include <linux/semaphore.h>
18
#include <linux/error-injection.h>
19
#include <linux/crc32c.h>
20
#include <linux/sched/mm.h>
21
#include <asm/unaligned.h>
22
#include <crypto/hash.h>
23 24
#include "ctree.h"
#include "disk-io.h"
25
#include "transaction.h"
26
#include "btrfs_inode.h"
27
#include "volumes.h"
28
#include "print-tree.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "free-space-cache.h"
32
#include "free-space-tree.h"
33
#include "inode-map.h"
34
#include "check-integrity.h"
35
#include "rcu-string.h"
36
#include "dev-replace.h"
David Woodhouse's avatar
David Woodhouse committed
37
#include "raid56.h"
38
#include "sysfs.h"
39
#include "qgroup.h"
40
#include "compression.h"
41
#include "tree-checker.h"
42
#include "ref-verify.h"
43
#include "block-group.h"
44
#include "discard.h"
45

46 47 48 49
#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
				 BTRFS_HEADER_FLAG_RELOC |\
				 BTRFS_SUPER_FLAG_ERROR |\
				 BTRFS_SUPER_FLAG_SEEDING |\
50 51
				 BTRFS_SUPER_FLAG_METADUMP |\
				 BTRFS_SUPER_FLAG_METADUMP_V2)
52

53
static const struct extent_io_ops btree_extent_io_ops;
54
static void end_workqueue_fn(struct btrfs_work *work);
55
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
56
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
57
				      struct btrfs_fs_info *fs_info);
58
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
59
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
60 61
					struct extent_io_tree *dirty_pages,
					int mark);
62
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
63
				       struct extent_io_tree *pinned_extents);
64 65
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
66

67
/*
68 69
 * btrfs_end_io_wq structs are used to do processing in task context when an IO
 * is complete.  This is used during reads to verify checksums, and it is used
70 71
 * by writes to insert metadata for new file extents after IO is complete.
 */
72
struct btrfs_end_io_wq {
73 74 75 76
	struct bio *bio;
	bio_end_io_t *end_io;
	void *private;
	struct btrfs_fs_info *info;
77
	blk_status_t status;
78
	enum btrfs_wq_endio_type metadata;
79
	struct btrfs_work work;
80
};
81

82 83 84 85 86 87 88
static struct kmem_cache *btrfs_end_io_wq_cache;

int __init btrfs_end_io_wq_init(void)
{
	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
					sizeof(struct btrfs_end_io_wq),
					0,
89
					SLAB_MEM_SPREAD,
90 91 92 93 94 95
					NULL);
	if (!btrfs_end_io_wq_cache)
		return -ENOMEM;
	return 0;
}

96
void __cold btrfs_end_io_wq_exit(void)
97
{
98
	kmem_cache_destroy(btrfs_end_io_wq_cache);
99 100
}

101 102 103 104 105 106
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
	if (fs_info->csum_shash)
		crypto_free_shash(fs_info->csum_shash);
}

107 108 109 110 111
/*
 * async submit bios are used to offload expensive checksumming
 * onto the worker threads.  They checksum file and metadata bios
 * just before they are sent down the IO stack.
 */
112
struct async_submit_bio {
113
	void *private_data;
114
	struct bio *bio;
115
	extent_submit_bio_start_t *submit_bio_start;
116
	int mirror_num;
117 118 119 120 121
	/*
	 * bio_offset is optional, can be used if the pages in the bio
	 * can't tell us where in the file the bio should go
	 */
	u64 bio_offset;
122
	struct btrfs_work work;
123
	blk_status_t status;
124 125
};

126 127 128 129 130 131 132 133
/*
 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 * the level the eb occupies in the tree.
 *
 * Different roots are used for different purposes and may nest inside each
 * other and they require separate keysets.  As lockdep keys should be
 * static, assign keysets according to the purpose of the root as indicated
134 135
 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
 * roots have separate keysets.
136
 *
137 138 139
 * Lock-nesting across peer nodes is always done with the immediate parent
 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 * subclass to avoid triggering lockdep warning in such cases.
140
 *
141 142 143
 * The key is set by the readpage_end_io_hook after the buffer has passed
 * csum validation but before the pages are unlocked.  It is also set by
 * btrfs_init_new_buffer on freshly allocated blocks.
144
 *
145 146 147
 * We also add a check to make sure the highest level of the tree is the
 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 * needs update as well.
148 149 150 151 152
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
#  error
# endif
153 154 155 156 157 158 159 160 161 162 163 164 165

static struct btrfs_lockdep_keyset {
	u64			id;		/* root objectid */
	const char		*name_stem;	/* lock name stem */
	char			names[BTRFS_MAX_LEVEL + 1][20];
	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1];
} btrfs_lockdep_keysets[] = {
	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	},
	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	},
	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	},
	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
166
	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
167 168 169
	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
170
	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
171
	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	.name_stem = "free-space" },
172
	{ .id = 0,				.name_stem = "tree"	},
173
};
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204

void __init btrfs_init_lockdep(void)
{
	int i, j;

	/* initialize lockdep class names */
	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];

		for (j = 0; j < ARRAY_SIZE(ks->names); j++)
			snprintf(ks->names[j], sizeof(ks->names[j]),
				 "btrfs-%s-%02d", ks->name_stem, j);
	}
}

void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
				    int level)
{
	struct btrfs_lockdep_keyset *ks;

	BUG_ON(level >= ARRAY_SIZE(ks->keys));

	/* find the matching keyset, id 0 is the default entry */
	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
		if (ks->id == objectid)
			break;

	lockdep_set_class_and_name(&eb->lock,
				   &ks->keys[level], ks->names[level]);
}

205 206
#endif

207 208 209 210
/*
 * extents on the btree inode are pretty simple, there's one extent
 * that covers the entire device
 */
211
struct extent_map *btree_get_extent(struct btrfs_inode *inode,
212 213
				    struct page *page, size_t pg_offset,
				    u64 start, u64 len)
214
{
215
	struct extent_map_tree *em_tree = &inode->extent_tree;
216 217 218
	struct extent_map *em;
	int ret;

219
	read_lock(&em_tree->lock);
220
	em = lookup_extent_mapping(em_tree, start, len);
221
	if (em) {
222
		read_unlock(&em_tree->lock);
223
		goto out;
224
	}
225
	read_unlock(&em_tree->lock);
226

227
	em = alloc_extent_map();
228 229 230 231 232
	if (!em) {
		em = ERR_PTR(-ENOMEM);
		goto out;
	}
	em->start = 0;
233
	em->len = (u64)-1;
234
	em->block_len = (u64)-1;
235
	em->block_start = 0;
236

237
	write_lock(&em_tree->lock);
Josef Bacik's avatar
Josef Bacik committed
238
	ret = add_extent_mapping(em_tree, em, 0);
239 240
	if (ret == -EEXIST) {
		free_extent_map(em);
241
		em = lookup_extent_mapping(em_tree, start, len);
242
		if (!em)
243
			em = ERR_PTR(-EIO);
244
	} else if (ret) {
245
		free_extent_map(em);
246
		em = ERR_PTR(ret);
247
	}
248
	write_unlock(&em_tree->lock);
249

250 251
out:
	return em;
252 253
}

254
/*
255 256 257
 * Compute the csum of a btree block and store the result to provided buffer.
 *
 * Returns error if the extent buffer cannot be mapped.
258
 */
259
static int csum_tree_block(struct extent_buffer *buf, u8 *result)
260
{
261 262
	struct btrfs_fs_info *fs_info = buf->fs_info;
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
263 264 265 266 267 268 269
	unsigned long len;
	unsigned long cur_len;
	unsigned long offset = BTRFS_CSUM_SIZE;
	char *kaddr;
	unsigned long map_start;
	unsigned long map_len;
	int err;
270 271 272

	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
273 274

	len = buf->len - offset;
275

276
	while (len > 0) {
277 278 279 280 281 282
		/*
		 * Note: we don't need to check for the err == 1 case here, as
		 * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
		 * and 'min_len = 32' and the currently implemented mapping
		 * algorithm we cannot cross a page boundary.
		 */
283
		err = map_private_extent_buffer(buf, offset, 32,
284
					&kaddr, &map_start, &map_len);
285
		if (WARN_ON(err))
286
			return err;
287
		cur_len = min(len, map_len - (offset - map_start));
288
		crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
289 290 291
		len -= cur_len;
		offset += cur_len;
	}
292
	memset(result, 0, BTRFS_CSUM_SIZE);
293

294
	crypto_shash_final(shash, result);
295 296 297 298

	return 0;
}

299 300 301 302 303 304
/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
305
static int verify_parent_transid(struct extent_io_tree *io_tree,
306 307
				 struct extent_buffer *eb, u64 parent_transid,
				 int atomic)
308
{
309
	struct extent_state *cached_state = NULL;
310
	int ret;
311
	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
312 313 314 315

	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
		return 0;

316 317 318
	if (atomic)
		return -EAGAIN;

319 320
	if (need_lock) {
		btrfs_tree_read_lock(eb);
321
		btrfs_set_lock_blocking_read(eb);
322 323
	}

324
	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
325
			 &cached_state);
326
	if (extent_buffer_uptodate(eb) &&
327 328 329 330
	    btrfs_header_generation(eb) == parent_transid) {
		ret = 0;
		goto out;
	}
331 332 333
	btrfs_err_rl(eb->fs_info,
		"parent transid verify failed on %llu wanted %llu found %llu",
			eb->start,
334
			parent_transid, btrfs_header_generation(eb));
335
	ret = 1;
336 337 338 339

	/*
	 * Things reading via commit roots that don't have normal protection,
	 * like send, can have a really old block in cache that may point at a
340
	 * block that has been freed and re-allocated.  So don't clear uptodate
341 342 343 344 345 346
	 * if we find an eb that is under IO (dirty/writeback) because we could
	 * end up reading in the stale data and then writing it back out and
	 * making everybody very sad.
	 */
	if (!extent_buffer_under_io(eb))
		clear_extent_buffer_uptodate(eb);
347
out:
348
	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
349
			     &cached_state);
350 351
	if (need_lock)
		btrfs_tree_read_unlock_blocking(eb);
352 353 354
	return ret;
}

355 356 357 358
static bool btrfs_supported_super_csum(u16 csum_type)
{
	switch (csum_type) {
	case BTRFS_CSUM_TYPE_CRC32:
359
	case BTRFS_CSUM_TYPE_XXHASH:
360
	case BTRFS_CSUM_TYPE_SHA256:
361
	case BTRFS_CSUM_TYPE_BLAKE2:
362 363 364 365 366 367
		return true;
	default:
		return false;
	}
}

368 369 370 371
/*
 * Return 0 if the superblock checksum type matches the checksum value of that
 * algorithm. Pass the raw disk superblock data.
 */
372 373
static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
				  char *raw_disk_sb)
374 375 376
{
	struct btrfs_super_block *disk_sb =
		(struct btrfs_super_block *)raw_disk_sb;
377
	char result[BTRFS_CSUM_SIZE];
378 379 380 381
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);

	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
382

383 384 385 386 387
	/*
	 * The super_block structure does not span the whole
	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
	 * filled with zeros and is included in the checksum.
	 */
388 389 390
	crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
	crypto_shash_final(shash, result);
391

392 393
	if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
		return 1;
394

395
	return 0;
396 397
}

398
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
399
			   struct btrfs_key *first_key, u64 parent_transid)
400
{
401
	struct btrfs_fs_info *fs_info = eb->fs_info;
402 403 404 405 406 407
	int found_level;
	struct btrfs_key found_key;
	int ret;

	found_level = btrfs_header_level(eb);
	if (found_level != level) {
408 409
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
		     KERN_ERR "BTRFS: tree level check failed\n");
410 411 412 413 414 415 416 417 418
		btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
			  eb->start, level, found_level);
		return -EIO;
	}

	if (!first_key)
		return 0;

419 420 421 422 423 424 425 426
	/*
	 * For live tree block (new tree blocks in current transaction),
	 * we need proper lock context to avoid race, which is impossible here.
	 * So we only checks tree blocks which is read from disk, whose
	 * generation <= fs_info->last_trans_committed.
	 */
	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
		return 0;
427 428 429 430 431 432 433 434 435 436

	/* We have @first_key, so this @eb must have at least one item */
	if (btrfs_header_nritems(eb) == 0) {
		btrfs_err(fs_info,
		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
			  eb->start);
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
		return -EUCLEAN;
	}

437 438 439 440 441 442 443
	if (found_level)
		btrfs_node_key_to_cpu(eb, &found_key, 0);
	else
		btrfs_item_key_to_cpu(eb, &found_key, 0);
	ret = btrfs_comp_cpu_keys(first_key, &found_key);

	if (ret) {
444 445
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
		     KERN_ERR "BTRFS: tree first key check failed\n");
446
		btrfs_err(fs_info,
447 448 449 450 451
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
			  eb->start, parent_transid, first_key->objectid,
			  first_key->type, first_key->offset,
			  found_key.objectid, found_key.type,
			  found_key.offset);
452 453 454 455
	}
	return ret;
}

456 457 458
/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
459 460 461 462
 *
 * @parent_transid:	expected transid, skip check if 0
 * @level:		expected level, mandatory check
 * @first_key:		expected key of first slot, skip check if NULL
463
 */
464
static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
465 466
					  u64 parent_transid, int level,
					  struct btrfs_key *first_key)
467
{
468
	struct btrfs_fs_info *fs_info = eb->fs_info;
469
	struct extent_io_tree *io_tree;
470
	int failed = 0;
471 472 473
	int ret;
	int num_copies = 0;
	int mirror_num = 0;
474
	int failed_mirror = 0;
475

476
	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
477
	while (1) {
478
		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
479
		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
480
		if (!ret) {
481
			if (verify_parent_transid(io_tree, eb,
482
						   parent_transid, 0))
483
				ret = -EIO;
484
			else if (btrfs_verify_level_key(eb, level,
485
						first_key, parent_transid))
486 487 488
				ret = -EUCLEAN;
			else
				break;
489
		}
490

491
		num_copies = btrfs_num_copies(fs_info,
492
					      eb->start, eb->len);
493
		if (num_copies == 1)
494
			break;
495

496 497 498 499 500
		if (!failed_mirror) {
			failed = 1;
			failed_mirror = eb->read_mirror;
		}

501
		mirror_num++;
502 503 504
		if (mirror_num == failed_mirror)
			mirror_num++;

505
		if (mirror_num > num_copies)
506
			break;
507
	}
508

509
	if (failed && !ret && failed_mirror)
510
		btrfs_repair_eb_io_failure(eb, failed_mirror);
511 512

	return ret;
513
}
514

515
/*
516 517
 * checksum a dirty tree block before IO.  This has extra checks to make sure
 * we only fill in the checksum field in the first page of a multi-page block
518
 */
519

520
static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
521
{
Miao Xie's avatar
Miao Xie committed
522
	u64 start = page_offset(page);
523
	u64 found_start;
524 525
	u8 result[BTRFS_CSUM_SIZE];
	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
526
	struct extent_buffer *eb;
527
	int ret;
528

529 530 531
	eb = (struct extent_buffer *)page->private;
	if (page != eb->pages[0])
		return 0;
532

533
	found_start = btrfs_header_bytenr(eb);
534 535 536 537 538 539 540 541 542
	/*
	 * Please do not consolidate these warnings into a single if.
	 * It is useful to know what went wrong.
	 */
	if (WARN_ON(found_start != start))
		return -EUCLEAN;
	if (WARN_ON(!PageUptodate(page)))
		return -EUCLEAN;

543
	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
544 545
			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);

546 547 548
	if (csum_tree_block(eb, result))
		return -EINVAL;

549 550 551 552 553 554
	if (btrfs_header_level(eb))
		ret = btrfs_check_node(eb);
	else
		ret = btrfs_check_leaf_full(eb);

	if (ret < 0) {
555
		btrfs_print_tree(eb, 0);
556 557 558
		btrfs_err(fs_info,
		"block=%llu write time tree block corruption detected",
			  eb->start);
559
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
560 561
		return ret;
	}
562
	write_extent_buffer(eb, result, 0, csum_size);
563

564
	return 0;
565 566
}

567
static int check_tree_block_fsid(struct extent_buffer *eb)
Yan Zheng's avatar
Yan Zheng committed
568
{
569
	struct btrfs_fs_info *fs_info = eb->fs_info;
570
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
571
	u8 fsid[BTRFS_FSID_SIZE];
Yan Zheng's avatar
Yan Zheng committed
572 573
	int ret = 1;

574
	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
Yan Zheng's avatar
Yan Zheng committed
575
	while (fs_devices) {
576 577 578 579 580 581 582 583 584 585 586 587 588 589
		u8 *metadata_uuid;

		/*
		 * Checking the incompat flag is only valid for the current
		 * fs. For seed devices it's forbidden to have their uuid
		 * changed so reading ->fsid in this case is fine
		 */
		if (fs_devices == fs_info->fs_devices &&
		    btrfs_fs_incompat(fs_info, METADATA_UUID))
			metadata_uuid = fs_devices->metadata_uuid;
		else
			metadata_uuid = fs_devices->fsid;

		if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
Yan Zheng's avatar
Yan Zheng committed
590 591 592 593 594 595 596 597
			ret = 0;
			break;
		}
		fs_devices = fs_devices->seed;
	}
	return ret;
}

598 599 600
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
				      u64 phy_offset, struct page *page,
				      u64 start, u64 end, int mirror)
601 602 603 604 605
{
	u64 found_start;
	int found_level;
	struct extent_buffer *eb;
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
606
	struct btrfs_fs_info *fs_info = root->fs_info;
607
	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
608
	int ret = 0;
609
	u8 result[BTRFS_CSUM_SIZE];
610
	int reads_done;
611 612 613

	if (!page->private)
		goto out;
614

615
	eb = (struct extent_buffer *)page->private;
616

617 618 619
	/* the pending IO might have been the only thing that kept this buffer
	 * in memory.  Make sure we have a ref for all this other checks
	 */
620
	atomic_inc(&eb->refs);
621 622

	reads_done = atomic_dec_and_test(&eb->io_pages);
623 624
	if (!reads_done)
		goto err;
625

626
	eb->read_mirror = mirror;
627
	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
628 629 630 631
		ret = -EIO;
		goto err;
	}

632
	found_start = btrfs_header_bytenr(eb);
633
	if (found_start != eb->start) {
634 635
		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
			     eb->start, found_start);
636
		ret = -EIO;
637 638
		goto err;
	}
639
	if (check_tree_block_fsid(eb)) {
640 641
		btrfs_err_rl(fs_info, "bad fsid on block %llu",
			     eb->start);
642 643 644
		ret = -EIO;
		goto err;
	}
645
	found_level = btrfs_header_level(eb);
646
	if (found_level >= BTRFS_MAX_LEVEL) {
647 648
		btrfs_err(fs_info, "bad tree block level %d on %llu",
			  (int)btrfs_header_level(eb), eb->start);
649 650 651
		ret = -EIO;
		goto err;
	}
652

653 654
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
				       eb, found_level);
655

656
	ret = csum_tree_block(eb, result);
657
	if (ret)
658 659
		goto err;

660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
	if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
		u32 val;
		u32 found = 0;

		memcpy(&found, result, csum_size);

		read_extent_buffer(eb, &val, 0, csum_size);
		btrfs_warn_rl(fs_info,
		"%s checksum verify failed on %llu wanted %x found %x level %d",
			      fs_info->sb->s_id, eb->start,
			      val, found, btrfs_header_level(eb));
		ret = -EUCLEAN;
		goto err;
	}

675 676 677 678 679
	/*
	 * If this is a leaf block and it is corrupt, set the corrupt bit so
	 * that we don't try and read the other copies of this block, just
	 * return -EIO.
	 */
680
	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
681 682 683
		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
		ret = -EIO;
	}
684

685
	if (found_level > 0 && btrfs_check_node(eb))
Liu Bo's avatar
Liu Bo committed
686 687
		ret = -EIO;

688 689
	if (!ret)
		set_extent_buffer_uptodate(eb);
690 691 692 693
	else
		btrfs_err(fs_info,
			  "block=%llu read time tree block corruption detected",
			  eb->start);
694
err:
695 696
	if (reads_done &&
	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
697
		btree_readahead_hook(eb, ret);
Arne Jansen's avatar
Arne Jansen committed
698

David Woodhouse's avatar
David Woodhouse committed
699 700 701 702 703 704 705
	if (ret) {
		/*
		 * our io error hook is going to dec the io pages
		 * again, we have to make sure it has something
		 * to decrement
		 */
		atomic_inc(&eb->io_pages);
706
		clear_extent_buffer_uptodate(eb);
David Woodhouse's avatar
David Woodhouse committed
707
	}
708
	free_extent_buffer(eb);
709
out:
710
	return ret;
711 712
}

713
static void end_workqueue_bio(struct bio *bio)
714
{
715
	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
716
	struct btrfs_fs_info *fs_info;
717
	struct btrfs_workqueue *wq;
718 719

	fs_info = end_io_wq->info;
720
	end_io_wq->status = bio->bi_status;
721

722
	if (bio_op(bio) == REQ_OP_WRITE) {
723
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
724
			wq = fs_info->endio_meta_write_workers;
725
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
726
			wq = fs_info->endio_freespace_worker;
727
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
728
			wq = fs_info->endio_raid56_workers;
729
		else
730
			wq = fs_info->endio_write_workers;
731
	} else {
732
		if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
733
			wq = fs_info->endio_repair_workers;
734
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
735
			wq = fs_info->endio_raid56_workers;
736
		else if (end_io_wq->metadata)
737
			wq = fs_info->endio_meta_workers;
738
		else
739
			wq = fs_info->endio_workers;
740
	}
741

742
	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
743
	btrfs_queue_work(wq, &end_io_wq->work);
744 745
}

746
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
747
			enum btrfs_wq_endio_type metadata)
748
{
749
	struct btrfs_end_io_wq *end_io_wq;
750

751
	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
752
	if (!end_io_wq)
753
		return BLK_STS_RESOURCE;
754 755 756

	end_io_wq->private = bio->bi_private;
	end_io_wq->end_io = bio->bi_end_io;
757
	end_io_wq->info = info;
758
	end_io_wq->status = 0;
759
	end_io_wq->bio = bio;
760
	end_io_wq->metadata = metadata;
761 762 763

	bio->bi_private = end_io_wq;
	bio->bi_end_io = end_workqueue_bio;
764 765 766
	return 0;
}

767 768 769
static void run_one_async_start(struct btrfs_work *work)
{
	struct async_submit_bio *async;
770
	blk_status_t ret;
771 772

	async = container_of(work, struct  async_submit_bio, work);
773
	ret = async->submit_bio_start(async->private_data, async->bio,
774 775
				      async->bio_offset);
	if (ret)
776
		async->status = ret;
777 778
}

779 780 781 782 783 784 785 786
/*
 * In order to insert checksums into the metadata in large chunks, we wait
 * until bio submission time.   All the pages in the bio are checksummed and
 * sums are attached onto the ordered extent record.
 *
 * At IO completion time the csums attached on the ordered extent record are
 * inserted into the tree.
 */
787
static void run_one_async_done(struct btrfs_work *work)
788 789
{
	struct async_submit_bio *async;
790 791
	struct inode *inode;
	blk_status_t ret;
792 793

	async = container_of(work, struct  async_submit_bio, work);
794
	inode = async->private_data;
795

796
	/* If an error occurred we just want to clean up the bio and move on */
797 798
	if (async->status) {
		async->bio->bi_status = async->status;
799
		bio_endio(async->bio);
800 801 802
		return;
	}

803 804 805 806 807 808
	/*
	 * All of the bios that pass through here are from async helpers.
	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
	 * This changes nothing when cgroups aren't in use.
	 */
	async->bio->bi_opf |= REQ_CGROUP_PUNT;
809
	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
810 811 812 813
	if (ret) {
		async->bio->bi_status = ret;
		bio_endio(async->bio);
	}
814 815 816 817 818 819 820
}

static void run_one_async_free(struct btrfs_work *work)
{
	struct async_submit_bio *async;

	async = container_of(work, struct  async_submit_bio, work);
821 822 823
	kfree(async);
}

824 825 826
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
				 int mirror_num, unsigned long bio_flags,
				 u64 bio_offset, void *private_data,
827
				 extent_submit_bio_start_t *submit_bio_start)
828 829 830 831 832
{
	struct async_submit_bio *async;

	async = kmalloc(sizeof(*async), GFP_NOFS);
	if (!async)
833
		return BLK_STS_RESOURCE;
834

835
	async->private_data = private_data;
836 837
	async->bio = bio;
	async->mirror_num = mirror_num;
838 839
	async->submit_bio_start = submit_bio_start;

840 841
	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
			run_one_async_free);
842

843
	async->bio_offset = bio_offset;
844

845
	async->status = 0;
846

847
	if (op_is_sync(bio->bi_opf))
848
		btrfs_set_work_high_priority(&async->work);
849

850
	btrfs_queue_work(fs_info->workers, &async->work);
851 852 853
	return 0;
}

854
static blk_status_t btree_csum_one_bio(struct bio *bio)
855
{
856
	struct bio_vec *bvec;
857
	struct btrfs_root *root;
858
	int ret = 0;
859
	struct bvec_iter_all iter_all;
860

861
	ASSERT(!bio_flagged(bio, BIO_CLONED));
862
	bio_for_each_segment_all(bvec, bio, iter_all) {
863
		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
864
		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
865 866
		if (ret)
			break;
867
	}
868

869
	return errno_to_blk_status(ret);
870 871
}

872
static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
873
					     u64 bio_offset)
874
{
875 876
	/*
	 * when we're called for a write, we're already in the async
877
	 * submission context.  Just jump into btrfs_map_bio
878
	 */
879
	return btree_csum_one_bio(bio);
880
}
881

882 883
static int check_async_write(struct btrfs_fs_info *fs_info,
			     struct btrfs_inode *bi)
884
{
885 886
	if (atomic_read(&bi->sync_writers))
		return 0;
887
	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
888 889 890 891
		return 0;
	return 1;
}

892
static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
893 894
					  int mirror_num,
					  unsigned long bio_flags)
895
{
896
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
897
	int async = check_async_write(fs_info, BTRFS_I(inode));
898
	blk_status_t ret;
899

900
	if (bio_op(bio) != REQ_OP_WRITE) {
901 902 903 904
		/*
		 * called for a read, do the setup so that checksum validation
		 * can happen in the async kernel threads
		 */
905 906
		ret = btrfs_bio_wq_end_io(fs_info, bio,
					  BTRFS_WQ_ENDIO_METADATA);
907
		if (ret)
908
			goto out_w_error;
909
		ret = btrfs_map_bio(fs_info, bio, mirror_num);
910 911 912
	} else if (!async) {
		ret = btree_csum_one_bio(bio);
		if (ret)
913
			goto out_w_error;
914
		ret = btrfs_map_bio(fs_info, bio, mirror_num);
915 916 917 918 919
	} else {
		/*
		 * kthread helpers are used to submit writes so that
		 * checksumming can happen in parallel across all CPUs
		 */
920
		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
921
					  0, inode, btree_submit_bio_start);
922
	}
923

924 925 926 927
	if (ret)
		goto out_w_error;
	return 0;

928
out_w_error:
929
	bio->bi_status = ret;
930
	bio_endio(bio);
931
	return ret;
932 933
}

Jan Beulich's avatar
Jan Beulich committed
934
#ifdef CONFIG_MIGRATION
935
static int btree_migratepage(struct address_space *mapping,
936 937
			struct page *newpage, struct page *page,
			enum migrate_mode mode)
938 939 940 941 942 943 944 945 946 947 948 949 950 951
{
	/*
	 * we can't safely write a btree page from here,
	 * we haven't done the locking hook
	 */
	if (PageDirty(page))
		return -EAGAIN;
	/*
	 * Buffers may be managed in a filesystem specific way.
	 * We must have no buffers or drop them.
	 */
	if (page_has_private(page) &&
	    !try_to_release_page(page, GFP_KERNEL))
		return -EAGAIN;
952
	return migrate_page(mapping, newpage, page, mode);
953
}
Jan Beulich's avatar
Jan Beulich committed
954
#endif
955

956 957 958 959

static int btree_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
{
960 961 962
	struct btrfs_fs_info *fs_info;
	int ret;

963
	if (wbc->sync_mode == WB_SYNC_NONE) {
964 965 966 967

		if (wbc->for_kupdate)
			return 0;

968
		fs_info = BTRFS_I(mapping->host)->root->fs_info;
969
		/* this is a bit racy, but that's ok */
970 971 972
		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
					     BTRFS_DIRTY_METADATA_THRESH,
					     fs_info->dirty_metadata_batch);
973
		if (ret < 0)
974 975
			return 0;
	}
976
	return btree_write_cache_pages(mapping, wbc);
977 978
}

979
static int btree_readpage(struct file *file, struct page *page)
980
{
981 982
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
983
	return extent_read_full_page(tree, page, btree_get_extent, 0);
984
}
Chris Mason's avatar
Chris Mason committed
985

986
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
987
{
988
	if (PageWriteback(page) || PageDirty(page))
989
		return 0;
990

991
	return try_release_extent_buffer(page);
992 993
}

994 995
static void btree_invalidatepage(struct page *page, unsigned int offset,
				 unsigned int length)
996
{
997 998
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
999 1000
	extent_invalidatepage(tree, page, offset);
	btree_releasepage(page, GFP_NOFS);
1001
	if (PagePrivate(page)) {
1002 1003 1004
		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
			   "page private not zero on page %llu",
			   (unsigned long long)page_offset(page));
1005 1006
		ClearPagePrivate(page);
		set_page_private(page, 0);
1007
		put_page(page);
1008
	}
1009 1010
}

1011 1012
static int btree_set_page_dirty(struct page *page)
{
1013
#ifdef DEBUG
1014 1015 1016 1017 1018 1019 1020 1021
	struct extent_buffer *eb;

	BUG_ON(!PagePrivate(page));
	eb = (struct extent_buffer *)page->private;
	BUG_ON(!eb);
	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
	BUG_ON(!atomic_read(&eb->refs));
	btrfs_assert_tree_locked(eb);
1022
#endif
1023 1024 1025
	return __set_page_dirty_nobuffers(page);
}

1026
static const struct address_space_operations btree_aops = {
1027
	.readpage	= btree_readpage,
1028
	.writepages	= btree_writepages,
1029 1030
	.releasepage	= btree_releasepage,
	.invalidatepage = btree_invalidatepage,
1031
#ifdef CONFIG_MIGRATION
1032
	.migratepage	= btree_migratepage,
1033
#endif
1034
	.set_page_dirty = btree_set_page_dirty,
1035 1036
};

1037
void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
Chris Mason's avatar
Chris Mason committed
1038
{
1039
	struct extent_buffer *buf = NULL;
1040
	int ret;
Chris Mason's avatar
Chris Mason committed
1041

1042
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
1043
	if (IS_ERR(buf))
1044
		return;
1045

1046
	ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
1047 1048 1049 1050
	if (ret < 0)
		free_extent_buffer_stale(buf);
	else
		free_extent_buffer(buf);
Chris Mason's avatar
Chris Mason committed
1051 1052
}

1053 1054 1055
struct extent_buffer *btrfs_find_create_tree_block(
						struct btrfs_fs_info *fs_info,
						u64 bytenr)
1056
{
1057 1058 1059
	if (btrfs_is_testing(fs_info))
		return alloc_test_extent_buffer(fs_info, bytenr);
	return alloc_extent_buffer(fs_info, bytenr);
1060 1061
}

1062 1063 1064 1065 1066 1067 1068 1069
/*
 * Read tree block at logical address @bytenr and do variant basic but critical
 * verification.
 *
 * @parent_transid:	expected transid of this tree block, skip check if 0
 * @level:		expected level, mandatory check
 * @first_key:		expected key in slot 0, skip check if NULL
 */
1070
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1071 1072
				      u64 parent_transid, int level,
				      struct btrfs_key *first_key)
1073 1074 1075 1076
{
	struct extent_buffer *buf = NULL;
	int ret;

1077
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
1078 1079
	if (IS_ERR(buf))
		return buf;
1080

1081
	ret = btree_read_extent_buffer_pages(buf, parent_transid,
1082
					     level, first_key);
1083
	if (ret) {
1084
		free_extent_buffer_stale(buf);
1085
		return ERR_PTR(ret);
1086
	}
1087
	return buf;
1088

1089 1090
}

1091
void btrfs_clean_tree_block(struct extent_buffer *buf)
1092
{
1093
	struct btrfs_fs_info *fs_info = buf->fs_info;
1094
	if (btrfs_header_generation(buf) ==
1095
	    fs_info->running_transaction->transid) {
1096
		btrfs_assert_tree_locked(buf);
1097

1098
		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1099 1100 1101
			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
						 -buf->len,
						 fs_info->dirty_metadata_batch);
1102
			/* ugh, clear_extent_buffer_dirty needs to lock the page */
1103
			btrfs_set_lock_blocking_write(buf);
1104 1105
			clear_extent_buffer_dirty(buf);
		}
1106
	}
1107 1108
}

1109 1110 1111 1112 1113 1114 1115 1116 1117
static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
{
	struct btrfs_subvolume_writers *writers;
	int ret;

	writers = kmalloc(sizeof(*writers), GFP_NOFS);
	if (!writers)
		return ERR_PTR(-ENOMEM);

1118
	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
	if (ret < 0) {
		kfree(writers);
		return ERR_PTR(ret);
	}

	init_waitqueue_head(&writers->wait);
	return writers;
}

static void
btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
{
	percpu_counter_destroy(&writers->counter);
	kfree(writers);
}

1135
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1136
			 u64 objectid)
1137
{
1138
	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1139
	root->fs_info = fs_info;
Chris Mason's avatar
Chris Mason committed
1140
	root->node = NULL;
1141
	root->commit_root = NULL;
1142
	root->state = 0;
1143
	root->orphan_cleanup_state = 0;
1144

1145
	root->last_trans = 0;
1146
	root->highest_objectid = 0;
1147
	root->nr_delalloc_inodes = 0;
1148
	root->nr_ordered_extents = 0;
1149
	root->inode_tree = RB_ROOT;
1150
	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1151
	root->block_rsv = NULL;
1152 1153

	INIT_LIST_HEAD(&root->dirty_list);
1154
	INIT_LIST_HEAD(&root->root_list);
1155 1156
	INIT_LIST_HEAD(&root->delalloc_inodes);
	INIT_LIST_HEAD(&root->delalloc_root);
1157 1158
	INIT_LIST_HEAD(&root->ordered_extents);
	INIT_LIST_HEAD(&root->ordered_root);
1159
	INIT_LIST_HEAD(&root->reloc_dirty_list);
1160 1161
	INIT_LIST_HEAD(&root->logged_list[0]);
	INIT_LIST_HEAD(&root->logged_list[1]);
1162
	spin_lock_init(&root->inode_lock);
1163
	spin_lock_init(&root->delalloc_lock);
1164
	spin_lock_init(&root->ordered_extent_lock);
1165
	spin_lock_init(&root->accounting_lock);
1166 1167
	spin_lock_init(&root->log_extents_lock[0]);
	spin_lock_init(&root->log_extents_lock[1]);
1168
	spin_lock_init(&root->qgroup_meta_rsv_lock);
1169
	mutex_init(&root->objectid_mutex);
1170
	mutex_init(&root->log_mutex);
1171
	mutex_init(&root->ordered_extent_mutex);
1172
	mutex_init(&root->delalloc_mutex);
1173 1174 1175
	init_waitqueue_head(&root->log_writer_wait);
	init_waitqueue_head(&root->log_commit_wait[0]);
	init_waitqueue_head(&root->log_commit_wait[1]);
1176 1177
	INIT_LIST_HEAD(&root->log_ctxs[0]);
	INIT_LIST_HEAD(&root->log_ctxs[1]);
1178 1179 1180
	atomic_set(&root->log_commit[0], 0);
	atomic_set(&root->log_commit[1], 0);
	atomic_set(&root->log_writers, 0);
1181
	atomic_set(&root->log_batch, 0);
1182
	refcount_set(&root->refs, 1);
1183
	atomic_set(&root->will_be_snapshotted, 0);
1184
	atomic_set(&root->snapshot_force_cow, 0);
1185
	atomic_set(&root->nr_swapfiles, 0);
1186
	root->log_transid = 0;
1187
	root->log_transid_committed = -1;
1188
	root->last_log_commit = 0;
1189
	if (!dummy)
1190 1191
		extent_io_tree_init(fs_info, &root->dirty_log_pages,
				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1192

1193 1194
	memset(&root->root_key, 0, sizeof(root->root_key));
	memset(&root->root_item, 0, sizeof(root->root_item));
1195
	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1196
	if (!dummy)
1197 1198 1199
		root->defrag_trans_start = fs_info->generation;
	else
		root->defrag_trans_start = 0;
1200
	root->root_key.objectid = objectid;
1201
	root->anon_dev = 0;
1202

1203
	spin_lock_init(&root->root_item_lock);
1204
	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1205 1206
}

1207
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1208
					   u64 objectid, gfp_t flags)
1209
{
1210
	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1211
	if (root)
1212
		__setup_root(root, fs_info, objectid);
1213 1214 1215
	return root;
}

1216 1217
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
1218
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1219 1220 1221
{
	struct btrfs_root *root;

1222 1223 1224
	if (!fs_info)
		return ERR_PTR(-EINVAL);

1225
	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1226 1227
	if (!root)
		return ERR_PTR(-ENOMEM);
1228

1229
	/* We don't use the stripesize in selftest, set it as sectorsize */
1230
	root->alloc_bytenr = 0;
1231 1232 1233 1234 1235

	return root;
}
#endif

1236 1237 1238
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
				     u64 objectid)
{
1239
	struct btrfs_fs_info *fs_info = trans->fs_info;
1240 1241 1242 1243
	struct extent_buffer *leaf;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root;
	struct btrfs_key key;
1244
	unsigned int nofs_flag;
1245
	int ret = 0;
1246
	uuid_le uuid = NULL_UUID_LE;
1247

1248 1249 1250 1251 1252
	/*
	 * We're holding a transaction handle, so use a NOFS memory allocation
	 * context to avoid deadlock if reclaim happens.
	 */
	nofs_flag = memalloc_nofs_save();
1253
	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1254
	memalloc_nofs_restore(nofs_flag);
1255 1256 1257 1258 1259 1260 1261
	if (!root)
		return ERR_PTR(-ENOMEM);

	root->root_key.objectid = objectid;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = 0;

1262
	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1263 1264
	if (IS_ERR(leaf)) {
		ret = PTR_ERR(leaf);
1265
		leaf = NULL;
1266 1267 1268 1269 1270 1271 1272
		goto fail;
	}

	root->node = leaf;
	btrfs_mark_buffer_dirty(leaf);

	root->commit_root = btrfs_root_node(root);
1273
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283

	root->root_item.flags = 0;
	root->root_item.byte_limit = 0;
	btrfs_set_root_bytenr(&root->root_item, leaf->start);
	btrfs_set_root_generation(&root->root_item, trans->transid);
	btrfs_set_root_level(&root->root_item, 0);
	btrfs_set_root_refs(&root->root_item, 1);
	btrfs_set_root_used(&root->root_item, leaf->len);
	btrfs_set_root_last_snapshot(&root->root_item, 0);
	btrfs_set_root_dirid(&root->root_item, 0);
1284 1285
	if (is_fstree(objectid))
		uuid_le_gen(&uuid);
1286
	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297
	root->root_item.drop_level = 0;

	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
	if (ret)
		goto fail;

	btrfs_tree_unlock(leaf);

1298 1299
	return root;

1300
fail:
1301 1302
	if (leaf) {
		btrfs_tree_unlock(leaf);
1303
		free_extent_buffer(root->commit_root);
1304 1305
		free_extent_buffer(leaf);
	}
1306
	btrfs_put_fs_root(root);
1307

1308
	return ERR_PTR(ret);
1309 1310
}

1311 1312
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
					 struct btrfs_fs_info *fs_info)
1313 1314
{
	struct btrfs_root *root;
1315
	struct extent_buffer *leaf;
1316

1317
	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1318
	if (!root)
1319
		return ERR_PTR(-ENOMEM);
1320 1321 1322 1323

	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1324

1325
	/*
1326 1327
	 * DON'T set REF_COWS for log trees
	 *
1328 1329 1330 1331 1332
	 * log trees do not get reference counted because they go away
	 * before a real commit is actually done.  They do store pointers
	 * to file data extents, and those reference counts still get
	 * updated (along with back refs to the log tree).
	 */
1333

1334 1335
	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
			NULL, 0, 0, 0);
1336
	if (IS_ERR(leaf)) {
1337
		btrfs_put_fs_root(root);
1338 1339
		return ERR_CAST(leaf);
	}
1340

1341
	root->node = leaf;
1342 1343 1344

	btrfs_mark_buffer_dirty(root->node);
	btrfs_tree_unlock(root->node);
1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
	return root;
}

int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
			     struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *log_root;

	log_root = alloc_log_tree(trans, fs_info);
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);
	WARN_ON(fs_info->log_root_tree);
	fs_info->log_root_tree = log_root;
	return 0;
}

int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root)
{
1364
	struct btrfs_fs_info *fs_info = root->fs_info;
1365 1366 1367
	struct btrfs_root *log_root;
	struct btrfs_inode_item *inode_item;

1368
	log_root = alloc_log_tree(trans, fs_info);
1369 1370 1371 1372 1373 1374 1375
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);

	log_root->last_trans = trans->transid;
	log_root->root_key.offset = root->root_key.objectid;

	inode_item = &log_root->root_item.inode;
1376 1377 1378
	btrfs_set_stack_inode_generation(inode_item, 1);
	btrfs_set_stack_inode_size(inode_item, 3);
	btrfs_set_stack_inode_nlink(inode_item, 1);
1379
	btrfs_set_stack_inode_nbytes(inode_item,
1380
				     fs_info->nodesize);
1381
	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1382

1383
	btrfs_set_root_node(&log_root->root_item, log_root->node);
1384 1385 1386 1387

	WARN_ON(root->log_root);
	root->log_root = log_root;
	root->log_transid = 0;
1388
	root->log_transid_committed = -1;
1389
	root->last_log_commit = 0;
1390 1391 1392
	return 0;
}

1393 1394
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
					struct btrfs_key *key)
1395 1396 1397
{
	struct btrfs_root *root;
	struct btrfs_fs_info *fs_info = tree_root->fs_info;
1398
	struct btrfs_path *path;
1399
	u64 generation;
1400
	int ret;
1401
	int level;
1402

1403 1404
	path = btrfs_alloc_path();
	if (!path)
1405
		return ERR_PTR(-ENOMEM);
1406

1407
	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1408 1409 1410
	if (!root) {
		ret = -ENOMEM;
		goto alloc_fail;
1411 1412
	}

1413 1414
	ret = btrfs_find_root(tree_root, key, path,
			      &root->root_item, &root->root_key);
1415
	if (ret) {
1416 1417
		if (ret > 0)
			ret = -ENOENT;
1418
		goto find_fail;
1419
	}
1420

1421
	generation = btrfs_root_generation(&root->root_item);
1422
	level = btrfs_root_level(&root->root_item);
1423 1424
	root->node = read_tree_block(fs_info,
				     btrfs_root_bytenr(&root->root_item),
1425
				     generation, level, NULL);
1426 1427
	if (IS_ERR(root->node)) {
		ret = PTR_ERR(root->node);
1428 1429 1430
		goto find_fail;
	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
		ret = -EIO;
1431 1432
		free_extent_buffer(root->node);
		goto find_fail;
1433
	}
1434
	root->commit_root = btrfs_root_node(root);
1435
out:
1436 1437 1438 1439
	btrfs_free_path(path);
	return root;

find_fail:
1440
	btrfs_put_fs_root(root);
1441 1442 1443 1444 1445
alloc_fail:
	root = ERR_PTR(ret);
	goto out;
}

1446
static int btrfs_init_fs_root(struct btrfs_root *root)
1447 1448
{
	int ret;
1449
	struct btrfs_subvolume_writers *writers;
1450 1451 1452 1453 1454 1455 1456 1457 1458

	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
					GFP_NOFS);
	if (!root->free_ino_pinned || !root->free_ino_ctl) {
		ret = -ENOMEM;
		goto fail;
	}

1459 1460 1461 1462 1463 1464 1465
	writers = btrfs_alloc_subvolume_writers();
	if (IS_ERR(writers)) {
		ret = PTR_ERR(writers);
		goto fail;
	}
	root->subv_writers = writers;

1466 1467 1468 1469 1470
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
		set_bit(BTRFS_ROOT_REF_COWS, &root->state);
		btrfs_check_and_init_root_item(&root->root_item);
	}

1471
	btrfs_init_free_ino_ctl(root);
1472 1473
	spin_lock_init(&root->ino_cache_lock);
	init_waitqueue_head(&root->ino_cache_wait);
1474 1475 1476

	ret = get_anon_bdev(&root->anon_dev);
	if (ret)
Liu Bo's avatar
Liu Bo committed
1477
		goto fail;
1478 1479 1480 1481 1482 1483

	mutex_lock(&root->objectid_mutex);
	ret = btrfs_find_highest_objectid(root,
					&root->highest_objectid);
	if (ret) {
		mutex_unlock(&root->objectid_mutex);
Liu Bo's avatar
Liu Bo committed
1484
		goto fail;
1485 1486 1487 1488 1489 1490
	}

	ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);

	mutex_unlock(&root->objectid_mutex);

1491 1492
	return 0;
fail:
1493
	/* The caller is responsible to call btrfs_free_fs_root */
1494 1495 1496
	return ret;
}

1497 1498
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
					       u64 root_id)
1499 1500 1501 1502 1503 1504
{
	struct btrfs_root *root;

	spin_lock(&fs_info->fs_roots_radix_lock);
	root = radix_tree_lookup(&fs_info->fs_roots_radix,
				 (unsigned long)root_id);
1505 1506
	if (root)
		root = btrfs_grab_fs_root(root);
1507 1508 1509 1510 1511 1512 1513 1514 1515
	spin_unlock(&fs_info->fs_roots_radix_lock);
	return root;
}

int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
			 struct btrfs_root *root)
{
	int ret;

1516
	ret = radix_tree_preload(GFP_NOFS);
1517 1518 1519 1520 1521 1522 1523
	if (ret)
		return ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	ret = radix_tree_insert(&fs_info->fs_roots_radix,
				(unsigned long)root->root_key.objectid,
				root);
1524 1525
	if (ret == 0) {
		btrfs_grab_fs_root(root);
1526
		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1527
	}
1528 1529 1530 1531 1532 1533
	spin_unlock(&fs_info->fs_roots_radix_lock);
	radix_tree_preload_end();

	return ret;
}

1534 1535
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
1536 1537 1538 1539 1540 1541 1542
	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
	percpu_counter_destroy(&fs_info->delalloc_bytes);
	percpu_counter_destroy(&fs_info->dio_bytes);
	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
	btrfs_free_csum_hash(fs_info);
	btrfs_free_stripe_hash_table(fs_info);
	btrfs_free_ref_cache(fs_info);
1543 1544
	kfree(fs_info->balance_ctl);
	kfree(fs_info->delayed_root);
1545 1546 1547 1548 1549 1550 1551 1552 1553
	btrfs_put_fs_root(fs_info->extent_root);
	btrfs_put_fs_root(fs_info->tree_root);
	btrfs_put_fs_root(fs_info->chunk_root);
	btrfs_put_fs_root(fs_info->dev_root);
	btrfs_put_fs_root(fs_info->csum_root);
	btrfs_put_fs_root(fs_info->quota_root);
	btrfs_put_fs_root(fs_info->uuid_root);
	btrfs_put_fs_root(fs_info->free_space_root);
	btrfs_put_fs_root(fs_info->fs_root);
1554 1555 1556 1557 1558 1559
	kfree(fs_info->super_copy);
	kfree(fs_info->super_for_commit);
	kvfree(fs_info);
}


1560 1561 1562
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
				     struct btrfs_key *location,
				     bool check_ref)
1563 1564
{
	struct btrfs_root *root;
1565
	struct btrfs_path *path;
1566
	struct btrfs_key key;
1567 1568
	int ret;

1569
	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1570
		return btrfs_grab_fs_root(fs_info->tree_root);
1571
	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1572
		return btrfs_grab_fs_root(fs_info->extent_root);
1573
	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1574
		return btrfs_grab_fs_root(fs_info->chunk_root);
1575
	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1576
		return btrfs_grab_fs_root(fs_info->dev_root);
1577
	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1578
		return btrfs_grab_fs_root(fs_info->csum_root);
1579
	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1580 1581
		return btrfs_grab_fs_root(fs_info->quota_root) ?
			fs_info->quota_root : ERR_PTR(-ENOENT);
1582
	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
1583 1584
		return btrfs_grab_fs_root(fs_info->uuid_root) ?
			fs_info->uuid_root : ERR_PTR(-ENOENT);
1585
	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1586 1587
		return btrfs_grab_fs_root(fs_info->free_space_root) ?
			fs_info->free_space_root : ERR_PTR(-ENOENT);
1588
again:
1589
	root = btrfs_lookup_fs_root(fs_info, location->objectid);
1590
	if (root) {
1591 1592
		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
			btrfs_put_fs_root(root);
1593
			return ERR_PTR(-ENOENT);
1594
		}
1595
		return root;
1596
	}
1597

1598
	root = btrfs_read_tree_root(fs_info->tree_root, location);
1599 1600
	if (IS_ERR(root))
		return root;
1601

1602
	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1603
		ret = -ENOENT;
1604
		goto fail;
1605
	}
1606

1607
	ret = btrfs_init_fs_root(root);
1608 1609
	if (ret)
		goto fail;
1610

1611 1612 1613 1614 1615
	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto fail;
	}
1616 1617 1618 1619 1620
	key.objectid = BTRFS_ORPHAN_OBJECTID;
	key.type = BTRFS_ORPHAN_ITEM_KEY;
	key.offset = location->objectid;

	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1621
	btrfs_free_path(path);
1622 1623 1624
	if (ret < 0)
		goto fail;
	if (ret == 0)
1625
		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1626

1627 1628 1629 1630 1631 1632 1633 1634 1635
	/*
	 * All roots have two refs on them at all times, one for the mounted fs,
	 * and one for being in the radix tree.  This way we only free the root
	 * when we are unmounting or deleting the subvolume.  We get one ref
	 * from __setup_root, one for inserting it into the radix tree, and then
	 * we have the third for returning it, and the caller will put it when
	 * it's done with the root.
	 */
	btrfs_grab_fs_root(root);
1636
	ret = btrfs_insert_fs_root(fs_info, root);
1637
	if (ret) {
1638
		btrfs_put_fs_root(root);
1639
		if (ret == -EEXIST) {
1640
			btrfs_free_fs_root(root);
1641 1642 1643
			goto again;
		}
		goto fail;
1644
	}
1645
	return root;
1646
fail:
1647
	btrfs_free_fs_root(root);
1648
	return ERR_PTR(ret);
1649 1650
}

1651 1652 1653 1654 1655 1656
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
{
	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
	int ret = 0;
	struct btrfs_device *device;
	struct backing_dev_info *bdi;
Chris Mason's avatar
Chris Mason committed
1657

1658 1659
	rcu_read_lock();
	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1660 1661
		if (!device->bdev)
			continue;
1662
		bdi = device->bdev->bd_bdi;
1663
		if (bdi_congested(bdi, bdi_bits)) {
1664 1665 1666 1667
			ret = 1;
			break;
		}
	}
1668
	rcu_read_unlock();
1669 1670 1671
	return ret;
}

1672 1673 1674 1675 1676
/*
 * called by the kthread helper functions to finally call the bio end_io
 * functions.  This is where read checksum verification actually happens
 */
static void end_workqueue_fn(struct btrfs_work *work)
1677 1678
{
	struct bio *bio;
1679
	struct btrfs_end_io_wq *end_io_wq;
1680

1681
	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1682
	bio = end_io_wq->bio;
1683

1684
	bio->bi_status = end_io_wq->status;
1685 1686
	bio->bi_private = end_io_wq->private;
	bio->bi_end_io = end_io_wq->end_io;
1687
	bio_endio(bio);
1688
	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1689 1690
}

1691 1692 1693
static int cleaner_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1694
	struct btrfs_fs_info *fs_info = root->fs_info;
1695
	int again;
1696

1697
	while (1) {
1698
		again = 0;
1699

1700 1701
		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);

1702
		/* Make the cleaner go to sleep early. */
1703
		if (btrfs_need_cleaner_sleep(fs_info))
1704 1705
			goto sleep;

1706 1707 1708 1709
		/*
		 * Do not do anything if we might cause open_ctree() to block
		 * before we have finished mounting the filesystem.
		 */
1710
		if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1711 1712
			goto sleep;

1713
		if (!mutex_trylock(&fs_info->cleaner_mutex))
1714 1715
			goto sleep;

1716 1717 1718 1719
		/*
		 * Avoid the problem that we change the status of the fs
		 * during the above check and trylock.
		 */
1720
		if (btrfs_need_cleaner_sleep(fs_info)) {
1721
			mutex_unlock(&fs_info->cleaner_mutex);
1722
			goto sleep;
1723
		}
1724

1725
		btrfs_run_delayed_iputs(fs_info);
1726

1727
		again = btrfs_clean_one_deleted_snapshot(root);
1728
		mutex_unlock(&fs_info->cleaner_mutex);
1729 1730

		/*
1731 1732
		 * The defragger has dealt with the R/O remount and umount,
		 * needn't do anything special here.
1733
		 */
1734
		btrfs_run_defrag_inodes(fs_info);
1735 1736 1737 1738 1739 1740 1741 1742 1743

		/*
		 * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
		 * with relocation (btrfs_relocate_chunk) and relocation
		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
		 * after acquiring fs_info->delete_unused_bgs_mutex. So we
		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
		 * unused block groups.
		 */
1744
		btrfs_delete_unused_bgs(fs_info);
1745
sleep:
1746
		clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1747 1748 1749 1750
		if (kthread_should_park())
			kthread_parkme();
		if (kthread_should_stop())
			return 0;
1751
		if (!again) {
1752
			set_current_state(TASK_INTERRUPTIBLE);
1753
			schedule();
1754 1755
			__set_current_state(TASK_RUNNING);
		}
1756
	}
1757 1758 1759 1760 1761
}

static int transaction_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1762
	struct btrfs_fs_info *fs_info = root->fs_info;
1763 1764
	struct btrfs_trans_handle *trans;
	struct btrfs_transaction *cur;
1765
	u64 transid;
1766
	time64_t now;
1767
	unsigned long delay;
1768
	bool cannot_commit;
1769 1770

	do {
1771
		cannot_commit = false;
1772 1773
		delay = HZ * fs_info->commit_interval;
		mutex_lock(&fs_info->transaction_kthread_mutex);
1774

1775 1776
		spin_lock(&fs_info->trans_lock);
		cur = fs_info->running_transaction;
1777
		if (!cur) {
1778
			spin_unlock(&fs_info->trans_lock);
1779 1780
			goto sleep;
		}
1781

1782
		now = ktime_get_seconds();
1783
		if (cur->state < TRANS_STATE_COMMIT_START &&
1784
		    !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
1785
		    (now < cur->start_time ||
1786 1787
		     now - cur->start_time < fs_info->commit_interval)) {
			spin_unlock(&fs_info->trans_lock);
1788 1789 1790
			delay = HZ * 5;
			goto sleep;
		}
1791
		transid = cur->transid;
1792
		spin_unlock(&fs_info->trans_lock);
1793

1794
		/* If the file system is aborted, this will always fail. */
1795
		trans = btrfs_attach_transaction(root);
1796
		if (IS_ERR(trans)) {
1797 1798
			if (PTR_ERR(trans) != -ENOENT)
				cannot_commit = true;
1799
			goto sleep;
1800
		}
1801
		if (transid == trans->transid) {
1802
			btrfs_commit_transaction(trans);
1803
		} else {
1804
			btrfs_end_transaction(trans);
1805
		}
1806
sleep:
1807 1808
		wake_up_process(fs_info->cleaner_kthread);
		mutex_unlock(&fs_info->transaction_kthread_mutex);
1809

1810
		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1811
				      &fs_info->fs_state)))
1812
			btrfs_cleanup_transaction(fs_info);
1813
		if (!kthread_should_stop() &&
1814
				(!btrfs_transaction_blocked(fs_info) ||
1815
				 cannot_commit))
1816
			schedule_timeout_interruptible(delay);
1817 1818 1819 1820
	} while (!kthread_should_stop());
	return 0;
}

1821
/*
1822 1823 1824
 * This will find the highest generation in the array of root backups.  The
 * index of the highest array is returned, or -EINVAL if we can't find
 * anything.
1825 1826 1827 1828 1829
 *
 * We check to make sure the array is valid by comparing the
 * generation of the latest  root in the array with the generation
 * in the super block.  If they don't match we pitch it.
 */
1830
static int find_newest_super_backup(struct btrfs_fs_info *info)
1831
{
1832
	const u64 newest_gen = btrfs_super_generation(info->super_copy);
1833 1834 1835 1836 1837 1838 1839 1840
	u64 cur;
	struct btrfs_root_backup *root_backup;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		root_backup = info->super_copy->super_roots + i;
		cur = btrfs_backup_tree_root_gen(root_backup);
		if (cur == newest_gen)
1841
			return i;
1842 1843
	}

1844
	return -EINVAL;
1845 1846 1847 1848 1849 1850 1851 1852 1853
}

/*
 * copy all the root pointers into the super backup array.
 * this will bump the backup pointer by one when it is
 * done
 */
static void backup_super_roots(struct btrfs_fs_info *info)
{
1854
	const int next_backup = info->backup_root_index;
1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885
	struct btrfs_root_backup *root_backup;

	root_backup = info->super_for_commit->super_roots + next_backup;

	/*
	 * make sure all of our padding and empty slots get zero filled
	 * regardless of which ones we use today
	 */
	memset(root_backup, 0, sizeof(*root_backup));

	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;

	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
	btrfs_set_backup_tree_root_gen(root_backup,
			       btrfs_header_generation(info->tree_root->node));

	btrfs_set_backup_tree_root_level(root_backup,
			       btrfs_header_level(info->tree_root->node));

	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
	btrfs_set_backup_chunk_root_gen(root_backup,
			       btrfs_header_generation(info->chunk_root->node));
	btrfs_set_backup_chunk_root_level(root_backup,
			       btrfs_header_level(info->chunk_root->node));

	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
	btrfs_set_backup_extent_root_gen(root_backup,
			       btrfs_header_generation(info->extent_root->node));
	btrfs_set_backup_extent_root_level(root_backup,
			       btrfs_header_level(info->extent_root->node));

1886 1887 1888 1889 1890 1891 1892 1893
	/*
	 * we might commit during log recovery, which happens before we set
	 * the fs_root.  Make sure it is valid before we fill it in.
	 */
	if (info->fs_root && info->fs_root->node) {
		btrfs_set_backup_fs_root(root_backup,
					 info->fs_root->node->start);
		btrfs_set_backup_fs_root_gen(root_backup,
1894
			       btrfs_header_generation(info->fs_root->node));
1895
		btrfs_set_backup_fs_root_level(root_backup,
1896
			       btrfs_header_level(info->fs_root->node));
1897
	}
1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926

	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
	btrfs_set_backup_dev_root_gen(root_backup,
			       btrfs_header_generation(info->dev_root->node));
	btrfs_set_backup_dev_root_level(root_backup,
				       btrfs_header_level(info->dev_root->node));

	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
	btrfs_set_backup_csum_root_gen(root_backup,
			       btrfs_header_generation(info->csum_root->node));
	btrfs_set_backup_csum_root_level(root_backup,
			       btrfs_header_level(info->csum_root->node));

	btrfs_set_backup_total_bytes(root_backup,
			     btrfs_super_total_bytes(info->super_copy));
	btrfs_set_backup_bytes_used(root_backup,
			     btrfs_super_bytes_used(info->super_copy));
	btrfs_set_backup_num_devices(root_backup,
			     btrfs_super_num_devices(info->super_copy));

	/*
	 * if we don't copy this out to the super_copy, it won't get remembered
	 * for the next commit
	 */
	memcpy(&info->super_copy->super_roots,
	       &info->super_for_commit->super_roots,
	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}

1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
/*
 * read_backup_root - Reads a backup root based on the passed priority. Prio 0
 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
 *
 * fs_info - filesystem whose backup roots need to be read
 * priority - priority of backup root required
 *
 * Returns backup root index on success and -EINVAL otherwise.
 */
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
	int backup_index = find_newest_super_backup(fs_info);
	struct btrfs_super_block *super = fs_info->super_copy;
	struct btrfs_root_backup *root_backup;

	if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
		if (priority == 0)
			return backup_index;

		backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
		backup_index %= BTRFS_NUM_BACKUP_ROOTS;
	} else {
		return -EINVAL;
	}

	root_backup = super->super_roots + backup_index;

	btrfs_set_super_generation(super,
				   btrfs_backup_tree_root_gen(root_backup));
	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
	btrfs_set_super_root_level(super,
				   btrfs_backup_tree_root_level(root_backup));
	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));

	/*
	 * Fixme: the total bytes and num_devices need to match or we should
	 * need a fsck
	 */
	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));

	return backup_index;
}

Liu Bo's avatar
Liu Bo committed
1971 1972 1973
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
1974
	btrfs_destroy_workqueue(fs_info->fixup_workers);
1975
	btrfs_destroy_workqueue(fs_info->delalloc_workers);
1976
	btrfs_destroy_workqueue(fs_info->workers);
1977 1978
	btrfs_destroy_workqueue(fs_info->endio_workers);
	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1979
	btrfs_destroy_workqueue(fs_info->endio_repair_workers);
1980
	btrfs_destroy_workqueue(fs_info->rmw_workers);
1981 1982
	btrfs_destroy_workqueue(fs_info->endio_write_workers);
	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
1983
	btrfs_destroy_workqueue(fs_info->delayed_workers);
1984
	btrfs_destroy_workqueue(fs_info->caching_workers);
1985
	btrfs_destroy_workqueue(fs_info->readahead_workers);
1986
	btrfs_destroy_workqueue(fs_info->flush_workers);
1987
	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
1988 1989
	if (fs_info->discard_ctl.discard_workers)
		destroy_workqueue(fs_info->discard_ctl.discard_workers);
1990 1991 1992 1993 1994 1995 1996
	/*
	 * Now that all other work queues are destroyed, we can safely destroy
	 * the queues used for metadata I/O, since tasks from those other work
	 * queues can do metadata I/O operations.
	 */
	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
Liu Bo's avatar
Liu Bo committed
1997 1998
}

1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
static void free_root_extent_buffers(struct btrfs_root *root)
{
	if (root) {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
		root->node = NULL;
		root->commit_root = NULL;
	}
}

2009
/* helper to cleanup tree roots */
2010
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2011
{
2012
	free_root_extent_buffers(info->tree_root);
2013

2014 2015 2016 2017 2018
	free_root_extent_buffers(info->dev_root);
	free_root_extent_buffers(info->extent_root);
	free_root_extent_buffers(info->csum_root);
	free_root_extent_buffers(info->quota_root);
	free_root_extent_buffers(info->uuid_root);
2019
	if (free_chunk_root)
2020
		free_root_extent_buffers(info->chunk_root);
2021
	free_root_extent_buffers(info->free_space_root);
2022 2023
}

2024
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2025 2026 2027 2028 2029 2030 2031 2032 2033 2034
{
	int ret;
	struct btrfs_root *gang[8];
	int i;

	while (!list_empty(&fs_info->dead_roots)) {
		gang[0] = list_entry(fs_info->dead_roots.next,
				     struct btrfs_root, root_list);
		list_del(&gang[0]->root_list);

2035
		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2036
			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2037 2038 2039
		} else {
			free_extent_buffer(gang[0]->node);
			free_extent_buffer(gang[0]->commit_root);
2040
			btrfs_put_fs_root(gang[0]);
2041 2042 2043 2044 2045 2046 2047 2048 2049 2050
		}
	}

	while (1) {
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, 0,
					     ARRAY_SIZE(gang));
		if (!ret)
			break;
		for (i = 0; i < ret; i++)
2051
			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2052
	}
2053 2054 2055

	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
		btrfs_free_log_root_tree(NULL, fs_info);
2056
		btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
2057
	}
2058
}
2059

2060 2061 2062 2063 2064 2065 2066 2067
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->scrub_lock);
	atomic_set(&fs_info->scrubs_running, 0);
	atomic_set(&fs_info->scrub_pause_req, 0);
	atomic_set(&fs_info->scrubs_paused, 0);
	atomic_set(&fs_info->scrub_cancel_req, 0);
	init_waitqueue_head(&fs_info->scrub_pause_wait);
2068
	refcount_set(&fs_info->scrub_workers_refcnt, 0);
2069 2070
}

2071 2072 2073 2074 2075 2076 2077 2078 2079 2080
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->balance_lock);
	mutex_init(&fs_info->balance_mutex);
	atomic_set(&fs_info->balance_pause_req, 0);
	atomic_set(&fs_info->balance_cancel_req, 0);
	fs_info->balance_ctl = NULL;
	init_waitqueue_head(&fs_info->balance_wait_q);
}

2081
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2082
{
2083 2084 2085 2086
	struct inode *inode = fs_info->btree_inode;

	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
	set_nlink(inode, 1);
2087 2088 2089 2090 2091
	/*
	 * we set the i_size on the btree inode to the max possible int.
	 * the real end of the address space is determined by all of
	 * the devices in the system
	 */
2092 2093
	inode->i_size = OFFSET_MAX;
	inode->i_mapping->a_ops = &btree_aops;
2094

2095
	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2096 2097
	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
			    IO_TREE_INODE_IO, inode);
2098
	BTRFS_I(inode)->io_tree.track_uptodate = false;
2099
	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2100

2101
	BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
2102

2103 2104 2105 2106
	BTRFS_I(inode)->root = fs_info->tree_root;
	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
	btrfs_insert_inode_hash(inode);
2107 2108
}

2109 2110 2111
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2112
	init_rwsem(&fs_info->dev_replace.rwsem);
2113
	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2114 2115
}

2116 2117 2118 2119 2120 2121 2122 2123
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->qgroup_lock);
	mutex_init(&fs_info->qgroup_ioctl_lock);
	fs_info->qgroup_tree = RB_ROOT;
	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
	fs_info->qgroup_seq = 1;
	fs_info->qgroup_ulist = NULL;
2124
	fs_info->qgroup_rescan_running = false;
2125 2126 2127
	mutex_init(&fs_info->qgroup_rescan_lock);
}

2128 2129 2130
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
		struct btrfs_fs_devices *fs_devices)
{
2131
	u32 max_active = fs_info->thread_pool_size;
2132
	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2133 2134

	fs_info->workers =
2135 2136
		btrfs_alloc_workqueue(fs_info, "worker",
				      flags | WQ_HIGHPRI, max_active, 16);
2137 2138

	fs_info->delalloc_workers =
2139 2140
		btrfs_alloc_workqueue(fs_info, "delalloc",
				      flags, max_active, 2);
2141 2142

	fs_info->flush_workers =
2143 2144
		btrfs_alloc_workqueue(fs_info, "flush_delalloc",
				      flags, max_active, 0);
2145 2146

	fs_info->caching_workers =
2147
		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2148 2149

	fs_info->fixup_workers =
2150
		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2151 2152 2153 2154 2155 2156

	/*
	 * endios are largely parallel and should have a very
	 * low idle thresh
	 */
	fs_info->endio_workers =
2157
		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
2158
	fs_info->endio_meta_workers =
2159 2160
		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
				      max_active, 4);
2161
	fs_info->endio_meta_write_workers =
2162 2163
		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
				      max_active, 2);
2164
	fs_info->endio_raid56_workers =
2165 2166
		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
				      max_active, 4);
2167
	fs_info->endio_repair_workers =
2168
		btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
2169
	fs_info->rmw_workers =
2170
		btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
2171
	fs_info->endio_write_workers =
2172 2173
		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
				      max_active, 2);
2174
	fs_info->endio_freespace_worker =
2175 2176
		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
				      max_active, 0);
2177
	fs_info->delayed_workers =
2178 2179
		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
				      max_active, 0);
2180
	fs_info->readahead_workers =
2181 2182
		btrfs_alloc_workqueue(fs_info, "readahead", flags,
				      max_active, 2);
2183
	fs_info->qgroup_rescan_workers =
2184
		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2185 2186
	fs_info->discard_ctl.discard_workers =
		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2187 2188

	if (!(fs_info->workers && fs_info->delalloc_workers &&
2189
	      fs_info->flush_workers &&
2190 2191 2192 2193 2194 2195 2196
	      fs_info->endio_workers && fs_info->endio_meta_workers &&
	      fs_info->endio_meta_write_workers &&
	      fs_info->endio_repair_workers &&
	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
	      fs_info->caching_workers && fs_info->readahead_workers &&
	      fs_info->fixup_workers && fs_info->delayed_workers &&
2197 2198
	      fs_info->qgroup_rescan_workers &&
	      fs_info->discard_ctl.discard_workers)) {
2199 2200 2201 2202 2203 2204
		return -ENOMEM;
	}

	return 0;
}

2205 2206 2207
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
	struct crypto_shash *csum_shash;
2208
	const char *csum_driver = btrfs_super_csum_driver(csum_type);
2209

2210
	csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2211 2212 2213

	if (IS_ERR(csum_shash)) {
		btrfs_err(fs_info, "error allocating %s hash for checksum",
2214
			  csum_driver);
2215 2216 2217 2218 2219 2220 2221 2222
		return PTR_ERR(csum_shash);
	}

	fs_info->csum_shash = csum_shash;

	return 0;
}

2223 2224 2225 2226 2227 2228 2229
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
			    struct btrfs_fs_devices *fs_devices)
{
	int ret;
	struct btrfs_root *log_tree_root;
	struct btrfs_super_block *disk_super = fs_info->super_copy;
	u64 bytenr = btrfs_super_log_root(disk_super);
2230
	int level = btrfs_super_log_root_level(disk_super);
2231 2232

	if (fs_devices->rw_devices == 0) {
2233
		btrfs_warn(fs_info, "log replay required on RO media");
2234 2235 2236
		return -EIO;
	}

2237 2238
	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
					 GFP_KERNEL);
2239 2240 2241
	if (!log_tree_root)
		return -ENOMEM;

2242
	log_tree_root->node = read_tree_block(fs_info, bytenr,
2243 2244
					      fs_info->generation + 1,
					      level, NULL);
2245
	if (IS_ERR(log_tree_root->node)) {
2246
		btrfs_warn(fs_info, "failed to read log tree");
2247
		ret = PTR_ERR(log_tree_root->node);
2248
		btrfs_put_fs_root(log_tree_root);
2249
		return ret;
2250
	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
2251
		btrfs_err(fs_info, "failed to read log tree");
2252
		free_extent_buffer(log_tree_root->node);
2253
		btrfs_put_fs_root(log_tree_root);
2254 2255 2256 2257 2258
		return -EIO;
	}
	/* returns with log_tree_root freed on success */
	ret = btrfs_recover_log_trees(log_tree_root);
	if (ret) {
2259 2260
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to recover log tree");
2261
		free_extent_buffer(log_tree_root->node);
2262
		btrfs_put_fs_root(log_tree_root);
2263 2264 2265
		return ret;
	}

2266
	if (sb_rdonly(fs_info->sb)) {
2267
		ret = btrfs_commit_super(fs_info);
2268 2269 2270 2271 2272 2273 2274
		if (ret)
			return ret;
	}

	return 0;
}

2275
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2276
{
2277
	struct btrfs_root *tree_root = fs_info->tree_root;
2278
	struct btrfs_root *root;
2279 2280 2281
	struct btrfs_key location;
	int ret;

2282 2283
	BUG_ON(!fs_info->tree_root);

2284 2285 2286 2287
	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
	location.type = BTRFS_ROOT_ITEM_KEY;
	location.offset = 0;

2288
	root = btrfs_read_tree_root(tree_root, &location);
2289 2290 2291 2292
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out;
	}
2293 2294
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
	fs_info->extent_root = root;
2295 2296

	location.objectid = BTRFS_DEV_TREE_OBJECTID;
2297
	root = btrfs_read_tree_root(tree_root, &location);
2298 2299 2300 2301
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out;
	}
2302 2303
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
	fs_info->dev_root = root;
2304 2305 2306
	btrfs_init_devices_late(fs_info);

	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2307
	root = btrfs_read_tree_root(tree_root, &location);
2308 2309 2310 2311
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out;
	}
2312 2313
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
	fs_info->csum_root = root;
2314 2315

	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2316 2317 2318
	root = btrfs_read_tree_root(tree_root, &location);
	if (!IS_ERR(root)) {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2319
		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2320
		fs_info->quota_root = root;
2321 2322 2323
	}

	location.objectid = BTRFS_UUID_TREE_OBJECTID;
2324 2325 2326
	root = btrfs_read_tree_root(tree_root, &location);
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
2327
		if (ret != -ENOENT)
2328
			goto out;
2329
	} else {
2330 2331
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->uuid_root = root;
2332 2333
	}

2334 2335 2336
	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
		root = btrfs_read_tree_root(tree_root, &location);
2337 2338 2339 2340
		if (IS_ERR(root)) {
			ret = PTR_ERR(root);
			goto out;
		}
2341 2342 2343 2344
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->free_space_root = root;
	}

2345
	return 0;
2346 2347 2348 2349
out:
	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
		   location.objectid, ret);
	return ret;
2350 2351
}

2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363
/*
 * Real super block validation
 * NOTE: super csum type and incompat features will not be checked here.
 *
 * @sb:		super block to check
 * @mirror_num:	the super block number to check its bytenr:
 * 		0	the primary (1st) sb
 * 		1, 2	2nd and 3rd backup copy
 * 	       -1	skip bytenr check
 */
static int validate_super(struct btrfs_fs_info *fs_info,
			    struct btrfs_super_block *sb, int mirror_num)
2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437
{
	u64 nodesize = btrfs_super_nodesize(sb);
	u64 sectorsize = btrfs_super_sectorsize(sb);
	int ret = 0;

	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
		btrfs_err(fs_info, "no valid FS found");
		ret = -EINVAL;
	}
	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
		ret = -EINVAL;
	}
	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "log_root level too big: %d >= %d",
				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}

	/*
	 * Check sectorsize and nodesize first, other check will need it.
	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
	 */
	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
		ret = -EINVAL;
	}
	/* Only PAGE SIZE is supported yet */
	if (sectorsize != PAGE_SIZE) {
		btrfs_err(fs_info,
			"sectorsize %llu not supported yet, only support %lu",
			sectorsize, PAGE_SIZE);
		ret = -EINVAL;
	}
	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
		ret = -EINVAL;
	}
	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
			  le32_to_cpu(sb->__unused_leafsize), nodesize);
		ret = -EINVAL;
	}

	/* Root alignment check */
	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
			   btrfs_super_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
			   btrfs_super_chunk_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "log_root block unaligned: %llu",
			   btrfs_super_log_root(sb));
		ret = -EINVAL;
	}

2438
	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2439
		   BTRFS_FSID_SIZE) != 0) {
2440
		btrfs_err(fs_info,
2441
			"dev_item UUID does not match metadata fsid: %pU != %pU",
2442
			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467
		ret = -EINVAL;
	}

	/*
	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
	 * done later
	 */
	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
		btrfs_err(fs_info, "bytes_used is too small %llu",
			  btrfs_super_bytes_used(sb));
		ret = -EINVAL;
	}
	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
		btrfs_err(fs_info, "invalid stripesize %u",
			  btrfs_super_stripesize(sb));
		ret = -EINVAL;
	}
	if (btrfs_super_num_devices(sb) > (1UL << 31))
		btrfs_warn(fs_info, "suspicious number of devices: %llu",
			   btrfs_super_num_devices(sb));
	if (btrfs_super_num_devices(sb) == 0) {
		btrfs_err(fs_info, "number of devices is 0");
		ret = -EINVAL;
	}

2468 2469
	if (mirror_num >= 0 &&
	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512
		btrfs_err(fs_info, "super offset mismatch %llu != %u",
			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
		ret = -EINVAL;
	}

	/*
	 * Obvious sys_chunk_array corruptions, it must hold at least one key
	 * and one chunk
	 */
	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
		btrfs_err(fs_info, "system chunk array too big %u > %u",
			  btrfs_super_sys_array_size(sb),
			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
		ret = -EINVAL;
	}
	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
			+ sizeof(struct btrfs_chunk)) {
		btrfs_err(fs_info, "system chunk array too small %u < %zu",
			  btrfs_super_sys_array_size(sb),
			  sizeof(struct btrfs_disk_key)
			  + sizeof(struct btrfs_chunk));
		ret = -EINVAL;
	}

	/*
	 * The generation is a global counter, we'll trust it more than the others
	 * but it's still possible that it's the one that's wrong.
	 */
	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
		btrfs_warn(fs_info,
			"suspicious: generation < chunk_root_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_chunk_root_generation(sb));
	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
	    && btrfs_super_cache_generation(sb) != (u64)-1)
		btrfs_warn(fs_info,
			"suspicious: generation < cache_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_cache_generation(sb));

	return ret;
}

2513 2514 2515 2516 2517 2518 2519 2520 2521 2522
/*
 * Validation of super block at mount time.
 * Some checks already done early at mount time, like csum type and incompat
 * flags will be skipped.
 */
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
	return validate_super(fs_info, fs_info->super_copy, 0);
}

2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536
/*
 * Validation of super block at write time.
 * Some checks like bytenr check will be skipped as their values will be
 * overwritten soon.
 * Extra checks like csum type and incompat flags will be done here.
 */
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
				      struct btrfs_super_block *sb)
{
	int ret;

	ret = validate_super(fs_info, sb, -1);
	if (ret < 0)
		goto out;
2537
	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557
		ret = -EUCLEAN;
		btrfs_err(fs_info, "invalid csum type, has %u want %u",
			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
		goto out;
	}
	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
		ret = -EUCLEAN;
		btrfs_err(fs_info,
		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
			  btrfs_super_incompat_flags(sb),
			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
		goto out;
	}
out:
	if (ret < 0)
		btrfs_err(fs_info,
		"super block corruption detected before writing it to disk");
	return ret;
}

2558
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2559
{
2560
	int backup_index = find_newest_super_backup(fs_info);
2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590
	struct btrfs_super_block *sb = fs_info->super_copy;
	struct btrfs_root *tree_root = fs_info->tree_root;
	bool handle_error = false;
	int ret = 0;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		u64 generation;
		int level;

		if (handle_error) {
			if (!IS_ERR(tree_root->node))
				free_extent_buffer(tree_root->node);
			tree_root->node = NULL;

			if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
				break;

			free_root_pointers(fs_info, 0);

			/*
			 * Don't use the log in recovery mode, it won't be
			 * valid
			 */
			btrfs_set_super_log_root(sb, 0);

			/* We can't trust the free space cache either */
			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);

			ret = read_backup_root(fs_info, i);
2591
			backup_index = ret;
2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615
			if (ret < 0)
				return ret;
		}
		generation = btrfs_super_generation(sb);
		level = btrfs_super_root_level(sb);
		tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
						  generation, level, NULL);
		if (IS_ERR(tree_root->node) ||
		    !extent_buffer_uptodate(tree_root->node)) {
			handle_error = true;

			if (IS_ERR(tree_root->node))
				ret = PTR_ERR(tree_root->node);
			else if (!extent_buffer_uptodate(tree_root->node))
				ret = -EUCLEAN;

			btrfs_warn(fs_info, "failed to read tree root");
			continue;
		}

		btrfs_set_root_node(&tree_root->root_item, tree_root->node);
		tree_root->commit_root = btrfs_root_node(tree_root);
		btrfs_set_root_refs(&tree_root->root_item, 1);

2616 2617 2618 2619
		/*
		 * No need to hold btrfs_root::objectid_mutex since the fs
		 * hasn't been fully initialised and we are the only user
		 */
2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637
		ret = btrfs_find_highest_objectid(tree_root,
						&tree_root->highest_objectid);
		if (ret < 0) {
			handle_error = true;
			continue;
		}

		ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);

		ret = btrfs_read_roots(fs_info);
		if (ret < 0) {
			handle_error = true;
			continue;
		}

		/* All successful */
		fs_info->generation = generation;
		fs_info->last_trans_committed = generation;
2638 2639 2640 2641 2642 2643 2644 2645

		/* Always begin writing backup roots after the one being used */
		if (backup_index < 0) {
			fs_info->backup_root_index = 0;
		} else {
			fs_info->backup_root_index = backup_index + 1;
			fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
		}
2646 2647 2648 2649 2650 2651
		break;
	}

	return ret;
}

2652
int __cold open_ctree(struct super_block *sb,
2653 2654
	       struct btrfs_fs_devices *fs_devices,
	       char *options)
2655
{
2656 2657
	u32 sectorsize;
	u32 nodesize;
2658
	u32 stripesize;
2659
	u64 generation;
2660
	u64 features;
2661
	u16 csum_type;
2662
	struct btrfs_key location;
2663
	struct buffer_head *bh;
2664
	struct btrfs_super_block *disk_super;
2665
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2666
	struct btrfs_root *tree_root;
2667
	struct btrfs_root *chunk_root;
2668
	int ret;
2669
	int err = -EINVAL;
2670
	int clear_free_space_tree = 0;
2671
	int level;
2672

2673 2674 2675 2676 2677 2678
	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
				     GFP_KERNEL);
	fs_info->tree_root = tree_root;
	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
				      GFP_KERNEL);
	fs_info->chunk_root = chunk_root;
2679
	if (!tree_root || !chunk_root) {
Chris Mason's avatar
Chris Mason committed
2680 2681 2682
		err = -ENOMEM;
		goto fail;
	}
2683 2684 2685 2686 2687 2688 2689

	ret = init_srcu_struct(&fs_info->subvol_srcu);
	if (ret) {
		err = ret;
		goto fail;
	}

2690
	ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
2691 2692
	if (ret) {
		err = ret;
2693
		goto fail_srcu;
2694
	}
2695 2696 2697 2698

	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
	if (ret) {
		err = ret;
2699
		goto fail_srcu;
2700
	}
2701
	fs_info->dirty_metadata_batch = PAGE_SIZE *
2702 2703
					(1 + ilog2(nr_cpu_ids));

2704
	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2705 2706
	if (ret) {
		err = ret;
2707
		goto fail_srcu;
2708 2709
	}

2710 2711
	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
			GFP_KERNEL);
2712 2713
	if (ret) {
		err = ret;
2714
		goto fail_srcu;
2715 2716
	}

2717
	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2718
	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
Chris Mason's avatar
Chris Mason committed
2719
	INIT_LIST_HEAD(&fs_info->trans_list);
2720
	INIT_LIST_HEAD(&fs_info->dead_roots);
Yan, Zheng's avatar
Yan, Zheng committed
2721
	INIT_LIST_HEAD(&fs_info->delayed_iputs);
2722
	INIT_LIST_HEAD(&fs_info->delalloc_roots);
2723
	INIT_LIST_HEAD(&fs_info->caching_block_groups);
2724
	spin_lock_init(&fs_info->delalloc_root_lock);
Josef Bacik's avatar
Josef Bacik committed
2725
	spin_lock_init(&fs_info->trans_lock);
2726
	spin_lock_init(&fs_info->fs_roots_radix_lock);
Yan, Zheng's avatar
Yan, Zheng committed
2727
	spin_lock_init(&fs_info->delayed_iput_lock);
2728
	spin_lock_init(&fs_info->defrag_inodes_lock);
2729
	spin_lock_init(&fs_info->super_lock);
2730
	spin_lock_init(&fs_info->buffer_lock);
2731
	spin_lock_init(&fs_info->unused_bgs_lock);
2732
	rwlock_init(&fs_info->tree_mod_log_lock);
2733
	mutex_init(&fs_info->unused_bg_unpin_mutex);
2734
	mutex_init(&fs_info->delete_unused_bgs_mutex);
Chris Mason's avatar
Chris Mason committed
2735
	mutex_init(&fs_info->reloc_mutex);
2736
	mutex_init(&fs_info->delalloc_root_mutex);
2737
	seqlock_init(&fs_info->profiles_lock);
2738

2739
	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2740
	INIT_LIST_HEAD(&fs_info->space_info);
2741
	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2742
	INIT_LIST_HEAD(&fs_info->unused_bgs);
2743
	extent_map_tree_init(&fs_info->mapping_tree);
2744 2745 2746 2747 2748 2749 2750
	btrfs_init_block_rsv(&fs_info->global_block_rsv,
			     BTRFS_BLOCK_RSV_GLOBAL);
	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
			     BTRFS_BLOCK_RSV_DELOPS);
2751 2752 2753
	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
			     BTRFS_BLOCK_RSV_DELREFS);

2754
	atomic_set(&fs_info->async_delalloc_pages, 0);
2755
	atomic_set(&fs_info->defrag_running, 0);
2756
	atomic_set(&fs_info->reada_works_cnt, 0);
2757
	atomic_set(&fs_info->nr_delayed_iputs, 0);
2758
	atomic64_set(&fs_info->tree_mod_seq, 0);
Chris Mason's avatar
Chris Mason committed
2759
	fs_info->sb = sb;
2760
	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
Josef Bacik's avatar
Josef Bacik committed
2761
	fs_info->metadata_ratio = 0;
2762
	fs_info->defrag_inodes = RB_ROOT;
2763
	atomic64_set(&fs_info->free_chunk_space, 0);
2764
	fs_info->tree_mod_log = RB_ROOT;
2765
	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2766
	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2767
	/* readahead state */
2768
	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2769
	spin_lock_init(&fs_info->reada_lock);
2770
	btrfs_init_ref_verify(fs_info);
2771

2772 2773
	fs_info->thread_pool_size = min_t(unsigned long,
					  num_online_cpus() + 2, 8);
2774

2775 2776
	INIT_LIST_HEAD(&fs_info->ordered_roots);
	spin_lock_init(&fs_info->ordered_root_lock);
2777 2778 2779 2780

	fs_info->btree_inode = new_inode(sb);
	if (!fs_info->btree_inode) {
		err = -ENOMEM;
2781
		goto fail_srcu;
2782 2783 2784
	}
	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);

2785
	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2786
					GFP_KERNEL);
2787 2788 2789 2790 2791
	if (!fs_info->delayed_root) {
		err = -ENOMEM;
		goto fail_iput;
	}
	btrfs_init_delayed_root(fs_info->delayed_root);
2792

2793
	btrfs_init_scrub(fs_info);
2794 2795 2796
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
	fs_info->check_integrity_print_mask = 0;
#endif
2797
	btrfs_init_balance(fs_info);
2798
	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
Arne Jansen's avatar
Arne Jansen committed
2799

2800 2801
	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2802

2803
	btrfs_init_btree_inode(fs_info);
2804

2805
	spin_lock_init(&fs_info->block_group_cache_lock);
2806
	fs_info->block_group_cache_tree = RB_ROOT;
2807
	fs_info->first_logical_byte = (u64)-1;
2808

2809 2810 2811 2812
	extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
			    IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
	extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
			    IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
2813
	fs_info->pinned_extents = &fs_info->freed_extents[0];
2814
	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
Chris Mason's avatar
Chris Mason committed
2815

2816
	mutex_init(&fs_info->ordered_operations_mutex);
2817
	mutex_init(&fs_info->tree_log_mutex);
2818
	mutex_init(&fs_info->chunk_mutex);
2819 2820
	mutex_init(&fs_info->transaction_kthread_mutex);
	mutex_init(&fs_info->cleaner_mutex);
2821
	mutex_init(&fs_info->ro_block_group_mutex);
2822
	init_rwsem(&fs_info->commit_root_sem);
2823
	init_rwsem(&fs_info->cleanup_work_sem);
2824
	init_rwsem(&fs_info->subvol_sem);
2825
	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2826

2827
	btrfs_init_dev_replace_locks(fs_info);
2828
	btrfs_init_qgroup(fs_info);
2829
	btrfs_discard_init(fs_info);
2830

2831 2832 2833
	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);

2834
	init_waitqueue_head(&fs_info->transaction_throttle);
2835
	init_waitqueue_head(&fs_info->transaction_wait);
Sage Weil's avatar
Sage Weil committed
2836
	init_waitqueue_head(&fs_info->transaction_blocked_wait);
2837
	init_waitqueue_head(&fs_info->async_submit_wait);
2838
	init_waitqueue_head(&fs_info->delayed_iputs_wait);
2839

2840 2841 2842 2843 2844
	/* Usable values until the real ones are cached from the superblock */
	fs_info->nodesize = 4096;
	fs_info->sectorsize = 4096;
	fs_info->stripesize = 4096;

2845 2846 2847
	spin_lock_init(&fs_info->swapfile_pins_lock);
	fs_info->swapfile_pins = RB_ROOT;

2848 2849
	fs_info->send_in_progress = 0;

David Woodhouse's avatar
David Woodhouse committed
2850 2851
	ret = btrfs_alloc_stripe_hash_table(fs_info);
	if (ret) {
2852
		err = ret;
David Woodhouse's avatar
David Woodhouse committed
2853 2854 2855
		goto fail_alloc;
	}

2856
	invalidate_bdev(fs_devices->latest_bdev);
2857 2858 2859 2860

	/*
	 * Read super block and check the signature bytes only
	 */
Yan Zheng's avatar
Yan Zheng committed
2861
	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2862 2863
	if (IS_ERR(bh)) {
		err = PTR_ERR(bh);
2864
		goto fail_alloc;
2865
	}
Chris Mason's avatar
Chris Mason committed
2866

2867 2868 2869 2870
	/*
	 * Verify the type first, if that or the the checksum value are
	 * corrupted, we'll find out
	 */
2871 2872
	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
	if (!btrfs_supported_super_csum(csum_type)) {
2873
		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
2874
			  csum_type);
2875 2876 2877 2878 2879
		err = -EINVAL;
		brelse(bh);
		goto fail_alloc;
	}

2880 2881 2882 2883 2884 2885
	ret = btrfs_init_csum_hash(fs_info, csum_type);
	if (ret) {
		err = ret;
		goto fail_alloc;
	}

2886 2887 2888 2889
	/*
	 * We want to check superblock checksum, the type is stored inside.
	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
	 */
2890
	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
2891
		btrfs_err(fs_info, "superblock checksum mismatch");
2892
		err = -EINVAL;
2893
		brelse(bh);
2894
		goto fail_alloc;
2895 2896 2897 2898 2899 2900 2901
	}

	/*
	 * super_copy is zeroed at allocation time and we never touch the
	 * following bytes up to INFO_SIZE, the checksum is calculated from
	 * the whole block of INFO_SIZE
	 */
2902
	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
2903
	brelse(bh);
2904

2905 2906
	disk_super = fs_info->super_copy;

2907 2908 2909
	ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
		       BTRFS_FSID_SIZE));

2910
	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
2911 2912 2913
		ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
				fs_info->super_copy->metadata_uuid,
				BTRFS_FSID_SIZE));
2914
	}
2915

2916 2917 2918 2919 2920 2921 2922 2923 2924 2925
	features = btrfs_super_flags(disk_super);
	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
		btrfs_set_super_flags(disk_super, features);
		btrfs_info(fs_info,
			"found metadata UUID change in progress flag, clearing");
	}

	memcpy(fs_info->super_for_commit, fs_info->super_copy,
	       sizeof(*fs_info->super_for_commit));
2926

2927
	ret = btrfs_validate_mount_super(fs_info);
2928
	if (ret) {
2929
		btrfs_err(fs_info, "superblock contains fatal errors");
2930
		err = -EINVAL;
2931
		goto fail_alloc;
2932 2933
	}

2934
	if (!btrfs_super_root(disk_super))
2935
		goto fail_alloc;
2936

2937
	/* check FS state, whether FS is broken. */
2938 2939
	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2940

2941 2942 2943 2944 2945 2946
	/*
	 * In the long term, we'll store the compression type in the super
	 * block, and it'll be used for per file compression control.
	 */
	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;

2947
	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
Yan Zheng's avatar
Yan Zheng committed
2948 2949
	if (ret) {
		err = ret;
2950
		goto fail_alloc;
Yan Zheng's avatar
Yan Zheng committed
2951
	}
2952

2953 2954 2955
	features = btrfs_super_incompat_flags(disk_super) &
		~BTRFS_FEATURE_INCOMPAT_SUPP;
	if (features) {
2956 2957 2958
		btrfs_err(fs_info,
		    "cannot mount because of unsupported optional features (%llx)",
		    features);
2959
		err = -EINVAL;
2960
		goto fail_alloc;
2961 2962
	}

2963
	features = btrfs_super_incompat_flags(disk_super);
2964
	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2965
	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
2966
		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
Nick Terrell's avatar
Nick Terrell committed
2967 2968
	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
2969

2970
	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2971
		btrfs_info(fs_info, "has skinny extents");
2972

2973 2974 2975 2976
	/*
	 * flag our filesystem as having big metadata blocks if
	 * they are bigger than the page size
	 */
2977
	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
2978
		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2979 2980
			btrfs_info(fs_info,
				"flagging fs with big metadata feature");
2981 2982 2983
		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
	}

2984 2985
	nodesize = btrfs_super_nodesize(disk_super);
	sectorsize = btrfs_super_sectorsize(disk_super);
2986
	stripesize = sectorsize;
2987
	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2988
	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2989

2990 2991 2992 2993 2994
	/* Cache block sizes */
	fs_info->nodesize = nodesize;
	fs_info->sectorsize = sectorsize;
	fs_info->stripesize = stripesize;

2995 2996 2997 2998 2999
	/*
	 * mixed block groups end up with duplicate but slightly offset
	 * extent buffers for the same range.  It leads to corruptions
	 */
	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3000
	    (sectorsize != nodesize)) {
3001 3002 3003
		btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
			nodesize, sectorsize);
3004
		goto fail_alloc;
3005 3006
	}

3007 3008 3009 3010
	/*
	 * Needn't use the lock because there is no other task which will
	 * update the flag.
	 */
3011
	btrfs_set_super_incompat_flags(disk_super, features);
3012

3013 3014
	features = btrfs_super_compat_ro_flags(disk_super) &
		~BTRFS_FEATURE_COMPAT_RO_SUPP;
3015
	if (!sb_rdonly(sb) && features) {
3016 3017
		btrfs_err(fs_info,
	"cannot mount read-write because of unsupported optional features (%llx)",
3018
		       features);
3019
		err = -EINVAL;
3020
		goto fail_alloc;
3021
	}
3022

3023 3024 3025
	ret = btrfs_init_workqueues(fs_info, fs_devices);
	if (ret) {
		err = ret;
3026 3027
		goto fail_sb_buffer;
	}
3028

3029 3030 3031
	sb->s_bdi->congested_fn = btrfs_congested_fn;
	sb->s_bdi->congested_data = fs_info;
	sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
3032
	sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
3033 3034
	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3035

3036 3037
	sb->s_blocksize = sectorsize;
	sb->s_blocksize_bits = blksize_bits(sectorsize);
3038
	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3039

3040
	mutex_lock(&fs_info->chunk_mutex);
3041
	ret = btrfs_read_sys_array(fs_info);
3042
	mutex_unlock(&fs_info->chunk_mutex);
3043
	if (ret) {
3044
		btrfs_err(fs_info, "failed to read the system array: %d", ret);
3045
		goto fail_sb_buffer;
3046
	}
3047

3048
	generation = btrfs_super_chunk_root_generation(disk_super);
3049
	level = btrfs_super_chunk_root_level(disk_super);
3050

3051
	chunk_root->node = read_tree_block(fs_info,
3052
					   btrfs_super_chunk_root(disk_super),
3053
					   generation, level, NULL);
3054 3055
	if (IS_ERR(chunk_root->node) ||
	    !extent_buffer_uptodate(chunk_root->node)) {
3056
		btrfs_err(fs_info, "failed to read chunk root");
3057 3058
		if (!IS_ERR(chunk_root->node))
			free_extent_buffer(chunk_root->node);
3059
		chunk_root->node = NULL;
3060
		goto fail_tree_roots;
3061
	}
3062 3063
	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
	chunk_root->commit_root = btrfs_root_node(chunk_root);
3064

3065
	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3066
	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
3067

3068
	ret = btrfs_read_chunk_tree(fs_info);
Yan Zheng's avatar
Yan Zheng committed
3069
	if (ret) {
3070
		btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3071
		goto fail_tree_roots;
Yan Zheng's avatar
Yan Zheng committed
3072
	}
3073

3074
	/*
3075 3076
	 * Keep the devid that is marked to be the target device for the
	 * device replace procedure
3077
	 */
3078
	btrfs_free_extra_devids(fs_devices, 0);
3079

3080
	if (!fs_devices->latest_bdev) {
3081
		btrfs_err(fs_info, "failed to read devices");
3082 3083 3084
		goto fail_tree_roots;
	}

3085
	ret = init_tree_roots(fs_info);
3086
	if (ret)
3087
		goto fail_tree_roots;
3088

3089 3090 3091 3092 3093 3094 3095
	ret = btrfs_verify_dev_extents(fs_info);
	if (ret) {
		btrfs_err(fs_info,
			  "failed to verify dev extents against chunks: %d",
			  ret);
		goto fail_block_groups;
	}
3096 3097
	ret = btrfs_recover_balance(fs_info);
	if (ret) {
3098
		btrfs_err(fs_info, "failed to recover balance: %d", ret);
3099 3100 3101
		goto fail_block_groups;
	}

3102 3103
	ret = btrfs_init_dev_stats(fs_info);
	if (ret) {
3104
		btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3105 3106 3107
		goto fail_block_groups;
	}

3108 3109
	ret = btrfs_init_dev_replace(fs_info);
	if (ret) {
3110
		btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3111 3112 3113
		goto fail_block_groups;
	}

3114
	btrfs_free_extra_devids(fs_devices, 1);
3115

3116
	ret = btrfs_sysfs_add_fsid(fs_devices);
3117
	if (ret) {
3118 3119
		btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
				ret);
3120 3121 3122
		goto fail_block_groups;
	}

3123
	ret = btrfs_sysfs_add_mounted(fs_info);
3124
	if (ret) {
3125
		btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3126
		goto fail_fsdev_sysfs;
3127 3128 3129 3130
	}

	ret = btrfs_init_space_info(fs_info);
	if (ret) {
3131
		btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3132
		goto fail_sysfs;
3133 3134
	}

3135
	ret = btrfs_read_block_groups(fs_info);
3136
	if (ret) {
3137
		btrfs_err(fs_info, "failed to read block groups: %d", ret);
3138
		goto fail_sysfs;
3139
	}
3140

3141
	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
3142
		btrfs_warn(fs_info,
3143
		"writable mount is not allowed due to too many missing devices");
3144
		goto fail_sysfs;
3145
	}
3146

3147 3148
	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
					       "btrfs-cleaner");
3149
	if (IS_ERR(fs_info->cleaner_kthread))
3150
		goto fail_sysfs;
3151 3152 3153 3154

	fs_info->transaction_kthread = kthread_run(transaction_kthread,
						   tree_root,
						   "btrfs-transaction");
3155
	if (IS_ERR(fs_info->transaction_kthread))
3156
		goto fail_cleaner;
3157

3158
	if (!btrfs_test_opt(fs_info, NOSSD) &&
Chris Mason's avatar
Chris Mason committed
3159
	    !fs_info->fs_devices->rotating) {
3160
		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
Chris Mason's avatar
Chris Mason committed
3161 3162
	}

3163
	/*
3164
	 * Mount does not set all options immediately, we can do it now and do
3165 3166 3167
	 * not have to wait for transaction commit
	 */
	btrfs_apply_pending_changes(fs_info);
3168

3169
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3170
	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3171
		ret = btrfsic_mount(fs_info, fs_devices,
3172
				    btrfs_test_opt(fs_info,
3173 3174 3175 3176
					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
				    1 : 0,
				    fs_info->check_integrity_print_mask);
		if (ret)
3177 3178 3179
			btrfs_warn(fs_info,
				"failed to initialize integrity check module: %d",
				ret);
3180 3181
	}
#endif
3182 3183 3184
	ret = btrfs_read_qgroup_config(fs_info);
	if (ret)
		goto fail_trans_kthread;
3185

3186 3187 3188
	if (btrfs_build_ref_tree(fs_info))
		btrfs_err(fs_info, "couldn't build ref tree");

3189 3190
	/* do not make disk changes in broken FS or nologreplay is given */
	if (btrfs_super_log_root(disk_super) != 0 &&
3191
	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3192
		btrfs_info(fs_info, "start tree-log replay");
3193
		ret = btrfs_replay_log(fs_info, fs_devices);
3194
		if (ret) {
3195
			err = ret;
3196
			goto fail_qgroup;
3197
		}
3198
	}
3199

3200
	ret = btrfs_find_orphan_roots(fs_info);
3201
	if (ret)
3202
		goto fail_qgroup;
3203

3204
	if (!sb_rdonly(sb)) {
3205
		ret = btrfs_cleanup_fs_roots(fs_info);
3206
		if (ret)
3207
			goto fail_qgroup;
3208 3209

		mutex_lock(&fs_info->cleaner_mutex);
3210
		ret = btrfs_recover_relocation(tree_root);
3211
		mutex_unlock(&fs_info->cleaner_mutex);
3212
		if (ret < 0) {
3213 3214
			btrfs_warn(fs_info, "failed to recover relocation: %d",
					ret);
3215
			err = -EINVAL;
3216
			goto fail_qgroup;
3217
		}
3218
	}
3219

3220 3221
	location.objectid = BTRFS_FS_TREE_OBJECTID;
	location.type = BTRFS_ROOT_ITEM_KEY;
3222
	location.offset = 0;
3223

3224
	fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true);
3225 3226
	if (IS_ERR(fs_info->fs_root)) {
		err = PTR_ERR(fs_info->fs_root);
3227
		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3228
		fs_info->fs_root = NULL;
3229
		goto fail_qgroup;
3230
	}
Chris Mason's avatar
Chris Mason committed
3231

3232
	if (sb_rdonly(sb))
3233
		return 0;
3234

3235 3236
	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3237 3238 3239 3240 3241 3242 3243 3244
		clear_free_space_tree = 1;
	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
		btrfs_warn(fs_info, "free space tree is invalid");
		clear_free_space_tree = 1;
	}

	if (clear_free_space_tree) {
3245 3246 3247 3248 3249
		btrfs_info(fs_info, "clearing free space tree");
		ret = btrfs_clear_free_space_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				   "failed to clear free space tree: %d", ret);
3250
			close_ctree(fs_info);
3251 3252 3253 3254
			return ret;
		}
	}

3255
	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3256
	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3257
		btrfs_info(fs_info, "creating free space tree");
3258 3259
		ret = btrfs_create_free_space_tree(fs_info);
		if (ret) {
3260 3261
			btrfs_warn(fs_info,
				"failed to create free space tree: %d", ret);
3262
			close_ctree(fs_info);
3263 3264 3265 3266
			return ret;
		}
	}

3267 3268 3269
	down_read(&fs_info->cleanup_work_sem);
	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3270
		up_read(&fs_info->cleanup_work_sem);
3271
		close_ctree(fs_info);
3272 3273 3274
		return ret;
	}
	up_read(&fs_info->cleanup_work_sem);
3275

3276 3277
	ret = btrfs_resume_balance_async(fs_info);
	if (ret) {
3278
		btrfs_warn(fs_info, "failed to resume balance: %d", ret);
3279
		close_ctree(fs_info);
3280
		return ret;
3281 3282
	}

3283 3284
	ret = btrfs_resume_dev_replace_async(fs_info);
	if (ret) {
3285
		btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
3286
		close_ctree(fs_info);
3287 3288 3289
		return ret;
	}

3290
	btrfs_qgroup_rescan_resume(fs_info);
3291
	btrfs_discard_resume(fs_info);
3292

3293
	if (!fs_info->uuid_root) {
3294
		btrfs_info(fs_info, "creating UUID tree");
3295 3296
		ret = btrfs_create_uuid_tree(fs_info);
		if (ret) {
3297 3298
			btrfs_warn(fs_info,
				"failed to create the UUID tree: %d", ret);
3299
			close_ctree(fs_info);
3300 3301
			return ret;
		}
3302
	} else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3303 3304
		   fs_info->generation !=
				btrfs_super_uuid_tree_generation(disk_super)) {
3305
		btrfs_info(fs_info, "checking UUID tree");
3306 3307
		ret = btrfs_check_uuid_tree(fs_info);
		if (ret) {
3308 3309
			btrfs_warn(fs_info,
				"failed to check the UUID tree: %d", ret);
3310
			close_ctree(fs_info);
3311 3312 3313
			return ret;
		}
	} else {
3314
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3315
	}
3316
	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3317

3318 3319 3320 3321 3322 3323
	/*
	 * backuproot only affect mount behavior, and if open_ctree succeeded,
	 * no need to keep the flag
	 */
	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);

3324
	return 0;
Chris Mason's avatar
Chris Mason committed
3325

3326 3327
fail_qgroup:
	btrfs_free_qgroup_config(fs_info);
3328 3329
fail_trans_kthread:
	kthread_stop(fs_info->transaction_kthread);
3330
	btrfs_cleanup_transaction(fs_info);
3331
	btrfs_free_fs_roots(fs_info);
3332
fail_cleaner:
3333
	kthread_stop(fs_info->cleaner_kthread);
3334 3335 3336 3337 3338 3339 3340

	/*
	 * make sure we're done with the btree inode before we stop our
	 * kthreads
	 */
	filemap_write_and_wait(fs_info->btree_inode->i_mapping);

3341
fail_sysfs:
3342
	btrfs_sysfs_remove_mounted(fs_info);
3343

3344 3345 3346
fail_fsdev_sysfs:
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);

3347
fail_block_groups:
Josef Bacik's avatar
Josef Bacik committed
3348
	btrfs_put_block_group_cache(fs_info);
3349 3350

fail_tree_roots:
3351
	free_root_pointers(fs_info, true);
3352
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3353

Chris Mason's avatar
Chris Mason committed
3354
fail_sb_buffer:
Liu Bo's avatar
Liu Bo committed
3355
	btrfs_stop_all_workers(fs_info);
3356
	btrfs_free_block_groups(fs_info);
3357
fail_alloc:
3358
fail_iput:
3359 3360
	btrfs_mapping_tree_free(&fs_info->mapping_tree);

3361
	iput(fs_info->btree_inode);
3362 3363
fail_srcu:
	cleanup_srcu_struct(&fs_info->subvol_srcu);
3364
fail:
3365
	btrfs_close_devices(fs_info->fs_devices);
3366
	return err;
3367
}
3368
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3369

3370 3371 3372 3373 3374
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
	if (uptodate) {
		set_buffer_uptodate(bh);
	} else {
3375 3376 3377
		struct btrfs_device *device = (struct btrfs_device *)
			bh->b_private;

3378
		btrfs_warn_rl_in_rcu(device->fs_info,
3379
				"lost page write due to IO error on %s",
3380
					  rcu_str_deref(device->name));
3381
		/* note, we don't set_buffer_write_io_error because we have
3382 3383
		 * our own ways of dealing with the IO errors
		 */
3384
		clear_buffer_uptodate(bh);
3385
		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
3386 3387 3388 3389 3390
	}
	unlock_buffer(bh);
	put_bh(bh);
}

3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401
int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
			struct buffer_head **bh_ret)
{
	struct buffer_head *bh;
	struct btrfs_super_block *super;
	u64 bytenr;

	bytenr = btrfs_sb_offset(copy_num);
	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
		return -EINVAL;

3402
	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421
	/*
	 * If we fail to read from the underlying devices, as of now
	 * the best option we have is to mark it EIO.
	 */
	if (!bh)
		return -EIO;

	super = (struct btrfs_super_block *)bh->b_data;
	if (btrfs_super_bytenr(super) != bytenr ||
		    btrfs_super_magic(super) != BTRFS_MAGIC) {
		brelse(bh);
		return -EINVAL;
	}

	*bh_ret = bh;
	return 0;
}


Yan Zheng's avatar
Yan Zheng committed
3422 3423 3424 3425 3426 3427 3428
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
{
	struct buffer_head *bh;
	struct buffer_head *latest = NULL;
	struct btrfs_super_block *super;
	int i;
	u64 transid = 0;
3429
	int ret = -EINVAL;
Yan Zheng's avatar
Yan Zheng committed
3430 3431 3432 3433 3434 3435 3436

	/* we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	for (i = 0; i < 1; i++) {
3437 3438
		ret = btrfs_read_dev_one_super(bdev, i, &bh);
		if (ret)
Yan Zheng's avatar
Yan Zheng committed
3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450
			continue;

		super = (struct btrfs_super_block *)bh->b_data;

		if (!latest || btrfs_super_generation(super) > transid) {
			brelse(latest);
			latest = bh;
			transid = btrfs_super_generation(super);
		} else {
			brelse(bh);
		}
	}
3451 3452 3453 3454

	if (!latest)
		return ERR_PTR(ret);

Yan Zheng's avatar
Yan Zheng committed
3455 3456 3457
	return latest;
}

3458
/*
3459 3460
 * Write superblock @sb to the @device. Do not wait for completion, all the
 * buffer heads we write are pinned.
3461
 *
3462 3463 3464
 * Write @max_mirrors copies of the superblock, where 0 means default that fit
 * the expected device size at commit time. Note that max_mirrors must be
 * same for write and wait phases.
3465
 *
3466
 * Return number of errors when buffer head is not found or submission fails.
3467
 */
Yan Zheng's avatar
Yan Zheng committed
3468
static int write_dev_supers(struct btrfs_device *device,
3469
			    struct btrfs_super_block *sb, int max_mirrors)
Yan Zheng's avatar
Yan Zheng committed
3470
{
3471 3472
	struct btrfs_fs_info *fs_info = device->fs_info;
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Yan Zheng's avatar
Yan Zheng committed
3473 3474 3475 3476 3477
	struct buffer_head *bh;
	int i;
	int ret;
	int errors = 0;
	u64 bytenr;
3478
	int op_flags;
Yan Zheng's avatar
Yan Zheng committed
3479 3480 3481 3482

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

3483 3484
	shash->tfm = fs_info->csum_shash;

Yan Zheng's avatar
Yan Zheng committed
3485 3486
	for (i = 0; i < max_mirrors; i++) {
		bytenr = btrfs_sb_offset(i);
3487 3488
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
Yan Zheng's avatar
Yan Zheng committed
3489 3490
			break;

3491
		btrfs_set_super_bytenr(sb, bytenr);
3492

3493 3494 3495 3496
		crypto_shash_init(shash);
		crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
		crypto_shash_final(shash, sb->csum);
3497

3498
		/* One reference for us, and we leave it for the caller */
3499
		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
3500 3501 3502 3503 3504 3505
			      BTRFS_SUPER_INFO_SIZE);
		if (!bh) {
			btrfs_err(device->fs_info,
			    "couldn't get super buffer head for bytenr %llu",
			    bytenr);
			errors++;
3506
			continue;
3507
		}
3508

3509
		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
Yan Zheng's avatar
Yan Zheng committed
3510

3511 3512
		/* one reference for submit_bh */
		get_bh(bh);
3513

3514 3515 3516 3517
		set_buffer_uptodate(bh);
		lock_buffer(bh);
		bh->b_end_io = btrfs_end_buffer_write_sync;
		bh->b_private = device;
Yan Zheng's avatar
Yan Zheng committed
3518

Chris Mason's avatar
Chris Mason committed
3519 3520 3521 3522
		/*
		 * we fua the first super.  The others we allow
		 * to go down lazy.
		 */
3523 3524 3525 3526
		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
			op_flags |= REQ_FUA;
		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
3527
		if (ret)
Yan Zheng's avatar
Yan Zheng committed
3528 3529 3530 3531 3532
			errors++;
	}
	return errors < i ? 0 : -1;
}

3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544
/*
 * Wait for write completion of superblocks done by write_dev_supers,
 * @max_mirrors same for write and wait phases.
 *
 * Return number of errors when buffer head is not found or not marked up to
 * date.
 */
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
	struct buffer_head *bh;
	int i;
	int errors = 0;
3545
	bool primary_failed = false;
3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556
	u64 bytenr;

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

	for (i = 0; i < max_mirrors; i++) {
		bytenr = btrfs_sb_offset(i);
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
			break;

3557 3558
		bh = __find_get_block(device->bdev,
				      bytenr / BTRFS_BDEV_BLOCKSIZE,
3559 3560 3561
				      BTRFS_SUPER_INFO_SIZE);
		if (!bh) {
			errors++;
3562 3563
			if (i == 0)
				primary_failed = true;
3564 3565 3566
			continue;
		}
		wait_on_buffer(bh);
3567
		if (!buffer_uptodate(bh)) {
3568
			errors++;
3569 3570 3571
			if (i == 0)
				primary_failed = true;
		}
3572 3573 3574 3575 3576 3577 3578 3579

		/* drop our reference */
		brelse(bh);

		/* drop the reference from the writing run */
		brelse(bh);
	}

3580 3581 3582 3583 3584 3585 3586
	/* log error, force error return */
	if (primary_failed) {
		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
			  device->devid);
		return -1;
	}

3587 3588 3589
	return errors < i ? 0 : -1;
}

Chris Mason's avatar
Chris Mason committed
3590 3591 3592 3593
/*
 * endio for the write_dev_flush, this will wake anyone waiting
 * for the barrier when it is done
 */
3594
static void btrfs_end_empty_barrier(struct bio *bio)
Chris Mason's avatar
Chris Mason committed
3595
{
3596
	complete(bio->bi_private);
Chris Mason's avatar
Chris Mason committed
3597 3598 3599
}

/*
3600 3601
 * Submit a flush request to the device if it supports it. Error handling is
 * done in the waiting counterpart.
Chris Mason's avatar
Chris Mason committed
3602
 */
3603
static void write_dev_flush(struct btrfs_device *device)
Chris Mason's avatar
Chris Mason committed
3604
{
3605
	struct request_queue *q = bdev_get_queue(device->bdev);
3606
	struct bio *bio = device->flush_bio;
Chris Mason's avatar
Chris Mason committed
3607

3608
	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
3609
		return;
Chris Mason's avatar
Chris Mason committed
3610

3611
	bio_reset(bio);
Chris Mason's avatar
Chris Mason committed
3612
	bio->bi_end_io = btrfs_end_empty_barrier;
3613
	bio_set_dev(bio, device->bdev);
3614
	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
Chris Mason's avatar
Chris Mason committed
3615 3616 3617
	init_completion(&device->flush_wait);
	bio->bi_private = &device->flush_wait;

3618
	btrfsic_submit_bio(bio);
3619
	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3620
}
Chris Mason's avatar
Chris Mason committed
3621

3622 3623 3624
/*
 * If the flush bio has been submitted by write_dev_flush, wait for it.
 */
3625
static blk_status_t wait_dev_flush(struct btrfs_device *device)
3626 3627
{
	struct bio *bio = device->flush_bio;
Chris Mason's avatar
Chris Mason committed
3628

3629
	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3630
		return BLK_STS_OK;
Chris Mason's avatar
Chris Mason committed
3631

3632
	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3633
	wait_for_completion_io(&device->flush_wait);
Chris Mason's avatar
Chris Mason committed
3634

3635
	return bio->bi_status;
Chris Mason's avatar
Chris Mason committed
3636 3637
}

3638
static int check_barrier_error(struct btrfs_fs_info *fs_info)
3639
{
3640
	if (!btrfs_check_rw_degradable(fs_info, NULL))
3641
		return -EIO;
Chris Mason's avatar
Chris Mason committed
3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652
	return 0;
}

/*
 * send an empty flush down to each device in parallel,
 * then wait for them
 */
static int barrier_all_devices(struct btrfs_fs_info *info)
{
	struct list_head *head;
	struct btrfs_device *dev;
3653
	int errors_wait = 0;
3654
	blk_status_t ret;
Chris Mason's avatar
Chris Mason committed
3655

3656
	lockdep_assert_held(&info->fs_devices->device_list_mutex);
Chris Mason's avatar
Chris Mason committed
3657 3658
	/* send down all the barriers */
	head = &info->fs_devices->devices;
3659
	list_for_each_entry(dev, head, dev_list) {
3660
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3661
			continue;
3662
		if (!dev->bdev)
Chris Mason's avatar
Chris Mason committed
3663
			continue;
3664
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3665
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
Chris Mason's avatar
Chris Mason committed
3666 3667
			continue;

3668
		write_dev_flush(dev);
3669
		dev->last_flush_error = BLK_STS_OK;
Chris Mason's avatar
Chris Mason committed
3670 3671 3672
	}

	/* wait for all the barriers */
3673
	list_for_each_entry(dev, head, dev_list) {
3674
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3675
			continue;
Chris Mason's avatar
Chris Mason committed
3676
		if (!dev->bdev) {
3677
			errors_wait++;
Chris Mason's avatar
Chris Mason committed
3678 3679
			continue;
		}
3680
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3681
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
Chris Mason's avatar
Chris Mason committed
3682 3683
			continue;

3684
		ret = wait_dev_flush(dev);
3685 3686
		if (ret) {
			dev->last_flush_error = ret;
3687 3688
			btrfs_dev_stat_inc_and_print(dev,
					BTRFS_DEV_STAT_FLUSH_ERRS);
3689
			errors_wait++;
3690 3691 3692
		}
	}

3693
	if (errors_wait) {
3694 3695 3696 3697 3698
		/*
		 * At some point we need the status of all disks
		 * to arrive at the volume status. So error checking
		 * is being pushed to a separate loop.
		 */
3699
		return check_barrier_error(info);
Chris Mason's avatar
Chris Mason committed
3700 3701 3702 3703
	}
	return 0;
}

3704 3705
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
3706 3707
	int raid_type;
	int min_tolerated = INT_MAX;
3708

3709 3710
	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3711
		min_tolerated = min_t(int, min_tolerated,
3712 3713
				    btrfs_raid_array[BTRFS_RAID_SINGLE].
				    tolerated_failures);
3714

3715 3716 3717
	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
		if (raid_type == BTRFS_RAID_SINGLE)
			continue;
3718
		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3719
			continue;
3720
		min_tolerated = min_t(int, min_tolerated,
3721 3722 3723
				    btrfs_raid_array[raid_type].
				    tolerated_failures);
	}
3724

3725
	if (min_tolerated == INT_MAX) {
3726
		pr_warn("BTRFS: unknown raid flag: %llu", flags);
3727 3728 3729 3730
		min_tolerated = 0;
	}

	return min_tolerated;
3731 3732
}

3733
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
3734
{
3735
	struct list_head *head;
3736
	struct btrfs_device *dev;
3737
	struct btrfs_super_block *sb;
3738 3739 3740
	struct btrfs_dev_item *dev_item;
	int ret;
	int do_barriers;
3741 3742
	int max_errors;
	int total_errors = 0;
3743
	u64 flags;
3744

3745
	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
3746 3747 3748 3749 3750 3751 3752 3753

	/*
	 * max_mirrors == 0 indicates we're from commit_transaction,
	 * not from fsync where the tree roots in fs_info have not
	 * been consistent on disk.
	 */
	if (max_mirrors == 0)
		backup_super_roots(fs_info);
3754

3755
	sb = fs_info->super_for_commit;
3756
	dev_item = &sb->dev_item;
3757

3758 3759 3760
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	head = &fs_info->fs_devices->devices;
	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
Chris Mason's avatar
Chris Mason committed
3761

3762
	if (do_barriers) {
3763
		ret = barrier_all_devices(fs_info);
3764 3765
		if (ret) {
			mutex_unlock(
3766 3767 3768
				&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, ret,
					      "errors while submitting device barriers.");
3769 3770 3771
			return ret;
		}
	}
Chris Mason's avatar
Chris Mason committed
3772

3773
	list_for_each_entry(dev, head, dev_list) {
3774 3775 3776 3777
		if (!dev->bdev) {
			total_errors++;
			continue;
		}
3778
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3779
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3780 3781
			continue;

Yan Zheng's avatar
Yan Zheng committed
3782
		btrfs_set_stack_device_generation(dev_item, 0);
3783 3784
		btrfs_set_stack_device_type(dev_item, dev->type);
		btrfs_set_stack_device_id(dev_item, dev->devid);
3785
		btrfs_set_stack_device_total_bytes(dev_item,
3786
						   dev->commit_total_bytes);
3787 3788
		btrfs_set_stack_device_bytes_used(dev_item,
						  dev->commit_bytes_used);
3789 3790 3791 3792
		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3793 3794
		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
		       BTRFS_FSID_SIZE);
Yan Zheng's avatar
Yan Zheng committed
3795

3796 3797 3798
		flags = btrfs_super_flags(sb);
		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);

3799 3800 3801 3802 3803 3804 3805 3806
		ret = btrfs_validate_write_super(fs_info, sb);
		if (ret < 0) {
			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, -EUCLEAN,
				"unexpected superblock corruption detected");
			return -EUCLEAN;
		}

3807
		ret = write_dev_supers(dev, sb, max_mirrors);
3808 3809
		if (ret)
			total_errors++;
3810
	}
3811
	if (total_errors > max_errors) {
3812 3813 3814
		btrfs_err(fs_info, "%d errors while writing supers",
			  total_errors);
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3815

3816
		/* FUA is masked off if unsupported and can't be the reason */
3817 3818 3819
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
3820
		return -EIO;
3821
	}
3822

Yan Zheng's avatar
Yan Zheng committed
3823
	total_errors = 0;
3824
	list_for_each_entry(dev, head, dev_list) {
3825 3826
		if (!dev->bdev)
			continue;
3827
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3828
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3829 3830
			continue;

3831
		ret = wait_dev_supers(dev, max_mirrors);
Yan Zheng's avatar
Yan Zheng committed
3832 3833
		if (ret)
			total_errors++;
3834
	}
3835
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3836
	if (total_errors > max_errors) {
3837 3838 3839
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
3840
		return -EIO;
3841
	}
3842 3843 3844
	return 0;
}

3845 3846 3847
/* Drop a fs root from the radix tree and free it. */
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
				  struct btrfs_root *root)
Chris Mason's avatar
Chris Mason committed
3848
{
3849
	spin_lock(&fs_info->fs_roots_radix_lock);
Chris Mason's avatar
Chris Mason committed
3850 3851
	radix_tree_delete(&fs_info->fs_roots_radix,
			  (unsigned long)root->root_key.objectid);
3852 3853
	if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
		btrfs_put_fs_root(root);
3854
	spin_unlock(&fs_info->fs_roots_radix_lock);
3855 3856 3857 3858

	if (btrfs_root_refs(&root->root_item) == 0)
		synchronize_srcu(&fs_info->subvol_srcu);

3859
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
Liu Bo's avatar
Liu Bo committed
3860
		btrfs_free_log(NULL, root);
3861 3862 3863 3864 3865 3866 3867
		if (root->reloc_root) {
			free_extent_buffer(root->reloc_root->node);
			free_extent_buffer(root->reloc_root->commit_root);
			btrfs_put_fs_root(root->reloc_root);
			root->reloc_root = NULL;
		}
	}
Liu Bo's avatar
Liu Bo committed
3868

3869 3870 3871 3872
	if (root->free_ino_pinned)
		__btrfs_remove_free_space_cache(root->free_ino_pinned);
	if (root->free_ino_ctl)
		__btrfs_remove_free_space_cache(root->free_ino_ctl);
3873
	btrfs_free_fs_root(root);
3874 3875
}

3876
void btrfs_free_fs_root(struct btrfs_root *root)
3877
{
3878
	iput(root->ino_cache_inode);
3879
	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3880 3881
	if (root->anon_dev)
		free_anon_bdev(root->anon_dev);
3882 3883
	if (root->subv_writers)
		btrfs_free_subvolume_writers(root->subv_writers);
3884 3885
	free_extent_buffer(root->node);
	free_extent_buffer(root->commit_root);
3886 3887
	kfree(root->free_ino_ctl);
	kfree(root->free_ino_pinned);
3888
	btrfs_put_fs_root(root);
Chris Mason's avatar
Chris Mason committed
3889 3890
}

3891
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
Chris Mason's avatar
Chris Mason committed
3892
{
3893 3894
	u64 root_objectid = 0;
	struct btrfs_root *gang[8];
3895 3896 3897 3898
	int i = 0;
	int err = 0;
	unsigned int ret = 0;
	int index;
3899

3900
	while (1) {
3901
		index = srcu_read_lock(&fs_info->subvol_srcu);
3902 3903 3904
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, root_objectid,
					     ARRAY_SIZE(gang));
3905 3906
		if (!ret) {
			srcu_read_unlock(&fs_info->subvol_srcu, index);
3907
			break;
3908
		}
3909
		root_objectid = gang[ret - 1]->root_key.objectid + 1;
3910

3911
		for (i = 0; i < ret; i++) {
3912 3913 3914 3915 3916 3917 3918 3919 3920
			/* Avoid to grab roots in dead_roots */
			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
				gang[i] = NULL;
				continue;
			}
			/* grab all the search result for later use */
			gang[i] = btrfs_grab_fs_root(gang[i]);
		}
		srcu_read_unlock(&fs_info->subvol_srcu, index);
3921

3922 3923 3924
		for (i = 0; i < ret; i++) {
			if (!gang[i])
				continue;
3925
			root_objectid = gang[i]->root_key.objectid;
3926 3927
			err = btrfs_orphan_cleanup(gang[i]);
			if (err)
3928 3929
				break;
			btrfs_put_fs_root(gang[i]);
3930 3931 3932
		}
		root_objectid++;
	}
3933 3934 3935 3936 3937 3938 3939

	/* release the uncleaned roots due to error */
	for (; i < ret; i++) {
		if (gang[i])
			btrfs_put_fs_root(gang[i]);
	}
	return err;
3940
}
3941

3942
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
3943
{
3944
	struct btrfs_root *root = fs_info->tree_root;
3945
	struct btrfs_trans_handle *trans;
3946

3947
	mutex_lock(&fs_info->cleaner_mutex);
3948
	btrfs_run_delayed_iputs(fs_info);
3949 3950
	mutex_unlock(&fs_info->cleaner_mutex);
	wake_up_process(fs_info->cleaner_kthread);
3951 3952

	/* wait until ongoing cleanup work done */
3953 3954
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
3955

3956
	trans = btrfs_join_transaction(root);
3957 3958
	if (IS_ERR(trans))
		return PTR_ERR(trans);
3959
	return btrfs_commit_transaction(trans);
3960 3961
}

3962
void __cold close_ctree(struct btrfs_fs_info *fs_info)
3963 3964 3965
{
	int ret;

3966
	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
3967 3968 3969 3970 3971 3972 3973
	/*
	 * We don't want the cleaner to start new transactions, add more delayed
	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
	 * because that frees the task_struct, and the transaction kthread might
	 * still try to wake up the cleaner.
	 */
	kthread_park(fs_info->cleaner_kthread);
3974

3975
	/* wait for the qgroup rescan worker to stop */
3976
	btrfs_qgroup_wait_for_completion(fs_info, false);
3977

3978 3979 3980 3981 3982
	/* wait for the uuid_scan task to finish */
	down(&fs_info->uuid_tree_rescan_sem);
	/* avoid complains from lockdep et al., set sem back to initial state */
	up(&fs_info->uuid_tree_rescan_sem);

3983
	/* pause restriper - we want to resume on mount */
3984
	btrfs_pause_balance(fs_info);
3985

3986 3987
	btrfs_dev_replace_suspend_for_unmount(fs_info);

3988
	btrfs_scrub_cancel(fs_info);
3989 3990 3991 3992 3993 3994

	/* wait for any defraggers to finish */
	wait_event(fs_info->transaction_wait,
		   (atomic_read(&fs_info->defrag_running) == 0));

	/* clear out the rbtree of defraggable inodes */
3995
	btrfs_cleanup_defrag_inodes(fs_info);
3996

3997 3998
	cancel_work_sync(&fs_info->async_reclaim_work);

3999 4000 4001
	/* Cancel or finish ongoing discard work */
	btrfs_discard_cleanup(fs_info);

4002
	if (!sb_rdonly(fs_info->sb)) {
4003
		/*
4004 4005
		 * The cleaner kthread is stopped, so do one final pass over
		 * unused block groups.
4006
		 */
4007
		btrfs_delete_unused_bgs(fs_info);
4008

4009
		ret = btrfs_commit_super(fs_info);
4010
		if (ret)
4011
			btrfs_err(fs_info, "commit super ret %d", ret);
4012 4013
	}

4014 4015
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
	    test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
4016
		btrfs_error_commit_super(fs_info);
4017

4018 4019
	kthread_stop(fs_info->transaction_kthread);
	kthread_stop(fs_info->cleaner_kthread);
4020

4021
	ASSERT(list_empty(&fs_info->delayed_iputs));
4022
	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4023

4024
	btrfs_free_qgroup_config(fs_info);
4025
	ASSERT(list_empty(&fs_info->delalloc_roots));
4026

4027
	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4028
		btrfs_info(fs_info, "at unmount delalloc count %lld",
4029
		       percpu_counter_sum(&fs_info->delalloc_bytes));
4030
	}
4031

4032 4033 4034 4035
	if (percpu_counter_sum(&fs_info->dio_bytes))
		btrfs_info(fs_info, "at unmount dio bytes count %lld",
			   percpu_counter_sum(&fs_info->dio_bytes));

4036
	btrfs_sysfs_remove_mounted(fs_info);
4037
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4038

4039
	btrfs_free_fs_roots(fs_info);
4040

4041 4042
	btrfs_put_block_group_cache(fs_info);

4043 4044 4045 4046 4047
	/*
	 * we must make sure there is not any read request to
	 * submit after we stopping all workers.
	 */
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4048 4049
	btrfs_stop_all_workers(fs_info);

4050
	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4051
	free_root_pointers(fs_info, true);
4052

4053 4054 4055 4056 4057 4058 4059 4060 4061
	/*
	 * We must free the block groups after dropping the fs_roots as we could
	 * have had an IO error and have left over tree log blocks that aren't
	 * cleaned up until the fs roots are freed.  This makes the block group
	 * accounting appear to be wrong because there's pending reserved bytes,
	 * so make sure we do the block group cleanup afterwards.
	 */
	btrfs_free_block_groups(fs_info);

4062
	iput(fs_info->btree_inode);
4063

4064
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4065
	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4066
		btrfsic_unmount(fs_info->fs_devices);
4067 4068
#endif

4069
	btrfs_mapping_tree_free(&fs_info->mapping_tree);
4070
	btrfs_close_devices(fs_info->fs_devices);
4071
	cleanup_srcu_struct(&fs_info->subvol_srcu);
4072 4073
}

4074 4075
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
			  int atomic)
4076
{
4077
	int ret;
4078
	struct inode *btree_inode = buf->pages[0]->mapping->host;
4079

4080
	ret = extent_buffer_uptodate(buf);
4081 4082 4083 4084
	if (!ret)
		return ret;

	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4085 4086 4087
				    parent_transid, atomic);
	if (ret == -EAGAIN)
		return ret;
4088
	return !ret;
4089 4090 4091 4092
}

void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
4093
	struct btrfs_fs_info *fs_info;
4094
	struct btrfs_root *root;
4095
	u64 transid = btrfs_header_generation(buf);
4096
	int was_dirty;
4097

4098 4099 4100
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
	/*
	 * This is a fast path so only do this check if we have sanity tests
4101
	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4102 4103
	 * outside of the sanity tests.
	 */
4104
	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4105 4106 4107
		return;
#endif
	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4108
	fs_info = root->fs_info;
4109
	btrfs_assert_tree_locked(buf);
4110
	if (transid != fs_info->generation)
4111
		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4112
			buf->start, transid, fs_info->generation);
4113
	was_dirty = set_extent_buffer_dirty(buf);
4114
	if (!was_dirty)
4115 4116 4117
		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
					 buf->len,
					 fs_info->dirty_metadata_batch);
4118
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4119 4120 4121 4122 4123 4124
	/*
	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
	 * but item data not updated.
	 * So here we should only check item pointers, not item data.
	 */
	if (btrfs_header_level(buf) == 0 &&
4125
	    btrfs_check_leaf_relaxed(buf)) {
4126
		btrfs_print_leaf(buf);
4127 4128 4129
		ASSERT(0);
	}
#endif
4130 4131
}

4132
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4133
					int flush_delayed)
4134 4135 4136 4137 4138
{
	/*
	 * looks as though older kernels can get into trouble with
	 * this code, they end up stuck in balance_dirty_pages forever
	 */
4139
	int ret;
4140 4141 4142 4143

	if (current->flags & PF_MEMALLOC)
		return;

4144
	if (flush_delayed)
4145
		btrfs_balance_delayed_items(fs_info);
4146

4147 4148 4149
	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				     BTRFS_DIRTY_METADATA_THRESH,
				     fs_info->dirty_metadata_batch);
4150
	if (ret > 0) {
4151
		balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4152 4153 4154
	}
}

4155
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4156
{
4157
	__btrfs_btree_balance_dirty(fs_info, 1);
4158
}
4159

4160
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4161
{
4162
	__btrfs_btree_balance_dirty(fs_info, 0);
4163
}
4164

4165 4166
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
		      struct btrfs_key *first_key)
4167
{
4168
	return btree_read_extent_buffer_pages(buf, parent_transid,
4169
					      level, first_key);
4170
}
4171

4172
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4173
{
4174 4175 4176
	/* cleanup FS via transaction */
	btrfs_cleanup_transaction(fs_info);

4177
	mutex_lock(&fs_info->cleaner_mutex);
4178
	btrfs_run_delayed_iputs(fs_info);
4179
	mutex_unlock(&fs_info->cleaner_mutex);
4180

4181 4182
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
4183 4184
}

4185
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4186 4187 4188
{
	struct btrfs_ordered_extent *ordered;

4189
	spin_lock(&root->ordered_extent_lock);
4190 4191 4192 4193
	/*
	 * This will just short circuit the ordered completion stuff which will
	 * make sure the ordered extent gets properly cleaned up.
	 */
4194
	list_for_each_entry(ordered, &root->ordered_extents,
4195 4196
			    root_extent_list)
		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211
	spin_unlock(&root->ordered_extent_lock);
}

static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
4212 4213
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
4214

4215
		spin_unlock(&fs_info->ordered_root_lock);
4216 4217
		btrfs_destroy_ordered_extents(root);

4218 4219
		cond_resched();
		spin_lock(&fs_info->ordered_root_lock);
4220 4221
	}
	spin_unlock(&fs_info->ordered_root_lock);
4222 4223 4224 4225 4226 4227 4228 4229

	/*
	 * We need this here because if we've been flipped read-only we won't
	 * get sync() from the umount, so we need to make sure any ordered
	 * extents that haven't had their dirty pages IO start writeout yet
	 * actually get run and error out properly.
	 */
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4230 4231
}

4232
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4233
				      struct btrfs_fs_info *fs_info)
4234 4235 4236 4237 4238 4239 4240 4241 4242
{
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_node *ref;
	int ret = 0;

	delayed_refs = &trans->delayed_refs;

	spin_lock(&delayed_refs->lock);
4243
	if (atomic_read(&delayed_refs->num_entries) == 0) {
4244
		spin_unlock(&delayed_refs->lock);
4245
		btrfs_info(fs_info, "delayed_refs has NO entry");
4246 4247 4248
		return ret;
	}

4249
	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4250
		struct btrfs_delayed_ref_head *head;
4251
		struct rb_node *n;
4252
		bool pin_bytes = false;
4253

4254 4255
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
4256
		if (btrfs_delayed_ref_lock(delayed_refs, head))
4257
			continue;
4258

4259
		spin_lock(&head->lock);
4260
		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4261 4262
			ref = rb_entry(n, struct btrfs_delayed_ref_node,
				       ref_node);
4263
			ref->in_tree = 0;
4264
			rb_erase_cached(&ref->ref_node, &head->ref_tree);
4265
			RB_CLEAR_NODE(&ref->ref_node);
4266 4267
			if (!list_empty(&ref->add_list))
				list_del(&ref->add_list);
4268 4269
			atomic_dec(&delayed_refs->num_entries);
			btrfs_put_delayed_ref(ref);
4270
		}
4271 4272 4273
		if (head->must_insert_reserved)
			pin_bytes = true;
		btrfs_free_delayed_extent_op(head->extent_op);
4274
		btrfs_delete_ref_head(delayed_refs, head);
4275 4276 4277
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		mutex_unlock(&head->mutex);
4278

4279
		if (pin_bytes)
4280 4281
			btrfs_pin_extent(fs_info, head->bytenr,
					 head->num_bytes, 1);
4282
		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4283
		btrfs_put_delayed_ref_head(head);
4284 4285 4286
		cond_resched();
		spin_lock(&delayed_refs->lock);
	}
4287
	btrfs_qgroup_destroy_extent_records(trans);
4288 4289 4290 4291 4292 4293

	spin_unlock(&delayed_refs->lock);

	return ret;
}

4294
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4295 4296 4297 4298 4299 4300
{
	struct btrfs_inode *btrfs_inode;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

4301 4302
	spin_lock(&root->delalloc_lock);
	list_splice_init(&root->delalloc_inodes, &splice);
4303 4304

	while (!list_empty(&splice)) {
4305
		struct inode *inode = NULL;
4306 4307
		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
					       delalloc_inodes);
4308
		__btrfs_del_delalloc_inode(root, btrfs_inode);
4309
		spin_unlock(&root->delalloc_lock);
4310

4311 4312 4313 4314 4315 4316 4317 4318 4319
		/*
		 * Make sure we get a live inode and that it'll not disappear
		 * meanwhile.
		 */
		inode = igrab(&btrfs_inode->vfs_inode);
		if (inode) {
			invalidate_inode_pages2(inode->i_mapping);
			iput(inode);
		}
4320
		spin_lock(&root->delalloc_lock);
4321
	}
4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346
	spin_unlock(&root->delalloc_lock);
}

static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->delalloc_root_lock);
	list_splice_init(&fs_info->delalloc_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					 delalloc_root);
		root = btrfs_grab_fs_root(root);
		BUG_ON(!root);
		spin_unlock(&fs_info->delalloc_root_lock);

		btrfs_destroy_delalloc_inodes(root);
		btrfs_put_fs_root(root);

		spin_lock(&fs_info->delalloc_root_lock);
	}
	spin_unlock(&fs_info->delalloc_root_lock);
4347 4348
}

4349
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4350 4351 4352 4353 4354 4355 4356 4357 4358 4359
					struct extent_io_tree *dirty_pages,
					int mark)
{
	int ret;
	struct extent_buffer *eb;
	u64 start = 0;
	u64 end;

	while (1) {
		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4360
					    mark, NULL);
4361 4362 4363
		if (ret)
			break;

4364
		clear_extent_bits(dirty_pages, start, end, mark);
4365
		while (start <= end) {
4366 4367
			eb = find_extent_buffer(fs_info, start);
			start += fs_info->nodesize;
4368
			if (!eb)
4369
				continue;
4370
			wait_on_extent_buffer_writeback(eb);
4371

4372 4373 4374 4375
			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
					       &eb->bflags))
				clear_extent_buffer_dirty(eb);
			free_extent_buffer_stale(eb);
4376 4377 4378 4379 4380 4381
		}
	}

	return ret;
}

4382
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4383 4384 4385 4386 4387 4388
				       struct extent_io_tree *pinned_extents)
{
	struct extent_io_tree *unpin;
	u64 start;
	u64 end;
	int ret;
4389
	bool loop = true;
4390 4391

	unpin = pinned_extents;
4392
again:
4393
	while (1) {
4394 4395
		struct extent_state *cached_state = NULL;

4396 4397 4398 4399 4400 4401 4402
		/*
		 * The btrfs_finish_extent_commit() may get the same range as
		 * ours between find_first_extent_bit and clear_extent_dirty.
		 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
		 * the same extent range.
		 */
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
4403
		ret = find_first_extent_bit(unpin, 0, &start, &end,
4404
					    EXTENT_DIRTY, &cached_state);
4405 4406
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4407
			break;
4408
		}
4409

4410 4411
		clear_extent_dirty(unpin, start, end, &cached_state);
		free_extent_state(cached_state);
4412
		btrfs_error_unpin_extent_range(fs_info, start, end);
4413
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4414 4415 4416
		cond_resched();
	}

4417
	if (loop) {
4418 4419
		if (unpin == &fs_info->freed_extents[0])
			unpin = &fs_info->freed_extents[1];
4420
		else
4421
			unpin = &fs_info->freed_extents[0];
4422 4423 4424 4425
		loop = false;
		goto again;
	}

4426 4427 4428
	return 0;
}

4429
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443
{
	struct inode *inode;

	inode = cache->io_ctl.inode;
	if (inode) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
		cache->io_ctl.inode = NULL;
		iput(inode);
	}
	btrfs_put_block_group(cache);
}

void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4444
			     struct btrfs_fs_info *fs_info)
4445
{
4446
	struct btrfs_block_group *cache;
4447 4448 4449 4450

	spin_lock(&cur_trans->dirty_bgs_lock);
	while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
4451
					 struct btrfs_block_group,
4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467
					 dirty_list);

		if (!list_empty(&cache->io_list)) {
			spin_unlock(&cur_trans->dirty_bgs_lock);
			list_del_init(&cache->io_list);
			btrfs_cleanup_bg_io(cache);
			spin_lock(&cur_trans->dirty_bgs_lock);
		}

		list_del_init(&cache->dirty_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);

		spin_unlock(&cur_trans->dirty_bgs_lock);
		btrfs_put_block_group(cache);
4468
		btrfs_delayed_refs_rsv_release(fs_info, 1);
4469 4470 4471 4472
		spin_lock(&cur_trans->dirty_bgs_lock);
	}
	spin_unlock(&cur_trans->dirty_bgs_lock);

4473 4474 4475 4476
	/*
	 * Refer to the definition of io_bgs member for details why it's safe
	 * to use it without any locking
	 */
4477 4478
	while (!list_empty(&cur_trans->io_bgs)) {
		cache = list_first_entry(&cur_trans->io_bgs,
4479
					 struct btrfs_block_group,
4480 4481 4482 4483 4484 4485 4486 4487 4488 4489
					 io_list);

		list_del_init(&cache->io_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);
		btrfs_cleanup_bg_io(cache);
	}
}

4490
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4491
				   struct btrfs_fs_info *fs_info)
4492
{
4493 4494
	struct btrfs_device *dev, *tmp;

4495
	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4496 4497 4498
	ASSERT(list_empty(&cur_trans->dirty_bgs));
	ASSERT(list_empty(&cur_trans->io_bgs));

4499 4500 4501 4502 4503
	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&dev->post_commit_list);
	}

4504
	btrfs_destroy_delayed_refs(cur_trans, fs_info);
4505

4506
	cur_trans->state = TRANS_STATE_COMMIT_START;
4507
	wake_up(&fs_info->transaction_blocked_wait);
4508

4509
	cur_trans->state = TRANS_STATE_UNBLOCKED;
4510
	wake_up(&fs_info->transaction_wait);
4511

4512
	btrfs_destroy_delayed_inodes(fs_info);
4513

4514
	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4515
				     EXTENT_DIRTY);
4516
	btrfs_destroy_pinned_extent(fs_info,
4517
				    fs_info->pinned_extents);
4518

4519 4520
	cur_trans->state =TRANS_STATE_COMPLETED;
	wake_up(&cur_trans->commit_wait);
4521 4522
}

4523
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4524 4525 4526
{
	struct btrfs_transaction *t;

4527
	mutex_lock(&fs_info->transaction_kthread_mutex);
4528

4529 4530 4531
	spin_lock(&fs_info->trans_lock);
	while (!list_empty(&fs_info->trans_list)) {
		t = list_first_entry(&fs_info->trans_list,
4532 4533
				     struct btrfs_transaction, list);
		if (t->state >= TRANS_STATE_COMMIT_START) {
4534
			refcount_inc(&t->use_count);
4535
			spin_unlock(&fs_info->trans_lock);
4536
			btrfs_wait_for_commit(fs_info, t->transid);
4537
			btrfs_put_transaction(t);
4538
			spin_lock(&fs_info->trans_lock);
4539 4540
			continue;
		}
4541
		if (t == fs_info->running_transaction) {
4542
			t->state = TRANS_STATE_COMMIT_DOING;
4543
			spin_unlock(&fs_info->trans_lock);
4544 4545 4546 4547 4548 4549 4550
			/*
			 * We wait for 0 num_writers since we don't hold a trans
			 * handle open currently for this transaction.
			 */
			wait_event(t->writer_wait,
				   atomic_read(&t->num_writers) == 0);
		} else {
4551
			spin_unlock(&fs_info->trans_lock);
4552
		}
4553
		btrfs_cleanup_one_transaction(t, fs_info);
4554

4555 4556 4557
		spin_lock(&fs_info->trans_lock);
		if (t == fs_info->running_transaction)
			fs_info->running_transaction = NULL;
4558
		list_del_init(&t->list);
4559
		spin_unlock(&fs_info->trans_lock);
4560

4561
		btrfs_put_transaction(t);
4562
		trace_btrfs_transaction_commit(fs_info->tree_root);
4563
		spin_lock(&fs_info->trans_lock);
4564
	}
4565 4566
	spin_unlock(&fs_info->trans_lock);
	btrfs_destroy_all_ordered_extents(fs_info);
4567 4568
	btrfs_destroy_delayed_inodes(fs_info);
	btrfs_assert_delayed_root_empty(fs_info);
4569
	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
4570 4571
	btrfs_destroy_all_delalloc_inodes(fs_info);
	mutex_unlock(&fs_info->transaction_kthread_mutex);
4572 4573 4574 4575

	return 0;
}

4576
static const struct extent_io_ops btree_extent_io_ops = {
4577
	/* mandatory callbacks */
4578
	.submit_bio_hook = btree_submit_bio_hook,
4579
	.readpage_end_io_hook = btree_readpage_end_io_hook,
4580
};