1. 10 Jun, 2014 40 commits
    • Filipe Manana's avatar
      Btrfs: ensure readers see new data after a clone operation · c125b8bf
      Filipe Manana authored
      We were cleaning the clone target file range from the page cache before
      we did replace the file extent items in the fs tree. This was racy,
      as right after cleaning the relevant range from the page cache and before
      replacing the file extent items, a read against that range could be
      performed by another task and populate again the page cache with stale
      data (stale after the cloning finishes). This would result in reads after
      the clone operation successfully finishes to get old data (and potentially
      for a very long time). Therefore evict the pages after replacing the file
      extent items, so that subsequent reads will always get the new data.
      
      Similarly, we were prone to races while cloning the file extent items
      because we weren't locking the target range and wait for any existing
      ordered extents against that range to complete. It was possible that
      after cloning the extent items, a write operation that was performed
      before the clone operation and overlaps the same range, would end up
      undoing all or part of the work the clone operation did (a worker task
      running inode.c:btrfs_finish_ordered_io). Therefore lock the target
      range in the io tree, wait for all pending ordered extents against that
      range to finish and then safely perform the cloning.
      
      The issue of reading stale data after the clone operation is easy to
      reproduce by running the following C program in a loop until it exits
      with return value 1.
      
       #include <unistd.h>
       #include <stdio.h>
       #include <stdlib.h>
       #include <string.h>
       #include <errno.h>
       #include <pthread.h>
       #include <fcntl.h>
       #include <assert.h>
       #include <asm/types.h>
       #include <linux/ioctl.h>
       #include <sys/stat.h>
       #include <sys/types.h>
       #include <sys/ioctl.h>
      
       #define SRC_FILE "/mnt/sdd/foo"
       #define DST_FILE "/mnt/sdd/bar"
       #define FILE_SIZE (16 * 1024)
       #define PATTERN_SRC 'X'
       #define PATTERN_DST 'Y'
      
      struct btrfs_ioctl_clone_range_args {
      	__s64 src_fd;
      	__u64 src_offset, src_length;
      	__u64 dest_offset;
      };
      
       #define BTRFS_IOCTL_MAGIC 0x94
       #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
      				   struct btrfs_ioctl_clone_range_args)
      
      static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
      static int clone_done = 0;
      static int reader_ready = 0;
      static int stale_data = 0;
      
      static void *reader_loop(void *arg)
      {
      	char buf[4096], want_buf[4096];
      
      	memset(want_buf, PATTERN_SRC, 4096);
      	pthread_mutex_lock(&mutex);
      	reader_ready = 1;
      	pthread_mutex_unlock(&mutex);
      
      	while (1) {
      		int done, fd, ret;
      
      		fd = open(DST_FILE, O_RDONLY);
      		assert(fd != -1);
      
      		pthread_mutex_lock(&mutex);
      		done = clone_done;
      		pthread_mutex_unlock(&mutex);
      
      		ret = read(fd, buf, 4096);
      		assert(ret == 4096);
      		close(fd);
      
      		if (done) {
      			ret = memcmp(buf, want_buf, 4096);
      			if (ret == 0) {
      				printf("Found new content\n");
      			} else {
      				printf("Found old content\n");
      				pthread_mutex_lock(&mutex);
      				stale_data = 1;
      				pthread_mutex_unlock(&mutex);
      			}
      			break;
      		}
      	}
      	return NULL;
      }
      
      int main(int argc, char *argv[])
      {
      	pthread_t reader;
      	int ret, i, fd;
      	struct btrfs_ioctl_clone_range_args clone_args;
      	int fd1, fd2;
      
      	ret = remove(SRC_FILE);
      	if (ret == -1 && errno != ENOENT) {
      		fprintf(stderr, "Error deleting src file: %s\n", strerror(errno));
      		return 1;
      	}
      	ret = remove(DST_FILE);
      	if (ret == -1 && errno != ENOENT) {
      		fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno));
      		return 1;
      	}
      
      	fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
      	assert(fd != -1);
      	for (i = 0; i < FILE_SIZE; i++) {
      		char c = PATTERN_SRC;
      		ret = write(fd, &c, 1);
      		assert(ret == 1);
      	}
      	close(fd);
      	fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
      	assert(fd != -1);
      	for (i = 0; i < FILE_SIZE; i++) {
      		char c = PATTERN_DST;
      		ret = write(fd, &c, 1);
      		assert(ret == 1);
      	}
      	close(fd);
              sync();
      
      	ret = pthread_create(&reader, NULL, reader_loop, NULL);
      	assert(ret == 0);
      	while (1) {
      		int r;
      		pthread_mutex_lock(&mutex);
      		r = reader_ready;
      		pthread_mutex_unlock(&mutex);
      		if (r) break;
      	}
      
      	fd1 = open(SRC_FILE, O_RDONLY);
      	if (fd1 < 0) {
      		fprintf(stderr, "Error open src file: %s\n", strerror(errno));
      		return 1;
      	}
      	fd2 = open(DST_FILE, O_RDWR);
      	if (fd2 < 0) {
      		fprintf(stderr, "Error open dst file: %s\n", strerror(errno));
      		return 1;
      	}
      	clone_args.src_fd = fd1;
      	clone_args.src_offset = 0;
      	clone_args.src_length = 4096;
      	clone_args.dest_offset = 0;
      	ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args);
      	assert(ret == 0);
      	close(fd1);
      	close(fd2);
      
      	pthread_mutex_lock(&mutex);
      	clone_done = 1;
      	pthread_mutex_unlock(&mutex);
      	ret = pthread_join(reader, NULL);
      	assert(ret == 0);
      
      	pthread_mutex_lock(&mutex);
      	ret = stale_data ? 1 : 0;
      	pthread_mutex_unlock(&mutex);
      	return ret;
      }
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      c125b8bf
    • Rickard Strandqvist's avatar
      fs: btrfs: volumes.c: Fix for possible null pointer dereference · 8321cf25
      Rickard Strandqvist authored
      There is otherwise a risk of a possible null pointer dereference.
      
      Was largely found by using a static code analysis program called cppcheck.
      Signed-off-by: default avatarRickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      8321cf25
    • Jeff Mahoney's avatar
      btrfs: allocate raid type kobjects dynamically · c1895442
      Jeff Mahoney authored
      We are currently allocating space_info objects in an array when we
      allocate space_info. When a user does something like:
      
      # btrfs balance start -mconvert=raid1 -dconvert=raid1 /mnt
      # btrfs balance start -mconvert=single -dconvert=single /mnt -f
      # btrfs balance start -mconvert=raid1 -dconvert=raid1 /
      
      We can end up with memory corruption since the kobject hasn't
      been reinitialized properly and the name pointer was left set.
      
      The rationale behind allocating them statically was to avoid
      creating a separate kobject container that just contained the
      raid type. It used the index in the array to determine the index.
      
      Ultimately, though, this wastes more memory than it saves in all
      but the most complex scenarios and introduces kobject lifetime
      questions.
      
      This patch allocates the kobjects dynamically instead. Note that
      we also remove the kobject_get/put of the parent kobject since
      kobject_add and kobject_del do that internally.
      Signed-off-by: default avatarJeff Mahoney <jeffm@suse.com>
      Reported-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      c1895442
    • Filipe Manana's avatar
      Btrfs: send, use the right limits for xattr names and values · 7e3ae33e
      Filipe Manana authored
      We were limiting the sum of the xattr name and value lengths to PATH_MAX,
      which is not correct, specially on filesystems created with btrfs-progs
      v3.12 or higher, where the default leaf size is max(16384, PAGE_SIZE), or
      systems with page sizes larger than 4096 bytes.
      
      Xattrs have their own specific maximum name and value lengths, which depend
      on the leaf size, therefore use these limits to be able to send xattrs with
      sizes larger than PATH_MAX.
      
      A test case for xfstests follows.
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      7e3ae33e
    • Filipe Manana's avatar
      Btrfs: send, don't error in the presence of subvols/snapshots · 1af56070
      Filipe Manana authored
      If we are doing an incremental send and the base snapshot has a
      directory with name X that doesn't exist anymore in the second
      snapshot and a new subvolume/snapshot exists in the second snapshot
      that has the same name as the directory (name X), the incremental
      send would fail with -ENOENT error. This is because it attempts
      to lookup for an inode with a number matching the objectid of a
      root, which doesn't exist.
      
      Steps to reproduce:
      
          mkfs.btrfs -f /dev/sdd
          mount /dev/sdd /mnt
      
          mkdir /mnt/testdir
          btrfs subvolume snapshot -r /mnt /mnt/mysnap1
      
          rmdir /mnt/testdir
          btrfs subvolume create /mnt/testdir
          btrfs subvolume snapshot -r /mnt /mnt/mysnap2
      
          btrfs send -p /mnt/mysnap1 /mnt/mysnap2 -f /tmp/send.data
      
      A test case for xfstests follows.
      Reported-by: default avatarRobert White <rwhite@pobox.com>
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      1af56070
    • Chris Mason's avatar
      Btrfs: async delayed refs · a79b7d4b
      Chris Mason authored
      Delayed extent operations are triggered during transaction commits.
      The goal is to queue up a healthly batch of changes to the extent
      allocation tree and run through them in bulk.
      
      This farms them off to async helper threads.  The goal is to have the
      bulk of the delayed operations being done in the background, but this is
      also important to limit our stack footprint.
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      a79b7d4b
    • Chris Mason's avatar
      Btrfs: split up __extent_writepage to lower stack usage · 40f76580
      Chris Mason authored
      __extent_writepage has two unrelated parts.  First it does the delayed
      allocation dance and second it does the mapping and IO for the page
      we're actually writing.
      
      This splits it up into those two parts so the stack from one doesn't
      impact the stack from the other.
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      40f76580
    • Alex Gartrell's avatar
      btrfs: Drop EXTENT_UPTODATE check in hole punching and direct locking · fc4adbff
      Alex Gartrell authored
      In these instances, we are trying to determine if a page has been accessed
      since we began the operation for the sake of retry.  This is easily
      accomplished by doing a gang lookup in the page mapping radix tree, and it
      saves us the dependency on the flag (so that we might eventually delete
      it).
      
      btrfs_page_exists_in_range borrows heavily from find_get_page, replacing
      the radix tree look up with a gang lookup of 1, so that we can find the
      next highest page >= index and see if it falls into our lock range.
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      Signed-off-by: default avatarAlex Gartrell <agartrell@fb.com>
      fc4adbff
    • Chris Mason's avatar
      Btrfs: cut down stack usage in btree_write_cache_pages · 0e378df1
      Chris Mason authored
      This adds noinline_for_stack to two helpers used by
      btree_write_cache_pages.  It shaves us down from 424 bytes on the
      stack to 280.
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      0e378df1
    • Chris Mason's avatar
      Btrfs: break up __btrfs_write_out_cache to cut down stack usage · d4452bc5
      Chris Mason authored
      __btrfs_write_out_cache was one of our stack pigs.  This breaks it
      up into helper functions and slims it down to 194 bytes.
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      d4452bc5
    • Josef Bacik's avatar
      Btrfs: free tmp ulist for qgroup rescan · 2a108409
      Josef Bacik authored
      Memory leaks are bad mmkay?
      Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      2a108409
    • Anand Jain's avatar
      btrfs: usage error should not be logged into system log · 402a0f47
      Anand Jain authored
      I have an opinion that system logs /var/log/messages are
      valuable info to investigate the real system issues at
      the data center. People handling data center issues
      do spend a lot time and efforts analyzing messages
      files. Having usage error logged into /var/log/messages
      is something we should avoid.
      Signed-off-by: default avatarAnand Jain <Anand.Jain@oracle.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      402a0f47
    • David Sterba's avatar
      btrfs: remove newline from inode cache kthread name · 67a77eb1
      David Sterba authored
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      67a77eb1
    • David Sterba's avatar
      btrfs: remove stale newlines from log messages · 351fd353
      David Sterba authored
      I've noticed an extra line after "use no compression", but search
      revealed much more in messages of more critical levels and rare errors.
      Signed-off-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      351fd353
    • Chris Mason's avatar
      Btrfs: fix double free in find_lock_delalloc_range · 7d788742
      Chris Mason authored
      We need to NULL the cached_state after freeing it, otherwise
      we might free it again if find_delalloc_range doesn't find anything.
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      cc: stable@vger.kernel.org
      7d788742
    • ZhangZhen's avatar
      btrfs: replace simple_strtoull() with kstrtoull() · 58dfae63
      ZhangZhen authored
      use the newer and more pleasant kstrtoull() to replace simple_strtoull(),
      because simple_strtoull() is marked for obsoletion.
      Signed-off-by: default avatarZhang Zhen <zhenzhang.zhang@huawei.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      58dfae63
    • Wang Shilong's avatar
      Btrfs: set right total device count for seeding support · 29865841
      Wang Shilong authored
      Seeding device support allows us to create a new filesystem
      based on existed filesystem.
      
      However newly created filesystem's @total_devices should include seed
      devices. This patch fix the following problem:
      
       # mkfs.btrfs -f /dev/sdb
       # btrfstune -S 1 /dev/sdb
       # mount /dev/sdb /mnt
       # btrfs device add -f /dev/sdc /mnt --->fs_devices->total_devices = 1
       # umount /mnt
       # mount /dev/sdc /mnt               --->fs_devices->total_devices = 2
      
      This is because we record right @total_devices in superblock, but
      @fs_devices->total_devices is reset to be 0 in btrfs_prepare_sprout().
      
      Fix this problem by not resetting @fs_devices->total_devices.
      Signed-off-by: default avatarWang Shilong <wangsl.fnst@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      29865841
    • Guangliang Zhao's avatar
      Btrfs: remove OPT_acl parse when acl disabled · 45ff35d6
      Guangliang Zhao authored
      Even CONFIG_BTRFS_FS_POSIX_ACL is not defined, the acl still could
      been enabled using a mount option, and now fs/btrfs/acl.o is not
      built, so the mount options will appear to be supported but will
      be silently ignored.
      Signed-off-by: default avatarGuangliang Zhao <lucienchao@gmail.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      45ff35d6
    • Josef Bacik's avatar
      Btrfs: add sanity tests for new qgroup accounting code · faa2dbf0
      Josef Bacik authored
      This exercises the various parts of the new qgroup accounting code.  We do some
      basic stuff and do some things with the shared refs to make sure all that code
      works.  I had to add a bunch of infrastructure because I needed to be able to
      insert items into a fake tree without having to do all the hard work myself,
      hopefully this will be usefull in the future.  Thanks,
      Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      faa2dbf0
    • Josef Bacik's avatar
      Btrfs: rework qgroup accounting · fcebe456
      Josef Bacik authored
      Currently qgroups account for space by intercepting delayed ref updates to fs
      trees.  It does this by adding sequence numbers to delayed ref updates so that
      it can figure out how the tree looked before the update so we can adjust the
      counters properly.  The problem with this is that it does not allow delayed refs
      to be merged, so if you say are defragging an extent with 5k snapshots pointing
      to it we will thrash the delayed ref lock because we need to go back and
      manually merge these things together.  Instead we want to process quota changes
      when we know they are going to happen, like when we first allocate an extent, we
      free a reference for an extent, we add new references etc.  This patch
      accomplishes this by only adding qgroup operations for real ref changes.  We
      only modify the sequence number when we need to lookup roots for bytenrs, this
      reduces the amount of churn on the sequence number and allows us to merge
      delayed refs as we add them most of the time.  This patch encompasses a bunch of
      architectural changes
      
      1) qgroup ref operations: instead of tracking qgroup operations through the
      delayed refs we simply add new ref operations whenever we notice that we need to
      when we've modified the refs themselves.
      
      2) tree mod seq:  we no longer have this separation of major/minor counters.
      this makes the sequence number stuff much more sane and we can remove some
      locking that was needed to protect the counter.
      
      3) delayed ref seq: we now read the tree mod seq number and use that as our
      sequence.  This means each new delayed ref doesn't have it's own unique sequence
      number, rather whenever we go to lookup backrefs we inc the sequence number so
      we can make sure to keep any new operations from screwing up our world view at
      that given point.  This allows us to merge delayed refs during runtime.
      
      With all of these changes the delayed ref stuff is a little saner and the qgroup
      accounting stuff no longer goes negative in some cases like it was before.
      Thanks,
      Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      fcebe456
    • Liu Bo's avatar
      Btrfs: mark mapping with error flag to report errors to userspace · 5dca6eea
      Liu Bo authored
      According to commit 865ffef3
      (fs: fix fsync() error reporting),
      it's not stable to just check error pages because pages can be
      truncated or invalidated, we should also mark mapping with error
      flag so that a later fsync can catch the error.
      Signed-off-by: default avatarLiu Bo <bo.li.liu@oracle.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      5dca6eea
    • Liu Bo's avatar
      Btrfs: fix NULL pointer crash of deleting a seed device · 29cc83f6
      Liu Bo authored
      Same as normal devices, seed devices should be initialized with
      fs_info->dev_root as well, otherwise we'll get a NULL pointer crash.
      
      Cc: Chris Murphy <lists@colorremedies.com>
      Reported-by: default avatarChris Murphy <lists@colorremedies.com>
      Signed-off-by: default avatarLiu Bo <bo.li.liu@oracle.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      29cc83f6
    • Wang Shilong's avatar
      Btrfs: fix joining same transaction handle more than twice · f017f15f
      Wang Shilong authored
      We hit something like the following function call flows:
      
      |->run_delalloc_range()
       |->btrfs_join_transaction()
         |->cow_file_range()
           |->btrfs_join_transaction()
             |->find_free_extent()
               |->btrfs_join_transaction()
      
      Trace infomation can be seen as:
      
      [ 7411.127040] ------------[ cut here ]------------
      [ 7411.127060] WARNING: CPU: 0 PID: 11557 at fs/btrfs/transaction.c:383 start_transaction+0x561/0x580 [btrfs]()
      [ 7411.127079] CPU: 0 PID: 11557 Comm: kworker/u8:9 Tainted: G           O 3.13.0+ #4
      [ 7411.127080] Hardware name: LENOVO QiTianM4350/ , BIOS F1KT52AUS 05/24/2013
      [ 7411.127085] Workqueue: writeback bdi_writeback_workfn (flush-btrfs-5)
      [ 7411.127092] Call Trace:
      [ 7411.127097]  [<ffffffff815b87b0>] dump_stack+0x45/0x56
      [ 7411.127101]  [<ffffffff81051ffd>] warn_slowpath_common+0x7d/0xa0
      [ 7411.127102]  [<ffffffff810520da>] warn_slowpath_null+0x1a/0x20
      [ 7411.127109]  [<ffffffffa0444fb1>] start_transaction+0x561/0x580 [btrfs]
      [ 7411.127115]  [<ffffffffa0445027>] btrfs_join_transaction+0x17/0x20 [btrfs]
      [ 7411.127120]  [<ffffffffa0431c91>] find_free_extent+0xa21/0xb50 [btrfs]
      [ 7411.127126]  [<ffffffffa0431f68>] btrfs_reserve_extent+0xa8/0x1a0 [btrfs]
      [ 7411.127131]  [<ffffffffa04322ce>] btrfs_alloc_free_block+0xee/0x440 [btrfs]
      [ 7411.127137]  [<ffffffffa043bd6e>] ? btree_set_page_dirty+0xe/0x10 [btrfs]
      [ 7411.127142]  [<ffffffffa041da51>] __btrfs_cow_block+0x121/0x530 [btrfs]
      [ 7411.127146]  [<ffffffffa041dfff>] btrfs_cow_block+0x11f/0x1c0 [btrfs]
      [ 7411.127151]  [<ffffffffa0421b74>] btrfs_search_slot+0x1d4/0x9c0 [btrfs]
      [ 7411.127157]  [<ffffffffa0438567>] btrfs_lookup_file_extent+0x37/0x40 [btrfs]
      [ 7411.127163]  [<ffffffffa0456bfc>] __btrfs_drop_extents+0x16c/0xd90 [btrfs]
      [ 7411.127169]  [<ffffffffa0444ae3>] ? start_transaction+0x93/0x580 [btrfs]
      [ 7411.127171]  [<ffffffff811663e2>] ? kmem_cache_alloc+0x132/0x140
      [ 7411.127176]  [<ffffffffa041cd9a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs]
      [ 7411.127182]  [<ffffffffa044aa61>] cow_file_range_inline+0x181/0x2e0 [btrfs]
      [ 7411.127187]  [<ffffffffa044aead>] cow_file_range+0x2ed/0x440 [btrfs]
      [ 7411.127194]  [<ffffffffa0464d7f>] ? free_extent_buffer+0x4f/0xb0 [btrfs]
      [ 7411.127200]  [<ffffffffa044b38f>] run_delalloc_nocow+0x38f/0xa60 [btrfs]
      [ 7411.127207]  [<ffffffffa0461600>] ? test_range_bit+0x30/0x180 [btrfs]
      [ 7411.127212]  [<ffffffffa044bd48>] run_delalloc_range+0x2e8/0x350 [btrfs]
      [ 7411.127219]  [<ffffffffa04618f9>] ? find_lock_delalloc_range+0x1a9/0x1e0 [btrfs]
      [ 7411.127222]  [<ffffffff812a1e71>] ? blk_queue_bio+0x2c1/0x330
      [ 7411.127228]  [<ffffffffa0462ad4>] __extent_writepage+0x2f4/0x760 [btrfs]
      
      Here we fix it by avoiding joining transaction again if we have held
      a transaction handle when allocating chunk in find_free_extent().
      Signed-off-by: default avatarWang Shilong <wangsl.fnst@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      f017f15f
    • Miao Xie's avatar
    • Filipe Manana's avatar
      Btrfs: check if items are ordered when a leaf is marked dirty · 1f21ef0a
      Filipe Manana authored
      To ease finding bugs during development related to modifying btree leaves
      in such a way that it makes its items not sorted by key anymore. Since this
      is an expensive check, it's only enabled if CONFIG_BTRFS_FS_CHECK_INTEGRITY
      is set, which isn't meant to be enabled for regular users.
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      1f21ef0a
    • Filipe Manana's avatar
      Btrfs: don't access non-existent key when csum tree is empty · 35045bf2
      Filipe Manana authored
      When the csum tree is empty, our leaf (path->nodes[0]) has a number
      of items equal to 0 and since btrfs_header_nritems() returns an
      unsigned integer (and so is our local nritems variable) the following
      comparison always evaluates to false:
      
           if (path->slots[0] >= nritems - 1) {
      
      As the casting rules lead to:
      
           if ((u32)0 >= (u32)4294967295) {
      
      This makes us access key at slot paths->slots[0] + 1 (1) of the empty leaf
      some lines below:
      
          btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
          if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
              found_key.type != BTRFS_EXTENT_CSUM_KEY) {
      		found_next = 1;
      		goto insert;
          }
      
      So just don't access such non-existent slot and don't set found_next to 1
      when the tree is empty. It's very unlikely we'll get a random key with the
      objectid and type values above, which is where we could go into trouble.
      
      If nritems is 0, just set found_next to 1 anyway as it will make us insert
      a csum item covering our whole extent (or the whole leaf) when the tree is
      empty.
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      35045bf2
    • Wang Shilong's avatar
      Btrfs: make sure there are not any read requests before stopping workers · de348ee0
      Wang Shilong authored
      In close_ctree(), after we have stopped all workers,there maybe still
      some read requests(for example readahead) to submit and this *maybe* trigger
      an oops that user reported before:
      
      kernel BUG at fs/btrfs/async-thread.c:619!
      
      By hacking codes, i can reproduce this problem with one cpu available.
      We fix this potential problem by invalidating all btree inode pages before
      stopping all workers.
      
      Thanks to Miao for pointing out this problem.
      Signed-off-by: default avatarWang Shilong <wangsl.fnst@cn.fujitsu.com>
      Reviewed-by: default avatarDavid Sterba <dsterba@suse.cz>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      de348ee0
    • Tsutomu Itoh's avatar
      Btrfs: fix possible memory leak in btrfs_create_tree() · 59885b39
      Tsutomu Itoh authored
      In btrfs_create_tree(), if btrfs_insert_root() fails, we should
      free root->commit_root.
      Reported-by: default avatarAlex Lyakas <alex@zadarastorage.com>
      Signed-off-by: default avatarTsutomu Itoh <t-itoh@jp.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      59885b39
    • ZhangZhen's avatar
      btrfs: remove useless ACL check · 776e4aae
      ZhangZhen authored
      posix_acl_xattr_set() already does the check, and it's the only
      way to feed in an ACL from userspace.
      So the check here is useless, remove it.
      Signed-off-by: default avatarzhang zhen <zhenzhang.zhang@huawei.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      776e4aae
    • Anand Jain's avatar
      btrfs: btrfs_rm_device() should zero mirror SB as well · 4d90d28b
      Anand Jain authored
      This fix will ensure all SB copies on the disk is zeroed
      when the disk is intentionally removed. This helps to
      better manage disks in the user land.
      
      This version of patch also merges the Zach patch as below.
      
       btrfs: don't double brelse on device rm
      Signed-off-by: default avatarAnand Jain <anand.jain@oracle.com>
      Signed-off-by: default avatarZach Brown <zab@redhat.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      4d90d28b
    • Miao Xie's avatar
    • Filipe Manana's avatar
      Btrfs: send, fix more issues related to directory renames · f959492f
      Filipe Manana authored
      This is a continuation of the previous changes titled:
      
         Btrfs: fix incremental send's decision to delay a dir move/rename
         Btrfs: part 2, fix incremental send's decision to delay a dir move/rename
      
      There's a few more cases where a directory rename/move must be delayed which was
      previously overlooked. If our immediate ancestor has a lower inode number than
      ours and it doesn't have a delayed rename/move operation associated to it, it
      doesn't mean there isn't any non-direct ancestor of our current inode that needs
      to be renamed/moved before our current inode (i.e. with a higher inode number
      than ours).
      
      So we can't stop the search if our immediate ancestor has a lower inode number than
      ours, we need to navigate the directory hierarchy upwards until we hit the root or:
      
      1) find an ancestor with an higher inode number that was renamed/moved in the send
         root too (or already has a pending rename/move registered);
      2) find an ancestor that is a new directory (higher inode number than ours and
         exists only in the send root).
      
      Reproducer for case 1)
      
          $ mkfs.btrfs -f /dev/sdd
          $ mount /dev/sdd /mnt
      
          $ mkdir -p /mnt/a/b
          $ mkdir -p /mnt/a/c/d
          $ mkdir /mnt/a/b/e
          $ mkdir /mnt/a/c/d/f
          $ mv /mnt/a/b /mnt/a/c/d/2b
          $ mkdir /mnt/a/x
          $ mkdir /mnt/a/y
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap1
          $ btrfs send /mnt/snap1 -f /tmp/base.send
      
          $ mv /mnt/a/x /mnt/a/y
          $ mv /mnt/a/c/d/2b/e /mnt/a/c/d/2b/2e
          $ mv /mnt/a/c/d /mnt/a/h/2d
          $ mv /mnt/a/c /mnt/a/h/2d/2b/2c
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap2
          $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send
      
      Simple reproducer for case 2)
      
          $ mkfs.btrfs -f /dev/sdd
          $ mount /dev/sdd /mnt
      
          $ mkdir -p /mnt/a/b
          $ mkdir /mnt/a/c
          $ mv /mnt/a/b /mnt/a/c/b2
          $ mkdir /mnt/a/e
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap1
          $ btrfs send /mnt/snap1 -f /tmp/base.send
      
          $ mv /mnt/a/c/b2 /mnt/a/e/b3
          $ mkdir /mnt/a/e/b3/f
          $ mkdir /mnt/a/h
          $ mv /mnt/a/c /mnt/a/e/b3/f/c2
          $ mv /mnt/a/e /mnt/a/h/e2
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap2
          $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send
      
      Another simple reproducer for case 2)
      
          $ mkfs.btrfs -f /dev/sdd
          $ mount /dev/sdd /mnt
      
          $ mkdir -p /mnt/a/b
          $ mkdir /mnt/a/c
          $ mkdir /mnt/a/b/d
          $ mkdir /mnt/a/c/e
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap1
          $ btrfs send /mnt/snap1 -f /tmp/base.send
      
          $ mkdir /mnt/a/b/d/f
          $ mkdir /mnt/a/b/g
          $ mv /mnt/a/c/e /mnt/a/b/g/e2
          $ mv /mnt/a/c /mnt/a/b/d/f/c2
          $ mv /mnt/a/b/d/f /mnt/a/b/g/e2/f2
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap2
          $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send
      
      More complex reproducer for case 2)
      
          $ mkfs.btrfs -f /dev/sdd
          $ mount /dev/sdd /mnt
      
          $ mkdir -p /mnt/a/b
          $ mkdir -p /mnt/a/c/d
          $ mkdir /mnt/a/b/e
          $ mkdir /mnt/a/c/d/f
          $ mv /mnt/a/b /mnt/a/c/d/2b
          $ mkdir /mnt/a/x
          $ mkdir /mnt/a/y
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap1
          $ btrfs send /mnt/snap1 -f /tmp/base.send
      
          $ mv /mnt/a/x /mnt/a/y
          $ mv /mnt/a/c/d/2b/e /mnt/a/c/d/2b/2e
          $ mv /mnt/a/c/d /mnt/a/h/2d
          $ mv /mnt/a/c /mnt/a/h/2d/2b/2c
      
          $ btrfs subvolume snapshot -r /mnt /mnt/snap2
          $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send
      
      For both cases the incremental send would enter an infinite loop when building
      path strings.
      
      While solving these cases, this change also re-implements the code to detect
      when directory moves/renames should be delayed. Instead of dealing with several
      specific cases separately, it's now more generic handling all cases with a simple
      detection algorithm and if when applying a delayed move/rename there's a path loop
      detected, it further delays the move/rename registering a new ancestor inode as
      the dependency inode (so our rename happens after that ancestor is renamed).
      
      Tests for these cases is being added to xfstests too.
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      f959492f
    • Filipe Manana's avatar
    • Filipe Manana's avatar
      Btrfs: send, account for orphan directories when building path strings · c992ec94
      Filipe Manana authored
      If we have directories with a pending move/rename operation, we must take into
      account any orphan directories that got created before executing the pending
      move/rename. Those orphan directories are directories with an inode number higher
      then the current send progress and that don't exist in the parent snapshot, they
      are created before current progress reaches their inode number, with a generated
      name of the form oN-M-I and at the root of the filesystem tree, and later when
      progress matches their inode number, moved/renamed to their final location.
      
      Reproducer:
      
                $ mkfs.btrfs -f /dev/sdd
                $ mount /dev/sdd /mnt
      
                $ mkdir -p /mnt/a/b/c/d
                $ mkdir /mnt/a/b/e
                $ mv /mnt/a/b/c /mnt/a/b/e/CC
                $ mkdir /mnt/a/b/e/CC/d/f
      	  $ mkdir /mnt/a/g
      
                $ btrfs subvolume snapshot -r /mnt /mnt/snap1
                $ btrfs send /mnt/snap1 -f /tmp/base.send
      
                $ mkdir /mnt/a/g/h
      	  $ mv /mnt/a/b/e /mnt/a/g/h/EE
                $ mv /mnt/a/g/h/EE/CC/d /mnt/a/g/h/EE/DD
      
                $ btrfs subvolume snapshot -r /mnt /mnt/snap2
                $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send
      
      The second receive command failed with the following error:
      
          ERROR: rename a/b/e/CC/d -> o264-7-0/EE/DD failed. No such file or directory
      
      A test case for xfstests follows soon.
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      c992ec94
    • Filipe Manana's avatar
      Btrfs: send, avoid unnecessary inode item lookup in the btree · b46ab97b
      Filipe Manana authored
      Regardless of whether the caller is interested or not in knowing the inode's
      generation (dir_gen != NULL), get_first_ref always does a btree lookup to get
      the inode item. Avoid this useless lookup if dir_gen parameter is NULL (which
      is in some cases).
      Signed-off-by: default avatarFilipe David Borba Manana <fdmanana@gmail.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      b46ab97b
    • Gui Hecheng's avatar
      btrfs: add dev maxs limit for __btrfs_alloc_chunk in kernel space · 23f8f9b7
      Gui Hecheng authored
      For RAID0,5,6,10,
      For system chunk, there shouldn't be too many stripes to
      make a btrfs_chunk that exceeds BTRFS_SYSTEM_CHUNK_ARRAY_SIZE
      For data/meta chunk, there shouldn't be too many stripes to
      make a btrfs_chunk that exceeds a leaf.
      Signed-off-by: default avatarGui Hecheng <guihc.fnst@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      23f8f9b7
    • Gui Hecheng's avatar
      btrfs: fix wrong max system array size check in kernel space · 5f43f86e
      Gui Hecheng authored
      For system chunk array,
      We copy a "disk_key" and an chunk item each time,
      so there should be enough space to hold both of them,
      not only the chunk item.
      Signed-off-by: default avatarGui Hecheng <guihc.fnst@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      5f43f86e
    • Qu Wenruo's avatar
      btrfs: Add check to avoid cleanup roots already in fs_info->dead_roots. · 65d33fd7
      Qu Wenruo authored
      Current btrfs_orphan_cleanup will also cleanup roots which is already in
      fs_info->dead_roots without protection.
      This will have conditional race with fs_info->cleaner_kthread.
      
      This patch will use refs in root->root_item to detect roots in
      dead_roots and avoid conflicts.
      Signed-off-by: default avatarQu Wenruo <quwenruo@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      65d33fd7
    • Miao Xie's avatar
      Btrfs: reclaim the reserved metadata space at background · 21c7e756
      Miao Xie authored
      Before applying this patch, the task had to reclaim the metadata space
      by itself if the metadata space was not enough. And When the task started
      the space reclamation, all the other tasks which wanted to reserve the
      metadata space were blocked. At some cases, they would be blocked for
      a long time, it made the performance fluctuate wildly.
      
      So we introduce the background metadata space reclamation, when the space
      is about to be exhausted, we insert a reclaim work into the workqueue, the
      worker of the workqueue helps us to reclaim the reserved space at the
      background. By this way, the tasks needn't reclaim the space by themselves at
      most cases, and even if the tasks have to reclaim the space or are blocked
      for the space reclamation, they will get enough space more quickly.
      
      Here is my test result(Tested by compilebench):
       Memory:	2GB
       CPU:		2Cores * 1CPU
       Partition:	40GB(SSD)
      
      Test command:
       # compilebench -D <mnt> -m
      
      Without this patch:
       intial create total runs 30 avg 54.36 MB/s (user 0.52s sys 2.44s)
       compile total runs 30 avg 123.72 MB/s (user 0.13s sys 1.17s)
       read compiled tree total runs 3 avg 81.15 MB/s (user 0.74s sys 4.89s)
       delete compiled tree total runs 30 avg 5.32 seconds (user 0.35s sys 4.37s)
      
      With this patch:
       intial create total runs 30 avg 59.80 MB/s (user 0.52s sys 2.53s)
       compile total runs 30 avg 151.44 MB/s (user 0.13s sys 1.11s)
       read compiled tree total runs 3 avg 83.25 MB/s (user 0.76s sys 4.91s)
       delete compiled tree total runs 30 avg 5.29 seconds (user 0.34s sys 4.34s)
      Signed-off-by: default avatarMiao Xie <miaox@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      21c7e756
    • Miao Xie's avatar
      Btrfs: output warning instead of error when loading free space cache failed · 32d6b47f
      Miao Xie authored
      If we fail to load a free space cache, we can rebuild it from the extent tree,
      so it is not a serious error, we should not output a error message that
      would make the users uncomfortable. This patch uses warning message instead
      of it.
      Signed-off-by: default avatarMiao Xie <miaox@cn.fujitsu.com>
      Signed-off-by: default avatarChris Mason <clm@fb.com>
      32d6b47f