• Filipe Manana's avatar
    btrfs: improve btree readahead for full send operations · ace75066
    Filipe Manana authored
    Currently a full send operation uses the standard btree readahead when
    iterating over the subvolume/snapshot btree, which despite bringing good
    performance benefits, it could be improved in a few aspects for use cases
    such as full send operations, which are guaranteed to visit every node
    and leaf of a btree, in ascending and sequential order. The limitations
    of that standard btree readahead implementation are the following:
    
    1) It only triggers readahead for leaves that are physically close
       to the leaf being read, within a 64K range;
    
    2) It only triggers readahead for the next or previous leaves if the
       leaf being read is not currently in memory;
    
    3) It never triggers readahead for nodes.
    
    So add a new readahead mode that addresses all these points and use it
    for full send operations.
    
    The following test script was used to measure the improvement on a box
    using an average, consumer grade, spinning disk and with 16GiB of RAM:
    
      $ cat test.sh
      #!/bin/bash
    
      DEV=/dev/sdj
      MNT=/mnt/sdj
      MKFS_OPTIONS="--nodesize 16384"     # default, just to be explicit
      MOUNT_OPTIONS="-o max_inline=2048"  # default, just to be explicit
    
      mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
      mount $MOUNT_OPTIONS $DEV $MNT
    
      # Create files with inline data to make it easier and faster to create
      # large btrees.
      add_files()
      {
          local total=$1
          local start_offset=$2
          local number_jobs=$3
          local total_per_job=$(($total / $number_jobs))
    
          echo "Creating $total new files using $number_jobs jobs"
          for ((n = 0; n < $number_jobs; n++)); do
              (
                  local start_num=$(($start_offset + $n * $total_per_job))
                  for ((i = 1; i <= $total_per_job; i++)); do
                      local file_num=$((start_num + $i))
                      local file_path="$MNT/file_${file_num}"
                      xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null
                      if [ $? -ne 0 ]; then
                          echo "Failed creating file $file_path"
                          break
                      fi
                  done
              ) &
              worker_pids[$n]=$!
          done
    
          wait ${worker_pids[@]}
    
          sync
          echo
          echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)"
      }
    
      initial_file_count=500000
      add_files $initial_file_count 0 4
    
      echo
      echo "Creating first snapshot..."
      btrfs subvolume snapshot -r $MNT $MNT/snap1
    
      echo
      echo "Adding more files..."
      add_files $((initial_file_count / 4)) $initial_file_count 4
    
      echo
      echo "Updating 1/50th of the initial files..."
      for ((i = 1; i < $initial_file_count; i += 50)); do
          xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null
      done
    
      echo
      echo "Creating second snapshot..."
      btrfs subvolume snapshot -r $MNT $MNT/snap2
    
      umount $MNT
    
      echo 3 > /proc/sys/vm/drop_caches
      blockdev --flushbufs $DEV &> /dev/null
      hdparm -F $DEV &> /dev/null
    
      mount $MOUNT_OPTIONS $DEV $MNT
    
      echo
      echo "Testing full send..."
      start=$(date +%s)
      btrfs send $MNT/snap1 > /dev/null
      end=$(date +%s)
      echo
      echo "Full send took $((end - start)) seconds"
    
      umount $MNT
    
    The durations of the full send operation in seconds were the following:
    
    Before this change:  217 seconds
    After this change:   205 seconds (-5.7%)
    Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
    Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
    ace75066
send.c 182 KB