Commit d352ac68 authored by Chris Mason's avatar Chris Mason

Btrfs: add and improve comments

This improves the comments at the top of many functions.  It didn't
dive into the guts of functions because I was trying to
avoid merging problems with the new allocator and back reference work.

extent-tree.c and volumes.c were both skipped, and there is definitely
more work todo in cleaning and commenting the code.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 9a5e1ea1
......@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
obj-m := btrfs.o
btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
file-item.o inode-item.o inode-map.o disk-io.o \
transaction.o bit-radix.o inode.o file.o tree-defrag.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
ref-cache.o export.o tree-log.o acl.o free-space-cache.o
......
* cleanup, add more error checking, get rid of BUG_ONs
* Fix ENOSPC handling
* Make allocator smarter
* add a block group to struct inode
* Do actual block accounting
* Check compat and incompat flags on the inode
* Get rid of struct ctree_path, limiting tree levels held at one time
* Add generation number to key pointer in nodes
* Add generation number to inode
* forbid cross subvolume renames and hardlinks
* Release
* Do real tree locking
* Add extent mirroring (backup copies of blocks)
* Add fancy interface to get access to incremental backups
* Add fancy striped extents to make big reads faster
* Use relocation to try and fix write errors
* Make allocator much smarter
* xattrs (directory streams for regular files)
* Scrub & defrag
......@@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
/*
* if we pick a busy task, move the task to the end of the list.
* hopefully this will keep things somewhat evenly balanced
* hopefully this will keep things somewhat evenly balanced.
* Do the move in batches based on the sequence number. This groups
* requests submitted at roughly the same time onto the same worker.
*/
next = workers->worker_list.next;
worker = list_entry(next, struct btrfs_worker_thread, worker_list);
atomic_inc(&worker->num_pending);
worker->sequence++;
if (worker->sequence % workers->idle_thresh == 0)
list_move_tail(next, &workers->worker_list);
return worker;
}
/*
* selects a worker thread to take the next job. This will either find
* an idle worker, start a new worker up to the max count, or just return
* one of the existing busy workers.
*/
static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
......
......@@ -63,14 +63,17 @@ struct btrfs_workers {
/* once a worker has this many requests or fewer, it is idle */
int idle_thresh;
/* list with all the work threads */
/* list with all the work threads. The workers on the idle thread
* may be actively servicing jobs, but they haven't yet hit the
* idle thresh limit above.
*/
struct list_head worker_list;
struct list_head idle_list;
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
/* extra name for this worker */
/* extra name for this worker, used for current->name */
char *name;
};
......
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include "bit-radix.h"
#define BIT_ARRAY_BYTES 256
#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
extern struct kmem_cache *btrfs_bit_radix_cachep;
int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
{
unsigned long *bits;
unsigned long slot;
int bit_slot;
int ret;
slot = bit / BIT_RADIX_BITS_PER_ARRAY;
bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
bits = radix_tree_lookup(radix, slot);
if (!bits) {
bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS);
if (!bits)
return -ENOMEM;
memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
bits[0] = slot;
ret = radix_tree_insert(radix, slot, bits);
if (ret)
return ret;
}
ret = test_and_set_bit(bit_slot, bits + 1);
if (ret < 0)
ret = 1;
return ret;
}
int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
{
unsigned long *bits;
unsigned long slot;
int bit_slot;
slot = bit / BIT_RADIX_BITS_PER_ARRAY;
bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
bits = radix_tree_lookup(radix, slot);
if (!bits)
return 0;
return test_bit(bit_slot, bits + 1);
}
int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
{
unsigned long *bits;
unsigned long slot;
int bit_slot;
int i;
int empty = 1;
slot = bit / BIT_RADIX_BITS_PER_ARRAY;
bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
bits = radix_tree_lookup(radix, slot);
if (!bits)
return 0;
clear_bit(bit_slot, bits + 1);
for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
if (bits[i]) {
empty = 0;
break;
}
}
if (empty) {
bits = radix_tree_delete(radix, slot);
BUG_ON(!bits);
kmem_cache_free(btrfs_bit_radix_cachep, bits);
}
return 0;
}
int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
unsigned long start, int nr)
{
unsigned long *bits;
unsigned long *gang[4];
int found;
int ret;
int i;
int total_found = 0;
unsigned long slot;
slot = start / BIT_RADIX_BITS_PER_ARRAY;
ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
ARRAY_SIZE(gang));
found = start % BIT_RADIX_BITS_PER_ARRAY;
for (i = 0; i < ret && nr > 0; i++) {
bits = gang[i];
while(nr > 0) {
found = find_next_bit(bits + 1,
BIT_RADIX_BITS_PER_ARRAY,
found);
if (found < BIT_RADIX_BITS_PER_ARRAY) {
*retbits = bits[0] *
BIT_RADIX_BITS_PER_ARRAY + found;
retbits++;
nr--;
total_found++;
found++;
} else
break;
}
found = 0;
}
return total_found;
}
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef __BIT_RADIX__
#define __BIT_RADIX__
#include <linux/radix-tree.h>
int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
unsigned long start, int nr);
static inline void init_bit_radix(struct radix_tree_root *radix)
{
INIT_RADIX_TREE(radix, GFP_NOFS);
}
#endif
......@@ -25,27 +25,58 @@
/* in memory btrfs inode */
struct btrfs_inode {
/* which subvolume this inode belongs to */
struct btrfs_root *root;
/* the block group preferred for allocations. This pointer is buggy
* and needs to be replaced with a bytenr instead
*/
struct btrfs_block_group_cache *block_group;
/* key used to find this inode on disk. This is used by the code
* to read in roots of subvolumes
*/
struct btrfs_key location;
/* the extent_tree has caches of all the extent mappings to disk */
struct extent_map_tree extent_tree;
/* the io_tree does range state (DIRTY, LOCKED etc) */
struct extent_io_tree io_tree;
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
struct extent_io_tree io_failure_tree;
/* held while inserting checksums to avoid races */
struct mutex csum_mutex;
/* held while inesrting or deleting extents from files */
struct mutex extent_mutex;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
struct inode vfs_inode;
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
/* standard acl pointers */
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
/* for keeping track of orphaned inodes */
struct list_head i_orphan;
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
* to walk them all.
*/
struct list_head delalloc_inodes;
/* full 64 bit generation number */
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
u64 generation;
/*
......@@ -57,10 +88,25 @@ struct btrfs_inode {
*/
u64 logged_trans;
/* trans that last made a change that should be fully fsync'd */
/*
* trans that last made a change that should be fully fsync'd. This
* gets reset to zero each time the inode is logged
*/
u64 log_dirty_trans;
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
*/
u64 delalloc_bytes;
/*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
* because not all the blocks are written yet.
*/
u64 disk_i_size;
/* flags field from the on disk inode */
u32 flags;
/*
......@@ -68,6 +114,8 @@ struct btrfs_inode {
* number for new files that are created
*/
u64 index_cnt;
struct inode vfs_inode;
};
static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
......
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef __BTRFS_CRC32C__
#define __BTRFS_CRC32C__
#include <asm/byteorder.h>
......
This diff is collapsed.
......@@ -27,7 +27,6 @@
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <asm/kmap_types.h>
#include "bit-radix.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
......
......@@ -21,6 +21,14 @@
#include "hash.h"
#include "transaction.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
* collision. data_size indicates how big the item inserted should be. On
* success a struct btrfs_dir_item pointer is returned, otherwise it is
* an ERR_PTR.
*
* The name is not copied into the dir item, you have to do that yourself.
*/
static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
*trans,
struct btrfs_root *root,
......@@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
return (struct btrfs_dir_item *)ptr;
}
/*
* xattrs work a lot like directories, this inserts an xattr item
* into the tree
*/
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
u16 name_len, const void *data, u16 data_len,
......@@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
return ret;
}
/*
* insert a directory item in the tree, doing all the magic for
* both indexes. 'dir' indicates which objectid to insert it into,
* 'location' is the key to stuff into the directory item, 'type' is the
* type of the inode we're pointing to, and 'index' is the sequence number
* to use for the second index (if one is created).
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, const char *name, int name_len, u64 dir,
struct btrfs_key *location, u8 type, u64 index)
......@@ -184,6 +203,11 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
return 0;
}
/*
* lookup a directory item based on name. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
* item (use mod < 0) or changing the options (use mod > 0)
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
......@@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
/*
* lookup a directory item based on index. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
* item (use mod < 0) or changing the options (use mod > 0)
*
* The name is used to make sure the index really points to the name you were
* looking for.
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
......@@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
/*
* helper function to look at the directory item pointed to by 'path'
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len)
......@@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
return NULL;
}
/*
* given a pointer into a directory item, delete it. This
* handles items that have more than one entry in them.
*/
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
......
......@@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
/*
* end_io_wq structs are used to do processing in task context when an IO is
* complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
struct end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
......@@ -66,6 +71,11 @@ struct end_io_wq {
struct btrfs_work work;
};
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
struct inode *inode;
struct bio *bio;
......@@ -76,6 +86,10 @@ struct async_submit_bio {
struct btrfs_work work;
};
/*
* extents on the btree inode are pretty simple, there's one extent
* that covers the entire device
*/
struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
size_t page_offset, u64 start, u64 len,
int create)
......@@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result)
*(__le32 *)result = ~cpu_to_le32(crc);
}
/*
* compute the csum for a btree block, and either verify it or write it
* into the csum field of the block.
*/
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
......@@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
return 0;
}
/*
* we can't consider a given block up to date unless the transid of the
* block matches the transid in the parent node's pointer. This is how we
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
static int verify_parent_transid(struct extent_io_tree *io_tree,
struct extent_buffer *eb, u64 parent_transid)
{
......@@ -228,9 +252,12 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
GFP_NOFS);
return ret;
}
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*/
static int btree_read_extent_buffer_pages(struct btrfs_root *root,
struct extent_buffer *eb,
u64 start, u64 parent_transid)
......@@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
return -EIO;
}
/*
* checksum a dirty tree block before IO. This has extra checks to make
* sure we only fill in the checksum field in the first page of a multi-page block
*/
int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
{
struct extent_io_tree *tree;
......
......@@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
}
EXPORT_SYMBOL(wait_on_extent_writeback);
/*
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
{
int err;
......@@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
}
EXPORT_SYMBOL(set_range_writeback);
/*
* find the first offset in the io tree with 'bits' set. zero is
* returned if we find something, and *start_ret and *end_ret are
* set to reflect the state struct that was found.
*
* If nothing was found, 1 is returned, < 0 on error
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, int bits)
{
......@@ -1017,6 +1028,10 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
}
EXPORT_SYMBOL(find_first_extent_bit);
/* find the first state struct with 'bits' set after 'start', and
* return it. tree->lock must be held. NULL will returned if
* nothing was found after 'start'
*/
struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
u64 start, int bits)
{
......@@ -1046,8 +1061,14 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
}
EXPORT_SYMBOL(find_first_extent_bit_state);
u64 find_lock_delalloc_range(struct extent_io_tree *tree,
u64 *start, u64 *end, u64 max_bytes)
/*
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
u64 *start, u64 *end, u64 max_bytes)
{
struct rb_node *node;
struct extent_state *state;
......@@ -1130,6 +1151,11 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
return found;
}
/*
* count the number of bytes in the tree that have a given bit(s)
* set. This can be fairly slow, except for EXTENT_DIRTY which is
* cached. The total number found is returned.
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
unsigned long bits)
......@@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
}
EXPORT_SYMBOL(unlock_range);
/*
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
{
struct rb_node *node;
......
......@@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
return NULL;
}
/*
* search through the tree for an extent_map with a given offset. If
* it can't be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct rb_node **prev_ret,
struct rb_node **next_ret)
......@@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
/*
* look for an offset in the tree, and if it can't be found, return
* the first offset we can find smaller than 'offset'.
*/
static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
{
struct rb_node *prev;
......@@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
return ret;
}
/* check to see if two extent_map structs are adjacent and safe to merge */
static int mergable_maps(struct extent_map *prev, struct extent_map *next)
{
if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
......@@ -250,6 +259,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
}
EXPORT_SYMBOL(add_extent_mapping);
/* simple helper to do math around the end of an extent, handling wrap */
static u64 range_end(u64 start, u64 len)
{
if (start + len < start)
......
......@@ -41,6 +41,9 @@
#include "compat.h"
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
*/
static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
int write_bytes,
struct page **prepared_pages,
......@@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
return page_fault ? -EFAULT : 0;
}
/*
* unlocks pages after btrfs_file_write is done with them
*/
static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
if (!pages[i])
break;
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
* clear it here
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
mark_page_accessed(pages[i]);
......@@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
}
}
/* this does all the hard work for inserting an inline extent into
* the btree. Any existing inline extent is extended as required to make room,
* otherwise things are inserted as required into the btree
*/
static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 offset, size_t size,
......@@ -228,6 +242,14 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
return err;
}
/*
* after copy_from_user, pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
* any next extents (if required).
*
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct file *file,
......@@ -362,6 +384,10 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
return err;
}
/*
* this drops all the extents in the cache that intersect the range
* [start, end]. Existing extents are split as required.
*/
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned)
{
......@@ -536,6 +562,9 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
*
* inline_limit is used to tell this code which offsets in the file to keep
* if they contain inline extents.
*/
int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
......@@ -796,7 +825,9 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
}
/*
* this gets pages into the page cache and locks them down
* this gets pages into the page cache and locks them down, it also properly
* waits for data=ordered extents to finish before allowing the pages to be
* modified.
*/
static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
struct page **pages, size_t num_pages,
......@@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
return 0;
}
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
*
* It needs to call filemap_fdatawait so that all ordered extent updates are
* in the metadata btree are up to date for copying to the log.
*
* It drops the inode mutex before doing the tree log commit. This is an
* important optimization for directories because holding the mutex prevents
* new operations on the dir while we write to disk.
*/
int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
......
This diff is collapsed.
......@@ -25,6 +25,15 @@
#include "extent_io.h"
#include "locking.h"
/*
* locks the per buffer mutex in an extent buffer. This uses adaptive locks
* and the spin is not tuned very extensively. The spinning does make a big
* difference in almost every workload, but spinning for the right amount of
* time needs some help.
*
* In general, we want to spin as long as the lock holder is doing btree searches,
* and we should give up if they are in more expensive code.
*/
int btrfs_tree_lock(struct extent_buffer *eb)
{
int i;
......@@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb)
return mutex_is_locked(&eb->mutex);
}
/*
* btrfs_search_slot uses this to decide if it should drop its locks
* before doing something expensive like allocating free blocks for cow.
*/
int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
{
int i;
......
......@@ -26,7 +26,6 @@
#include "btrfs_inode.h"
#include "extent_io.h"
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
if (entry->file_offset + entry->len < entry->file_offset)
......@@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
return entry->file_offset + entry->len;
}
/* returns NULL if the insertion worked, or it returns the node it did find
* in the tree
*/
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
struct rb_node *node)
{
......@@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
return NULL;
}
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
struct rb_node **prev_ret)
{
......@@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
return NULL;
}
/*
* helper to check if a given offset is inside a given entry
*/
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
if (file_offset < entry->file_offset ||
......@@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
return 1;
}
/*
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
*/
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
u64 file_offset)
{
......@@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode,
return 0;
}
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
{
struct list_head splice;
......
......@@ -21,6 +21,16 @@
#include "ref-cache.h"
#include "transaction.h"
/*
* leaf refs are used to cache the information about which extents
* a given leaf has references on. This allows us to process that leaf
* in btrfs_drop_snapshot without needing to read it back from disk.
*/
/*
* kmalloc a leaf reference struct and update the counters for the
* total ref cache size
*/
struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
int nr_extents)
{
......@@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
return ref;
}
/*
* free a leaf reference struct and update the counters for the
* total ref cache size
*/
void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
{
if (!ref)
......@@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
return 0;
}
/*
* find the leaf ref for a given extent. This returns the ref struct with
* a usage reference incremented
*/
struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
u64 bytenr)
{
......@@ -160,6 +178,10 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
return NULL;
}
/*
* add a fully filled in leaf ref struct
* remove all the refs older than a given root generation
*/
int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
int shared)
{
......@@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
return ret;
}
/*
* remove a single leaf ref from the tree. This drops the ref held by the tree
* only
*/
int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
{
struct btrfs_leaf_ref_tree *tree;
......
......@@ -19,8 +19,11 @@
#define __REFCACHE__
struct btrfs_extent_info {
/* bytenr and num_bytes find the extent in the extent allocation tree */
u64 bytenr;
u64 num_bytes;
/* objectid and offset find the back reference for the file */
u64 objectid;
u64 offset;
};
......
......@@ -22,8 +22,10 @@
#include "print-tree.h"
/*
* returns 0 on finding something, 1 if no more roots are there
* and < 0 on error
* search forward for a root, starting with objectid 'search_start'
* if a root key is found, the objectid we find is filled into 'found_objectid'
* and 0 is returned. < 0 is returned on error, 1 if there is nothing
* left in the tree.
*/
int btrfs_search_root(struct btrfs_root *root, u64 search_start,
u64 *found_objectid)
......@@ -66,6 +68,11 @@ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
return ret;
}
/*
* lookup the root with the highest offset for a given objectid. The key we do
* find is copied into 'key'. If we find something return 0, otherwise 1, < 0
* on error.
*/
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
struct btrfs_root_item *item, struct btrfs_key *key)
{
......@@ -104,6 +111,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
return ret;
}
/*
* copy the data in 'item' into the btree
*/
int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item)
......@@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
return ret;
}
/*
* at mount time we want to find all the old transaction snapshots that were in
* the process of being deleted if we crashed. This is any root item with an offset
* lower than the latest root. They need to be queued for deletion to finish
* what was happening when we crashed.
*/
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
struct btrfs_root *latest)
{
......@@ -227,6 +243,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
return ret;
}
/* drop the root item for 'key' from 'root' */
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key)
{
......
......@@ -17,6 +17,27 @@
*/
#include <linux/highmem.h>
/* this is some deeply nasty code. ctree.h has a different
* definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
*
* The end result is that anyone who #includes ctree.h gets a
* declaration for the btrfs_set_foo functions and btrfs_foo functions
*
* This file declares the macros and then #includes ctree.h, which results
* in cpp creating the function here based on the template below.
*
* These setget functions do all the extent_buffer related mapping
* required to efficiently read and write specific fields in the extent
* buffers. Every pointer to metadata items in btrfs is really just
* an unsigned long offset into the extent buffer which has been
* cast to a specific type. This gives us all the gcc type checking.
*
* The extent buffer api is used to do all the kmapping and page
* spanning work required to get extent buffers in highmem and have
* a metadata blocksize different from the page size.
*/
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
u##bits btrfs_##name(struct extent_buffer *eb, \
type *s) \
......
......@@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
/*
* used by btrfsctl to scan devices when no FS is mounted
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
......
......@@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
}
}
/*
* either allocate a new transaction or hop into the existing one
*/
static noinline int join_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *cur_trans;
......@@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root)
return 0;
}
/*
* this does all the record keeping required to make sure that a
* reference counted root is properly recorded in a given transaction.
* This is required to make sure the old root from before we joined the transaction
* is deleted when the transaction commits
*/
noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
{
struct btrfs_dirty_root *dirty;
......@@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
return 0;
}
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
*/
static void wait_current_trans(struct btrfs_root *root)
{
struct btrfs_transaction *cur_trans;
......@@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
return start_transaction(r, num_blocks, 2);
}
/* wait for a transaction commit to be fully complete */
static noinline int wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
{
......@@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root,
return 0;
}
/*
* rate limit against the drop_snapshot code. This helps to slow down new operations
* if the drop_snapshot code isn't able to keep up.
*/
static void throttle_on_drops(struct btrfs_root *root)
{
struct btrfs_fs_info *info = root->fs_info;
......@@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
return __btrfs_end_transaction(trans, root, 1);
}
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
* those extents are on disk for transaction or log commit
*/
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages)
{
......@@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
&trans->transaction->dirty_pages);
}
/*
* this is used to update the root pointer in the tree of tree roots.
*
* But, in the case of the extent allocation tree, updating the root
* pointer may allocate blocks which may change the root of the extent
* allocation tree.
*
* So, this loops and repeats and makes sure the cowonly root didn't
* change while the root pointer was being updated in the metadata.
*/
static int update_cowonly_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
......@@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return 0;
}
/*
* update all the cowonly tree roots on disk
*/
int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
......@@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
return 0;
}
/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
*/
int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
{
struct btrfs_dirty_root *dirty;
......@@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
return 0;
}
/*
* at transaction commit time we need to schedule the old roots for
* deletion via btrfs_drop_snapshot. This runs through all the
* reference counted roots that were modified in the current
* transaction and puts them into the drop list
*/
static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
struct radix_tree_root *radix,
struct list_head *list)
......@@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
return err;
}
/*
* defrag a given btree. If cacheonly == 1, this won't read from the disk,
* otherwise every leaf in the btree is read and defragged.
*/
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
struct btrfs_fs_info *info = root->fs_info;
......@@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
return 0;
}
/*
* Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
* all of them
*/
static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
struct list_head *list)
{
......@@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
return ret;
}
/*
* new snapshots need to be created at a very specific time in the
* transaction commit. This does the actual creation
*/
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
struct btrfs_pending_snapshot *pending)
......@@ -734,6 +791,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
/*
* create all the snapshots we've scheduled for creation
*/
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
......@@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
/*
* interface function to delete all the snapshots we have scheduled for deletion
*/
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
struct list_head dirty_roots;
......
......@@ -23,6 +23,10 @@
#include "transaction.h"
#include "locking.h"
/* defrag all the leaves in a given btree. If cache_only == 1, don't read things
* from disk, otherwise read all the leaves and try to get key order to
* better reflect disk order
*/
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int cache_only)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment