Commit 382c958e authored by Mingming Cao's avatar Mingming Cao Committed by Linus Torvalds

[PATCH] ext3 block reservations

rbtree implementation and other changes From: Stephen Tweedie <sct@redhat.com>
contributions From: Badari Pulavarty <pbadari@us.ibm.com> and probably me.

This is the ext3 block reservation patch.  It improves the layout of ext3
files by establishing, for each inode, reserved areas of the disk in which
only that file can allocate blocks.  Those reserved areas are managed in an
rbtree, via the in-core inode.

It's a bit like ext2 preallocation only stronger in that it can span
already-allocated blocks, including the per-blockgroup inode tables and
bitmaps.

The patch fixes ext3's worst performance problem: disastrous layout when
multiple files are being concurrently grown.

It increases the size of the inode by rather a lot.  A todo item is to
dynamically allocate the `struct reserve_window_node', so we don't need to
carry this storage for inodes which aren't opened for writing.

The feature is enabled by mounting with the "reservation" mount option. 
Reservations default to "off".
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 9a995d44
This diff is collapsed.
......@@ -33,6 +33,10 @@
*/
static int ext3_release_file (struct inode * inode, struct file * filp)
{
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1))
ext3_discard_reservation(inode);
if (is_dx(inode) && filp->private_data)
ext3_htree_free_dir_info(filp->private_data);
......
......@@ -582,6 +582,11 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
ei->i_file_acl = 0;
ei->i_dir_acl = 0;
ei->i_dtime = 0;
ei->i_rsv_window.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
atomic_set(&ei->i_rsv_window.rsv_alloc_hit, 0);
seqlock_init(&ei->i_rsv_window.rsv_seqlock);
ei->i_block_group = group;
ext3_set_inode_flags(inode);
......
......@@ -178,17 +178,6 @@ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
return ext3_journal_restart(handle, blocks_for_truncate(inode));
}
/*
* Called at each iput()
*
* The inode may be "bad" if ext3_read_inode() saw an error from
* ext3_get_inode(), so we need to check that to avoid freeing random disk
* blocks.
*/
void ext3_put_inode(struct inode *inode)
{
}
/*
* Called at the last iput() if i_nlink is zero.
*/
......@@ -2086,6 +2075,8 @@ void ext3_truncate(struct inode * inode)
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
ext3_discard_reservation(inode);
/*
* We have to lock the EOF page here, because lock_page() nests
* outside journal_start().
......@@ -2418,6 +2409,8 @@ void ext3_read_inode(struct inode * inode)
ei->i_acl = EXT3_ACL_NOT_CACHED;
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
if (ext3_get_inode_loc(inode, &iloc, 0))
goto bad_inode;
bh = iloc.bh;
......@@ -2478,7 +2471,10 @@ void ext3_read_inode(struct inode * inode)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
ei->i_rsv_window.rsv_start = 0;
ei->i_rsv_window.rsv_end= 0;
atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
seqlock_init(&ei->i_rsv_window.rsv_seqlock);
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
......
......@@ -20,6 +20,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
{
struct ext3_inode_info *ei = EXT3_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
......@@ -151,6 +152,29 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
return ret;
}
#endif
case EXT3_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode)) {
rsv_window_size = atomic_read(&ei->i_rsv_window.rsv_goal_size);
return put_user(rsv_window_size, (int *)arg);
}
return -ENOTTY;
case EXT3_IOC_SETRSVSZ:
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
if (IS_RDONLY(inode))
return -EROFS;
if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
return -EACCES;
if (get_user(rsv_window_size, (int *)arg))
return -EFAULT;
if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
atomic_set(&ei->i_rsv_window.rsv_goal_size, rsv_window_size);
return 0;
default:
return -ENOTTY;
}
......
......@@ -449,6 +449,7 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
ei->i_acl = EXT3_ACL_NOT_CACHED;
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
ei->vfs_inode.i_version = 1;
return &ei->vfs_inode;
}
......@@ -490,10 +491,9 @@ static void destroy_inodecache(void)
printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n");
}
#ifdef CONFIG_EXT3_FS_POSIX_ACL
static void ext3_clear_inode(struct inode *inode)
{
#ifdef CONFIG_EXT3_FS_POSIX_ACL
if (EXT3_I(inode)->i_acl &&
EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
posix_acl_release(EXT3_I(inode)->i_acl);
......@@ -504,11 +504,9 @@ static void ext3_clear_inode(struct inode *inode)
posix_acl_release(EXT3_I(inode)->i_default_acl);
EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
}
}
#else
# define ext3_clear_inode NULL
#endif
ext3_discard_reservation(inode);
}
#ifdef CONFIG_QUOTA
......@@ -558,7 +556,6 @@ static struct super_operations ext3_sops = {
.read_inode = ext3_read_inode,
.write_inode = ext3_write_inode,
.dirty_inode = ext3_dirty_inode,
.put_inode = ext3_put_inode,
.delete_inode = ext3_delete_inode,
.put_super = ext3_put_super,
.write_super = ext3_write_super,
......@@ -579,7 +576,8 @@ enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload,
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
......@@ -611,6 +609,8 @@ static match_table_t tokens = {
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_reservation, "reservation"},
{Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
{Opt_commit, "commit=%u"},
{Opt_journal_update, "journal=update"},
......@@ -765,6 +765,12 @@ static int parse_options (char * options, struct super_block *sb,
printk("EXT3 (no)acl options not supported\n");
break;
#endif
case Opt_reservation:
set_opt(sbi->s_mount_opt, RESERVATION);
break;
case Opt_noreservation:
clear_opt(sbi->s_mount_opt, RESERVATION);
break;
case Opt_journal_update:
/* @@@ FIXME */
/* Eventually we will want to be able to create
......@@ -1469,6 +1475,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
sbi->s_rsv_window_root = RB_ROOT;
/* Add a single, static dummy reservation to the start of the
* reservation window list --- it gives us a placeholder for
* append-at-start-of-list which makes the allocation logic
* _much_ simpler. */
sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
atomic_set(&sbi->s_rsv_window_head.rsv_alloc_hit, 0);
atomic_set(&sbi->s_rsv_window_head.rsv_goal_size, 0);
rsv_window_add(sb, &sbi->s_rsv_window_head);
/*
* set up enough so that it can read an inode
*/
......
......@@ -32,6 +32,12 @@ struct statfs;
*/
#undef EXT3FS_DEBUG
/*
* Define EXT3_RESERVATION to reserve data blocks for expanding files
*/
#define EXT3_DEFAULT_RESERVE_BLOCKS 8
#define EXT3_MAX_RESERVE_BLOCKS 1024
#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
/*
* Always enable hashed directories
*/
......@@ -202,6 +208,8 @@ struct ext3_group_desc
#ifdef CONFIG_JBD_DEBUG
#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
#endif
#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
/*
* Structure of an inode on the disk
......@@ -300,25 +308,26 @@ struct ext3_inode {
/*
* Mount flags
*/
#define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */
#define EXT3_MOUNT_OLDALLOC 0x0002 /* Don't use the new Orlov allocator */
#define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */
#define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */
#define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */
#define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */
#define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
#define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/
#define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */
#define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */
#define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
#define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
#define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
#define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */
#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
#define EXT3_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
......@@ -684,6 +693,7 @@ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
unsigned int block_group,
struct buffer_head ** bh);
extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv);
/* dir.c */
extern int ext3_check_dir_entry(const char *, struct inode *,
......@@ -722,7 +732,7 @@ extern int ext3_setattr (struct dentry *, struct iattr *);
extern void ext3_put_inode (struct inode *);
extern void ext3_delete_inode (struct inode *);
extern int ext3_sync_inode (handle_t *, struct inode *);
extern void ext3_discard_prealloc (struct inode *);
extern void ext3_discard_reservation (struct inode *);
extern void ext3_dirty_inode(struct inode *);
extern int ext3_change_inode_journal_flag(struct inode *, int);
extern void ext3_truncate (struct inode *);
......
......@@ -17,9 +17,27 @@
#define _LINUX_EXT3_FS_I
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
struct reserve_window {
__u32 _rsv_start; /* First byte reserved */
__u32 _rsv_end; /* Last byte reserved or 0 */
};
struct reserve_window_node {
struct rb_node rsv_node;
atomic_t rsv_goal_size;
atomic_t rsv_alloc_hit;
seqlock_t rsv_seqlock;
struct reserve_window rsv_window;
};
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
/*
* second extended file system inode data in memory
* third extended file system inode data in memory
*/
struct ext3_inode_info {
__le32 i_data[15]; /* unconverted */
......@@ -57,6 +75,9 @@ struct ext3_inode_info {
* allocation when we detect linearly ascending requests.
*/
__u32 i_next_alloc_goal;
/* block reservation window */
struct reserve_window_node i_rsv_window;
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT3_FS_XATTR
/*
......
......@@ -22,6 +22,7 @@
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#endif
#include <linux/rbtree.h>
/*
* third extended-fs super-block data in memory
......@@ -59,6 +60,11 @@ struct ext3_sb_info {
struct percpu_counter s_dirs_counter;
struct blockgroup_lock s_blockgroup_lock;
/* root of the per fs reservation window tree */
spinlock_t s_rsv_window_lock;
struct rb_root s_rsv_window_root;
struct reserve_window_node s_rsv_window_head;
/* Journaling */
struct inode * s_journal_inode;
struct journal_s * s_journal;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment