Commit c12b9866 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] ext3: concurrent block/inode allocation

From: Alex Tomas <bzzz@tmi.comex.ru>


This patch weans ext3 off lock_super()-based protection for the inode and
block allocators.

It's basically the same as the ext2 changes.


1) each group has own spinlock, which is used for group counter
   modifications

2) sb->s_free_blocks_count isn't used any more.  ext2_statfs() and
   find_group_orlov() loop over groups to count free blocks

3) sb->s_free_blocks_count is recalculated at mount/umount/sync_super time
   in order to check consistency and to avoid fsck warnings

4) reserved blocks are distributed over last groups

5) ext3_new_block() tries to use non-reserved blocks and if it fails then
   tries to use reserved blocks

6) ext3_new_block() and ext3_free_blocks do not modify sb->s_free_blocks,
   therefore they do not call mark_buffer_dirty() for superblock's
   buffer_head. this should reduce I/O a bit


Also fix orlov allocator boundary case:

In the interests of SMP scalability the ext2 free blocks and free inodes
counters are "approximate".  But there is a piece of code in the Orlov
allocator which fails due to boundary conditions on really small
filesystems.

Fix that up via a final allocation pass which simply uses first-fit for
allocatiopn of a directory inode.
parent 78f2f471
This diff is collapsed.
......@@ -131,7 +131,6 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
/* Do this BEFORE marking the inode not in use or returning an error */
clear_inode (inode);
lock_super (sb);
es = EXT3_SB(sb)->s_es;
if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
ext3_error (sb, "ext3_free_inode",
......@@ -150,7 +149,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
goto error_return;
/* Ok, now we can actually update the inode bitmaps.. */
if (!ext3_clear_bit(bit, bitmap_bh->b_data))
if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
bit, bitmap_bh->b_data))
ext3_error (sb, "ext3_free_inode",
"bit already cleared for inode %lu", ino);
else {
......@@ -160,28 +160,18 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
fatal = ext3_journal_get_write_access(handle, bh2);
if (fatal) goto error_return;
BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access");
fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
if (fatal) goto error_return;
if (gdp) {
spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
gdp->bg_free_inodes_count = cpu_to_le16(
le16_to_cpu(gdp->bg_free_inodes_count) + 1);
if (is_directory) {
if (is_directory)
gdp->bg_used_dirs_count = cpu_to_le16(
le16_to_cpu(gdp->bg_used_dirs_count) - 1);
EXT3_SB(sb)->s_dir_count--;
}
spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
}
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
if (!fatal) fatal = err;
es->s_free_inodes_count =
cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
BUFFER_TRACE(EXT3_SB(sb)->s_sbh,
"call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
if (!fatal) fatal = err;
}
BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
......@@ -191,7 +181,6 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
error_return:
brelse(bitmap_bh);
ext3_std_error(sb, fatal);
unlock_super(sb);
}
/*
......@@ -206,9 +195,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
*/
static int find_group_dir(struct super_block *sb, struct inode *parent)
{
struct ext3_super_block * es = EXT3_SB(sb)->s_es;
int ngroups = EXT3_SB(sb)->s_groups_count;
int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
int avefreei = ext3_count_free_inodes(sb) / ngroups;
struct ext3_group_desc *desc, *best_desc = NULL;
struct buffer_head *bh;
int group, best_group = -1;
......@@ -264,10 +252,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
struct ext3_super_block *es = sbi->s_es;
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
int freei = ext3_count_free_inodes(sb);
int avefreei = freei / ngroups;
int freeb = ext3_count_free_blocks(sb);
int avefreeb = freeb / ngroups;
int blocks_per_dir;
int ndirs = sbi->s_dir_count;
int ndirs = ext3_count_dirs(sb);
int max_debt, max_dirs, min_blocks, min_inodes;
int group = -1, i;
struct ext3_group_desc *desc;
......@@ -319,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
desc = ext3_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
continue;
if (sbi->s_debts[group] >= max_debt)
if (sbi->s_bgi[group].bg_debts >= max_debt)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
continue;
......@@ -340,6 +330,15 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
return group;
}
if (avefreei) {
/*
* The free-inodes counter is approximate, and for really small
* filesystems the above test can fail to find any blockgroups
*/
avefreei = 0;
goto fallback;
}
return -1;
}
......@@ -435,7 +434,6 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
return ERR_PTR(-ENOMEM);
ei = EXT3_I(inode);
lock_super (sb);
es = EXT3_SB(sb)->s_es;
repeat:
if (S_ISDIR(mode)) {
......@@ -464,11 +462,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
err = ext3_journal_get_write_access(handle, bitmap_bh);
if (err) goto fail;
if (ext3_set_bit(ino, bitmap_bh->b_data)) {
ext3_error (sb, "ext3_new_inode",
"bit already set for inode %lu", ino);
if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
ino, bitmap_bh->b_data))
goto repeat;
}
BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (err) goto fail;
......@@ -504,26 +500,19 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
BUFFER_TRACE(bh2, "get_write_access");
err = ext3_journal_get_write_access(handle, bh2);
if (err) goto fail;
spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
gdp->bg_free_inodes_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
if (S_ISDIR(mode)) {
gdp->bg_used_dirs_count =
cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
EXT3_SB(sb)->s_dir_count++;
}
spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
if (err) goto fail;
BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
if (err) goto fail;
es->s_free_inodes_count =
cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
sb->s_dirt = 1;
if (err) goto fail;
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
......@@ -576,7 +565,6 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
ei->i_state = EXT3_STATE_NEW;
unlock_super(sb);
ret = inode;
if(DQUOT_ALLOC_INODE(inode)) {
DQUOT_DROP(inode);
......@@ -600,7 +588,6 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
fail:
ext3_std_error(sb, err);
out:
unlock_super(sb);
iput(inode);
ret = ERR_PTR(err);
really_out:
......@@ -673,12 +660,13 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
unsigned long ext3_count_free_inodes (struct super_block * sb)
{
unsigned long desc_count;
struct ext3_group_desc *gdp;
int i;
#ifdef EXT3FS_DEBUG
struct ext3_super_block *es;
unsigned long desc_count, bitmap_count, x;
struct ext3_group_desc *gdp;
unsigned long bitmap_count, x;
struct buffer_head *bitmap_bh = NULL;
int i;
lock_super (sb);
es = EXT3_SB(sb)->s_es;
......@@ -706,7 +694,14 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
unlock_super(sb);
return desc_count;
#else
return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count);
desc_count = 0;
for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
gdp = ext3_get_group_desc (sb, i, NULL);
if (!gdp)
continue;
desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
}
return desc_count;
#endif
}
......
......@@ -460,7 +460,7 @@ void ext3_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
kfree(sbi->s_debts);
kfree(sbi->s_bgi);
brelse(sbi->s_sbh);
/* Debugging code just in case the in-memory inode orphan list
......@@ -901,6 +901,8 @@ static int ext3_check_descriptors (struct super_block * sb)
struct ext3_sb_info *sbi = EXT3_SB(sb);
unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
struct ext3_group_desc * gdp = NULL;
unsigned long total_free;
unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count);
int desc_block = 0;
int i;
......@@ -947,6 +949,43 @@ static int ext3_check_descriptors (struct super_block * sb)
block += EXT3_BLOCKS_PER_GROUP(sb);
gdp++;
}
total_free = ext3_count_free_blocks(sb);
if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count)) {
printk("EXT3-fs: invalid s_free_blocks_count %u (real %lu)\n",
le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count),
total_free);
EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free);
}
/* distribute reserved blocks over groups -bzzz */
for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) {
int free;
gdp = ext3_get_group_desc (sb, i, NULL);
if (!gdp) {
ext3_error (sb, "ext3_check_descriptors",
"cant get descriptor for group %d", i);
return 0;
}
free = le16_to_cpu(gdp->bg_free_blocks_count);
if (free > reserved)
free = reserved;
sbi->s_bgi[i].bg_reserved = free;
reserved -= free;
total_free -= free;
}
total_free = ext3_count_free_inodes(sb);
if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) {
printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n",
le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count),
total_free);
EXT3_SB(sb)->s_es->s_free_inodes_count = cpu_to_le32(total_free);
}
return 1;
}
......@@ -1307,13 +1346,17 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
printk (KERN_ERR "EXT3-fs: not enough memory\n");
goto failed_mount;
}
sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info),
GFP_KERNEL);
if (!sbi->s_debts) {
printk ("EXT3-fs: not enough memory\n");
if (!sbi->s_bgi) {
printk("EXT3-fs: not enough memory to allocate s_bgi\n");
goto failed_mount2;
}
memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
memset(sbi->s_bgi, 0, sbi->s_groups_count * sizeof(struct ext3_bg_info));
for (i = 0; i < sbi->s_groups_count; i++) {
spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock);
spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock);
}
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
......@@ -1329,7 +1372,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
sbi->s_dir_count = ext3_count_dirs(sb);
/*
* set up enough so that it can read an inode
*/
......@@ -1432,8 +1474,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
failed_mount3:
journal_destroy(sbi->s_journal);
failed_mount2:
if (sbi->s_debts)
kfree(sbi->s_debts);
kfree(sbi->s_bgi);
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
......@@ -1702,6 +1743,8 @@ static void ext3_commit_super (struct super_block * sb,
if (!sbh)
return;
es->s_wtime = cpu_to_le32(get_seconds());
es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
if (sync)
......
......@@ -48,9 +48,7 @@ EXPORT_SYMBOL(journal_get_create_access);
EXPORT_SYMBOL(journal_get_undo_access);
EXPORT_SYMBOL(journal_dirty_data);
EXPORT_SYMBOL(journal_dirty_metadata);
#if 0
EXPORT_SYMBOL(journal_release_buffer);
#endif
EXPORT_SYMBOL(journal_forget);
#if 0
EXPORT_SYMBOL(journal_sync_buffer);
......
......@@ -1106,7 +1106,6 @@ int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
return 0;
}
#if 0
/*
* journal_release_buffer: undo a get_write_access without any buffer
* updates, if the update decided in the end that it didn't need access.
......@@ -1140,7 +1139,6 @@ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "exit");
unlock_journal(journal);
}
#endif
/**
* void journal_forget() - bforget() for potentially-journaled buffers.
......
......@@ -344,7 +344,9 @@ struct ext3_inode {
#endif
#define ext3_set_bit ext2_set_bit
#define ext3_set_bit_atomic ext2_set_bit_atomic
#define ext3_clear_bit ext2_clear_bit
#define ext3_clear_bit_atomic ext2_clear_bit_atomic
#define ext3_test_bit ext2_test_bit
#define ext3_find_first_zero_bit ext2_find_first_zero_bit
#define ext3_find_next_zero_bit ext2_find_next_zero_bit
......
......@@ -21,6 +21,13 @@
#include <linux/wait.h>
#endif
struct ext3_bg_info {
u8 bg_debts;
spinlock_t bg_balloc_lock;
spinlock_t bg_ialloc_lock;
unsigned long bg_reserved;
} ____cacheline_aligned_in_smp;
/*
* third extended-fs super-block data in memory
*/
......@@ -50,8 +57,7 @@ struct ext3_sb_info {
u32 s_next_generation;
u32 s_hash_seed[4];
int s_def_hash_version;
unsigned long s_dir_count;
u8 *s_debts;
struct ext3_bg_info *s_bgi;
/* Journaling */
struct inode * s_journal_inode;
......
......@@ -116,6 +116,12 @@ __ext3_journal_get_write_access(const char *where,
return err;
}
static inline void
ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
{
journal_release_buffer(handle, bh);
}
static inline void
ext3_journal_forget(handle_t *handle, struct buffer_head *bh)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment