Commit f97cd2d9 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] shmem: no sbinfo for shm mount

SGI investigations have shown a dramatic contrast in scalability between
anonymous memory and shmem objects.  Processes building distinct shmem objects
in parallel hit heavy contention on shmem superblock stat_lock.  Across 256
cpus an intensive test runs 300 times slower than anonymous.

Jack Steiner has observed that all the shmem superblock free_blocks and
free_inodes accounting is redundant in the case of the internal mount used for
SysV shared memory and for shared writable /dev/zero objects (the cases which
most concern them): it specifically declines to limit.

Based upon Brent Casavant's SHMEM_NOSBINFO patch, this instead just removes
the shmem_sb_info structure from the internal kernel mount, testing where
necessary for null sbinfo pointer.  shmem_set_size moved within CONFIG_TMPFS,
its arg named "sbinfo" as elsewhere.

This brings shmem object scalability up to that of anonymous memory, in the
case where distinct processes are building (faulting to allocate) distinct
objects.  It significantly improves parallel building of a shared shmem object
(that test runs 14 times faster across 256 cpus), but other issues remain in
that case: to be addressed in later patches.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 77cdadab
...@@ -185,10 +185,12 @@ static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED; ...@@ -185,10 +185,12 @@ static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
static void shmem_free_block(struct inode *inode) static void shmem_free_block(struct inode *inode)
{ {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
sbinfo->free_blocks++; sbinfo->free_blocks++;
inode->i_blocks -= BLOCKS_PER_PAGE; inode->i_blocks -= BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
}
} }
/* /*
...@@ -213,11 +215,13 @@ static void shmem_recalc_inode(struct inode *inode) ...@@ -213,11 +215,13 @@ static void shmem_recalc_inode(struct inode *inode)
if (freed > 0) { if (freed > 0) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
info->alloced -= freed; info->alloced -= freed;
shmem_unacct_blocks(info->flags, freed);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
sbinfo->free_blocks += freed; sbinfo->free_blocks += freed;
inode->i_blocks -= freed*BLOCKS_PER_PAGE; inode->i_blocks -= freed*BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
shmem_unacct_blocks(info->flags, freed); }
} }
} }
...@@ -350,6 +354,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long ...@@ -350,6 +354,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
* page (and perhaps indirect index pages) yet to allocate: * page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data. * a waste to allocate index if we cannot allocate data.
*/ */
if (sbinfo) {
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks <= 1) { if (sbinfo->free_blocks <= 1) {
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
...@@ -358,6 +363,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long ...@@ -358,6 +363,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
sbinfo->free_blocks--; sbinfo->free_blocks--;
inode->i_blocks += BLOCKS_PER_PAGE; inode->i_blocks += BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
}
spin_unlock(&info->lock); spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
...@@ -605,10 +611,12 @@ static void shmem_delete_inode(struct inode *inode) ...@@ -605,10 +611,12 @@ static void shmem_delete_inode(struct inode *inode)
inode->i_size = 0; inode->i_size = 0;
shmem_truncate(inode); shmem_truncate(inode);
} }
if (sbinfo) {
BUG_ON(inode->i_blocks); BUG_ON(inode->i_blocks);
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++; sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
}
clear_inode(inode); clear_inode(inode);
} }
...@@ -1001,8 +1009,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, ...@@ -1001,8 +1009,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
} else { } else {
shmem_swp_unmap(entry); shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb); sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks == 0 || shmem_acct_block(info->flags)) { if (sbinfo->free_blocks == 0 ||
shmem_acct_block(info->flags)) {
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
spin_unlock(&info->lock); spin_unlock(&info->lock);
error = -ENOSPC; error = -ENOSPC;
...@@ -1011,6 +1021,11 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, ...@@ -1011,6 +1021,11 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
sbinfo->free_blocks--; sbinfo->free_blocks--;
inode->i_blocks += BLOCKS_PER_PAGE; inode->i_blocks += BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
} else if (shmem_acct_block(info->flags)) {
spin_unlock(&info->lock);
error = -ENOSPC;
goto failed;
}
if (!filepage) { if (!filepage) {
spin_unlock(&info->lock); spin_unlock(&info->lock);
...@@ -1187,6 +1202,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) ...@@ -1187,6 +1202,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
struct shmem_inode_info *info; struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) { if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
...@@ -1194,6 +1210,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) ...@@ -1194,6 +1210,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
} }
sbinfo->free_inodes--; sbinfo->free_inodes--;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
}
inode = new_inode(sb); inode = new_inode(sb);
if (inode) { if (inode) {
...@@ -1234,32 +1251,32 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) ...@@ -1234,32 +1251,32 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
return inode; return inode;
} }
static int shmem_set_size(struct shmem_sb_info *info, #ifdef CONFIG_TMPFS
static int shmem_set_size(struct shmem_sb_info *sbinfo,
unsigned long max_blocks, unsigned long max_inodes) unsigned long max_blocks, unsigned long max_inodes)
{ {
int error; int error;
unsigned long blocks, inodes; unsigned long blocks, inodes;
spin_lock(&info->stat_lock); spin_lock(&sbinfo->stat_lock);
blocks = info->max_blocks - info->free_blocks; blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = info->max_inodes - info->free_inodes; inodes = sbinfo->max_inodes - sbinfo->free_inodes;
error = -EINVAL; error = -EINVAL;
if (max_blocks < blocks) if (max_blocks < blocks)
goto out; goto out;
if (max_inodes < inodes) if (max_inodes < inodes)
goto out; goto out;
error = 0; error = 0;
info->max_blocks = max_blocks; sbinfo->max_blocks = max_blocks;
info->free_blocks = max_blocks - blocks; sbinfo->free_blocks = max_blocks - blocks;
info->max_inodes = max_inodes; sbinfo->max_inodes = max_inodes;
info->free_inodes = max_inodes - inodes; sbinfo->free_inodes = max_inodes - inodes;
out: out:
spin_unlock(&info->stat_lock); spin_unlock(&sbinfo->stat_lock);
return error; return error;
} }
#ifdef CONFIG_TMPFS
static struct inode_operations shmem_symlink_inode_operations; static struct inode_operations shmem_symlink_inode_operations;
static struct inode_operations shmem_symlink_inline_operations; static struct inode_operations shmem_symlink_inline_operations;
...@@ -1819,47 +1836,51 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) ...@@ -1819,47 +1836,51 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
} }
#endif #endif
static void shmem_put_super(struct super_block *sb)
{
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
static int shmem_fill_super(struct super_block *sb, static int shmem_fill_super(struct super_block *sb,
void *data, int silent) void *data, int silent)
{ {
struct inode *inode; struct inode *inode;
struct dentry *root; struct dentry *root;
unsigned long blocks, inodes;
int mode = S_IRWXUGO | S_ISVTX; int mode = S_IRWXUGO | S_ISVTX;
uid_t uid = current->fsuid; uid_t uid = current->fsuid;
gid_t gid = current->fsgid; gid_t gid = current->fsgid;
struct shmem_sb_info *sbinfo;
int err = -ENOMEM; int err = -ENOMEM;
sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL); #ifdef CONFIG_TMPFS
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
memset(sbinfo, 0, sizeof(struct shmem_sb_info));
/* /*
* Per default we only allow half of the physical ram per * Per default we only allow half of the physical ram per
* tmpfs instance, limiting inodes to one per page of lowmem. * tmpfs instance, limiting inodes to one per page of lowmem;
* but the internal instance is left unlimited.
*/ */
blocks = totalram_pages / 2; if (!(sb->s_flags & MS_NOUSER)) {
inodes = totalram_pages - totalhigh_pages; struct shmem_sb_info *sbinfo;
unsigned long blocks = totalram_pages / 2;
unsigned long inodes = totalram_pages - totalhigh_pages;
if (inodes > blocks) if (inodes > blocks)
inodes = blocks; inodes = blocks;
#ifdef CONFIG_TMPFS if (shmem_parse_options(data, &mode,
if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, &inodes)) { &uid, &gid, &blocks, &inodes))
err = -EINVAL; return -EINVAL;
goto failed;
}
#else
sb->s_flags |= MS_NOUSER;
#endif
sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
spin_lock_init(&sbinfo->stat_lock); spin_lock_init(&sbinfo->stat_lock);
sbinfo->max_blocks = blocks; sbinfo->max_blocks = blocks;
sbinfo->free_blocks = blocks; sbinfo->free_blocks = blocks;
sbinfo->max_inodes = inodes; sbinfo->max_inodes = inodes;
sbinfo->free_inodes = inodes; sbinfo->free_inodes = inodes;
}
#endif
sb->s_maxbytes = SHMEM_MAX_BYTES; sb->s_maxbytes = SHMEM_MAX_BYTES;
sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
...@@ -1879,17 +1900,10 @@ static int shmem_fill_super(struct super_block *sb, ...@@ -1879,17 +1900,10 @@ static int shmem_fill_super(struct super_block *sb,
failed_iput: failed_iput:
iput(inode); iput(inode);
failed: failed:
kfree(sbinfo); shmem_put_super(sb);
sb->s_fs_info = NULL;
return err; return err;
} }
static void shmem_put_super(struct super_block *sb)
{
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
static kmem_cache_t *shmem_inode_cachep; static kmem_cache_t *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb) static struct inode *shmem_alloc_inode(struct super_block *sb)
...@@ -2023,15 +2037,13 @@ static int __init init_tmpfs(void) ...@@ -2023,15 +2037,13 @@ static int __init init_tmpfs(void)
#ifdef CONFIG_TMPFS #ifdef CONFIG_TMPFS
devfs_mk_dir("shm"); devfs_mk_dir("shm");
#endif #endif
shm_mnt = kern_mount(&tmpfs_fs_type); shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
tmpfs_fs_type.name, NULL);
if (IS_ERR(shm_mnt)) { if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt); error = PTR_ERR(shm_mnt);
printk(KERN_ERR "Could not kern_mount tmpfs\n"); printk(KERN_ERR "Could not kern_mount tmpfs\n");
goto out1; goto out1;
} }
/* The internal instance should not do size checking */
shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
return 0; return 0;
out1: out1:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment