Commit da353b0d authored by David Chinner's avatar David Chinner Committed by Tim Shimmin

[XFS] Radix tree based inode caching

One of the perpetual scaling problems XFS has is indexing it's incore
inodes. We currently uses hashes and the default hash sizes chosen can
only ever be a tradeoff between memory consumption and the maximum
realistic size of the cache.

As a result, anyone who has millions of inodes cached on a filesystem
needs to tunes the size of the cache via the ihashsize mount option to
allow decent scalability with inode cache operations.

A further problem is the separate inode cluster hash, whose size is based
on the ihashsize but is smaller, and so under certain conditions (sparse
cluster cache population) this can become a limitation long before the
inode hash is causing issues.

The following patchset removes the inode hash and cluster hash and
replaces them with radix trees to avoid the scalability limitations of the
hashes. It also reduces the size of the inodes by 3 pointers....

SGI-PV: 969561
SGI-Modid: xfs-linux-melb:xfs-kern:29481a
Signed-off-by: default avatarDavid Chinner <dgc@sgi.com>
Signed-off-by: default avatarChristoph Hellwig <hch@infradead.org>
Signed-off-by: default avatarTim Shimmin <tes@sgi.com>
parent 39cd9f87
......@@ -17,10 +17,12 @@
*/
#include "xfs.h"
#include "xfs_types.h"
#include "xfs_dmapi.h"
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_export.h"
......
......@@ -197,6 +197,10 @@ typedef struct xfs_perag
#endif
xfs_perag_busy_t *pagb_list; /* unstable blocks */
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
int pag_ici_init; /* incore inode cache initialised */
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
} xfs_perag_t;
#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
......
......@@ -23,6 +23,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_buf_item.h"
......
......@@ -89,7 +89,6 @@ struct xfs_mount_args {
#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */
#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width
* allocation */
#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */
#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename
* symlink,mkdir,rmdir,mknod */
#define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......
......@@ -23,6 +23,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
......
This diff is collapsed.
......@@ -52,7 +52,7 @@
kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone;
kmem_zone_t *xfs_chashlist_zone;
kmem_zone_t *xfs_icluster_zone;
/*
* Used in xfs_itruncate(). This is the maximum number of extents
......@@ -2182,10 +2182,10 @@ xfs_ifree_cluster(
int i, j, found, pre_flushed;
xfs_daddr_t blkno;
xfs_buf_t *bp;
xfs_ihash_t *ih;
xfs_inode_t *ip, **ip_found;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
xfs_perag_t *pag = xfs_get_perag(mp, inum);
SPLDECL(s);
if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
......@@ -2220,23 +2220,20 @@ xfs_ifree_cluster(
*/
found = 0;
for (i = 0; i < ninodes; i++) {
ih = XFS_IHASH(mp, inum + i);
read_lock(&ih->ih_lock);
for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
if (ip->i_ino == inum + i)
break;
}
read_lock(&pag->pag_ici_lock);
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
/* Inode not in memory or we found it already,
* nothing to do
*/
if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
read_unlock(&ih->ih_lock);
read_unlock(&pag->pag_ici_lock);
continue;
}
if (xfs_inode_clean(ip)) {
read_unlock(&ih->ih_lock);
read_unlock(&pag->pag_ici_lock);
continue;
}
......@@ -2259,7 +2256,7 @@ xfs_ifree_cluster(
ip_found[found++] = ip;
}
}
read_unlock(&ih->ih_lock);
read_unlock(&pag->pag_ici_lock);
continue;
}
......@@ -2277,8 +2274,7 @@ xfs_ifree_cluster(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
}
read_unlock(&ih->ih_lock);
read_unlock(&pag->pag_ici_lock);
}
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
......@@ -2333,6 +2329,7 @@ xfs_ifree_cluster(
}
kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
xfs_put_perag(mp, pag);
}
/*
......@@ -3050,12 +3047,11 @@ xfs_iflush(
xfs_mount_t *mp;
int error;
/* REFERENCED */
xfs_chash_t *ch;
xfs_inode_t *iq;
int clcount; /* count of inodes clustered */
int bufwasdelwri;
struct hlist_node *entry;
enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
SPLDECL(s);
XFS_STATS_INC(xs_iflush_count);
......@@ -3169,14 +3165,14 @@ xfs_iflush(
* inode clustering:
* see if other inodes can be gathered into this write
*/
ip->i_chash->chl_buf = bp;
ch = XFS_CHASH(mp, ip->i_blkno);
s = mutex_spinlock(&ch->ch_lock);
spin_lock(&ip->i_cluster->icl_lock);
ip->i_cluster->icl_buf = bp;
clcount = 0;
for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
if (iq == ip)
continue;
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
......@@ -3227,7 +3223,7 @@ xfs_iflush(
xfs_iunlock(iq, XFS_ILOCK_SHARED);
}
}
mutex_spinunlock(&ch->ch_lock, s);
spin_unlock(&ip->i_cluster->icl_lock);
if (clcount) {
XFS_STATS_INC(xs_icluster_flushcnt);
......@@ -3264,7 +3260,7 @@ xfs_iflush(
/* Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem.
*/
mutex_spinunlock(&ch->ch_lock, s);
spin_unlock(&ip->i_cluster->icl_lock);
/*
* Clean up the buffer. If it was B_DELWRI, just release it --
......
......@@ -172,41 +172,18 @@ typedef struct xfs_iocore {
extern void xfs_iocore_inode_init(struct xfs_inode *);
extern void xfs_iocore_inode_reinit(struct xfs_inode *);
/*
* This is the type used in the xfs inode hash table.
* An array of these is allocated for each mounted
* file system to hash the inodes for that file system.
*/
typedef struct xfs_ihash {
struct xfs_inode *ih_next;
rwlock_t ih_lock;
uint ih_version;
} xfs_ihash_t;
#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize))
/*
* This is the xfs inode cluster hash. This hash is used by xfs_iflush to
* find inodes that share a cluster and can be flushed to disk at the same
* time.
* This is the xfs inode cluster structure. This structure is used by
* xfs_iflush to find inodes that share a cluster and can be flushed to disk at
* the same time.
*/
typedef struct xfs_chashlist {
struct xfs_chashlist *chl_next;
struct xfs_chashlist *chl_prev;
struct xfs_inode *chl_ip;
xfs_daddr_t chl_blkno; /* starting block number of
typedef struct xfs_icluster {
struct hlist_head icl_inodes; /* list of inodes on cluster */
xfs_daddr_t icl_blkno; /* starting block number of
* the cluster */
struct xfs_buf *chl_buf; /* the inode buffer */
} xfs_chashlist_t;
typedef struct xfs_chash {
xfs_chashlist_t *ch_list;
lock_t ch_lock;
} xfs_chash_t;
#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
struct xfs_buf *icl_buf; /* the inode buffer */
lock_t icl_lock; /* inode list lock */
} xfs_icluster_t;
/*
* This is the xfs in-core inode structure.
......@@ -269,21 +246,15 @@ typedef struct xfs_icdinode {
} xfs_icdinode_t;
typedef struct {
struct xfs_ihash *ip_hash; /* pointer to hash header */
struct xfs_inode *ip_next; /* inode hash link forw */
struct xfs_inode *ip_mnext; /* next inode in mount list */
struct xfs_inode *ip_mprev; /* ptr to prev inode */
struct xfs_inode **ip_prevp; /* ptr to prev i_next */
struct xfs_mount *ip_mount; /* fs mount struct ptr */
} xfs_iptr_t;
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_ihash *i_hash; /* pointer to hash header */
struct xfs_inode *i_next; /* inode hash link forw */
struct xfs_inode *i_mnext; /* next inode in mount list */
struct xfs_inode *i_mprev; /* ptr to prev inode */
struct xfs_inode **i_prevp; /* ptr to prev i_next */
struct xfs_mount *i_mount; /* fs mount struct ptr */
struct list_head i_reclaim; /* reclaim list */
struct bhv_desc i_bhv_desc; /* inode behavior descriptor*/
......@@ -324,9 +295,8 @@ typedef struct xfs_inode {
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
xfs_chashlist_t *i_chash; /* cluster hash list header */
struct xfs_inode *i_cnext; /* cluster hash link forward */
struct xfs_inode *i_cprev; /* cluster hash link backward */
xfs_icluster_t *i_cluster; /* cluster list header */
struct hlist_node i_cnode; /* cluster link node */
xfs_fsize_t i_size; /* in-memory size */
/* Trace buffers per inode. */
......@@ -521,8 +491,6 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
*/
void xfs_ihash_init(struct xfs_mount *);
void xfs_ihash_free(struct xfs_mount *);
void xfs_chash_init(struct xfs_mount *);
void xfs_chash_free(struct xfs_mount *);
xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
struct xfs_trans *);
void xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *);
......@@ -633,7 +601,7 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#define xfs_inobp_check(mp, bp)
#endif /* DEBUG */
extern struct kmem_zone *xfs_chashlist_zone;
extern struct kmem_zone *xfs_icluster_zone;
extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone;
......
......@@ -160,11 +160,6 @@ xfs_mount_free(
xfs_mount_t *mp,
int remove_bhv)
{
if (mp->m_ihash)
xfs_ihash_free(mp);
if (mp->m_chash)
xfs_chash_free(mp);
if (mp->m_perag) {
int agno;
......@@ -342,6 +337,17 @@ xfs_mount_validate_sb(
return 0;
}
STATIC void
xfs_initialize_perag_icache(
xfs_perag_t *pag)
{
if (!pag->pag_ici_init) {
rwlock_init(&pag->pag_ici_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
pag->pag_ici_init = 1;
}
}
xfs_agnumber_t
xfs_initialize_perag(
bhv_vfs_t *vfs,
......@@ -396,12 +402,14 @@ xfs_initialize_perag(
pag->pagi_inodeok = 1;
if (index < max_metadata)
pag->pagf_metadata = 1;
xfs_initialize_perag_icache(pag);
}
} else {
/* Setup default behavior for smaller filesystems */
for (index = 0; index < agcount; index++) {
pag = &mp->m_perag[index];
pag->pagi_inodeok = 1;
xfs_initialize_perag_icache(pag);
}
}
return index;
......@@ -1032,13 +1040,6 @@ xfs_mountfs(
*/
xfs_trans_init(mp);
/*
* Allocate and initialize the inode hash table for this
* file system.
*/
xfs_ihash_init(mp);
xfs_chash_init(mp);
/*
* Allocate and initialize the per-ag data.
*/
......@@ -1190,8 +1191,6 @@ xfs_mountfs(
error3:
xfs_log_unmount_dealloc(mp);
error2:
xfs_ihash_free(mp);
xfs_chash_free(mp);
for (agno = 0; agno < sbp->sb_agcount; agno++)
if (mp->m_perag[agno].pagb_list)
kmem_free(mp->m_perag[agno].pagb_list,
......
......@@ -57,10 +57,7 @@ struct log;
struct bhv_vfs;
struct bhv_vnode;
struct xfs_mount_args;
struct xfs_ihash;
struct xfs_chash;
struct xfs_inode;
struct xfs_perag;
struct xfs_iocore;
struct xfs_bmbt_irec;
struct xfs_bmap_free;
......@@ -335,8 +332,6 @@ typedef struct xfs_mount {
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
lock_t m_agirotor_lock;/* .. and lock protecting it */
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
size_t m_ihsize; /* size of next field */
struct xfs_ihash *m_ihash; /* fs private inode hash table*/
struct xfs_inode *m_inodes; /* active inode list */
struct list_head m_del_inodes; /* inodes to reclaim */
mutex_t m_ilock; /* inode list mutex */
......@@ -458,7 +453,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_IDELETE (1ULL << 18) /* delete empty inode clusters*/
#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
* allocation */
#define XFS_MOUNT_IHASHSIZE (1ULL << 20) /* inode hash table size */
/* (1ULL << 20) -- currently unused */
#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
* I/O size in stat() */
......@@ -571,6 +566,21 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
/*
* perag get/put wrappers for eventual ref counting
*/
static inline xfs_perag_t *
xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
{
return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
}
static inline void
xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
{
/* nothing to see here, move along */
}
/*
* Per-cpu superblock locking functions
*/
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
......
......@@ -22,6 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_trans_priv.h"
......
......@@ -117,8 +117,8 @@ xfs_init(void)
xfs_ili_zone =
kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
KM_ZONE_SPREAD, NULL);
xfs_chashlist_zone =
kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist",
xfs_icluster_zone =
kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
KM_ZONE_SPREAD, NULL);
/*
......@@ -163,7 +163,7 @@ xfs_cleanup(void)
extern kmem_zone_t *xfs_efd_zone;
extern kmem_zone_t *xfs_efi_zone;
extern kmem_zone_t *xfs_buf_item_zone;
extern kmem_zone_t *xfs_chashlist_zone;
extern kmem_zone_t *xfs_icluster_zone;
xfs_cleanup_procfs();
xfs_sysctl_unregister();
......@@ -199,7 +199,7 @@ xfs_cleanup(void)
kmem_zone_destroy(xfs_efi_zone);
kmem_zone_destroy(xfs_ifork_zone);
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_chashlist_zone);
kmem_zone_destroy(xfs_icluster_zone);
}
/*
......@@ -246,7 +246,6 @@ xfs_start_flags(
ap->logbufsize);
return XFS_ERROR(EINVAL);
}
mp->m_ihsize = ap->ihashsize;
mp->m_logbsize = ap->logbufsize;
mp->m_fsname_len = strlen(ap->fsname) + 1;
mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
......@@ -293,8 +292,6 @@ xfs_start_flags(
mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
}
if (ap->flags & XFSMNT_IHASHSIZE)
mp->m_flags |= XFS_MOUNT_IHASHSIZE;
if (ap->flags & XFSMNT_IDELETE)
mp->m_flags |= XFS_MOUNT_IDELETE;
if (ap->flags & XFSMNT_DIRSYNC)
......@@ -1673,7 +1670,6 @@ xfs_vget(
#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */
#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */
#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */
#define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */
#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
* unwritten extent conversion */
......@@ -1799,15 +1795,6 @@ xfs_parseargs(
iosize = suffix_strtoul(value, &eov, 10);
args->flags |= XFSMNT_IOSIZE;
args->iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
if (!value || !*value) {
cmn_err(CE_WARN,
"XFS: %s option requires an argument",
this_char);
return EINVAL;
}
args->flags |= XFSMNT_IHASHSIZE;
args->ihashsize = simple_strtoul(value, &eov, 10);
} else if (!strcmp(this_char, MNTOPT_GRPID) ||
!strcmp(this_char, MNTOPT_BSDGROUPS)) {
vfsp->vfs_flag |= VFS_GRPID;
......@@ -1876,6 +1863,9 @@ xfs_parseargs(
args->flags &= ~XFSMNT_ATTR2;
} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
args->flags2 |= XFSMNT2_FILESTREAMS;
} else if (!strcmp(this_char, "ihashsize")) {
cmn_err(CE_WARN,
"XFS: ihashsize no longer used, option is deprecated.");
} else if (!strcmp(this_char, "osyncisdsync")) {
/* no-op, this is now the default */
cmn_err(CE_WARN,
......@@ -1966,9 +1956,6 @@ xfs_showargs(
seq_puts(m, xfs_infop->str);
}
if (mp->m_flags & XFS_MOUNT_IHASHSIZE)
seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", (int)mp->m_ihsize);
if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
(int)(1 << mp->m_writeio_log) >> 10);
......
......@@ -3876,7 +3876,7 @@ xfs_finish_reclaim(
int locked,
int sync_mode)
{
xfs_ihash_t *ih = ip->i_hash;
xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
int error;
......@@ -3888,12 +3888,12 @@ xfs_finish_reclaim(
* Once we have the XFS_IRECLAIM flag set it will not touch
* us.
*/
write_lock(&ih->ih_lock);
write_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
(!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
spin_unlock(&ip->i_flags_lock);
write_unlock(&ih->ih_lock);
write_unlock(&pag->pag_ici_lock);
if (locked) {
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
......@@ -3902,7 +3902,8 @@ xfs_finish_reclaim(
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
write_unlock(&ih->ih_lock);
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(ip->i_mount, pag);
/*
* If the inode is still dirty, then flush it out. If the inode
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment