Commit fdd4e158 authored by Yan, Zheng's avatar Yan, Zheng Committed by Ilya Dryomov

ceph: rework dcache readdir

Previously our dcache readdir code relies on that child dentries in
directory dentry's d_subdir list are sorted by dentry's offset in
descending order. When adding dentries to the dcache, if a dentry
already exists, our readdir code moves it to head of directory
dentry's d_subdir list. This design relies on dcache internals.
Al Viro suggests using ncpfs's approach: keeping array of pointers
to dentries in page cache of directory inode. the validity of those
pointers are presented by directory inode's complete and ordered
flags. When a dentry gets pruned, we clear directory inode's complete
flag in the d_prune() callback. Before moving a dentry to other
directory, we clear the ordered flag for both old and new directory.
Signed-off-by: default avatarYan, Zheng <zyan@redhat.com>
parent b459be73
......@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
used |= CEPH_CAP_PIN;
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
if (ci->i_rdcache_ref ||
(!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
......@@ -1651,9 +1653,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
* If we fail, it's because pages are locked.... try again later.
*/
if ((!is_delayed || mdsc->stopping) &&
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
!tried_invalidate) {
......@@ -2805,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!ci->i_wrbuffer_ref) {
if (try_nonblocking_invalidate(inode)) {
......
This diff is collapsed.
......@@ -96,6 +96,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
}
cf->fmode = fmode;
cf->next_offset = 2;
cf->readdir_cache_idx = -1;
file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release);
break;
......@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name);
kfree(cf->dir_info);
dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
......
......@@ -390,9 +390,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
ci->i_ordered_count = 0;
atomic_set(&ci->i_release_count, 1);
atomic_set(&ci->i_complete_count, 0);
atomic64_set(&ci->i_ordered_count, 1);
atomic64_set(&ci->i_release_count, 1);
atomic64_set(&ci->i_complete_seq[0], 0);
atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
......@@ -860,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci,
atomic_read(&ci->i_release_count),
ci->i_ordered_count);
atomic64_read(&ci->i_release_count),
atomic64_read(&ci->i_ordered_count));
}
wake = true;
......@@ -1214,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry,
......@@ -1224,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
......@@ -1335,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err;
}
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
page_cache_release(ctl->page);
ctl->page = NULL;
}
}
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
ctl->page = grab_cache_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
return -ENOMEM;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
dout("readdir cache dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
dout("disable readdir cache\n");
ctl->index = -1;
}
return 0;
}
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
......@@ -1347,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
......@@ -1356,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag))
r_readdir_offset = 2;
req->r_readdir_offset = 2;
else
r_readdir_offset = 0;
req->r_readdir_offset = 0;
}
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir);
......@@ -1376,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
}
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino;
......@@ -1415,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
d_delete(dn);
dput(dn);
goto retry_lookup;
} else {
/* reorder parent's d_subdirs */
spin_lock(&parent->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
spin_unlock(&parent->d_lock);
}
/* inode */
......@@ -1438,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
}
if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation) < 0) {
ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn))
iput(in);
d_drop(dn);
err = ret;
goto next_item;
}
......@@ -1460,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
di = dn->d_fsdata;
di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session,
req->r_request_started);
if (err == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
err = ret;
}
next_item:
if (dn)
dput(dn);
}
if (err == 0)
req->r_did_prepopulate = true;
out:
if (err == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
if (snapdir) {
iput(snapdir);
dput(parent);
......
......@@ -253,6 +253,9 @@ struct ceph_mds_request {
bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate;
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
......
......@@ -282,9 +282,9 @@ struct ceph_inode_info {
u32 i_time_warp_seq;
unsigned i_ceph_flags;
int i_ordered_count;
atomic_t i_release_count;
atomic_t i_complete_count;
atomic64_t i_release_count;
atomic64_t i_ordered_count;
atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
......@@ -471,30 +471,36 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
int release_count, int ordered_count)
long long release_count,
long long ordered_count)
{
atomic_set(&ci->i_complete_count, release_count);
if (ci->i_ordered_count == ordered_count)
ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
else
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
smp_mb__before_atomic();
atomic64_set(&ci->i_complete_seq[0], release_count);
atomic64_set(&ci->i_complete_seq[1], ordered_count);
}
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{
atomic_inc(&ci->i_release_count);
atomic64_inc(&ci->i_release_count);
}
static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
{
atomic64_inc(&ci->i_ordered_count);
}
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
{
return atomic_read(&ci->i_complete_count) ==
atomic_read(&ci->i_release_count);
return atomic64_read(&ci->i_complete_seq[0]) ==
atomic64_read(&ci->i_release_count);
}
static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
{
return __ceph_dir_is_complete(ci) &&
(ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
return atomic64_read(&ci->i_complete_seq[0]) ==
atomic64_read(&ci->i_release_count) &&
atomic64_read(&ci->i_complete_seq[1]) ==
atomic64_read(&ci->i_ordered_count);
}
static inline void ceph_dir_clear_complete(struct inode *inode)
......@@ -504,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
static inline void ceph_dir_clear_ordered(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
spin_lock(&ci->i_ceph_lock);
ci->i_ordered_count++;
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
spin_unlock(&ci->i_ceph_lock);
__ceph_dir_clear_ordered(ceph_inode(inode));
}
static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
bool ret;
spin_lock(&ci->i_ceph_lock);
ret = __ceph_dir_is_complete_ordered(ci);
spin_unlock(&ci->i_ceph_lock);
bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
smp_rmb();
return ret;
}
......@@ -636,16 +635,20 @@ struct ceph_file_info {
unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
int dir_release_count;
int dir_ordered_count;
long long dir_release_count;
long long dir_ordered_count;
int readdir_cache_idx;
/* used for -o dirstat read() on directory thing */
char *dir_info;
int dir_info_len;
};
struct ceph_readdir_cache_control {
struct page *page;
struct dentry **dentries;
int index;
};
/*
* A "snap realm" describes a subset of the file hierarchy sharing
......@@ -944,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/*
* our d_ops vary depending on whether the inode is live,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment