Commit 0aecba61 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs d_inode/d_flags memory ordering fixes from Al Viro:
 "Fallout from tree-wide audit for ->d_inode/->d_flags barriers use.
  Basically, the problem is that negative pinned dentries require
  careful treatment - unless ->d_lock is locked or parent is held at
  least shared, another thread can make them positive right under us.

  Most of the uses turned out to be safe - the main surprises as far as
  filesystems are concerned were

   - race in dget_parent() fastpath, that might end up with the caller
     observing the returned dentry _negative_, due to insufficient
     barriers. It is positive in memory, but we could end up seeing the
     wrong value of ->d_inode in CPU cache. Fixed.

   - manual checks that result of lookup_one_len_unlocked() is positive
     (and rejection of negatives). Again, insufficient barriers (we
     might end up with inconsistent observed values of ->d_inode and
     ->d_flags). Fixed by switching to a new primitive that does the
     checks itself and returns ERR_PTR(-ENOENT) instead of a negative
     dentry. That way we get rid of boilerplate converting negatives
     into ERR_PTR(-ENOENT) in the callers and have a single place to
     deal with the barrier-related mess - inside fs/namei.c rather than
     in every caller out there.

  The guts of pathname resolution *do* need to be careful - the race
  found by Ritesh is real, as well as several similar races.
  Fortunately, it turns out that we can take care of that with fairly
  local changes in there.

  The tree-wide audit had not been fun, and I hate the idea of repeating
  it. I think the right approach would be to annotate the places where
  we are _not_ guaranteed ->d_inode/->d_flags stability and have sparse
  catch regressions. But I'm still not sure what would be the least
  invasive way of doing that and it's clearly the next cycle fodder"

* 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  fs/namei.c: fix missing barriers when checking positivity
  fix dget_parent() fastpath race
  new helper: lookup_positive_unlocked()
  fs/namei.c: pull positivity check into follow_managed()
parents b0d4beaa 2fa6b1e0
......@@ -730,11 +730,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
struct inode *dir = d_inode(dentry);
struct dentry *child;
if (!dir) {
dput(dentry);
dentry = ERR_PTR(-ENOENT);
break;
}
if (!S_ISDIR(dir->i_mode)) {
dput(dentry);
dentry = ERR_PTR(-ENOTDIR);
......@@ -751,7 +746,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
while (*s && *s != sep)
s++;
child = lookup_one_len_unlocked(p, dentry, s - p);
child = lookup_positive_unlocked(p, dentry, s - p);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
......
......@@ -319,7 +319,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
WRITE_ONCE(dentry->d_flags, flags);
smp_store_release(&dentry->d_flags, flags);
}
static inline void __d_clear_type_and_inode(struct dentry *dentry)
......@@ -903,17 +903,19 @@ struct dentry *dget_parent(struct dentry *dentry)
{
int gotref;
struct dentry *ret;
unsigned seq;
/*
* Do optimistic parent lookup without any
* locking.
*/
rcu_read_lock();
seq = raw_seqcount_begin(&dentry->d_seq);
ret = READ_ONCE(dentry->d_parent);
gotref = lockref_get_not_zero(&ret->d_lockref);
rcu_read_unlock();
if (likely(gotref)) {
if (likely(ret == READ_ONCE(dentry->d_parent)))
if (!read_seqcount_retry(&dentry->d_seq, seq))
return ret;
dput(ret);
}
......
......@@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
dentry = lookup_one_len_unlocked(name, parent, strlen(name));
dentry = lookup_positive_unlocked(name, parent, strlen(name));
if (IS_ERR(dentry))
return NULL;
if (!d_really_is_positive(dentry)) {
dput(dentry);
return NULL;
}
return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_lookup);
......
......@@ -223,7 +223,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
dput(dentry);
return ERR_PTR(-EINVAL);
}
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
dtmp = lookup_positive_unlocked(kntmp->name, dentry,
strlen(kntmp->name));
dput(dentry);
if (IS_ERR(dtmp))
......
......@@ -1210,25 +1210,25 @@ static int follow_automount(struct path *path, struct nameidata *nd,
* - Flagged as automount point
*
* This may only be called in refwalk mode.
* On success path->dentry is known positive.
*
* Serialization is taken care of in namespace.c
*/
static int follow_managed(struct path *path, struct nameidata *nd)
{
struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
unsigned managed;
unsigned flags;
bool need_mntput = false;
int ret = 0;
/* Given that we're not holding a lock here, we retain the value in a
* local variable for each dentry as we look at it so that we don't see
* the components of that value change under us */
while (managed = READ_ONCE(path->dentry->d_flags),
managed &= DCACHE_MANAGED_DENTRY,
unlikely(managed != 0)) {
while (flags = smp_load_acquire(&path->dentry->d_flags),
unlikely(flags & DCACHE_MANAGED_DENTRY)) {
/* Allow the filesystem to manage the transit without i_mutex
* being held. */
if (managed & DCACHE_MANAGE_TRANSIT) {
if (flags & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
ret = path->dentry->d_op->d_manage(path, false);
......@@ -1237,7 +1237,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
}
/* Transit to a mounted filesystem. */
if (managed & DCACHE_MOUNTED) {
if (flags & DCACHE_MOUNTED) {
struct vfsmount *mounted = lookup_mnt(path);
if (mounted) {
dput(path->dentry);
......@@ -1256,7 +1256,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
}
/* Handle an automount point */
if (managed & DCACHE_NEED_AUTOMOUNT) {
if (flags & DCACHE_NEED_AUTOMOUNT) {
ret = follow_automount(path, nd, &need_mntput);
if (ret < 0)
break;
......@@ -1269,10 +1269,12 @@ static int follow_managed(struct path *path, struct nameidata *nd)
if (need_mntput && path->mnt == mnt)
mntput(path->mnt);
if (ret == -EISDIR || !ret)
ret = 1;
if (need_mntput)
nd->flags |= LOOKUP_JUMPED;
if (ret == -EISDIR || !ret)
ret = 1;
if (ret > 0 && unlikely(d_flags_negative(flags)))
ret = -ENOENT;
if (unlikely(ret < 0))
path_put_conditional(path, nd);
return ret;
......@@ -1621,10 +1623,6 @@ static int lookup_fast(struct nameidata *nd,
dput(dentry);
return status;
}
if (unlikely(d_is_negative(dentry))) {
dput(dentry);
return -ENOENT;
}
path->mnt = mnt;
path->dentry = dentry;
......@@ -1811,11 +1809,6 @@ static int walk_component(struct nameidata *nd, int flags)
if (unlikely(err < 0))
return err;
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
seq = 0; /* we are already out of RCU mode */
inode = d_backing_inode(path.dentry);
}
......@@ -2568,6 +2561,26 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
/*
* Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
* on negatives. Returns known positive or ERR_PTR(); that's what
* most of the users want. Note that pinned negative with unlocked parent
* _can_ become positive at any time, so callers of lookup_one_len_unlocked()
* need to be very careful; pinned positives have ->d_inode stable, so
* this one avoids such problems.
*/
struct dentry *lookup_positive_unlocked(const char *name,
struct dentry *base, int len)
{
struct dentry *ret = lookup_one_len_unlocked(name, base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
dput(ret);
ret = ERR_PTR(-ENOENT);
}
return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
......@@ -2662,7 +2675,7 @@ mountpoint_last(struct nameidata *nd)
return PTR_ERR(path.dentry);
}
}
if (d_is_negative(path.dentry)) {
if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) {
dput(path.dentry);
return -ENOENT;
}
......@@ -3356,11 +3369,6 @@ static int do_last(struct nameidata *nd,
if (unlikely(error < 0))
return error;
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
/*
* create/update audit record if it already exists.
*/
......
......@@ -863,13 +863,11 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
dchild = lookup_one_len_unlocked(name, dparent, namlen);
dchild = lookup_positive_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
goto out;
if (d_really_is_negative(dchild))
goto out;
if (dchild->d_inode->i_ino != ino)
goto out;
rv = fh_compose(fhp, exp, dchild, &cd->fh);
......
......@@ -2991,18 +2991,9 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
* we're not holding the i_mutex here, so there's
* a window where this directory entry could have gone
* away.
*/
dput(dentry);
return nfserr_noent;
}
exp_get(exp);
/*
......
......@@ -200,7 +200,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
int err;
bool last_element = !post[0];
this = lookup_one_len_unlocked(name, base, namelen);
this = lookup_positive_unlocked(name, base, namelen);
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
......@@ -208,8 +208,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto out;
goto out_err;
}
if (!this->d_inode)
goto put_and_out;
if (ovl_dentry_weird(this)) {
/* Don't support traversing automounts and other weirdness */
......@@ -651,7 +649,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
if (err)
return ERR_PTR(err);
index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
kfree(name.name);
if (IS_ERR(index)) {
if (PTR_ERR(index) == -ENOENT)
......@@ -659,9 +657,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
return index;
}
if (d_is_negative(index))
err = 0;
else if (ovl_is_whiteout(index))
if (ovl_is_whiteout(index))
err = -ESTALE;
else if (ovl_dentry_weird(index))
err = -EIO;
......@@ -685,7 +681,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
if (err)
return ERR_PTR(err);
index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
......@@ -700,9 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
}
inode = d_inode(index);
if (d_is_negative(index)) {
goto out_dput;
} else if (ovl_is_whiteout(index) && !verify) {
if (ovl_is_whiteout(index) && !verify) {
/*
* When index lookup is called with !verify for decoding an
* overlay file handle, a whiteout index implies that decode
......@@ -1131,7 +1125,7 @@ bool ovl_lower_positive(struct dentry *dentry)
struct dentry *this;
struct dentry *lowerdir = poe->lowerstack[i].dentry;
this = lookup_one_len_unlocked(name->name, lowerdir,
this = lookup_positive_unlocked(name->name, lowerdir,
name->len);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
......@@ -1148,10 +1142,8 @@ bool ovl_lower_positive(struct dentry *dentry)
break;
}
} else {
if (this->d_inode) {
positive = !ovl_is_whiteout(this);
done = true;
}
dput(this);
}
}
......
......@@ -2487,21 +2487,15 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
if (IS_ERR(dentry))
return PTR_ERR(dentry);
if (d_really_is_negative(dentry)) {
error = -ENOENT;
goto out;
}
error = security_quota_on(dentry);
if (!error)
error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
out:
dput(dentry);
return error;
}
......
......@@ -440,6 +440,11 @@ static inline bool d_is_negative(const struct dentry *dentry)
return d_is_miss(dentry);
}
static inline bool d_flags_negative(unsigned flags)
{
return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}
static inline bool d_is_positive(const struct dentry *dentry)
{
return !d_is_negative(dentry);
......
......@@ -60,6 +60,7 @@ extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment