Commit 1d9d7cbf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-5.2-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "On the filesystem side we have:

   - a fix to enforce quotas set above the mount point (Luis Henriques)

   - support for exporting snapshots through NFS (Zheng Yan)

   - proper statx implementation (Jeff Layton). statx flags are mapped
     to MDS caps, with AT_STATX_{DONT,FORCE}_SYNC taken into account.

   - some follow-up dentry name handling fixes, in particular
     elimination of our hand-rolled helper and the switch to __getname()
     as suggested by Al (Jeff Layton)

   - a set of MDS client cleanups in preparation for async MDS requests
     in the future (Jeff Layton)

   - a fix to sync the filesystem before remounting (Jeff Layton)

  On the rbd side, work is on-going on object-map and fast-diff image
  features"

* tag 'ceph-for-5.2-rc1' of git://github.com/ceph/ceph-client: (29 commits)
  ceph: flush dirty inodes before proceeding with remount
  ceph: fix unaligned access in ceph_send_cap_releases
  libceph: make ceph_pr_addr take an struct ceph_entity_addr pointer
  libceph: fix unaligned accesses in ceph_entity_addr handling
  rbd: don't assert on writes to snapshots
  rbd: client_mutex is never nested
  ceph: print inode number in __caps_issued_mask debugging messages
  ceph: just call get_session in __ceph_lookup_mds_session
  ceph: simplify arguments and return semantics of try_get_cap_refs
  ceph: fix comment over ceph_drop_caps_for_unlink
  ceph: move wait for mds request into helper function
  ceph: have ceph_mdsc_do_request call ceph_mdsc_submit_request
  ceph: after an MDS request, do callback and completions
  ceph: use pathlen values returned by set_request_path_attr
  ceph: use __getname/__putname in ceph_mdsc_build_path
  ceph: use ceph_mdsc_build_path instead of clone_dentry_name
  ceph: fix potential use-after-free in ceph_mdsc_build_path
  ceph: dump granular cap info in "caps" debugfs file
  ceph: make iterate_session_caps a public symbol
  ceph: fix NULL pointer deref when debugging is enabled
  ...
parents 2c45e7fb 00abf69d
......@@ -934,7 +934,7 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
struct rbd_client *rbdc;
int ret;
mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
mutex_lock(&client_mutex);
rbdc = rbd_client_find(ceph_opts);
if (rbdc) {
ceph_destroy_options(ceph_opts);
......@@ -1326,7 +1326,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
zero_bvecs(&obj_req->bvec_pos, off, bytes);
break;
default:
rbd_assert(0);
BUG();
}
}
......@@ -1581,7 +1581,7 @@ static void rbd_obj_request_destroy(struct kref *kref)
kfree(obj_request->bvec_pos.bvecs);
break;
default:
rbd_assert(0);
BUG();
}
kfree(obj_request->img_extents);
......@@ -1781,7 +1781,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
&obj_req->bvec_pos);
break;
default:
rbd_assert(0);
BUG();
}
}
......@@ -2036,7 +2036,7 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
ret = rbd_obj_setup_zeroout(obj_req);
break;
default:
rbd_assert(0);
BUG();
}
if (ret < 0)
return ret;
......@@ -2383,7 +2383,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
&obj_req->bvec_pos);
break;
default:
rbd_assert(0);
BUG();
}
} else {
ret = rbd_img_fill_from_bvecs(child_img_req,
......@@ -2515,7 +2515,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
num_osd_ops += count_zeroout_ops(obj_req);
break;
default:
rbd_assert(0);
BUG();
}
obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
......@@ -2542,7 +2542,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
__rbd_obj_setup_zeroout(obj_req, which);
break;
default:
rbd_assert(0);
BUG();
}
ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
......@@ -3842,8 +3842,12 @@ static void rbd_queue_workfn(struct work_struct *work)
goto err_rq;
}
rbd_assert(op_type == OBJ_OP_READ ||
rbd_dev->spec->snap_id == CEPH_NOSNAP);
if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
rbd_warn(rbd_dev, "%s on read-only snapshot",
obj_op_name(op_type));
result = -EIO;
goto err;
}
/*
* Quit early if the mapped snapshot no longer exists. It's
......
......@@ -892,8 +892,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask %p snap issued %s"
" (mask %s)\n", &ci->vfs_inode,
dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
" (mask %s)\n", ci->vfs_inode.i_ino,
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
......@@ -904,8 +904,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
dout("__ceph_caps_issued_mask %p cap %p issued %s"
" (mask %s)\n", &ci->vfs_inode, cap,
dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
" (mask %s)\n", ci->vfs_inode.i_ino, cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
......@@ -916,8 +916,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask %p combo issued %s"
" (mask %s)\n", &ci->vfs_inode,
dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
" (mask %s)\n", ci->vfs_inode.i_ino,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
......@@ -2257,8 +2257,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
inode_lock(inode);
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
......@@ -2273,7 +2271,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
}
inode_unlock(inode);
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
......@@ -2528,9 +2525,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
* to (when applicable), and check against max_size here as well.
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
*
* Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
* or a negative error code.
*
* FIXME: how does a 0 return differ from -EAGAIN?
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, bool nonblock, int *got, int *err)
loff_t endoff, bool nonblock, int *got)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
......@@ -2550,8 +2552,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
if ((file_wanted & need) != need) {
dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
ceph_cap_string(need), ceph_cap_string(file_wanted));
*err = -EBADF;
ret = 1;
ret = -EBADF;
goto out_unlock;
}
......@@ -2572,10 +2573,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size) {
*err = -EAGAIN;
ret = 1;
}
if (endoff > ci->i_requested_max_size)
ret = -EAGAIN;
goto out_unlock;
}
/*
......@@ -2610,8 +2609,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
* task isn't in TASK_RUNNING state
*/
if (nonblock) {
*err = -EAGAIN;
ret = 1;
ret = -EAGAIN;
goto out_unlock;
}
......@@ -2640,8 +2638,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
if (session_readonly) {
dout("get_cap_refs %p needed %s but mds%d readonly\n",
inode, ceph_cap_string(need), ci->i_auth_cap->mds);
*err = -EROFS;
ret = 1;
ret = -EROFS;
goto out_unlock;
}
......@@ -2650,16 +2647,14 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
if (READ_ONCE(mdsc->fsc->mount_state) ==
CEPH_MOUNT_SHUTDOWN) {
dout("get_cap_refs %p forced umount\n", inode);
*err = -EIO;
ret = 1;
ret = -EIO;
goto out_unlock;
}
mds_wanted = __ceph_caps_mds_wanted(ci, false);
if (need & ~(mds_wanted & need)) {
dout("get_cap_refs %p caps were dropped"
" (session killed?)\n", inode);
*err = -ESTALE;
ret = 1;
ret = -ESTALE;
goto out_unlock;
}
if (!(file_wanted & ~mds_wanted))
......@@ -2710,7 +2705,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
bool nonblock, int *got)
{
int ret, err = 0;
int ret;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
......@@ -2718,15 +2713,8 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
if (ret < 0)
return ret;
ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err);
if (ret) {
if (err == -EAGAIN) {
ret = 0;
} else if (err < 0) {
ret = err;
}
}
return ret;
ret = try_get_cap_refs(ci, need, want, 0, nonblock, got);
return ret == -EAGAIN ? 0 : ret;
}
/*
......@@ -2737,7 +2725,7 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
int _got, ret, err = 0;
int _got, ret;
ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
......@@ -2747,21 +2735,19 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
if (endoff > 0)
check_max_size(&ci->vfs_inode, endoff);
err = 0;
_got = 0;
ret = try_get_cap_refs(ci, need, want, endoff,
false, &_got, &err);
if (ret) {
if (err == -EAGAIN)
continue;
if (err < 0)
ret = err;
} else {
false, &_got);
if (ret == -EAGAIN) {
continue;
} else if (!ret) {
int err;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(&ci->i_cap_wq, &wait);
while (!try_get_cap_refs(ci, need, want, endoff,
true, &_got, &err)) {
while (!(err = try_get_cap_refs(ci, need, want, endoff,
true, &_got))) {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
......@@ -2770,19 +2756,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
if (err == -EAGAIN)
continue;
if (err < 0)
ret = err;
}
if (ret < 0) {
if (err == -ESTALE) {
/* session was killed, try renew caps */
ret = ceph_renew_caps(&ci->vfs_inode);
if (ret == 0)
continue;
}
if (ret == -ESTALE) {
/* session was killed, try renew caps */
ret = ceph_renew_caps(&ci->vfs_inode);
if (ret == 0)
continue;
return ret;
}
......@@ -4099,7 +4080,7 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
}
/*
* For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
* For a soon-to-be unlinked file, drop the LINK caps. If it
* looks like the link count will hit 0, drop any other caps (other
* than PIN) we don't specifically want (due to the file still being
* open).
......
......@@ -37,7 +37,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
ceph_pr_addr(&addr->in_addr),
ceph_pr_addr(addr),
ceph_mds_state_name(state));
}
return 0;
......@@ -88,7 +88,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
kfree(path);
ceph_mdsc_free_path(path, pathlen);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
......@@ -108,7 +108,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
kfree(path);
ceph_mdsc_free_path(path, pathlen);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
......@@ -124,18 +124,48 @@ static int mdsc_show(struct seq_file *s, void *p)
return 0;
}
static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
}
static int caps_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
int total, avail, used, reserved, min;
struct ceph_mds_client *mdsc = fsc->mdsc;
int total, avail, used, reserved, min, i;
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
"avail\t\t%d\n"
"used\t\t%d\n"
"reserved\t%d\n"
"min\t%d\n",
"min\t\t%d\n\n",
total, avail, used, reserved, min);
seq_printf(s, "ino issued implemented\n");
seq_printf(s, "-----------------------------------------------\n");
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
struct ceph_mds_session *session;
session = __ceph_lookup_mds_session(mdsc, i);
if (!session)
continue;
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
ceph_iterate_session_caps(session, caps_show_cb, s);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
return 0;
}
......
This diff is collapsed.
......@@ -929,7 +929,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
(write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc->seq);
snapc, snapc ? snapc->seq : 0);
ret = filemap_write_and_wait_range(inode->i_mapping,
pos, pos + count - 1);
......
......@@ -2266,43 +2266,72 @@ int ceph_permission(struct inode *inode, int mask)
return err;
}
/* Craft a mask of needed caps given a set of requested statx attrs. */
static int statx_to_caps(u32 want)
{
int mask = 0;
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME))
mask |= CEPH_CAP_AUTH_SHARED;
if (want & (STATX_NLINK|STATX_CTIME))
mask |= CEPH_CAP_LINK_SHARED;
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
STATX_BLOCKS))
mask |= CEPH_CAP_FILE_SHARED;
if (want & (STATX_CTIME))
mask |= CEPH_CAP_XATTR_SHARED;
return mask;
}
/*
* Get all attributes. Hopefully somedata we'll have a statlite()
* and can limit the fields we require to be accurate.
* Get all the attributes. If we have sufficient caps for the requested attrs,
* then we can avoid talking to the MDS at all.
*/
int ceph_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
int err;
int err = 0;
err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
if (!err) {
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
if (ceph_snap(inode) == CEPH_NOSNAP)
stat->dev = inode->i_sb->s_dev;
/* Skip the getattr altogether if we're asked not to sync */
if (!(flags & AT_STATX_DONT_SYNC)) {
err = ceph_do_getattr(inode, statx_to_caps(request_mask),
flags & AT_STATX_FORCE_SYNC);
if (err)
return err;
}
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
if (ceph_snap(inode) == CEPH_NOSNAP)
stat->dev = inode->i_sb->s_dev;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
RBYTES))
stat->size = ci->i_rbytes;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
RBYTES))
stat->size = ci->i_rbytes;
else
stat->size = ci->i_files + ci->i_subdirs;
stat->blocks = 0;
stat->blksize = 65536;
/*
* Some applications rely on the number of st_nlink
* value on directories to be either 0 (if unlinked)
* or 2 + number of subdirectories.
*/
if (stat->nlink == 1)
/* '.' + '..' + subdirs */
stat->nlink = 1 + 1 + ci->i_subdirs;
}
stat->size = ci->i_files + ci->i_subdirs;
stat->blocks = 0;
stat->blksize = 65536;
/*
* Some applications rely on the number of st_nlink
* value on directories to be either 0 (if unlinked)
* or 2 + number of subdirectories.
*/
if (stat->nlink == 1)
/* '.' + '..' + subdirs */
stat->nlink = 1 + 1 + ci->i_subdirs;
}
/* Mask off any higher bits (e.g. btime) until we have support */
stat->result_mask = request_mask & STATX_BASIC_STATS;
return err;
}
......@@ -237,15 +237,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
err = -EIO;
} else if (op == CEPH_MDS_OP_SETFILELOCK) {
/*
* increasing i_filelock_ref closes race window between
* handling request reply and adding file_lock struct to
* inode. Otherwise, i_auth_cap may get trimmed in the
* window. Caller function will decrease the counter.
*/
fl->fl_ops = &ceph_fl_lock_ops;
atomic_inc(&ci->i_filelock_ref);
}
spin_unlock(&ci->i_ceph_lock);
if (err < 0) {
......@@ -299,10 +290,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
err = -EIO;
} else {
/* see comment in ceph_lock */
fl->fl_ops = &ceph_fl_lock_ops;
atomic_inc(&ci->i_filelock_ref);
}
spin_unlock(&ci->i_ceph_lock);
if (err < 0) {
......
This diff is collapsed.
......@@ -325,6 +325,18 @@ struct ceph_snapid_map {
unsigned long last_used;
};
/*
* node for list of quotarealm inodes that are not visible from the filesystem
* mountpoint, but required to handle, e.g. quotas.
*/
struct ceph_quotarealm_inode {
struct rb_node node;
u64 ino;
unsigned long timeout; /* last time a lookup failed for this inode */
struct mutex mutex;
struct inode *inode;
};
/*
* mds client state
*/
......@@ -344,6 +356,12 @@ struct ceph_mds_client {
int stopping; /* true if shutting down */
atomic64_t quotarealms_count; /* # realms with quota */
/*
* We keep a list of inodes we don't see in the mountpoint but that we
* need to track quota realms.
*/
struct rb_root quotarealms_inodes;
struct mutex quotarealms_inodes_mutex;
/*
* snap_rwsem will cover cap linkage into snaprealms, and
......@@ -447,8 +465,9 @@ extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct inode *dir);
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
......@@ -468,8 +487,18 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *,
struct ceph_cap *, void *),
void *arg);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
static inline void ceph_mdsc_free_path(char *path, int len)
{
if (path)
__putname(path - (PATH_MAX - 1 - len));
}
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
int stop_on_nosnap);
......
......@@ -205,7 +205,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr.in_addr),
ceph_pr_addr(&addr),
ceph_mds_state_name(state));
if (mds < 0 || state <= 0)
......
......@@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
return atomic64_read(&mdsc->quotarealms_count) > 0;
struct super_block *sb = mdsc->fsc->sb;
if (atomic64_read(&mdsc->quotarealms_count) > 0)
return true;
/* if root is the real CephFS root, we don't have quota realms */
if (sb->s_root->d_inode &&
(sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
return false;
/* otherwise, we can't know for sure */
return true;
}
void ceph_handle_quota(struct ceph_mds_client *mdsc,
......@@ -68,6 +77,108 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
iput(inode);
}
static struct ceph_quotarealm_inode *
find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
{
struct ceph_quotarealm_inode *qri = NULL;
struct rb_node **node, *parent = NULL;
mutex_lock(&mdsc->quotarealms_inodes_mutex);
node = &(mdsc->quotarealms_inodes.rb_node);
while (*node) {
parent = *node;
qri = container_of(*node, struct ceph_quotarealm_inode, node);
if (ino < qri->ino)
node = &((*node)->rb_left);
else if (ino > qri->ino)
node = &((*node)->rb_right);
else
break;
}
if (!qri || (qri->ino != ino)) {
/* Not found, create a new one and insert it */
qri = kmalloc(sizeof(*qri), GFP_KERNEL);
if (qri) {
qri->ino = ino;
qri->inode = NULL;
qri->timeout = 0;
mutex_init(&qri->mutex);
rb_link_node(&qri->node, parent, node);
rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
} else
pr_warn("Failed to alloc quotarealms_inode\n");
}
mutex_unlock(&mdsc->quotarealms_inodes_mutex);
return qri;
}
/*
* This function will try to lookup a realm inode which isn't visible in the
* filesystem mountpoint. A list of these kind of inodes (not visible) is
* maintained in the mdsc and freed only when the filesystem is umounted.
*
* Note that these inodes are kept in this list even if the lookup fails, which
* allows to prevent useless lookup requests.
*/
static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
struct super_block *sb,
struct ceph_snap_realm *realm)
{
struct ceph_quotarealm_inode *qri;
struct inode *in;
qri = find_quotarealm_inode(mdsc, realm->ino);
if (!qri)
return NULL;
mutex_lock(&qri->mutex);
if (qri->inode) {
/* A request has already returned the inode */
mutex_unlock(&qri->mutex);
return qri->inode;
}
/* Check if this inode lookup has failed recently */
if (qri->timeout &&
time_before_eq(jiffies, qri->timeout)) {
mutex_unlock(&qri->mutex);
return NULL;
}
in = ceph_lookup_inode(sb, realm->ino);
if (IS_ERR(in)) {
pr_warn("Can't lookup inode %llx (err: %ld)\n",
realm->ino, PTR_ERR(in));
qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
} else {
qri->timeout = 0;
qri->inode = in;
}
mutex_unlock(&qri->mutex);
return in;
}
void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
{
struct ceph_quotarealm_inode *qri;
struct rb_node *node;
/*
* It should now be safe to clean quotarealms_inode tree without holding
* mdsc->quotarealms_inodes_mutex...
*/
mutex_lock(&mdsc->quotarealms_inodes_mutex);
while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) {
node = rb_first(&mdsc->quotarealms_inodes);
qri = rb_entry(node, struct ceph_quotarealm_inode, node);
rb_erase(node, &mdsc->quotarealms_inodes);
iput(qri->inode);
kfree(qri);
}
mutex_unlock(&mdsc->quotarealms_inodes_mutex);
}
/*
* This function walks through the snaprealm for an inode and returns the
* ceph_snap_realm for the first snaprealm that has quotas set (either max_files
......@@ -76,9 +187,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
*
* Callers of this function need to hold mdsc->snap_rwsem. However, if there's
* a need to do an inode lookup, this rwsem will be temporarily dropped. Hence
* the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
* this function will return -EAGAIN; otherwise, the snaprealms walk-through
* will be restarted.
*/
static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
struct inode *inode)
struct inode *inode, bool retry)
{
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
......@@ -88,6 +205,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
if (ceph_snap(inode) != CEPH_NOSNAP)
return NULL;
restart:
realm = ceph_inode(inode)->i_snap_realm;
if (realm)
ceph_get_snap_realm(mdsc, realm);
......@@ -95,11 +213,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
"null i_snap_realm\n", ceph_vinop(inode));
while (realm) {
bool has_inode;
spin_lock(&realm->inodes_with_caps_lock);
in = realm->inode ? igrab(realm->inode) : NULL;
has_inode = realm->inode;
in = has_inode ? igrab(realm->inode) : NULL;
spin_unlock(&realm->inodes_with_caps_lock);
if (!in)
if (has_inode && !in)
break;
if (!in) {
up_read(&mdsc->snap_rwsem);
in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
down_read(&mdsc->snap_rwsem);
if (IS_ERR_OR_NULL(in))
break;
ceph_put_snap_realm(mdsc, realm);
if (!retry)
return ERR_PTR(-EAGAIN);
goto restart;
}
ci = ceph_inode(in);
has_quota = __ceph_has_any_quota(ci);
......@@ -125,9 +257,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
restart:
/*
* We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
* However, get_quota_realm may drop it temporarily. By setting the
* 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
old_realm = get_quota_realm(mdsc, old);
new_realm = get_quota_realm(mdsc, new);
old_realm = get_quota_realm(mdsc, old, true);
new_realm = get_quota_realm(mdsc, new, false);
if (PTR_ERR(new_realm) == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
ceph_put_snap_realm(mdsc, old_realm);
goto restart;
}
is_same = (old_realm == new_realm);
up_read(&mdsc->snap_rwsem);
......@@ -166,6 +311,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
return false;
down_read(&mdsc->snap_rwsem);
restart:
realm = ceph_inode(inode)->i_snap_realm;
if (realm)
ceph_get_snap_realm(mdsc, realm);
......@@ -173,12 +319,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
"null i_snap_realm\n", ceph_vinop(inode));
while (realm) {
bool has_inode;
spin_lock(&realm->inodes_with_caps_lock);
in = realm->inode ? igrab(realm->inode) : NULL;
has_inode = realm->inode;
in = has_inode ? igrab(realm->inode) : NULL;
spin_unlock(&realm->inodes_with_caps_lock);
if (!in)
if (has_inode && !in)
break;
if (!in) {
up_read(&mdsc->snap_rwsem);
in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
down_read(&mdsc->snap_rwsem);
if (IS_ERR_OR_NULL(in))
break;
ceph_put_snap_realm(mdsc, realm);
goto restart;
}
ci = ceph_inode(in);
spin_lock(&ci->i_ceph_lock);
if (op == QUOTA_CHECK_MAX_FILES_OP) {
......@@ -314,7 +471,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
......
......@@ -845,6 +845,12 @@ static void ceph_umount_begin(struct super_block *sb)
return;
}
static int ceph_remount(struct super_block *sb, int *flags, char *data)
{
sync_filesystem(sb);
return 0;
}
static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.destroy_inode = ceph_destroy_inode,
......@@ -853,6 +859,7 @@ static const struct super_operations ceph_super_ops = {
.drop_inode = ceph_drop_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.remount_fs = ceph_remount,
.show_options = ceph_show_options,
.statfs = ceph_statfs,
.umount_begin = ceph_umount_begin,
......
......@@ -1083,6 +1083,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* export.c */
extern const struct export_operations ceph_export_ops;
struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino);
/* locks.c */
extern __init void ceph_flock_init(void);
......@@ -1133,5 +1134,6 @@ extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
loff_t newlen);
extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
struct kstatfs *buf);
extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
#endif /* _FS_CEPH_SUPER_H */
......@@ -436,6 +436,12 @@ union ceph_mds_request_args {
__le64 length; /* num bytes to lock from start */
__u8 wait; /* will caller wait for lock to become available? */
} __attribute__ ((packed)) filelock_change;
struct {
__le32 mask; /* CEPH_CAP_* */
__le64 snapid;
__le64 parent;
__le32 hash;
} __attribute__ ((packed)) lookupino;
} __attribute__ ((packed));
#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
......
......@@ -323,7 +323,8 @@ struct ceph_connection {
};
extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
extern int ceph_parse_ips(const char *c, const char *end,
struct ceph_entity_addr *addr,
int max_count, int *count);
......
......@@ -110,17 +110,16 @@ struct ceph_object_id {
int name_len;
};
#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
#define CEPH_DEFINE_OID_ONSTACK(oid) \
struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
static inline void ceph_oid_init(struct ceph_object_id *oid)
{
oid->name = oid->inline_name;
oid->name_len = 0;
*oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
}
#define CEPH_OID_INIT_ONSTACK(oid) \
({ ceph_oid_init(&oid); oid; })
#define CEPH_DEFINE_OID_ONSTACK(oid) \
struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid)
static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
{
return oid->name == oid->inline_name && !oid->name_len;
......
......@@ -271,7 +271,7 @@ static int decode_locker(void **p, void *end, struct ceph_locker *locker)
dout("%s %s%llu cookie %s addr %s\n", __func__,
ENTITY_NAME(locker->id.name), locker->id.cookie,
ceph_pr_addr(&locker->info.addr.in_addr));
ceph_pr_addr(&locker->info.addr));
return 0;
}
......
......@@ -46,7 +46,7 @@ static int monmap_show(struct seq_file *s, void *p)
seq_printf(s, "\t%s%lld\t%s\n",
ENTITY_NAME(inst->name),
ceph_pr_addr(&inst->addr.in_addr));
ceph_pr_addr(&inst->addr));
}
return 0;
}
......@@ -82,7 +82,7 @@ static int osdmap_show(struct seq_file *s, void *p)
char sb[64];
seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
i, ceph_pr_addr(&addr->in_addr),
i, ceph_pr_addr(addr),
((map->osd_weight[i]*100) >> 16),
ceph_osdmap_state_str(sb, sizeof(sb), state),
((ceph_get_primary_affinity(map, i)*100) >> 16));
......
This diff is collapsed.
......@@ -76,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
m->num_mon);
for (i = 0; i < m->num_mon; i++)
dout("monmap_decode mon%d is %s\n", i,
ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
ceph_pr_addr(&m->mon_inst[i].addr));
return m;
bad:
......@@ -203,7 +203,7 @@ static void reopen_session(struct ceph_mon_client *monc)
{
if (!monc->hunting)
pr_info("mon%d %s session lost, hunting for new mon\n",
monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr));
__close_session(monc);
__open_session(monc);
......@@ -1178,7 +1178,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
__resend_generic_request(monc);
pr_info("mon%d %s session established\n", monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr.in_addr));
ceph_pr_addr(&monc->con.peer_addr));
}
out:
......
......@@ -4926,7 +4926,7 @@ static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
dout("%s %s%llu cookie %llu addr %s\n", __func__,
ENTITY_NAME(item->name), item->cookie,
ceph_pr_addr(&item->addr.in_addr));
ceph_pr_addr(&item->addr));
return 0;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment