Commit db72438c authored by Yishai Hadas's avatar Yishai Hadas Committed by Jason Gunthorpe

RDMA/mlx5: Cleanup the synchronize_srcu() from the ODP flow

Cleanup the synchronize_srcu() from the ODP flow as it was found to be a
very heavy time consumer as part of dereg_mr.

For example de-registration of 10000 ODP MRs each with size of 2M hugepage
took 19.6 sec comparing de-registration of same number of non ODP MRs that
took 172 ms.

The new locking scheme uses the wait_event() mechanism which follows the
use count of the MR instead of using synchronize_srcu().

By that change, the time required for the above test took 95 ms which is
even better than the non ODP flow.

Once fully dropped the srcu usage, had to come with a lock to protect the
XA access.

As part of using the above mechanism we could also clean the
num_deferred_work stuff and follow the use count instead.

Link: https://lore.kernel.org/r/20210202071309.2057998-1-leon@kernel.orgSigned-off-by: default avatarYishai Hadas <yishaih@nvidia.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@nvidia.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent a5887d62
...@@ -1310,9 +1310,9 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj, ...@@ -1310,9 +1310,9 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
mkey->size = MLX5_GET64(mkc, mkc, len); mkey->size = MLX5_GET64(mkc, mkc, len);
mkey->pd = MLX5_GET(mkc, mkc, pd); mkey->pd = MLX5_GET(mkc, mkc, pd);
devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
init_waitqueue_head(&mkey->wait);
return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey, return mlx5r_store_odp_mkey(dev, mkey);
GFP_KERNEL));
} }
static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
...@@ -1385,16 +1385,15 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, ...@@ -1385,16 +1385,15 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
int ret; int ret;
dev = mlx5_udata_to_mdev(&attrs->driver_udata); dev = mlx5_udata_to_mdev(&attrs->driver_udata);
if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY &&
xa_erase(&obj->ib_dev->odp_mkeys,
mlx5_base_mkey(obj->devx_mr.mmkey.key)))
/* /*
* The pagefault_single_data_segment() does commands against * The pagefault_single_data_segment() does commands against
* the mmkey, we must wait for that to stop before freeing the * the mmkey, we must wait for that to stop before freeing the
* mkey, as another allocation could get the same mkey #. * mkey, as another allocation could get the same mkey #.
*/ */
xa_erase(&obj->ib_dev->odp_mkeys, mlx5r_deref_wait_odp_mkey(&obj->devx_mr.mmkey);
mlx5_base_mkey(obj->devx_mr.mmkey.key));
synchronize_srcu(&dev->odp_srcu);
}
if (obj->flags & DEVX_OBJ_FLAGS_DCT) if (obj->flags & DEVX_OBJ_FLAGS_DCT)
ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct);
......
...@@ -3869,7 +3869,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) ...@@ -3869,7 +3869,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{ {
mlx5_ib_cleanup_multiport_master(dev); mlx5_ib_cleanup_multiport_master(dev);
WARN_ON(!xa_empty(&dev->odp_mkeys)); WARN_ON(!xa_empty(&dev->odp_mkeys));
cleanup_srcu_struct(&dev->odp_srcu);
mutex_destroy(&dev->cap_mask_mutex); mutex_destroy(&dev->cap_mask_mutex);
WARN_ON(!xa_empty(&dev->sig_mrs)); WARN_ON(!xa_empty(&dev->sig_mrs));
WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
...@@ -3914,10 +3913,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) ...@@ -3914,10 +3913,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev); dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev);
err = init_srcu_struct(&dev->odp_srcu);
if (err)
goto err_mp;
mutex_init(&dev->cap_mask_mutex); mutex_init(&dev->cap_mask_mutex);
INIT_LIST_HEAD(&dev->qp_list); INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock); spin_lock_init(&dev->reset_flow_resource_lock);
......
...@@ -684,11 +684,8 @@ struct mlx5_ib_mr { ...@@ -684,11 +684,8 @@ struct mlx5_ib_mr {
u64 pi_iova; u64 pi_iova;
/* For ODP and implicit */ /* For ODP and implicit */
atomic_t num_deferred_work;
wait_queue_head_t q_deferred_work;
struct xarray implicit_children; struct xarray implicit_children;
union { union {
struct rcu_head rcu;
struct list_head elm; struct list_head elm;
struct work_struct work; struct work_struct work;
} odp_destroy; } odp_destroy;
...@@ -1068,11 +1065,6 @@ struct mlx5_ib_dev { ...@@ -1068,11 +1065,6 @@ struct mlx5_ib_dev {
u64 odp_max_size; u64 odp_max_size;
struct mlx5_ib_pf_eq odp_pf_eq; struct mlx5_ib_pf_eq odp_pf_eq;
/*
* Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler.
*/
struct srcu_struct odp_srcu;
struct xarray odp_mkeys; struct xarray odp_mkeys;
u32 null_mkey; u32 null_mkey;
...@@ -1599,6 +1591,29 @@ static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev, ...@@ -1599,6 +1591,29 @@ static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev,
return true; return true;
} }
static inline int mlx5r_store_odp_mkey(struct mlx5_ib_dev *dev,
struct mlx5_core_mkey *mmkey)
{
refcount_set(&mmkey->usecount, 1);
return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mmkey->key),
mmkey, GFP_KERNEL));
}
/* deref an mkey that can participate in ODP flow */
static inline void mlx5r_deref_odp_mkey(struct mlx5_core_mkey *mmkey)
{
if (refcount_dec_and_test(&mmkey->usecount))
wake_up(&mmkey->wait);
}
/* deref an mkey that can participate in ODP flow and wait for relese */
static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_core_mkey *mmkey)
{
mlx5r_deref_odp_mkey(mmkey);
wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0);
}
int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev)
......
...@@ -158,6 +158,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) ...@@ -158,6 +158,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
mr->mmkey.type = MLX5_MKEY_MR; mr->mmkey.type = MLX5_MKEY_MR;
mr->mmkey.key |= mlx5_idx_to_mkey( mr->mmkey.key |= mlx5_idx_to_mkey(
MLX5_GET(create_mkey_out, mr->out, mkey_index)); MLX5_GET(create_mkey_out, mr->out, mkey_index));
init_waitqueue_head(&mr->mmkey.wait);
WRITE_ONCE(dev->cache.last_add, jiffies); WRITE_ONCE(dev->cache.last_add, jiffies);
...@@ -1551,10 +1552,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -1551,10 +1552,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
} }
odp->private = mr; odp->private = mr;
init_waitqueue_head(&mr->q_deferred_work); err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
atomic_set(&mr->num_deferred_work, 0);
err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
&mr->mmkey, GFP_KERNEL));
if (err) if (err)
goto err_dereg_mr; goto err_dereg_mr;
...@@ -1651,10 +1649,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, ...@@ -1651,10 +1649,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
umem_dmabuf->private = mr; umem_dmabuf->private = mr;
init_waitqueue_head(&mr->q_deferred_work); err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
atomic_set(&mr->num_deferred_work, 0);
err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
&mr->mmkey, GFP_KERNEL));
if (err) if (err)
goto err_dereg_mr; goto err_dereg_mr;
...@@ -2330,9 +2325,7 @@ int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) ...@@ -2330,9 +2325,7 @@ int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
} }
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
err = xa_err(xa_store(&dev->odp_mkeys, err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
GFP_KERNEL));
if (err) if (err)
goto free_mkey; goto free_mkey;
} }
...@@ -2352,14 +2345,13 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw) ...@@ -2352,14 +2345,13 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw)
struct mlx5_ib_dev *dev = to_mdev(mw->device); struct mlx5_ib_dev *dev = to_mdev(mw->device);
struct mlx5_ib_mw *mmw = to_mmw(mw); struct mlx5_ib_mw *mmw = to_mmw(mw);
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)); xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
/* /*
* pagefault_single_data_segment() may be accessing mmw under * pagefault_single_data_segment() may be accessing mmw
* SRCU if the user bound an ODP MR to this MW. * if the user bound an ODP MR to this MW.
*/ */
synchronize_srcu(&dev->odp_srcu); mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
}
return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey); return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
} }
......
This diff is collapsed.
...@@ -56,6 +56,7 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, ...@@ -56,6 +56,7 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
mkey->size = MLX5_GET64(mkc, mkc, len); mkey->size = MLX5_GET64(mkc, mkc, len);
mkey->key |= mlx5_idx_to_mkey(mkey_index); mkey->key |= mlx5_idx_to_mkey(mkey_index);
mkey->pd = MLX5_GET(mkc, mkc, pd); mkey->pd = MLX5_GET(mkc, mkc, pd);
init_waitqueue_head(&mkey->wait);
mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index, mkey->key); mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index, mkey->key);
return 0; return 0;
......
...@@ -366,6 +366,8 @@ struct mlx5_core_mkey { ...@@ -366,6 +366,8 @@ struct mlx5_core_mkey {
u32 key; u32 key;
u32 pd; u32 pd;
u32 type; u32 type;
struct wait_queue_head wait;
refcount_t usecount;
}; };
#define MLX5_24BIT_MASK ((1 << 24) - 1) #define MLX5_24BIT_MASK ((1 << 24) - 1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment