Commit a6bc3875 authored by Moni Shoua's avatar Moni Shoua Committed by Jason Gunthorpe

IB/mlx5: Protect against prefetch of invalid MR

When deferring a prefetch request we need to protect against MR or PD
being destroyed while the request is still enqueued.

The first step is to validate that PD owns the lkey that describes the MR
and that the MR that the lkey refers to is owned by that PD.

The second step is to dequeue all requests when MR is destroyed.

Since PD can't be destroyed while it owns MRs it is guaranteed that when a
worker wakes up the request it refers to is still valid.

Now, it is possible to refrain from taking a reference on the device since
it is assured to be present as pd.

While that, replace the dedicated ordered workqueue with the system
unbound workqueue to reuse an existing resource and improve
performance. This will also fix a bug of queueing to the wrong workqueue.

Fixes: 813e90b1 ("IB/mlx5: Add advise_mr() support")
Reported-by: default avatarParav Pandit <parav@mellanox.com>
Signed-off-by: default avatarMoni Shoua <monis@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent 25fd08eb
...@@ -5829,8 +5829,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) ...@@ -5829,8 +5829,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
srcu_barrier(&dev->mr_srcu); srcu_barrier(&dev->mr_srcu);
cleanup_srcu_struct(&dev->mr_srcu); cleanup_srcu_struct(&dev->mr_srcu);
drain_workqueue(dev->advise_mr_wq);
destroy_workqueue(dev->advise_mr_wq);
} }
kfree(dev->port); kfree(dev->port);
} }
...@@ -5885,18 +5883,9 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) ...@@ -5885,18 +5883,9 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->memic.dev = mdev; dev->memic.dev = mdev;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
dev->advise_mr_wq =
alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
if (!dev->advise_mr_wq) {
err = -ENOMEM;
goto err_mp;
}
err = init_srcu_struct(&dev->mr_srcu); err = init_srcu_struct(&dev->mr_srcu);
if (err) { if (err)
destroy_workqueue(dev->advise_mr_wq);
goto err_mp; goto err_mp;
}
} }
return 0; return 0;
......
...@@ -589,6 +589,7 @@ struct mlx5_ib_mr { ...@@ -589,6 +589,7 @@ struct mlx5_ib_mr {
atomic_t num_leaf_free; atomic_t num_leaf_free;
wait_queue_head_t q_leaf_free; wait_queue_head_t q_leaf_free;
struct mlx5_async_work cb_work; struct mlx5_async_work cb_work;
atomic_t num_pending_prefetch;
}; };
static inline bool is_odp_mr(struct mlx5_ib_mr *mr) static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
...@@ -929,7 +930,6 @@ struct mlx5_ib_dev { ...@@ -929,7 +930,6 @@ struct mlx5_ib_dev {
*/ */
struct srcu_struct mr_srcu; struct srcu_struct mr_srcu;
u32 null_mkey; u32 null_mkey;
struct workqueue_struct *advise_mr_wq;
struct mlx5_ib_flow_db *flow_db; struct mlx5_ib_flow_db *flow_db;
/* protect resources needed as part of reset flow */ /* protect resources needed as part of reset flow */
spinlock_t reset_flow_resource_lock; spinlock_t reset_flow_resource_lock;
......
...@@ -1339,8 +1339,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, ...@@ -1339,8 +1339,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
} }
} }
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
mr->live = 1; mr->live = 1;
atomic_set(&mr->num_pending_prefetch, 0);
}
return &mr->ibmr; return &mr->ibmr;
error: error:
...@@ -1576,8 +1578,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) ...@@ -1576,8 +1578,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
if (is_odp_mr(mr)) { if (is_odp_mr(mr)) {
struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
/* Prevent new page faults from succeeding */ /* Prevent new page faults and
* prefetch requests from succeeding
*/
mr->live = 0; mr->live = 0;
/* dequeue pending prefetch requests for the mr */
if (atomic_read(&mr->num_pending_prefetch))
flush_workqueue(system_unbound_wq);
WARN_ON(atomic_read(&mr->num_pending_prefetch));
/* Wait for all running page-fault handlers to finish. */ /* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu); synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */ /* Destroy all page mappings */
......
...@@ -535,6 +535,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, ...@@ -535,6 +535,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
imr->umem = umem; imr->umem = umem;
init_waitqueue_head(&imr->q_leaf_free); init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
return imr; return imr;
} }
...@@ -1660,24 +1661,91 @@ struct prefetch_mr_work { ...@@ -1660,24 +1661,91 @@ struct prefetch_mr_work {
struct ib_sge sg_list[0]; struct ib_sge sg_list[0];
}; };
static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
struct ib_sge *sg_list, u32 num_sge,
u32 from)
{
u32 i;
int srcu_key;
srcu_key = srcu_read_lock(&dev->mr_srcu);
for (i = from; i < num_sge; ++i) {
struct mlx5_core_mkey *mmkey;
struct mlx5_ib_mr *mr;
mmkey = __mlx5_mr_lookup(dev->mdev,
mlx5_base_mkey(sg_list[i].lkey));
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
atomic_dec(&mr->num_pending_prefetch);
}
srcu_read_unlock(&dev->mr_srcu, srcu_key);
}
static bool num_pending_prefetch_inc(struct ib_pd *pd,
struct ib_sge *sg_list, u32 num_sge)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
bool ret = true;
u32 i;
for (i = 0; i < num_sge; ++i) {
struct mlx5_core_mkey *mmkey;
struct mlx5_ib_mr *mr;
mmkey = __mlx5_mr_lookup(dev->mdev,
mlx5_base_mkey(sg_list[i].lkey));
if (!mmkey || mmkey->key != sg_list[i].lkey) {
ret = false;
break;
}
if (mmkey->type != MLX5_MKEY_MR) {
ret = false;
break;
}
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
if (mr->ibmr.pd != pd) {
ret = false;
break;
}
if (!mr->live) {
ret = false;
break;
}
atomic_inc(&mr->num_pending_prefetch);
}
if (!ret)
num_pending_prefetch_dec(dev, sg_list, i, 0);
return ret;
}
static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags, static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags,
struct ib_sge *sg_list, u32 num_sge) struct ib_sge *sg_list, u32 num_sge)
{ {
int i; u32 i;
int ret = 0;
for (i = 0; i < num_sge; ++i) { for (i = 0; i < num_sge; ++i) {
struct ib_sge *sg = &sg_list[i]; struct ib_sge *sg = &sg_list[i];
int bytes_committed = 0; int bytes_committed = 0;
int ret;
ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr, ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr,
sg->length, sg->length,
&bytes_committed, NULL, &bytes_committed, NULL,
pf_flags); pf_flags);
if (ret < 0) if (ret < 0)
return ret; break;
} }
return 0;
return ret < 0 ? ret : 0;
} }
static void mlx5_ib_prefetch_mr_work(struct work_struct *work) static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
...@@ -1690,7 +1758,8 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *work) ...@@ -1690,7 +1758,8 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
w->num_sge); w->num_sge);
ib_device_put(&w->dev->ib_dev); ib_device_put(&w->dev->ib_dev);
} }
put_device(&w->dev->ib_dev.dev);
num_pending_prefetch_dec(w->dev, w->sg_list, w->num_sge, 0);
kfree(w); kfree(w);
} }
...@@ -1701,6 +1770,8 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, ...@@ -1701,6 +1770,8 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_dev *dev = to_mdev(pd->device);
u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
struct prefetch_mr_work *work; struct prefetch_mr_work *work;
bool valid_req;
int srcu_key;
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
...@@ -1715,12 +1786,21 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, ...@@ -1715,12 +1786,21 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
get_device(&dev->ib_dev.dev);
work->dev = dev; work->dev = dev;
work->pf_flags = pf_flags; work->pf_flags = pf_flags;
work->num_sge = num_sge; work->num_sge = num_sge;
INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
schedule_work(&work->work);
return 0; srcu_key = srcu_read_lock(&dev->mr_srcu);
valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge);
if (valid_req)
queue_work(system_unbound_wq, &work->work);
else
kfree(work);
srcu_read_unlock(&dev->mr_srcu, srcu_key);
return valid_req ? 0 : -EINVAL;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment