Commit 8276ea13 authored by Aya Levin's avatar Aya Levin Committed by Saeed Mahameed

net/mlx5e: Report and recover from CQE with error on RQ

Add support for report and recovery from error on completion on RQ by
setting the queue back to ready state. Handle only errors with a
syndrome indicating the RQ might enter error state and could be
recovered.
Signed-off-by: default avatarAya Levin <ayal@mellanox.com>
Reviewed-by: default avatarTariq Toukan <tariqt@mellanox.com>
Acked-by: default avatarJiri Pirko <jiri@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent 0a35ab3e
......@@ -300,6 +300,7 @@ struct mlx5e_dcbx_dp {
enum {
MLX5E_RQ_STATE_ENABLED,
MLX5E_RQ_STATE_RECOVERING,
MLX5E_RQ_STATE_AM,
MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
......@@ -672,6 +673,8 @@ struct mlx5e_rq {
struct zero_copy_allocator zca;
struct xdp_umem *umem;
struct work_struct recover_work;
/* control */
struct mlx5_wq_ctrl wq_ctrl;
__be32 mkey_be;
......
......@@ -8,6 +8,14 @@
#define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)
static inline bool cqe_syndrome_needs_recover(u8 syndrome)
{
return syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR ||
syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR ||
syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR ||
syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
}
int mlx5e_reporter_tx_create(struct mlx5e_priv *priv);
void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv);
void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq);
......@@ -21,6 +29,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg);
int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);
#define MLX5E_REPORTER_PER_Q_MAX_LEN 256
......
......@@ -115,6 +115,75 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
}
static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
{
struct net_device *dev = rq->netdev;
int err;
err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
if (err) {
netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
return err;
}
err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
if (err) {
netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
return err;
}
return 0;
}
static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
{
struct mlx5_core_dev *mdev;
struct net_device *dev;
struct mlx5e_rq *rq;
u8 state;
int err;
rq = ctx;
mdev = rq->mdev;
dev = rq->netdev;
err = mlx5e_query_rq_state(mdev, rq->rqn, &state);
if (err) {
netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n",
rq->rqn, err);
goto out;
}
if (state != MLX5_RQC_STATE_ERR)
goto out;
mlx5e_deactivate_rq(rq);
mlx5e_free_rx_descs(rq);
err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR);
if (err)
goto out;
clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
mlx5e_activate_rq(rq);
rq->stats->recover++;
return 0;
out:
clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
return err;
}
void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq)
{
struct mlx5e_priv *priv = rq->channel->priv;
char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
struct mlx5e_err_ctx err_ctx = {};
err_ctx.ctx = rq;
err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover;
sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn);
mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
}
static int mlx5e_rx_reporter_timeout_recover(void *ctx)
{
struct mlx5e_icosq *icosq;
......
......@@ -362,6 +362,13 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq)
kvfree(rq->wqe.di);
}
static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work)
{
struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, recover_work);
mlx5e_reporter_rq_cqe_err(rq);
}
static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
......@@ -398,6 +405,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->stats = &c->priv->channel_stats[c->ix].xskrq;
else
rq->stats = &c->priv->channel_stats[c->ix].rq;
INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
if (IS_ERR(rq->xdp_prog)) {
......@@ -907,6 +915,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
{
cancel_work_sync(&rq->dim.work);
cancel_work_sync(&rq->channel->icosq.recover_work);
cancel_work_sync(&rq->recover_work);
mlx5e_destroy_rq(rq);
mlx5e_free_rx_descs(rq);
mlx5e_free_rq(rq);
......
......@@ -1130,6 +1130,15 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
return skb;
}
static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
{
struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
if (cqe_syndrome_needs_recover(err_cqe->syndrome) &&
!test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state))
queue_work(rq->channel->priv->wq, &rq->recover_work);
}
void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
......@@ -1143,6 +1152,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
trigger_report(rq, cqe);
rq->stats->wqe_err++;
goto free_wqe;
}
......@@ -1328,6 +1338,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
wi->consumed_strides += cstrides;
if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
trigger_report(rq, cqe);
rq->stats->wqe_err++;
goto mpwrq_cqe_out;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment