Commit c1289d5d authored by Jack Wang's avatar Jack Wang Committed by Jason Gunthorpe

RDMA/rtrs-clt: Do stop and failover outside reconnect work.

We can't do instant reconnect, not to DDoS server, but we should stop and
failover earlier, so there is less service interruption.

To avoid deadlock, as error_recovery is called from different callback
like rdma event or hb error handler, add a new err recovery_work.

Link: https://lore.kernel.org/r/20220114154753.983568-6-haris.iqbal@ionos.comSigned-off-by: default avatarJack Wang <jinpu.wang@ionos.com>
Reviewed-by: default avatarAleksei Marov <aleksei.marov@ionos.com>
Reviewed-by: default avatarMd Haris Iqbal <haris.iqbal@ionos.com>
Signed-off-by: default avatarMd Haris Iqbal <haris.iqbal@ionos.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent b962fee5
......@@ -297,6 +297,7 @@ static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
return changed;
}
static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
{
struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
......@@ -304,16 +305,7 @@ static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
if (rtrs_clt_change_state_from_to(clt_path,
RTRS_CLT_CONNECTED,
RTRS_CLT_RECONNECTING)) {
struct rtrs_clt_sess *clt = clt_path->clt;
unsigned int delay_ms;
/*
* Normal scenario, reconnect if we were successfully connected
*/
delay_ms = clt->reconnect_delay_sec * 1000;
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
msecs_to_jiffies(delay_ms +
prandom_u32() % RTRS_RECONNECT_SEED));
queue_work(rtrs_wq, &clt_path->err_recovery_work);
} else {
/*
* Error can happen just on establishing new connection,
......@@ -1511,6 +1503,22 @@ static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
static void rtrs_clt_reconnect_work(struct work_struct *work);
static void rtrs_clt_close_work(struct work_struct *work);
static void rtrs_clt_err_recovery_work(struct work_struct *work)
{
struct rtrs_clt_path *clt_path;
struct rtrs_clt_sess *clt;
int delay_ms;
clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
clt = clt_path->clt;
delay_ms = clt->reconnect_delay_sec * 1000;
rtrs_clt_stop_and_destroy_conns(clt_path);
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
msecs_to_jiffies(delay_ms +
prandom_u32() %
RTRS_RECONNECT_SEED));
}
static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
const struct rtrs_addr *path,
size_t con_num, u32 nr_poll_queues)
......@@ -1562,6 +1570,7 @@ static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
clt_path->state = RTRS_CLT_CONNECTING;
atomic_set(&clt_path->connected_cnt, 0);
INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
rtrs_clt_init_hb(clt_path);
......@@ -2326,6 +2335,7 @@ static void rtrs_clt_close_work(struct work_struct *work)
clt_path = container_of(work, struct rtrs_clt_path, close_work);
cancel_work_sync(&clt_path->err_recovery_work);
cancel_delayed_work_sync(&clt_path->reconnect_dwork);
rtrs_clt_stop_and_destroy_conns(clt_path);
rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
......@@ -2638,7 +2648,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
{
struct rtrs_clt_path *clt_path;
struct rtrs_clt_sess *clt;
unsigned int delay_ms;
int err;
clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
......@@ -2655,8 +2664,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
}
clt_path->reconnect_attempts++;
/* Stop everything */
rtrs_clt_stop_and_destroy_conns(clt_path);
msleep(RTRS_RECONNECT_BACKOFF);
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
err = init_path(clt_path);
......@@ -2669,11 +2676,7 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
reconnect_again:
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
clt_path->stats->reconnects.fail_cnt++;
delay_ms = clt->reconnect_delay_sec * 1000;
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
msecs_to_jiffies(delay_ms +
prandom_u32() %
RTRS_RECONNECT_SEED));
queue_work(rtrs_wq, &clt_path->err_recovery_work);
}
}
......@@ -2905,6 +2908,7 @@ int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
&old_state);
if (changed) {
clt_path->reconnect_attempts = 0;
rtrs_clt_stop_and_destroy_conns(clt_path);
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
}
if (changed || old_state == RTRS_CLT_RECONNECTING) {
......
......@@ -134,6 +134,7 @@ struct rtrs_clt_path {
struct rtrs_clt_io_req *reqs;
struct delayed_work reconnect_dwork;
struct work_struct close_work;
struct work_struct err_recovery_work;
unsigned int reconnect_attempts;
bool established;
struct rtrs_rbuf *rbufs;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment