RDMA/rtrs-clt: Do stop and failover outside reconnect work.
We can't do instant reconnect, not to DDoS server, but we should stop and failover earlier, so there is less service interruption. To avoid deadlock, as error_recovery is called from different callback like rdma event or hb error handler, add a new err recovery_work. Link: https://lore.kernel.org/r/20220114154753.983568-6-haris.iqbal@ionos.com Signed-off-by: Jack Wang <jinpu.wang@ionos.com> Reviewed-by: Aleksei Marov <aleksei.marov@ionos.com> Reviewed-by: Md Haris Iqbal <haris.iqbal@ionos.com> Signed-off-by: Md Haris Iqbal <haris.iqbal@ionos.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
This commit is contained in:
parent
b962fee5c2
commit
c1289d5d85
|
@ -297,6 +297,7 @@ static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
|
||||||
return changed;
|
return changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
|
||||||
static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
|
static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
|
||||||
{
|
{
|
||||||
struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
|
struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
|
||||||
|
@ -304,16 +305,7 @@ static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
|
||||||
if (rtrs_clt_change_state_from_to(clt_path,
|
if (rtrs_clt_change_state_from_to(clt_path,
|
||||||
RTRS_CLT_CONNECTED,
|
RTRS_CLT_CONNECTED,
|
||||||
RTRS_CLT_RECONNECTING)) {
|
RTRS_CLT_RECONNECTING)) {
|
||||||
struct rtrs_clt_sess *clt = clt_path->clt;
|
queue_work(rtrs_wq, &clt_path->err_recovery_work);
|
||||||
unsigned int delay_ms;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Normal scenario, reconnect if we were successfully connected
|
|
||||||
*/
|
|
||||||
delay_ms = clt->reconnect_delay_sec * 1000;
|
|
||||||
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
|
|
||||||
msecs_to_jiffies(delay_ms +
|
|
||||||
prandom_u32() % RTRS_RECONNECT_SEED));
|
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* Error can happen just on establishing new connection,
|
* Error can happen just on establishing new connection,
|
||||||
|
@ -1511,6 +1503,22 @@ static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
|
||||||
static void rtrs_clt_reconnect_work(struct work_struct *work);
|
static void rtrs_clt_reconnect_work(struct work_struct *work);
|
||||||
static void rtrs_clt_close_work(struct work_struct *work);
|
static void rtrs_clt_close_work(struct work_struct *work);
|
||||||
|
|
||||||
|
static void rtrs_clt_err_recovery_work(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct rtrs_clt_path *clt_path;
|
||||||
|
struct rtrs_clt_sess *clt;
|
||||||
|
int delay_ms;
|
||||||
|
|
||||||
|
clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
|
||||||
|
clt = clt_path->clt;
|
||||||
|
delay_ms = clt->reconnect_delay_sec * 1000;
|
||||||
|
rtrs_clt_stop_and_destroy_conns(clt_path);
|
||||||
|
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
|
||||||
|
msecs_to_jiffies(delay_ms +
|
||||||
|
prandom_u32() %
|
||||||
|
RTRS_RECONNECT_SEED));
|
||||||
|
}
|
||||||
|
|
||||||
static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
|
static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
|
||||||
const struct rtrs_addr *path,
|
const struct rtrs_addr *path,
|
||||||
size_t con_num, u32 nr_poll_queues)
|
size_t con_num, u32 nr_poll_queues)
|
||||||
|
@ -1562,6 +1570,7 @@ static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
|
||||||
clt_path->state = RTRS_CLT_CONNECTING;
|
clt_path->state = RTRS_CLT_CONNECTING;
|
||||||
atomic_set(&clt_path->connected_cnt, 0);
|
atomic_set(&clt_path->connected_cnt, 0);
|
||||||
INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
|
INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
|
||||||
|
INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
|
||||||
INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
|
INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
|
||||||
rtrs_clt_init_hb(clt_path);
|
rtrs_clt_init_hb(clt_path);
|
||||||
|
|
||||||
|
@ -2326,6 +2335,7 @@ static void rtrs_clt_close_work(struct work_struct *work)
|
||||||
|
|
||||||
clt_path = container_of(work, struct rtrs_clt_path, close_work);
|
clt_path = container_of(work, struct rtrs_clt_path, close_work);
|
||||||
|
|
||||||
|
cancel_work_sync(&clt_path->err_recovery_work);
|
||||||
cancel_delayed_work_sync(&clt_path->reconnect_dwork);
|
cancel_delayed_work_sync(&clt_path->reconnect_dwork);
|
||||||
rtrs_clt_stop_and_destroy_conns(clt_path);
|
rtrs_clt_stop_and_destroy_conns(clt_path);
|
||||||
rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
|
rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
|
||||||
|
@ -2638,7 +2648,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
|
||||||
{
|
{
|
||||||
struct rtrs_clt_path *clt_path;
|
struct rtrs_clt_path *clt_path;
|
||||||
struct rtrs_clt_sess *clt;
|
struct rtrs_clt_sess *clt;
|
||||||
unsigned int delay_ms;
|
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
|
clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
|
||||||
|
@ -2655,8 +2664,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
|
||||||
}
|
}
|
||||||
clt_path->reconnect_attempts++;
|
clt_path->reconnect_attempts++;
|
||||||
|
|
||||||
/* Stop everything */
|
|
||||||
rtrs_clt_stop_and_destroy_conns(clt_path);
|
|
||||||
msleep(RTRS_RECONNECT_BACKOFF);
|
msleep(RTRS_RECONNECT_BACKOFF);
|
||||||
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
|
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
|
||||||
err = init_path(clt_path);
|
err = init_path(clt_path);
|
||||||
|
@ -2669,11 +2676,7 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
|
||||||
reconnect_again:
|
reconnect_again:
|
||||||
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
|
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
|
||||||
clt_path->stats->reconnects.fail_cnt++;
|
clt_path->stats->reconnects.fail_cnt++;
|
||||||
delay_ms = clt->reconnect_delay_sec * 1000;
|
queue_work(rtrs_wq, &clt_path->err_recovery_work);
|
||||||
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
|
|
||||||
msecs_to_jiffies(delay_ms +
|
|
||||||
prandom_u32() %
|
|
||||||
RTRS_RECONNECT_SEED));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2905,6 +2908,7 @@ int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
|
||||||
&old_state);
|
&old_state);
|
||||||
if (changed) {
|
if (changed) {
|
||||||
clt_path->reconnect_attempts = 0;
|
clt_path->reconnect_attempts = 0;
|
||||||
|
rtrs_clt_stop_and_destroy_conns(clt_path);
|
||||||
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
|
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
|
||||||
}
|
}
|
||||||
if (changed || old_state == RTRS_CLT_RECONNECTING) {
|
if (changed || old_state == RTRS_CLT_RECONNECTING) {
|
||||||
|
|
|
@ -134,6 +134,7 @@ struct rtrs_clt_path {
|
||||||
struct rtrs_clt_io_req *reqs;
|
struct rtrs_clt_io_req *reqs;
|
||||||
struct delayed_work reconnect_dwork;
|
struct delayed_work reconnect_dwork;
|
||||||
struct work_struct close_work;
|
struct work_struct close_work;
|
||||||
|
struct work_struct err_recovery_work;
|
||||||
unsigned int reconnect_attempts;
|
unsigned int reconnect_attempts;
|
||||||
bool established;
|
bool established;
|
||||||
struct rtrs_rbuf *rbufs;
|
struct rtrs_rbuf *rbufs;
|
||||||
|
|
Loading…
Reference in New Issue