Commit fcd2b0da authored by David S. Miller's avatar David S. Miller

Merge branch 'rds-ha-failover-fixes'

Sowmini Varadhan says:

====================
RDS: TCP: HA/Failover fixes

This series contains a set of fixes for bugs exposed when
we ran the following in a loop between a test machine pair:

 while (1); do
   # modprobe rds-tcp on test nodes
   # run rds-stress in bi-dir mode between test machine pair
   # modprobe -r rds-tcp on test nodes
 done

rds-stress in bi-dir mode will cause both nodes to initiate
RDS-TCP connections at almost the same instant, exposing the
bugs fixed in this series.

Without the fixes, rds-stress reports sporadic packet drops,
and packets arriving out of sequence. After the fixes,we have
been able to run the  test overnight, without any issues.

Each patch has a detailed description of the root-cause fixed
by the patch.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b3e51069 1a0e100f
...@@ -605,10 +605,14 @@ static void rds_exit(void) ...@@ -605,10 +605,14 @@ static void rds_exit(void)
} }
module_exit(rds_exit); module_exit(rds_exit);
u32 rds_gen_num;
static int rds_init(void) static int rds_init(void)
{ {
int ret; int ret;
net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
ret = rds_bind_lock_init(); ret = rds_bind_lock_init();
if (ret) if (ret)
goto out; goto out;
......
...@@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, ...@@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
kmem_cache_free(rds_conn_slab, conn); kmem_cache_free(rds_conn_slab, conn);
conn = found; conn = found;
} else { } else {
conn->c_my_gen_num = rds_gen_num;
conn->c_peer_gen_num = 0;
hlist_add_head_rcu(&conn->c_hash_node, head); hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn); rds_cong_add_conn(conn);
rds_conn_count++; rds_conn_count++;
...@@ -681,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) ...@@ -681,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
} }
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
void rds_conn_connect_if_down(struct rds_connection *conn) void rds_conn_connect_if_down(struct rds_connection *conn)
{ {
......
...@@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { ...@@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_NPATHS] = sizeof(u16), [RDS_EXTHDR_NPATHS] = sizeof(u16),
[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
}; };
......
...@@ -151,6 +151,9 @@ struct rds_connection { ...@@ -151,6 +151,9 @@ struct rds_connection {
struct rds_conn_path c_path[RDS_MPATH_WORKERS]; struct rds_conn_path c_path[RDS_MPATH_WORKERS];
wait_queue_head_t c_hs_waitq; /* handshake waitq */ wait_queue_head_t c_hs_waitq; /* handshake waitq */
u32 c_my_gen_num;
u32 c_peer_gen_num;
}; };
static inline static inline
...@@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest { ...@@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest {
/* Extension header announcing number of paths. /* Extension header announcing number of paths.
* Implicit length = 2 bytes. * Implicit length = 2 bytes.
*/ */
#define RDS_EXTHDR_NPATHS 4 #define RDS_EXTHDR_NPATHS 5
#define RDS_EXTHDR_GEN_NUM 6
#define __RDS_EXTHDR_MAX 16 /* for now */ #define __RDS_EXTHDR_MAX 16 /* for now */
...@@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) ...@@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
#define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_RETRANSMITTED 5
#define RDS_MSG_MAPPED 6 #define RDS_MSG_MAPPED 6
#define RDS_MSG_PAGEVEC 7 #define RDS_MSG_PAGEVEC 7
#define RDS_MSG_FLUSH 8
struct rds_message { struct rds_message {
atomic_t m_refcount; atomic_t m_refcount;
...@@ -664,6 +669,7 @@ void rds_cong_exit(void); ...@@ -664,6 +669,7 @@ void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */ /* conn.c */
extern u32 rds_gen_num;
int rds_conn_init(void); int rds_conn_init(void);
void rds_conn_exit(void); void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net, struct rds_connection *rds_conn_create(struct net *net,
......
...@@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, ...@@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
/* do nothing if no change in cong state */ /* do nothing if no change in cong state */
} }
static void rds_conn_peer_gen_update(struct rds_connection *conn,
u32 peer_gen_num)
{
int i;
struct rds_message *rm, *tmp;
unsigned long flags;
WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
if (peer_gen_num != 0) {
if (conn->c_peer_gen_num != 0 &&
peer_gen_num != conn->c_peer_gen_num) {
for (i = 0; i < RDS_MPATH_WORKERS; i++) {
struct rds_conn_path *cp;
cp = &conn->c_path[i];
spin_lock_irqsave(&cp->cp_lock, flags);
cp->cp_next_tx_seq = 1;
cp->cp_next_rx_seq = 0;
list_for_each_entry_safe(rm, tmp,
&cp->cp_retrans,
m_conn_item) {
set_bit(RDS_MSG_FLUSH, &rm->m_flags);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
}
}
conn->c_peer_gen_num = peer_gen_num;
}
}
/* /*
* Process all extension headers that come with this message. * Process all extension headers that come with this message.
*/ */
...@@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, ...@@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
union { union {
struct rds_ext_header_version version; struct rds_ext_header_version version;
u16 rds_npaths; u16 rds_npaths;
u32 rds_gen_num;
} buffer; } buffer;
u32 new_peer_gen_num = 0;
while (1) { while (1) {
len = sizeof(buffer); len = sizeof(buffer);
...@@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, ...@@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
buffer.rds_npaths); buffer.rds_npaths);
break; break;
case RDS_EXTHDR_GEN_NUM:
new_peer_gen_num = buffer.rds_gen_num;
break;
default: default:
pr_warn_ratelimited("ignoring unknown exthdr type " pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type); "0x%x\n", type);
...@@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, ...@@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
} }
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1); conn->c_npaths = max_t(int, conn->c_npaths, 1);
rds_conn_peer_gen_update(conn, new_peer_gen_num);
} }
/* rds_start_mprds() will synchronously start multiple paths when appropriate. /* rds_start_mprds() will synchronously start multiple paths when appropriate.
......
...@@ -259,8 +259,9 @@ int rds_send_xmit(struct rds_conn_path *cp) ...@@ -259,8 +259,9 @@ int rds_send_xmit(struct rds_conn_path *cp)
* connection. * connection.
* Therefore, we never retransmit messages with RDMA ops. * Therefore, we never retransmit messages with RDMA ops.
*/ */
if (rm->rdma.op_active && if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { (rm->rdma.op_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
spin_lock_irqsave(&cp->cp_lock, flags); spin_lock_irqsave(&cp->cp_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped); list_move(&rm->m_conn_item, &to_be_dropped);
...@@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, ...@@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
rds_message_add_extension(&rm->m_inc.i_hdr, rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths, RDS_EXTHDR_NPATHS, &npaths,
sizeof(npaths)); sizeof(npaths));
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
&cp->cp_conn->c_my_gen_num,
sizeof(u32));
} }
spin_unlock_irqrestore(&cp->cp_lock, flags); spin_unlock_irqrestore(&cp->cp_lock, flags);
......
...@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk) ...@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV: case TCP_SYN_RECV:
break; break;
case TCP_ESTABLISHED: case TCP_ESTABLISHED:
/* Force the peer to reconnect so that we have the
* TCP ports going from <smaller-ip>.<transient> to
* <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
* RDS connection as RDS_CONN_UP until the reconnect,
* to avoid RDS datagram loss.
*/
if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
RDS_CONN_ERROR)) {
rds_conn_path_drop(cp);
} else {
rds_connect_path_complete(cp, RDS_CONN_CONNECTING); rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
}
break; break;
case TCP_CLOSE_WAIT: case TCP_CLOSE_WAIT:
case TCP_CLOSE: case TCP_CLOSE:
......
...@@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) ...@@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{ {
int i; int i;
bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
int npaths = conn->c_npaths; int npaths = max_t(int, 1, conn->c_npaths);
if (npaths <= 1) { /* for mprds, all paths MUST be initiated by the peer
struct rds_conn_path *cp = &conn->c_path[0];
int ret;
ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
RDS_CONN_CONNECTING);
if (!ret)
rds_conn_path_transition(cp, RDS_CONN_ERROR,
RDS_CONN_CONNECTING);
return cp->cp_transport_data;
}
/* for mprds, paths with cp_index > 0 MUST be initiated by the peer
* with the smaller address. * with the smaller address.
*/ */
if (!peer_is_smaller) if (!peer_is_smaller) {
/* Make sure we initiate at least one path if this
* has not already been done; rds_start_mprds() will
* take care of additional paths, if necessary.
*/
if (npaths == 1)
rds_conn_path_connect_if_down(&conn->c_path[0]);
return NULL; return NULL;
}
for (i = 0; i < npaths; i++) { for (i = 0; i < npaths; i++) {
struct rds_conn_path *cp = &conn->c_path[i]; struct rds_conn_path *cp = &conn->c_path[i];
...@@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock) ...@@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock)
mutex_lock(&rs_tcp->t_conn_path_lock); mutex_lock(&rs_tcp->t_conn_path_lock);
cp = rs_tcp->t_cpath; cp = rs_tcp->t_cpath;
conn_state = rds_conn_path_state(cp); conn_state = rds_conn_path_state(cp);
if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && WARN_ON(conn_state == RDS_CONN_UP);
conn_state != RDS_CONN_ERROR) if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
goto rst_nsk; goto rst_nsk;
if (rs_tcp->t_sock) { if (rs_tcp->t_sock) {
/* Need to resolve a duelling SYN between peers. /* Need to resolve a duelling SYN between peers.
......
...@@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
tc->t_last_expected_una = rm->m_ack_seq + 1; tc->t_last_expected_una = rm->m_ack_seq + 1;
if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
rm, rds_tcp_snd_nxt(tc), rm, rds_tcp_snd_nxt(tc),
(unsigned long long)rm->m_ack_seq); (unsigned long long)rm->m_ack_seq);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment