Commit 448c907c authored by David S. Miller's avatar David S. Miller

Merge branch 'smc-next'

Ursula Braun says:

====================
smc fixes from 2018-04-17 - v3

in the mean time we challenged the benefit of these CLC handshake
optimizations for the sockopts TCP_NODELAY and TCP_CORK.
We decided to give up on them for now, since SMC still works
properly without.
There is now version 3 of the patch series with patches 2-4 implementing
sockopts that require special handling in SMC.

Version 3 changes
   * no deferring of setsockopts TCP_NODELAY and TCP_CORK anymore
   * allow fallback for some sockopts eliminating SMC usage
   * when setting TCP_NODELAY always enforce data transmission
     (not only together with corked data)

Version 2 changes of Patch 2/4 (and 3/4):
   * return error -EOPNOTSUPP for TCP_FASTOPEN sockopts
   * fix a kernel_setsockopt() usage bug by switching parameter
     variable from type "u8" to "int"
   * add return code validation when calling kernel_setsockopt()
   * propagate a setsockopt error on the internal CLC socket
     to the SMC socket.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 51dce24b abb190f1
...@@ -391,6 +391,9 @@ static int smc_connect_rdma(struct smc_sock *smc) ...@@ -391,6 +391,9 @@ static int smc_connect_rdma(struct smc_sock *smc)
sock_hold(&smc->sk); /* sock put in passive closing */ sock_hold(&smc->sk); /* sock put in passive closing */
if (smc->use_fallback)
goto out_connected;
if (!tcp_sk(smc->clcsock->sk)->syn_smc) { if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
/* peer has not signalled SMC-capability */ /* peer has not signalled SMC-capability */
smc->use_fallback = true; smc->use_fallback = true;
...@@ -790,6 +793,9 @@ static void smc_listen_work(struct work_struct *work) ...@@ -790,6 +793,9 @@ static void smc_listen_work(struct work_struct *work)
int rc = 0; int rc = 0;
u8 ibport; u8 ibport;
if (new_smc->use_fallback)
goto out_connected;
/* check if peer is smc capable */ /* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) { if (!tcp_sk(newclcsock->sk)->syn_smc) {
new_smc->use_fallback = true; new_smc->use_fallback = true;
...@@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work) ...@@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
continue; continue;
new_smc->listen_smc = lsmc; new_smc->listen_smc = lsmc;
new_smc->use_fallback = false; /* assume rdma capability first*/ new_smc->use_fallback = lsmc->use_fallback;
sock_hold(lsk); /* sock_put in smc_listen_work */ sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc); smc_copy_sock_settings_to_smc(new_smc);
...@@ -1004,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog) ...@@ -1004,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog)
* them to the clc socket -- copy smc socket options to clc socket * them to the clc socket -- copy smc socket options to clc socket
*/ */
smc_copy_sock_settings_to_clc(smc); smc_copy_sock_settings_to_clc(smc);
tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_listen(smc->clcsock, backlog); rc = kernel_listen(smc->clcsock, backlog);
if (rc) if (rc)
...@@ -1037,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, ...@@ -1037,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (lsmc->sk.sk_state != SMC_LISTEN) { if (lsmc->sk.sk_state != SMC_LISTEN) {
rc = -EINVAL; rc = -EINVAL;
release_sock(sk);
goto out; goto out;
} }
...@@ -1064,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, ...@@ -1064,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (!rc) if (!rc)
rc = sock_error(nsk); rc = sock_error(nsk);
release_sock(sk);
if (rc)
goto out;
if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
/* wait till data arrives on the socket */
timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
MSEC_PER_SEC);
if (smc_sk(nsk)->use_fallback) {
struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
lock_sock(clcsk);
if (skb_queue_empty(&clcsk->sk_receive_queue))
sk_wait_data(clcsk, &timeo, NULL);
release_sock(clcsk);
} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
lock_sock(nsk);
smc_rx_wait_data(smc_sk(nsk), &timeo);
release_sock(nsk);
}
}
out: out:
release_sock(sk);
sock_put(sk); /* sock_hold above */ sock_put(sk); /* sock_hold above */
return rc; return rc;
} }
...@@ -1097,6 +1125,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) ...@@ -1097,6 +1125,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
(sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_APPCLOSEWAIT1) &&
(sk->sk_state != SMC_INIT)) (sk->sk_state != SMC_INIT))
goto out; goto out;
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
} else {
rc = -EINVAL;
goto out;
}
}
if (smc->use_fallback) if (smc->use_fallback)
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
else else
...@@ -1274,14 +1312,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, ...@@ -1274,14 +1312,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
struct smc_sock *smc; struct smc_sock *smc;
int val, rc;
smc = smc_sk(sk); smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the /* generic setsockopts reaching us here always apply to the
* CLC socket * CLC socket
*/ */
return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
optval, optlen); optval, optlen);
if (smc->clcsock->sk->sk_err) {
sk->sk_err = smc->clcsock->sk->sk_err;
sk->sk_error_report(sk);
}
if (rc)
return rc;
if (optlen < sizeof(int))
return rc;
get_user(val, (int __user *)optval);
lock_sock(sk);
switch (optname) {
case TCP_ULP:
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
} else {
if (!smc->use_fallback)
rc = -EINVAL;
}
break;
case TCP_NODELAY:
if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
if (val)
mod_delayed_work(system_wq, &smc->conn.tx_work,
0);
}
break;
case TCP_CORK:
if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
if (!val)
mod_delayed_work(system_wq, &smc->conn.tx_work,
0);
}
break;
case TCP_DEFER_ACCEPT:
smc->sockopt_defer_accept = val;
break;
default:
break;
}
release_sock(sk);
return rc;
} }
static int smc_getsockopt(struct socket *sock, int level, int optname, static int smc_getsockopt(struct socket *sock, int level, int optname,
......
...@@ -180,6 +180,10 @@ struct smc_sock { /* smc sock container */ ...@@ -180,6 +180,10 @@ struct smc_sock { /* smc sock container */
struct list_head accept_q; /* sockets to be accepted */ struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */ spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */ bool use_fallback; /* fallback to tcp */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
*/
u8 wait_close_tx_prepared : 1; u8 wait_close_tx_prepared : 1;
/* shutdown wr or close /* shutdown wr or close
* started, waiting for unsent * started, waiting for unsent
......
...@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, ...@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
BUILD_BUG_ON_MSG( BUILD_BUG_ON_MSG(
offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
BUILD_BUG_ON_MSG( BUILD_BUG_ON_MSG(
sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
......
...@@ -48,7 +48,7 @@ struct smc_cdc_msg { ...@@ -48,7 +48,7 @@ struct smc_cdc_msg {
struct smc_cdc_producer_flags prod_flags; struct smc_cdc_producer_flags prod_flags;
struct smc_cdc_conn_state_flags conn_state_flags; struct smc_cdc_conn_state_flags conn_state_flags;
u8 reserved[18]; u8 reserved[18];
} __aligned(8); } __packed; /* format defined in RFC7609 */
static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
{ {
......
...@@ -51,7 +51,7 @@ static void smc_rx_data_ready(struct sock *sk) ...@@ -51,7 +51,7 @@ static void smc_rx_data_ready(struct sock *sk)
* 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
* 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
*/ */
static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
{ {
DEFINE_WAIT_FUNC(wait, woken_wake_function); DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn; struct smc_connection *conn = &smc->conn;
......
...@@ -20,5 +20,6 @@ ...@@ -20,5 +20,6 @@
void smc_rx_init(struct smc_sock *smc); void smc_rx_init(struct smc_sock *smc);
int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
int flags); int flags);
int smc_rx_wait_data(struct smc_sock *smc, long *timeo);
#endif /* SMC_RX_H */ #endif /* SMC_RX_H */
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/tcp.h>
#include "smc.h" #include "smc.h"
#include "smc_wr.h" #include "smc_wr.h"
...@@ -26,6 +27,7 @@ ...@@ -26,6 +27,7 @@
#include "smc_tx.h" #include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ #define SMC_TX_WORK_DELAY HZ
#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/ /***************************** sndbuf producer *******************************/
...@@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) ...@@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
return rc; return rc;
} }
static bool smc_tx_is_corked(struct smc_sock *smc)
{
struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
}
/* sndbuf producer: main API called by socket layer. /* sndbuf producer: main API called by socket layer.
* called under sock lock. * called under sock lock.
*/ */
...@@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) ...@@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
/* since we just produced more new data into sndbuf, /* since we just produced more new data into sndbuf,
* trigger sndbuf consumer: RDMA write into peer RMBE and CDC * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
*/ */
smc_tx_sndbuf_nonempty(conn); if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
(atomic_read(&conn->sndbuf_space) >
(conn->sndbuf_size >> 1)))
/* for a corked socket defer the RDMA writes if there
* is still sufficient sndbuf_space available
*/
schedule_delayed_work(&conn->tx_work,
SMC_TX_CORK_DELAY);
else
smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */ } /* while (msg_data_left(msg)) */
return send_done; return send_done;
...@@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) ...@@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
} }
rc = 0; rc = 0;
if (conn->alert_token_local) /* connection healthy */ if (conn->alert_token_local) /* connection healthy */
schedule_delayed_work(&conn->tx_work, mod_delayed_work(system_wq, &conn->tx_work,
SMC_TX_WORK_DELAY); SMC_TX_WORK_DELAY);
} }
goto out_unlock; goto out_unlock;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment