Commit 448c907c authored by David S. Miller's avatar David S. Miller

Merge branch 'smc-next'

Ursula Braun says:

====================
smc fixes from 2018-04-17 - v3

in the mean time we challenged the benefit of these CLC handshake
optimizations for the sockopts TCP_NODELAY and TCP_CORK.
We decided to give up on them for now, since SMC still works
properly without.
There is now version 3 of the patch series with patches 2-4 implementing
sockopts that require special handling in SMC.

Version 3 changes
   * no deferring of setsockopts TCP_NODELAY and TCP_CORK anymore
   * allow fallback for some sockopts eliminating SMC usage
   * when setting TCP_NODELAY always enforce data transmission
     (not only together with corked data)

Version 2 changes of Patch 2/4 (and 3/4):
   * return error -EOPNOTSUPP for TCP_FASTOPEN sockopts
   * fix a kernel_setsockopt() usage bug by switching parameter
     variable from type "u8" to "int"
   * add return code validation when calling kernel_setsockopt()
   * propagate a setsockopt error on the internal CLC socket
     to the SMC socket.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 51dce24b abb190f1
......@@ -391,6 +391,9 @@ static int smc_connect_rdma(struct smc_sock *smc)
sock_hold(&smc->sk); /* sock put in passive closing */
if (smc->use_fallback)
goto out_connected;
if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
/* peer has not signalled SMC-capability */
smc->use_fallback = true;
......@@ -790,6 +793,9 @@ static void smc_listen_work(struct work_struct *work)
int rc = 0;
u8 ibport;
if (new_smc->use_fallback)
goto out_connected;
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
new_smc->use_fallback = true;
......@@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
continue;
new_smc->listen_smc = lsmc;
new_smc->use_fallback = false; /* assume rdma capability first*/
new_smc->use_fallback = lsmc->use_fallback;
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
......@@ -1004,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog)
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
......@@ -1037,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (lsmc->sk.sk_state != SMC_LISTEN) {
rc = -EINVAL;
release_sock(sk);
goto out;
}
......@@ -1064,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (!rc)
rc = sock_error(nsk);
release_sock(sk);
if (rc)
goto out;
if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
/* wait till data arrives on the socket */
timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
MSEC_PER_SEC);
if (smc_sk(nsk)->use_fallback) {
struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
lock_sock(clcsk);
if (skb_queue_empty(&clcsk->sk_receive_queue))
sk_wait_data(clcsk, &timeo, NULL);
release_sock(clcsk);
} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
lock_sock(nsk);
smc_rx_wait_data(smc_sk(nsk), &timeo);
release_sock(nsk);
}
}
out:
release_sock(sk);
sock_put(sk); /* sock_hold above */
return rc;
}
......@@ -1097,6 +1125,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
(sk->sk_state != SMC_INIT))
goto out;
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
} else {
rc = -EINVAL;
goto out;
}
}
if (smc->use_fallback)
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
else
......@@ -1274,14 +1312,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int val, rc;
smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
optval, optlen);
rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
optval, optlen);
if (smc->clcsock->sk->sk_err) {
sk->sk_err = smc->clcsock->sk->sk_err;
sk->sk_error_report(sk);
}
if (rc)
return rc;
if (optlen < sizeof(int))
return rc;
get_user(val, (int __user *)optval);
lock_sock(sk);
switch (optname) {
case TCP_ULP:
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
} else {
if (!smc->use_fallback)
rc = -EINVAL;
}
break;
case TCP_NODELAY:
if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
if (val)
mod_delayed_work(system_wq, &smc->conn.tx_work,
0);
}
break;
case TCP_CORK:
if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
if (!val)
mod_delayed_work(system_wq, &smc->conn.tx_work,
0);
}
break;
case TCP_DEFER_ACCEPT:
smc->sockopt_defer_accept = val;
break;
default:
break;
}
release_sock(sk);
return rc;
}
static int smc_getsockopt(struct socket *sock, int level, int optname,
......
......@@ -180,6 +180,10 @@ struct smc_sock { /* smc sock container */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
*/
u8 wait_close_tx_prepared : 1;
/* shutdown wr or close
* started, waiting for unsent
......
......@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
BUILD_BUG_ON_MSG(
offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
BUILD_BUG_ON_MSG(
sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
......
......@@ -48,7 +48,7 @@ struct smc_cdc_msg {
struct smc_cdc_producer_flags prod_flags;
struct smc_cdc_conn_state_flags conn_state_flags;
u8 reserved[18];
} __aligned(8);
} __packed; /* format defined in RFC7609 */
static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
{
......
......@@ -51,7 +51,7 @@ static void smc_rx_data_ready(struct sock *sk)
* 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
* 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
*/
static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
......
......@@ -20,5 +20,6 @@
void smc_rx_init(struct smc_sock *smc);
int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
int flags);
int smc_rx_wait_data(struct smc_sock *smc, long *timeo);
#endif /* SMC_RX_H */
......@@ -19,6 +19,7 @@
#include <linux/sched/signal.h>
#include <net/sock.h>
#include <net/tcp.h>
#include "smc.h"
#include "smc_wr.h"
......@@ -26,6 +27,7 @@
#include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ
#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
......@@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
return rc;
}
static bool smc_tx_is_corked(struct smc_sock *smc)
{
struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
}
/* sndbuf producer: main API called by socket layer.
* called under sock lock.
*/
......@@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
/* since we just produced more new data into sndbuf,
* trigger sndbuf consumer: RDMA write into peer RMBE and CDC
*/
smc_tx_sndbuf_nonempty(conn);
if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
(atomic_read(&conn->sndbuf_space) >
(conn->sndbuf_size >> 1)))
/* for a corked socket defer the RDMA writes if there
* is still sufficient sndbuf_space available
*/
schedule_delayed_work(&conn->tx_work,
SMC_TX_CORK_DELAY);
else
smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */
return send_done;
......@@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
}
rc = 0;
if (conn->alert_token_local) /* connection healthy */
schedule_delayed_work(&conn->tx_work,
SMC_TX_WORK_DELAY);
mod_delayed_work(system_wq, &conn->tx_work,
SMC_TX_WORK_DELAY);
}
goto out_unlock;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment