Commit c992fde9 authored by David S. Miller's avatar David S. Miller

Merge branch 'smc-fixes'

Gerd Bayer says:

====================
net/smc: Fix effective buffer size

commit 0227f058 ("net/smc: Unbind r/w buffer size from clcsock
and make them tunable") started to derive the effective buffer size for
SMC connections inconsistently in case a TCP fallback was used and
memory consumption of SMC with the default settings was doubled when
a connection negotiated SMC. That was not what we want.

This series consolidates the resulting effective buffer size that is
used with SMC sockets, which is based on Jan Karcher's effort (see
[1]). For all TCP exchanges (in particular in case of a fall back when
no SMC connection was possible) the values from net.ipv4.tcp_[rw]mem
are used. If SMC succeeds in establishing a SMC connection, the newly
introduced values from net.smc.[rw]mem are used.

net.smc.[rw]mem is initialized to 64kB, respectively. Internal test
have show this to be a good compromise between throughput/latency
and memory consumption. Also net.smc.[rw]mem is now decoupled completely
from any tuning through net.ipv4.tcp_[rw]mem.

If a user chose to tune a socket's receive or send buffer size with
setsockopt, this tuning is now consistently applied to either fall-back
TCP or proper SMC connections over the socket.

Thanks,
Gerd

v2 - v3:
 - Rebase to and resolve conflict of second patch with latest net/master.
v1 - v2:
 - In second patch, use sock_net() helper as suggested by Tony and demanded
   by kernel test robot.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d0378ae6 30c3c4a4
...@@ -378,8 +378,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, ...@@ -378,8 +378,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
sk->sk_state = SMC_INIT; sk->sk_state = SMC_INIT;
sk->sk_destruct = smc_destruct; sk->sk_destruct = smc_destruct;
sk->sk_protocol = protocol; sk->sk_protocol = protocol;
WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
smc = smc_sk(sk); smc = smc_sk(sk);
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_WORK(&smc->connect_work, smc_connect_work); INIT_WORK(&smc->connect_work, smc_connect_work);
...@@ -436,13 +436,60 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, ...@@ -436,13 +436,60 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
return rc; return rc;
} }
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
* clc socket (since smc is not called for these options from net/core)
*/
#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
(1UL << SOCK_KEEPOPEN) | \
(1UL << SOCK_LINGER) | \
(1UL << SOCK_BROADCAST) | \
(1UL << SOCK_TIMESTAMP) | \
(1UL << SOCK_DBG) | \
(1UL << SOCK_RCVTSTAMP) | \
(1UL << SOCK_RCVTSTAMPNS) | \
(1UL << SOCK_LOCALROUTE) | \
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
(1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_WIFI_STATUS) | \
(1UL << SOCK_NOFCS) | \
(1UL << SOCK_FILTER_LOCKED) | \
(1UL << SOCK_TSTAMP_NEW))
/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
unsigned long mask)
{
struct net *nnet = sock_net(nsk);
nsk->sk_userlocks = osk->sk_userlocks;
if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
nsk->sk_sndbuf = osk->sk_sndbuf;
} else {
if (mask == SK_FLAGS_SMC_TO_CLC)
WRITE_ONCE(nsk->sk_sndbuf,
READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
else
WRITE_ONCE(nsk->sk_sndbuf,
2 * READ_ONCE(nnet->smc.sysctl_wmem));
}
if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
nsk->sk_rcvbuf = osk->sk_rcvbuf;
} else {
if (mask == SK_FLAGS_SMC_TO_CLC)
WRITE_ONCE(nsk->sk_rcvbuf,
READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
else
WRITE_ONCE(nsk->sk_rcvbuf,
2 * READ_ONCE(nnet->smc.sysctl_rmem));
}
}
static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
unsigned long mask) unsigned long mask)
{ {
/* options we don't get control via setsockopt for */ /* options we don't get control via setsockopt for */
nsk->sk_type = osk->sk_type; nsk->sk_type = osk->sk_type;
nsk->sk_sndbuf = osk->sk_sndbuf;
nsk->sk_rcvbuf = osk->sk_rcvbuf;
nsk->sk_sndtimeo = osk->sk_sndtimeo; nsk->sk_sndtimeo = osk->sk_sndtimeo;
nsk->sk_rcvtimeo = osk->sk_rcvtimeo; nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_mark = READ_ONCE(osk->sk_mark);
...@@ -453,26 +500,10 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, ...@@ -453,26 +500,10 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
nsk->sk_flags &= ~mask; nsk->sk_flags &= ~mask;
nsk->sk_flags |= osk->sk_flags & mask; nsk->sk_flags |= osk->sk_flags & mask;
smc_adjust_sock_bufsizes(nsk, osk, mask);
} }
#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
(1UL << SOCK_KEEPOPEN) | \
(1UL << SOCK_LINGER) | \
(1UL << SOCK_BROADCAST) | \
(1UL << SOCK_TIMESTAMP) | \
(1UL << SOCK_DBG) | \
(1UL << SOCK_RCVTSTAMP) | \
(1UL << SOCK_RCVTSTAMPNS) | \
(1UL << SOCK_LOCALROUTE) | \
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
(1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_WIFI_STATUS) | \
(1UL << SOCK_NOFCS) | \
(1UL << SOCK_FILTER_LOCKED) | \
(1UL << SOCK_TSTAMP_NEW))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
* clc socket (since smc is not called for these options from net/core)
*/
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{ {
smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
...@@ -2479,8 +2510,6 @@ static void smc_tcp_listen_work(struct work_struct *work) ...@@ -2479,8 +2510,6 @@ static void smc_tcp_listen_work(struct work_struct *work)
sock_hold(lsk); /* sock_put in smc_listen_work */ sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc); smc_copy_sock_settings_to_smc(new_smc);
new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */ sock_hold(&new_smc->sk); /* sock_put in passive closing */
if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk); sock_put(&new_smc->sk);
......
...@@ -161,7 +161,7 @@ struct smc_connection { ...@@ -161,7 +161,7 @@ struct smc_connection {
struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
int rmbe_size_short;/* compressed notation */ int rmbe_size_comp; /* compressed notation */
int rmbe_update_limit; int rmbe_update_limit;
/* lower limit for consumer /* lower limit for consumer
* cursor update * cursor update
......
...@@ -1007,7 +1007,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ...@@ -1007,7 +1007,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
clc->d0.gid = clc->d0.gid =
conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd); conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd);
clc->d0.token = conn->rmb_desc->token; clc->d0.token = conn->rmb_desc->token;
clc->d0.dmbe_size = conn->rmbe_size_short; clc->d0.dmbe_size = conn->rmbe_size_comp;
clc->d0.dmbe_idx = 0; clc->d0.dmbe_idx = 0;
memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
if (version == SMC_V1) { if (version == SMC_V1) {
...@@ -1050,7 +1050,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, ...@@ -1050,7 +1050,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
break; break;
} }
clc->r0.rmbe_size = conn->rmbe_size_short; clc->r0.rmbe_size = conn->rmbe_size_comp;
clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
cpu_to_be64((u64)sg_dma_address cpu_to_be64((u64)sg_dma_address
......
...@@ -2309,31 +2309,30 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) ...@@ -2309,31 +2309,30 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
struct smc_connection *conn = &smc->conn; struct smc_connection *conn = &smc->conn;
struct smc_link_group *lgr = conn->lgr; struct smc_link_group *lgr = conn->lgr;
struct list_head *buf_list; struct list_head *buf_list;
int bufsize, bufsize_short; int bufsize, bufsize_comp;
struct rw_semaphore *lock; /* lock buffer list */ struct rw_semaphore *lock; /* lock buffer list */
bool is_dgraded = false; bool is_dgraded = false;
int sk_buf_size;
if (is_rmb) if (is_rmb)
/* use socket recv buffer size (w/o overhead) as start value */ /* use socket recv buffer size (w/o overhead) as start value */
sk_buf_size = smc->sk.sk_rcvbuf; bufsize = smc->sk.sk_rcvbuf / 2;
else else
/* use socket send buffer size (w/o overhead) as start value */ /* use socket send buffer size (w/o overhead) as start value */
sk_buf_size = smc->sk.sk_sndbuf; bufsize = smc->sk.sk_sndbuf / 2;
for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb);
bufsize_short >= 0; bufsize_short--) { bufsize_comp >= 0; bufsize_comp--) {
if (is_rmb) { if (is_rmb) {
lock = &lgr->rmbs_lock; lock = &lgr->rmbs_lock;
buf_list = &lgr->rmbs[bufsize_short]; buf_list = &lgr->rmbs[bufsize_comp];
} else { } else {
lock = &lgr->sndbufs_lock; lock = &lgr->sndbufs_lock;
buf_list = &lgr->sndbufs[bufsize_short]; buf_list = &lgr->sndbufs[bufsize_comp];
} }
bufsize = smc_uncompress_bufsize(bufsize_short); bufsize = smc_uncompress_bufsize(bufsize_comp);
/* check for reusable slot in the link group */ /* check for reusable slot in the link group */
buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
if (buf_desc) { if (buf_desc) {
buf_desc->is_dma_need_sync = 0; buf_desc->is_dma_need_sync = 0;
SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
...@@ -2377,8 +2376,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) ...@@ -2377,8 +2376,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
if (is_rmb) { if (is_rmb) {
conn->rmb_desc = buf_desc; conn->rmb_desc = buf_desc;
conn->rmbe_size_short = bufsize_short; conn->rmbe_size_comp = bufsize_comp;
smc->sk.sk_rcvbuf = bufsize; smc->sk.sk_rcvbuf = bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0); atomic_set(&conn->bytes_to_rcv, 0);
conn->rmbe_update_limit = conn->rmbe_update_limit =
smc_rmb_wnd_update_limit(buf_desc->len); smc_rmb_wnd_update_limit(buf_desc->len);
...@@ -2386,7 +2385,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) ...@@ -2386,7 +2385,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
} else { } else {
conn->sndbuf_desc = buf_desc; conn->sndbuf_desc = buf_desc;
smc->sk.sk_sndbuf = bufsize; smc->sk.sk_sndbuf = bufsize * 2;
atomic_set(&conn->sndbuf_space, bufsize); atomic_set(&conn->sndbuf_space, bufsize);
} }
return 0; return 0;
......
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_sndbuf = SMC_BUF_MIN_SIZE;
static int min_rcvbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE;
static int max_sndbuf = INT_MAX / 2;
static int max_rcvbuf = INT_MAX / 2;
static const int net_smc_wmem_init = (64 * 1024);
static const int net_smc_rmem_init = (64 * 1024);
static struct ctl_table smc_table[] = { static struct ctl_table smc_table[] = {
{ {
...@@ -53,6 +57,7 @@ static struct ctl_table smc_table[] = { ...@@ -53,6 +57,7 @@ static struct ctl_table smc_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
.extra1 = &min_sndbuf, .extra1 = &min_sndbuf,
.extra2 = &max_sndbuf,
}, },
{ {
.procname = "rmem", .procname = "rmem",
...@@ -61,6 +66,7 @@ static struct ctl_table smc_table[] = { ...@@ -61,6 +66,7 @@ static struct ctl_table smc_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
.extra1 = &min_rcvbuf, .extra1 = &min_rcvbuf,
.extra2 = &max_rcvbuf,
}, },
{ } { }
}; };
...@@ -88,8 +94,8 @@ int __net_init smc_sysctl_net_init(struct net *net) ...@@ -88,8 +94,8 @@ int __net_init smc_sysctl_net_init(struct net *net)
net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init);
WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment