Commit 4b01a967 authored by Kuniyuki Iwashima's avatar Kuniyuki Iwashima Committed by David S. Miller

tcp: bind(0) remove the SO_REUSEADDR restriction when ephemeral ports are exhausted.

Commit aacd9289 ("tcp: bind() use stronger
condition for bind_conflict") introduced a restriction to forbid to bind
SO_REUSEADDR enabled sockets to the same (addr, port) tuple in order to
assign ports dispersedly so that we can connect to the same remote host.

The change results in accelerating port depletion so that we fail to bind
sockets to the same local port even if we want to connect to the different
remote hosts.

You can reproduce this issue by following instructions below.

  1. # sysctl -w net.ipv4.ip_local_port_range="32768 32768"
  2. set SO_REUSEADDR to two sockets.
  3. bind two sockets to (localhost, 0) and the latter fails.

Therefore, when ephemeral ports are exhausted, bind(0) should fallback to
the legacy behaviour to enable the SO_REUSEADDR option and make it possible
to connect to different remote (addr, port) tuples.

This patch allows us to bind SO_REUSEADDR enabled sockets to the same
(addr, port) only when net.ipv4.ip_autobind_reuse is set 1 and all
ephemeral ports are exhausted. This also allows connect() and listen() to
share ports in the following way and may break some applications. So the
ip_autobind_reuse is 0 by default and disables the feature.

  1. setsockopt(sk1, SO_REUSEADDR)
  2. setsockopt(sk2, SO_REUSEADDR)
  3. bind(sk1, saddr, 0)
  4. bind(sk2, saddr, 0)
  5. connect(sk1, daddr)
  6. listen(sk2)

If it is set 1, we can fully utilize the 4-tuples, but we should use
IP_BIND_ADDRESS_NO_PORT for bind()+connect() as possible.

The notable thing is that if all sockets bound to the same port have
both SO_REUSEADDR and SO_REUSEPORT enabled, we can bind sockets to an
ephemeral port and also do listen().
Signed-off-by: default avatarKuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 16f6c251
...@@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN ...@@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN
which can be quite useful - but may break some applications. which can be quite useful - but may break some applications.
Default: 0 Default: 0
ip_autobind_reuse - BOOLEAN
By default, bind() does not select the ports automatically even if
the new socket and all sockets bound to the port have SO_REUSEADDR.
ip_autobind_reuse allows bind() to reuse the port and this is useful
when you use bind()+connect(), but may break some applications.
The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
option should only be set by experts.
Default: 0
ip_dynaddr - BOOLEAN ip_dynaddr - BOOLEAN
If set non-zero, enables support for dynamic addresses. If set non-zero, enables support for dynamic addresses.
If set to a non-zero value larger than 1, a kernel log If set to a non-zero value larger than 1, a kernel log
......
...@@ -101,6 +101,7 @@ struct netns_ipv4 { ...@@ -101,6 +101,7 @@ struct netns_ipv4 {
int sysctl_ip_fwd_use_pmtu; int sysctl_ip_fwd_use_pmtu;
int sysctl_ip_fwd_update_priority; int sysctl_ip_fwd_update_priority;
int sysctl_ip_nonlocal_bind; int sysctl_ip_nonlocal_bind;
int sysctl_ip_autobind_reuse;
/* Shall we try to damage output packets if routing dev changes? */ /* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr; int sysctl_ip_dynaddr;
int sysctl_ip_early_demux; int sysctl_ip_early_demux;
......
...@@ -174,12 +174,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * ...@@ -174,12 +174,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int port = 0; int port = 0;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
bool relax = false;
int i, low, high, attempt_half; int i, low, high, attempt_half;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
u32 remaining, offset; u32 remaining, offset;
int l3mdev; int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk); l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan: other_half_scan:
inet_get_local_port_range(net, &low, &high); inet_get_local_port_range(net, &low, &high);
...@@ -217,7 +219,7 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * ...@@ -217,7 +219,7 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
inet_bind_bucket_for_each(tb, &head->chain) inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) { tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, false)) if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success; goto success;
goto next_port; goto next_port;
} }
...@@ -237,6 +239,12 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int * ...@@ -237,6 +239,12 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
attempt_half = 2; attempt_half = 2;
goto other_half_scan; goto other_half_scan;
} }
if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
/* We still have a chance to connect to different destinations */
relax = true;
goto ports_exhausted;
}
return NULL; return NULL;
success: success:
*port_ret = port; *port_ret = port;
......
...@@ -763,6 +763,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -763,6 +763,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "ip_autobind_reuse",
.data = &init_net.ipv4.sysctl_ip_autobind_reuse,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{ {
.procname = "fwmark_reflect", .procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect, .data = &init_net.ipv4.sysctl_fwmark_reflect,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment