Commit 3f17e16f authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'add-ip_local_port_range-socket-option'

Jakub Sitnicki says:

====================
Add IP_LOCAL_PORT_RANGE socket option

This patch set is a follow up to the "How to share IPv4 addresses by
partitioning the port space" talk given at LPC 2022 [1].

Please see patch #1 for the motivation & the use case description.
Patch #2 adds tests exercising the new option in various scenarios.

Documentation
-------------

Proposed update to the ip(7) man-page:

       IP_LOCAL_PORT_RANGE (since Linux X.Y)
              Set or get the per-socket default local  port  range.  This
              option  can  be  used  to  clamp down the global local port
              range, defined by the ip_local_port_range  /proc  interface
              described below, for a given socket.

              The  option  takes  an uint32_t value with the high 16 bits
              set to the upper range bound, and the low 16  bits  set  to
              the  lower  range  bound.  Range  bounds are inclusive. The
              16-bit values should be in host byte order.

              The lower bound has to be less than the  upper  bound  when
              both  bounds  are  not  zero. Otherwise, setting the option
              fails with EINVAL.

              If either bound is outside of the global local port  range,
              or is zero, then that bound has no effect.

              To  reset  the setting, pass zero as both the upper and the
              lower bound.

Interaction with SELinux bind() hook
------------------------------------

SELinux bind() hook - selinux_socket_bind() - performs a permission check
if the requested local port number lies outside of the netns ephemeral port
range.

The proposed socket option cannot be used change the ephemeral port range
to extend beyond the per-netns port range, as set by
net.ipv4.ip_local_port_range.

Hence, there is no interaction with SELinux, AFAICT.

RFC -> v1
RFC: https://lore.kernel.org/netdev/20220912225308.93659-1-jakub@cloudflare.com/

 * Allow either the high bound or the low bound, or both, to be zero
 * Add getsockopt support
 * Add selftests

Links:
------

[1]: https://lpc.events/event/16/contributions/1349/
====================

Link: https://lore.kernel.org/r/20221221-sockopt-port-range-v6-0-be255cc0e51f@cloudflare.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 6a7a2c18 ae543965
......@@ -249,6 +249,10 @@ struct inet_sock {
__be32 mc_addr;
struct ip_mc_socklist __rcu *mc_list;
struct inet_cork_full cork;
struct {
__u16 lo;
__u16 hi;
} local_port_range;
};
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
......
......@@ -340,7 +340,8 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
} \
}
void inet_get_local_port_range(struct net *net, int *low, int *high);
void inet_get_local_port_range(const struct net *net, int *low, int *high);
void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
......
......@@ -162,6 +162,7 @@ struct in_addr {
#define MCAST_MSFILTER 48
#define IP_MULTICAST_ALL 49
#define IP_UNICAST_IF 50
#define IP_LOCAL_PORT_RANGE 51
#define MCAST_EXCLUDE 0
#define MCAST_INCLUDE 1
......
......@@ -117,7 +117,7 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
void inet_get_local_port_range(struct net *net, int *low, int *high)
void inet_get_local_port_range(const struct net *net, int *low, int *high)
{
unsigned int seq;
......@@ -130,6 +130,27 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
const struct inet_sock *inet = inet_sk(sk);
const struct net *net = sock_net(sk);
int lo, hi, sk_lo, sk_hi;
inet_get_local_port_range(net, &lo, &hi);
sk_lo = inet->local_port_range.lo;
sk_hi = inet->local_port_range.hi;
if (unlikely(lo <= sk_lo && sk_lo <= hi))
lo = sk_lo;
if (unlikely(lo <= sk_hi && sk_hi <= hi))
hi = sk_hi;
*low = lo;
*high = hi;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);
static bool inet_use_bhash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
......@@ -316,7 +337,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
inet_sk_get_local_port_range(sk, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
if (high - low < 4)
attempt_half = 0;
......
......@@ -1016,7 +1016,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
l3mdev = inet_sk_bound_l3mdev(sk);
inet_get_local_port_range(net, &low, &high);
inet_sk_get_local_port_range(sk, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
if (likely(remaining > 1))
......
......@@ -923,6 +923,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
case IP_CHECKSUM:
case IP_RECVFRAGSIZE:
case IP_RECVERR_RFC4884:
case IP_LOCAL_PORT_RANGE:
if (optlen >= sizeof(int)) {
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
......@@ -1365,6 +1366,20 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(inet->min_ttl, val);
break;
case IP_LOCAL_PORT_RANGE:
{
const __u16 lo = val;
const __u16 hi = val >> 16;
if (optlen != sizeof(__u32))
goto e_inval;
if (lo != 0 && hi != 0 && lo > hi)
goto e_inval;
inet->local_port_range.lo = lo;
inet->local_port_range.hi = hi;
break;
}
default:
err = -ENOPROTOOPT;
break;
......@@ -1743,6 +1758,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MINTTL:
val = inet->min_ttl;
break;
case IP_LOCAL_PORT_RANGE:
val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
break;
default:
sockopt_release_sock(sk);
return -ENOPROTOOPT;
......
......@@ -248,7 +248,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
int low, high, remaining;
unsigned int rand;
inet_get_local_port_range(net, &low, &high);
inet_sk_get_local_port_range(sk, &low, &high);
remaining = (high - low) + 1;
rand = get_random_u32();
......
......@@ -8322,7 +8322,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
int low, high, remaining, index;
unsigned int rover;
inet_get_local_port_range(net, &low, &high);
inet_sk_get_local_port_range(sk, &low, &high);
remaining = (high - low) + 1;
rover = get_random_u32_below(remaining) + low;
......
......@@ -45,6 +45,7 @@ TEST_PROGS += arp_ndisc_untracked_subnets.sh
TEST_PROGS += stress_reuseport_listen.sh
TEST_PROGS += l2_tos_ttl_inherit.sh
TEST_PROGS += bind_bhash.sh
TEST_PROGS += ip_local_port_range.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest
......@@ -76,6 +77,7 @@ TEST_PROGS += sctp_vrf.sh
TEST_GEN_FILES += sctp_hello
TEST_GEN_FILES += csum
TEST_GEN_FILES += nat6to4.o
TEST_GEN_FILES += ip_local_port_range
TEST_FILES := settings
......
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
// Copyright (c) 2023 Cloudflare
/* Test IP_LOCAL_PORT_RANGE socket option: IPv4 + IPv6, TCP + UDP.
*
* Tests assume that net.ipv4.ip_local_port_range is [40000, 49999].
* Don't run these directly but with ip_local_port_range.sh script.
*/
#include <fcntl.h>
#include <netinet/ip.h>
#include "../kselftest_harness.h"
#ifndef IP_LOCAL_PORT_RANGE
#define IP_LOCAL_PORT_RANGE 51
#endif
static __u32 pack_port_range(__u16 lo, __u16 hi)
{
return (hi << 16) | (lo << 0);
}
static void unpack_port_range(__u32 range, __u16 *lo, __u16 *hi)
{
*lo = range & 0xffff;
*hi = range >> 16;
}
static int get_so_domain(int fd)
{
int domain, err;
socklen_t len;
len = sizeof(domain);
err = getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &domain, &len);
if (err)
return -1;
return domain;
}
static int bind_to_loopback_any_port(int fd)
{
union {
struct sockaddr sa;
struct sockaddr_in v4;
struct sockaddr_in6 v6;
} addr;
socklen_t addr_len;
memset(&addr, 0, sizeof(addr));
switch (get_so_domain(fd)) {
case AF_INET:
addr.v4.sin_family = AF_INET;
addr.v4.sin_port = htons(0);
addr.v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
addr_len = sizeof(addr.v4);
break;
case AF_INET6:
addr.v6.sin6_family = AF_INET6;
addr.v6.sin6_port = htons(0);
addr.v6.sin6_addr = in6addr_loopback;
addr_len = sizeof(addr.v6);
break;
default:
return -1;
}
return bind(fd, &addr.sa, addr_len);
}
static int get_sock_port(int fd)
{
union {
struct sockaddr sa;
struct sockaddr_in v4;
struct sockaddr_in6 v6;
} addr;
socklen_t addr_len;
int err;
addr_len = sizeof(addr);
memset(&addr, 0, sizeof(addr));
err = getsockname(fd, &addr.sa, &addr_len);
if (err)
return -1;
switch (addr.sa.sa_family) {
case AF_INET:
return ntohs(addr.v4.sin_port);
case AF_INET6:
return ntohs(addr.v6.sin6_port);
default:
errno = EAFNOSUPPORT;
return -1;
}
}
static int get_ip_local_port_range(int fd, __u32 *range)
{
socklen_t len;
__u32 val;
int err;
len = sizeof(val);
err = getsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val, &len);
if (err)
return -1;
*range = val;
return 0;
}
FIXTURE(ip_local_port_range) {};
FIXTURE_SETUP(ip_local_port_range)
{
}
FIXTURE_TEARDOWN(ip_local_port_range)
{
}
FIXTURE_VARIANT(ip_local_port_range) {
int so_domain;
int so_type;
int so_protocol;
};
FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_tcp) {
.so_domain = AF_INET,
.so_type = SOCK_STREAM,
.so_protocol = 0,
};
FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_udp) {
.so_domain = AF_INET,
.so_type = SOCK_DGRAM,
.so_protocol = 0,
};
FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_stcp) {
.so_domain = AF_INET,
.so_type = SOCK_STREAM,
.so_protocol = IPPROTO_SCTP,
};
FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_tcp) {
.so_domain = AF_INET6,
.so_type = SOCK_STREAM,
.so_protocol = 0,
};
FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_udp) {
.so_domain = AF_INET6,
.so_type = SOCK_DGRAM,
.so_protocol = 0,
};
FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_stcp) {
.so_domain = AF_INET6,
.so_type = SOCK_STREAM,
.so_protocol = IPPROTO_SCTP,
};
TEST_F(ip_local_port_range, invalid_option_value)
{
__u16 val16;
__u32 val32;
__u64 val64;
int fd, err;
fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
ASSERT_GE(fd, 0) TH_LOG("socket failed");
/* Too few bytes */
val16 = 40000;
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val16, sizeof(val16));
EXPECT_TRUE(err) TH_LOG("expected setsockopt(IP_LOCAL_PORT_RANGE) to fail");
EXPECT_EQ(errno, EINVAL);
/* Empty range: low port > high port */
val32 = pack_port_range(40222, 40111);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val32, sizeof(val32));
EXPECT_TRUE(err) TH_LOG("expected setsockopt(IP_LOCAL_PORT_RANGE) to fail");
EXPECT_EQ(errno, EINVAL);
/* Too many bytes */
val64 = pack_port_range(40333, 40444);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val64, sizeof(val64));
EXPECT_TRUE(err) TH_LOG("expected setsockopt(IP_LOCAL_PORT_RANGE) to fail");
EXPECT_EQ(errno, EINVAL);
err = close(fd);
ASSERT_TRUE(!err) TH_LOG("close failed");
}
TEST_F(ip_local_port_range, port_range_out_of_netns_range)
{
const struct test {
__u16 range_lo;
__u16 range_hi;
} tests[] = {
{ 30000, 39999 }, /* socket range below netns range */
{ 50000, 59999 }, /* socket range above netns range */
};
const struct test *t;
for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
/* Bind a couple of sockets, not just one, to check
* that the range wasn't clamped to a single port from
* the netns range. That is [40000, 40000] or [49999,
* 49999], respectively for each test case.
*/
int fds[2], i;
TH_LOG("lo %5hu, hi %5hu", t->range_lo, t->range_hi);
for (i = 0; i < ARRAY_SIZE(fds); i++) {
int fd, err, port;
__u32 range;
fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
ASSERT_GE(fd, 0) TH_LOG("#%d: socket failed", i);
range = pack_port_range(t->range_lo, t->range_hi);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("#%d: setsockopt(IP_LOCAL_PORT_RANGE) failed", i);
err = bind_to_loopback_any_port(fd);
ASSERT_TRUE(!err) TH_LOG("#%d: bind failed", i);
/* Check that socket port range outside of ephemeral range is ignored */
port = get_sock_port(fd);
ASSERT_GE(port, 40000) TH_LOG("#%d: expected port within netns range", i);
ASSERT_LE(port, 49999) TH_LOG("#%d: expected port within netns range", i);
fds[i] = fd;
}
for (i = 0; i < ARRAY_SIZE(fds); i++)
ASSERT_TRUE(close(fds[i]) == 0) TH_LOG("#%d: close failed", i);
}
}
TEST_F(ip_local_port_range, single_port_range)
{
const struct test {
__u16 range_lo;
__u16 range_hi;
__u16 expected;
} tests[] = {
/* single port range within ephemeral range */
{ 45000, 45000, 45000 },
/* first port in the ephemeral range (clamp from above) */
{ 0, 40000, 40000 },
/* last port in the ephemeral range (clamp from below) */
{ 49999, 0, 49999 },
};
const struct test *t;
for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
int fd, err, port;
__u32 range;
TH_LOG("lo %5hu, hi %5hu, expected %5hu",
t->range_lo, t->range_hi, t->expected);
fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
ASSERT_GE(fd, 0) TH_LOG("socket failed");
range = pack_port_range(t->range_lo, t->range_hi);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
err = bind_to_loopback_any_port(fd);
ASSERT_TRUE(!err) TH_LOG("bind failed");
port = get_sock_port(fd);
ASSERT_EQ(port, t->expected) TH_LOG("unexpected local port");
err = close(fd);
ASSERT_TRUE(!err) TH_LOG("close failed");
}
}
TEST_F(ip_local_port_range, exhaust_8_port_range)
{
__u8 port_set = 0;
int i, fd, err;
__u32 range;
__u16 port;
int fds[8];
for (i = 0; i < ARRAY_SIZE(fds); i++) {
fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
ASSERT_GE(fd, 0) TH_LOG("socket failed");
range = pack_port_range(40000, 40007);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
err = bind_to_loopback_any_port(fd);
ASSERT_TRUE(!err) TH_LOG("bind failed");
port = get_sock_port(fd);
ASSERT_GE(port, 40000) TH_LOG("expected port within sockopt range");
ASSERT_LE(port, 40007) TH_LOG("expected port within sockopt range");
port_set |= 1 << (port - 40000);
fds[i] = fd;
}
/* Check that all every port from the test range is in use */
ASSERT_EQ(port_set, 0xff) TH_LOG("expected all ports to be busy");
/* Check that bind() fails because the whole range is busy */
fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
ASSERT_GE(fd, 0) TH_LOG("socket failed");
range = pack_port_range(40000, 40007);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
err = bind_to_loopback_any_port(fd);
ASSERT_TRUE(err) TH_LOG("expected bind to fail");
ASSERT_EQ(errno, EADDRINUSE);
err = close(fd);
ASSERT_TRUE(!err) TH_LOG("close failed");
for (i = 0; i < ARRAY_SIZE(fds); i++) {
err = close(fds[i]);
ASSERT_TRUE(!err) TH_LOG("close failed");
}
}
TEST_F(ip_local_port_range, late_bind)
{
union {
struct sockaddr sa;
struct sockaddr_in v4;
struct sockaddr_in6 v6;
} addr;
socklen_t addr_len;
const int one = 1;
int fd, err;
__u32 range;
__u16 port;
if (variant->so_protocol == IPPROTO_SCTP)
SKIP(return, "SCTP doesn't support IP_BIND_ADDRESS_NO_PORT");
fd = socket(variant->so_domain, variant->so_type, 0);
ASSERT_GE(fd, 0) TH_LOG("socket failed");
range = pack_port_range(40100, 40199);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
err = setsockopt(fd, SOL_IP, IP_BIND_ADDRESS_NO_PORT, &one, sizeof(one));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_BIND_ADDRESS_NO_PORT) failed");
err = bind_to_loopback_any_port(fd);
ASSERT_TRUE(!err) TH_LOG("bind failed");
port = get_sock_port(fd);
ASSERT_EQ(port, 0) TH_LOG("getsockname failed");
/* Invalid destination */
memset(&addr, 0, sizeof(addr));
switch (variant->so_domain) {
case AF_INET:
addr.v4.sin_family = AF_INET;
addr.v4.sin_port = htons(0);
addr.v4.sin_addr.s_addr = htonl(INADDR_ANY);
addr_len = sizeof(addr.v4);
break;
case AF_INET6:
addr.v6.sin6_family = AF_INET6;
addr.v6.sin6_port = htons(0);
addr.v6.sin6_addr = in6addr_any;
addr_len = sizeof(addr.v6);
break;
default:
ASSERT_TRUE(false) TH_LOG("unsupported socket domain");
}
/* connect() doesn't need to succeed for late bind to happen */
connect(fd, &addr.sa, addr_len);
port = get_sock_port(fd);
ASSERT_GE(port, 40100);
ASSERT_LE(port, 40199);
err = close(fd);
ASSERT_TRUE(!err) TH_LOG("close failed");
}
TEST_F(ip_local_port_range, get_port_range)
{
__u16 lo, hi;
__u32 range;
int fd, err;
fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
ASSERT_GE(fd, 0) TH_LOG("socket failed");
/* Get range before it will be set */
err = get_ip_local_port_range(fd, &range);
ASSERT_TRUE(!err) TH_LOG("getsockopt(IP_LOCAL_PORT_RANGE) failed");
unpack_port_range(range, &lo, &hi);
ASSERT_EQ(lo, 0) TH_LOG("unexpected low port");
ASSERT_EQ(hi, 0) TH_LOG("unexpected high port");
range = pack_port_range(12345, 54321);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
/* Get range after it has been set */
err = get_ip_local_port_range(fd, &range);
ASSERT_TRUE(!err) TH_LOG("getsockopt(IP_LOCAL_PORT_RANGE) failed");
unpack_port_range(range, &lo, &hi);
ASSERT_EQ(lo, 12345) TH_LOG("unexpected low port");
ASSERT_EQ(hi, 54321) TH_LOG("unexpected high port");
/* Unset the port range */
range = pack_port_range(0, 0);
err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
/* Get range after it has been unset */
err = get_ip_local_port_range(fd, &range);
ASSERT_TRUE(!err) TH_LOG("getsockopt(IP_LOCAL_PORT_RANGE) failed");
unpack_port_range(range, &lo, &hi);
ASSERT_EQ(lo, 0) TH_LOG("unexpected low port");
ASSERT_EQ(hi, 0) TH_LOG("unexpected high port");
err = close(fd);
ASSERT_TRUE(!err) TH_LOG("close failed");
}
TEST_HARNESS_MAIN
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
./in_netns.sh \
sh -c 'sysctl -q -w net.ipv4.ip_local_port_range="40000 49999" && ./ip_local_port_range'
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment