Commit b67fd3d9 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-inet-retire-port-only-listening_hash'

Martin KaFai Lau says:

====================
net: inet: Retire port only listening_hash

This series is to retire the port only listening_hash.

The listen sk is currently stored in two hash tables,
listening_hash (hashed by port) and lhash2 (hashed by port and address).

After commit 0ee58dad ("net: tcp6: prefer listeners bound to an address")
and commit d9fbc7f6 ("net: tcp: prefer listeners bound to an address"),
the TCP-SYN lookup fast path does not use listening_hash.

The commit 05c0b357 ("tcp: seq_file: Replace listening_hash with lhash2")
also moved the seq_file (/proc/net/tcp) iteration usage from
listening_hash to lhash2.

There are still a few listening_hash usages left.
One of them is inet_reuseport_add_sock() which uses the listening_hash
to search a listen sk during the listen() system call.  This turns
out to be very slow on use cases that listen on many different
VIPs at a popular port (e.g. 443).  [ On top of the slowness in
adding to the tail in the IPv6 case ]. A latter patch has a
selftest to demonstrate this case.

This series takes this chance to move all remaining listening_hash
usages to lhash2 and then retire listening_hash.
====================

Link: https://lore.kernel.org/r/20220512000546.188616-1-kafai@fb.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 0c1822d9 ec8cb4f6
......@@ -66,7 +66,6 @@ struct inet_connection_sock_af_ops {
* @icsk_ulp_ops Pluggable ULP control hook
* @icsk_ulp_data ULP private data
* @icsk_clean_acked Clean acked data hook
* @icsk_listen_portaddr_node hash to the portaddr listener hashtable
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
......@@ -96,7 +95,6 @@ struct inet_connection_sock {
const struct tcp_ulp_ops *icsk_ulp_ops;
void __rcu *icsk_ulp_data;
void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
struct hlist_node icsk_listen_portaddr_node;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state:5,
icsk_ca_initialized:1,
......
......@@ -111,11 +111,7 @@ struct inet_bind_hashbucket {
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
spinlock_t lock;
unsigned int count;
union {
struct hlist_head head;
struct hlist_nulls_head nulls_head;
};
struct hlist_nulls_head nulls_head;
};
/* This is for listening sockets, thus all sockets which possess wildcards. */
......@@ -143,32 +139,8 @@ struct inet_hashinfo {
/* The 2nd listener table hashed by local port and address */
unsigned int lhash2_mask;
struct inet_listen_hashbucket *lhash2;
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* might be often dirty.
*/
/* All sockets in TCP_LISTEN state will be in listening_hash.
* This is the only table where wildcard'd TCP sockets can
* exist. listening_hash is only hashed by local port number.
* If lhash2 is initialized, the same socket will also be hashed
* to lhash2 by port and address.
*/
struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
____cacheline_aligned_in_smp;
};
#define inet_lhash2_for_each_icsk_continue(__icsk) \
hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
#define inet_lhash2_for_each_icsk(__icsk, list) \
hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
......@@ -230,23 +202,11 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum);
/* These can have wildcards, don't try too hard. */
static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
{
return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
}
static inline int inet_sk_listen_hashfn(const struct sock *sk)
{
return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
}
/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);
void inet_put_port(struct sock *sk);
void inet_hashinfo_init(struct inet_hashinfo *h);
void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
unsigned long numentries, int scale,
unsigned long low_limit,
......
......@@ -1110,7 +1110,6 @@ static int __init dccp_init(void)
BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
sizeof_field(struct sk_buff, cb));
inet_hashinfo_init(&dccp_hashinfo);
rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
if (rc)
goto out_fail;
......
......@@ -1028,12 +1028,13 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
num = 0;
ilb = &hashinfo->listening_hash[i];
ilb = &hashinfo->lhash2[i];
spin_lock(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->nulls_head) {
struct inet_sock *inet = inet_sk(sk);
......
......@@ -193,42 +193,6 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
return inet_lhash2_bucket(h, hash);
}
static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
{
struct inet_listen_hashbucket *ilb2;
if (!h->lhash2)
return;
ilb2 = inet_lhash2_bucket_sk(h, sk);
spin_lock(&ilb2->lock);
if (sk->sk_reuseport && sk->sk_family == AF_INET6)
hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
&ilb2->head);
else
hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
&ilb2->head);
ilb2->count++;
spin_unlock(&ilb2->lock);
}
static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
{
struct inet_listen_hashbucket *ilb2;
if (!h->lhash2 ||
WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
return;
ilb2 = inet_lhash2_bucket_sk(h, sk);
spin_lock(&ilb2->lock);
hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
ilb2->count--;
spin_unlock(&ilb2->lock);
}
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif, const int sdif)
......@@ -282,12 +246,11 @@ static struct sock *inet_lhash2_lookup(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
struct hlist_nulls_node *node;
int score, hiscore = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
......@@ -633,7 +596,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
struct inet_listen_hashbucket *ilb2;
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
......@@ -643,25 +606,23 @@ int __inet_hash(struct sock *sk, struct sock *osk)
return 0;
}
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
spin_lock(&ilb->lock);
spin_lock(&ilb2->lock);
if (sk->sk_reuseport) {
err = inet_reuseport_add_sock(sk, ilb);
err = inet_reuseport_add_sock(sk, ilb2);
if (err)
goto unlock;
}
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
sk->sk_family == AF_INET6)
__sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
else
__sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
inet_hash2(hashinfo, sk);
ilb->count++;
__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb->lock);
spin_unlock(&ilb2->lock);
return err;
}
......@@ -678,23 +639,6 @@ int inet_hash(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_hash);
static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
{
if (sk_unhashed(sk))
return;
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_stop_listen_sock(sk);
if (ilb) {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
inet_unhash2(hashinfo, sk);
ilb->count--;
}
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
......@@ -703,20 +647,34 @@ void inet_unhash(struct sock *sk)
return;
if (sk->sk_state == TCP_LISTEN) {
struct inet_listen_hashbucket *ilb;
struct inet_listen_hashbucket *ilb2;
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
/* Don't disable bottom halves while acquiring the lock to
* avoid circular locking dependency on PREEMPT_RT.
*/
spin_lock(&ilb->lock);
__inet_unhash(sk, ilb);
spin_unlock(&ilb->lock);
spin_lock(&ilb2->lock);
if (sk_unhashed(sk)) {
spin_unlock(&ilb2->lock);
return;
}
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_stop_listen_sock(sk);
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock(&ilb2->lock);
} else {
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
__inet_unhash(sk, NULL);
if (sk_unhashed(sk)) {
spin_unlock_bh(lock);
return;
}
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
}
}
......@@ -874,29 +832,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
void inet_hashinfo_init(struct inet_hashinfo *h)
{
int i;
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head,
i + LISTENING_NULLS_BASE);
h->listening_hash[i].count = 0;
}
h->lhash2 = NULL;
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
int i;
for (i = 0; i <= h->lhash2_mask; i++) {
spin_lock_init(&h->lhash2[i].lock);
INIT_HLIST_HEAD(&h->lhash2[i].head);
h->lhash2[i].count = 0;
INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
i + LISTENING_NULLS_BASE);
}
}
......
......@@ -4595,7 +4595,6 @@ void __init tcp_init(void)
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/
0, 64 * 1024);
......
......@@ -2283,16 +2283,15 @@ static void *listening_get_first(struct seq_file *seq)
st->offset = 0;
for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
struct inet_listen_hashbucket *ilb2;
struct inet_connection_sock *icsk;
struct hlist_nulls_node *node;
struct sock *sk;
ilb2 = &tcp_hashinfo.lhash2[st->bucket];
if (hlist_empty(&ilb2->head))
if (hlist_nulls_empty(&ilb2->nulls_head))
continue;
spin_lock(&ilb2->lock);
inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
if (seq_sk_match(seq, sk))
return sk;
}
......@@ -2311,15 +2310,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct tcp_iter_state *st = seq->private;
struct inet_listen_hashbucket *ilb2;
struct inet_connection_sock *icsk;
struct hlist_nulls_node *node;
struct sock *sk = cur;
++st->num;
++st->offset;
icsk = inet_csk(sk);
inet_lhash2_for_each_icsk_continue(icsk) {
sk = (struct sock *)icsk;
sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk))
return sk;
}
......@@ -2728,16 +2726,15 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
{
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
struct inet_connection_sock *icsk;
struct hlist_nulls_node *node;
unsigned int expected = 1;
struct sock *sk;
sock_hold(start_sk);
iter->batch[iter->end_sk++] = start_sk;
icsk = inet_csk(start_sk);
inet_lhash2_for_each_icsk_continue(icsk) {
sk = (struct sock *)icsk;
sk = sk_nulls_next(start_sk);
sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk)) {
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
......
......@@ -138,12 +138,11 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif, const int sdif)
{
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
struct hlist_nulls_node *node;
int score, hiscore = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
......
......@@ -83,13 +83,13 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
struct net *net = sock_net(skb->sk);
int i;
for (i = diag_ctx->l_slot; i < INET_LHTABLE_SIZE; i++) {
for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk;
int num = 0;
ilb = &tcp_hashinfo.listening_hash[i];
ilb = &tcp_hashinfo.lhash2[i];
rcu_read_lock();
spin_lock(&ilb->lock);
......
......@@ -38,6 +38,7 @@ TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
TEST_PROGS += vrf_strict_mode_test.sh
TEST_PROGS += arp_ndisc_evict_nocarrier.sh
TEST_PROGS += ndisc_unsolicited_na_test.sh
TEST_PROGS += stress_reuseport_listen.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest
......@@ -56,6 +57,7 @@ TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
TEST_GEN_FILES += toeplitz
TEST_GEN_FILES += cmsg_sender
TEST_GEN_FILES += stress_reuseport_listen
TEST_PROGS += test_vxlan_vnifiltering.sh
TEST_FILES := settings
......
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
/* Test listening on the same port 443 with multiple VIPS.
* Each VIP:443 will have multiple sk listening on by using
* SO_REUSEPORT.
*/
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <error.h>
#include <errno.h>
#include <time.h>
#include <arpa/inet.h>
#define IP6_LADDR_START "2401:dead::1"
#define IP6_LPORT 443
#define NSEC_PER_SEC 1000000000L
#define NSEC_PER_USEC 1000L
static unsigned int nr_socks_per_vip;
static unsigned int nr_vips;
static int *bind_reuseport_sock6(void)
{
int *lfds, *cur_fd, err, optvalue = 1;
struct sockaddr_in6 sa6 = {};
unsigned int i, j;
sa6.sin6_family = AF_INET6;
sa6.sin6_port = htons(IP6_LPORT);
err = inet_pton(AF_INET6, IP6_LADDR_START, &sa6.sin6_addr);
if (err != 1)
error(1, err, "inet_pton(%s)", IP6_LADDR_START);
lfds = malloc(nr_vips * nr_socks_per_vip * sizeof(lfds[0]));
if (!lfds)
error(1, errno, "cannot alloc array of lfds");
cur_fd = lfds;
for (i = 0; i < nr_vips; i++) {
for (j = 0; j < nr_socks_per_vip; j++) {
*cur_fd = socket(AF_INET6, SOCK_STREAM, 0);
if (*cur_fd == -1)
error(1, errno,
"lfds[%u,%u] = socket(AF_INET6)", i, j);
err = setsockopt(*cur_fd, SOL_SOCKET, SO_REUSEPORT,
&optvalue, sizeof(optvalue));
if (err)
error(1, errno,
"setsockopt(lfds[%u,%u], SO_REUSEPORT)",
i, j);
err = bind(*cur_fd, (struct sockaddr *)&sa6,
sizeof(sa6));
if (err)
error(1, errno, "bind(lfds[%u,%u])", i, j);
cur_fd++;
}
sa6.sin6_addr.s6_addr32[3]++;
}
return lfds;
}
int main(int argc, const char *argv[])
{
struct timespec start_ts, end_ts;
unsigned long start_ns, end_ns;
unsigned int nr_lsocks;
int *lfds, i, err;
if (argc != 3 || atoi(argv[1]) <= 0 || atoi(argv[2]) <= 0)
error(1, 0, "Usage: %s <nr_vips> <nr_socks_per_vip>\n",
argv[0]);
nr_vips = atoi(argv[1]);
nr_socks_per_vip = atoi(argv[2]);
nr_lsocks = nr_vips * nr_socks_per_vip;
lfds = bind_reuseport_sock6();
clock_gettime(CLOCK_MONOTONIC, &start_ts);
for (i = 0; i < nr_lsocks; i++) {
err = listen(lfds[i], 0);
if (err)
error(1, errno, "listen(lfds[%d])", i);
}
clock_gettime(CLOCK_MONOTONIC, &end_ts);
start_ns = start_ts.tv_sec * NSEC_PER_SEC + start_ts.tv_nsec;
end_ns = end_ts.tv_sec * NSEC_PER_SEC + end_ts.tv_nsec;
printf("listen %d socks took %lu.%lu\n", nr_lsocks,
(end_ns - start_ns) / NSEC_PER_SEC,
(end_ns - start_ns) / NSEC_PER_USEC);
for (i = 0; i < nr_lsocks; i++)
close(lfds[i]);
free(lfds);
return 0;
}
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
NS='stress_reuseport_listen_ns'
NR_FILES=24100
SAVED_NR_FILES=$(ulimit -n)
setup() {
ip netns add $NS
ip netns exec $NS sysctl -q -w net.ipv6.ip_nonlocal_bind=1
ulimit -n $NR_FILES
}
cleanup() {
ip netns del $NS
ulimit -n $SAVED_NR_FILES
}
trap cleanup EXIT
setup
# 300 different vips listen on port 443
# Each vip:443 sockaddr has 80 LISTEN sock by using SO_REUSEPORT
# Total 24000 listening socks
ip netns exec $NS ./stress_reuseport_listen 300 80
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment