Commit dbca1596 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

ping: convert to RCU lookups, get rid of rwlock

Using rwlock in networking code is extremely risky.
writers can starve if enough readers are constantly
grabing the rwlock.

I thought rwlock were at fault and sent this patch:

https://lkml.org/lkml/2022/6/17/272

But Peter and Linus essentially told me rwlock had to be unfair.

We need to get rid of rwlock in networking code.

Fixes: c319b4d7 ("net: ipv4: add IPPROTO_ICMP socket kind")
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f0623340
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
struct ping_table { struct ping_table {
struct hlist_nulls_head hash[PING_HTABLE_SIZE]; struct hlist_nulls_head hash[PING_HTABLE_SIZE];
rwlock_t lock; spinlock_t lock;
}; };
static struct ping_table ping_table; static struct ping_table ping_table;
...@@ -82,7 +82,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) ...@@ -82,7 +82,7 @@ int ping_get_port(struct sock *sk, unsigned short ident)
struct sock *sk2 = NULL; struct sock *sk2 = NULL;
isk = inet_sk(sk); isk = inet_sk(sk);
write_lock_bh(&ping_table.lock); spin_lock(&ping_table.lock);
if (ident == 0) { if (ident == 0) {
u32 i; u32 i;
u16 result = ping_port_rover + 1; u16 result = ping_port_rover + 1;
...@@ -128,14 +128,15 @@ int ping_get_port(struct sock *sk, unsigned short ident) ...@@ -128,14 +128,15 @@ int ping_get_port(struct sock *sk, unsigned short ident)
if (sk_unhashed(sk)) { if (sk_unhashed(sk)) {
pr_debug("was not hashed\n"); pr_debug("was not hashed\n");
sock_hold(sk); sock_hold(sk);
hlist_nulls_add_head(&sk->sk_nulls_node, hlist); sock_set_flag(sk, SOCK_RCU_FREE);
hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} }
write_unlock_bh(&ping_table.lock); spin_unlock(&ping_table.lock);
return 0; return 0;
fail: fail:
write_unlock_bh(&ping_table.lock); spin_unlock(&ping_table.lock);
return 1; return 1;
} }
EXPORT_SYMBOL_GPL(ping_get_port); EXPORT_SYMBOL_GPL(ping_get_port);
...@@ -153,19 +154,19 @@ void ping_unhash(struct sock *sk) ...@@ -153,19 +154,19 @@ void ping_unhash(struct sock *sk)
struct inet_sock *isk = inet_sk(sk); struct inet_sock *isk = inet_sk(sk);
pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
write_lock_bh(&ping_table.lock); spin_lock(&ping_table.lock);
if (sk_hashed(sk)) { if (sk_hashed(sk)) {
hlist_nulls_del(&sk->sk_nulls_node); hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
sk_nulls_node_init(&sk->sk_nulls_node);
sock_put(sk); sock_put(sk);
isk->inet_num = 0; isk->inet_num = 0;
isk->inet_sport = 0; isk->inet_sport = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
} }
write_unlock_bh(&ping_table.lock); spin_unlock(&ping_table.lock);
} }
EXPORT_SYMBOL_GPL(ping_unhash); EXPORT_SYMBOL_GPL(ping_unhash);
/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
{ {
struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
...@@ -190,8 +191,6 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) ...@@ -190,8 +191,6 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
return NULL; return NULL;
} }
read_lock_bh(&ping_table.lock);
ping_portaddr_for_each_entry(sk, hnode, hslot) { ping_portaddr_for_each_entry(sk, hnode, hslot) {
isk = inet_sk(sk); isk = inet_sk(sk);
...@@ -230,13 +229,11 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) ...@@ -230,13 +229,11 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
sk->sk_bound_dev_if != sdif) sk->sk_bound_dev_if != sdif)
continue; continue;
sock_hold(sk);
goto exit; goto exit;
} }
sk = NULL; sk = NULL;
exit: exit:
read_unlock_bh(&ping_table.lock);
return sk; return sk;
} }
...@@ -588,7 +585,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) ...@@ -588,7 +585,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
sk->sk_err = err; sk->sk_err = err;
sk_error_report(sk); sk_error_report(sk);
out: out:
sock_put(sk); return;
} }
EXPORT_SYMBOL_GPL(ping_err); EXPORT_SYMBOL_GPL(ping_err);
...@@ -994,7 +991,6 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) ...@@ -994,7 +991,6 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb)
reason = __ping_queue_rcv_skb(sk, skb2); reason = __ping_queue_rcv_skb(sk, skb2);
else else
reason = SKB_DROP_REASON_NOMEM; reason = SKB_DROP_REASON_NOMEM;
sock_put(sk);
} }
if (reason) if (reason)
...@@ -1080,13 +1076,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) ...@@ -1080,13 +1076,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
} }
void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
__acquires(ping_table.lock) __acquires(RCU)
{ {
struct ping_iter_state *state = seq->private; struct ping_iter_state *state = seq->private;
state->bucket = 0; state->bucket = 0;
state->family = family; state->family = family;
read_lock_bh(&ping_table.lock); rcu_read_lock();
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
} }
...@@ -1112,9 +1108,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) ...@@ -1112,9 +1108,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(ping_seq_next); EXPORT_SYMBOL_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v) void ping_seq_stop(struct seq_file *seq, void *v)
__releases(ping_table.lock) __releases(RCU)
{ {
read_unlock_bh(&ping_table.lock); rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(ping_seq_stop); EXPORT_SYMBOL_GPL(ping_seq_stop);
...@@ -1198,5 +1194,5 @@ void __init ping_init(void) ...@@ -1198,5 +1194,5 @@ void __init ping_init(void)
for (i = 0; i < PING_HTABLE_SIZE; i++) for (i = 0; i < PING_HTABLE_SIZE; i++)
INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
rwlock_init(&ping_table.lock); spin_lock_init(&ping_table.lock);
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment