Commit 2121c43f authored by Paolo Abeni's avatar Paolo Abeni

Merge branch 'inet_diag-remove-three-mutexes-in-diag-dumps'

Eric Dumazet says:

====================
inet_diag: remove three mutexes in diag dumps

Surprisingly, inet_diag operations are serialized over a stack
of three mutexes, giving legacy /proc based files an unfair
advantage on modern hosts.

This series removes all of them, making inet_diag operations
(eg iproute2/ss) fully parallel.

1-2) Two first patches are adding data-race annotations
     and can be backported to stable kernels.

3-4) inet_diag_table_mutex can be replaced with RCU protection,
     if we add corresponding protection against module unload.

5-7) sock_diag_table_mutex can be replaced with RCU protection,
     if we add corresponding protection against module unload.

 8)  sock_diag_mutex is removed, as the old bug it was
     working around has been fixed more elegantly.

 9)  inet_diag_dump_icsk() can skip over empty buckets to reduce
     spinlock contention.
====================

Link: https://lore.kernel.org/r/20240122112603.3270097-1-edumazet@google.comSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 736b5545 622a08e8
......@@ -8,6 +8,7 @@
struct inet_hashinfo;
struct inet_diag_handler {
struct module *owner;
void (*dump)(struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r);
......
......@@ -13,6 +13,7 @@ struct nlmsghdr;
struct sock;
struct sock_diag_handler {
struct module *owner;
__u8 family;
int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
int (*get_info)(struct sk_buff *skb, struct sock *sk);
......@@ -22,8 +23,13 @@ struct sock_diag_handler {
int sock_diag_register(const struct sock_diag_handler *h);
void sock_diag_unregister(const struct sock_diag_handler *h);
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
struct sock_diag_inet_compat {
struct module *owner;
int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh);
};
void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr);
void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr);
u64 __sock_gen_cookie(struct sock *sk);
......
......@@ -16,9 +16,10 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
static struct sock_diag_inet_compat __rcu *inet_rcv_compat;
static struct workqueue_struct *broadcast_wq;
DEFINE_COOKIE(sock_cookie);
......@@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void)
+ nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
static const struct sock_diag_handler *sock_diag_lock_handler(int family)
{
const struct sock_diag_handler *handler;
rcu_read_lock();
handler = rcu_dereference(sock_diag_handlers[family]);
if (handler && !try_module_get(handler->owner))
handler = NULL;
rcu_read_unlock();
return handler;
}
static void sock_diag_unlock_handler(const struct sock_diag_handler *handler)
{
module_put(handler->owner);
}
static void sock_diag_broadcast_destroy_work(struct work_struct *work)
{
struct broadcast_sk *bsk =
......@@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
if (!skb)
goto out;
mutex_lock(&sock_diag_table_mutex);
hndl = sock_diag_handlers[sk->sk_family];
if (hndl && hndl->get_info)
hndl = sock_diag_lock_handler(sk->sk_family);
if (hndl) {
if (hndl->get_info)
err = hndl->get_info(skb, sk);
mutex_unlock(&sock_diag_table_mutex);
sock_diag_unlock_handler(hndl);
}
if (!err)
nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
GFP_KERNEL);
......@@ -166,51 +185,45 @@ void sock_diag_broadcast_destroy(struct sock *sk)
queue_work(broadcast_wq, &bsk->work);
}
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
{
mutex_lock(&sock_diag_table_mutex);
inet_rcv_compat = fn;
mutex_unlock(&sock_diag_table_mutex);
xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
{
mutex_lock(&sock_diag_table_mutex);
inet_rcv_compat = NULL;
mutex_unlock(&sock_diag_table_mutex);
const struct sock_diag_inet_compat *old;
old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
NULL);
WARN_ON_ONCE(old != ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
int sock_diag_register(const struct sock_diag_handler *hndl)
{
int err = 0;
int family = hndl->family;
if (hndl->family >= AF_MAX)
if (family >= AF_MAX)
return -EINVAL;
mutex_lock(&sock_diag_table_mutex);
if (sock_diag_handlers[hndl->family])
err = -EBUSY;
else
sock_diag_handlers[hndl->family] = hndl;
mutex_unlock(&sock_diag_table_mutex);
return err;
return !cmpxchg((const struct sock_diag_handler **)
&sock_diag_handlers[family],
NULL, hndl) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(sock_diag_register);
void sock_diag_unregister(const struct sock_diag_handler *hnld)
void sock_diag_unregister(const struct sock_diag_handler *hndl)
{
int family = hnld->family;
int family = hndl->family;
if (family >= AF_MAX)
return;
mutex_lock(&sock_diag_table_mutex);
BUG_ON(sock_diag_handlers[family] != hnld);
sock_diag_handlers[family] = NULL;
mutex_unlock(&sock_diag_table_mutex);
xchg((const struct sock_diag_handler **)&sock_diag_handlers[family],
NULL);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
......@@ -227,20 +240,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
if (sock_diag_handlers[req->sdiag_family] == NULL)
if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family]))
sock_load_diag_module(req->sdiag_family, 0);
mutex_lock(&sock_diag_table_mutex);
hndl = sock_diag_handlers[req->sdiag_family];
hndl = sock_diag_lock_handler(req->sdiag_family);
if (hndl == NULL)
err = -ENOENT;
else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
return -ENOENT;
if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
err = hndl->destroy(skb, nlh);
else
err = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
sock_diag_unlock_handler(hndl);
return err;
}
......@@ -248,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
const struct sock_diag_inet_compat *ptr;
int ret;
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
case DCCPDIAG_GETSOCK:
if (inet_rcv_compat == NULL)
if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
mutex_lock(&sock_diag_table_mutex);
if (inet_rcv_compat != NULL)
ret = inet_rcv_compat(skb, nlh);
else
rcu_read_lock();
ptr = rcu_dereference(inet_rcv_compat);
if (ptr && !try_module_get(ptr->owner))
ptr = NULL;
rcu_read_unlock();
ret = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
if (ptr) {
ret = ptr->fn(skb, nlh);
module_put(ptr->owner);
}
return ret;
case SOCK_DIAG_BY_FAMILY:
......@@ -272,13 +292,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
static DEFINE_MUTEX(sock_diag_mutex);
static void sock_diag_rcv(struct sk_buff *skb)
{
mutex_lock(&sock_diag_mutex);
netlink_rcv_skb(skb, &sock_diag_rcv_msg);
mutex_unlock(&sock_diag_mutex);
}
static int sock_diag_bind(struct net *net, int group)
......@@ -286,12 +302,12 @@ static int sock_diag_bind(struct net *net, int group)
switch (group) {
case SKNLGRP_INET_TCP_DESTROY:
case SKNLGRP_INET_UDP_DESTROY:
if (!sock_diag_handlers[AF_INET])
if (!rcu_access_pointer(sock_diag_handlers[AF_INET]))
sock_load_diag_module(AF_INET, 0);
break;
case SKNLGRP_INET6_TCP_DESTROY:
case SKNLGRP_INET6_UDP_DESTROY:
if (!sock_diag_handlers[AF_INET6])
if (!rcu_access_pointer(sock_diag_handlers[AF_INET6]))
sock_load_diag_module(AF_INET6, 0);
break;
}
......
......@@ -58,6 +58,7 @@ static int dccp_diag_dump_one(struct netlink_callback *cb,
}
static const struct inet_diag_handler dccp_diag_handler = {
.owner = THIS_MODULE,
.dump = dccp_diag_dump,
.dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
......
......@@ -32,7 +32,7 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
static const struct inet_diag_handler **inet_diag_table;
static const struct inet_diag_handler __rcu **inet_diag_table;
struct inet_diag_entry {
const __be32 *saddr;
......@@ -48,28 +48,28 @@ struct inet_diag_entry {
#endif
};
static DEFINE_MUTEX(inet_diag_table_mutex);
static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
if (proto < 0 || proto >= IPPROTO_MAX) {
mutex_lock(&inet_diag_table_mutex);
return ERR_PTR(-ENOENT);
}
const struct inet_diag_handler *handler;
if (!inet_diag_table[proto])
if (proto < 0 || proto >= IPPROTO_MAX)
return NULL;
if (!READ_ONCE(inet_diag_table[proto]))
sock_load_diag_module(AF_INET, proto);
mutex_lock(&inet_diag_table_mutex);
if (!inet_diag_table[proto])
return ERR_PTR(-ENOENT);
rcu_read_lock();
handler = rcu_dereference(inet_diag_table[proto]);
if (handler && !try_module_get(handler->owner))
handler = NULL;
rcu_read_unlock();
return inet_diag_table[proto];
return handler;
}
static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
{
mutex_unlock(&inet_diag_table_mutex);
module_put(handler->owner);
}
void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
......@@ -104,9 +104,12 @@ static size_t inet_sk_attr_size(struct sock *sk,
const struct inet_diag_handler *handler;
size_t aux = 0;
handler = inet_diag_table[req->sdiag_protocol];
rcu_read_lock();
handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]);
DEBUG_NET_WARN_ON_ONCE(!handler);
if (handler && handler->idiag_get_aux_size)
aux = handler->idiag_get_aux_size(sk, net_admin);
rcu_read_unlock();
return nla_total_size(sizeof(struct tcp_info))
+ nla_total_size(sizeof(struct inet_diag_msg))
......@@ -244,10 +247,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
int protocol;
cb_data = cb->data;
handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)];
BUG_ON(!handler);
protocol = inet_diag_get_protocol(req, cb_data);
/* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */
handler = rcu_dereference_protected(inet_diag_table[protocol], 1);
DEBUG_NET_WARN_ON_ONCE(!handler);
if (!handler)
return -ENXIO;
nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
......@@ -605,9 +614,10 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
protocol = inet_diag_get_protocol(req, &dump_data);
handler = inet_diag_lock_handler(protocol);
if (IS_ERR(handler)) {
err = PTR_ERR(handler);
} else if (cmd == SOCK_DIAG_BY_FAMILY) {
if (!handler)
return -ENOENT;
if (cmd == SOCK_DIAG_BY_FAMILY) {
struct netlink_callback cb = {
.nlh = nlh,
.skb = in_skb,
......@@ -1035,6 +1045,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
num = 0;
ilb = &hashinfo->lhash2[i];
if (hlist_nulls_empty(&ilb->nulls_head)) {
s_num = 0;
continue;
}
spin_lock(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->nulls_head) {
struct inet_sock *inet = inet_sk(sk);
......@@ -1099,6 +1113,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
accum = 0;
ibb = &hashinfo->bhash2[i];
if (hlist_empty(&ibb->chain)) {
s_num = 0;
continue;
}
spin_lock_bh(&ibb->lock);
inet_bind_bucket_for_each(tb2, &ibb->chain) {
if (!net_eq(ib2_net(tb2), net))
......@@ -1259,12 +1277,12 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
again:
prev_min_dump_alloc = cb->min_dump_alloc;
handler = inet_diag_lock_handler(protocol);
if (!IS_ERR(handler))
if (handler) {
handler->dump(skb, cb, r);
else
err = PTR_ERR(handler);
inet_diag_unlock_handler(handler);
} else {
err = -ENOENT;
}
/* The skb is not large enough to fit one sk info and
* inet_sk_diag_fill() has requested for a larger skb.
*/
......@@ -1457,10 +1475,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
handler = inet_diag_lock_handler(sk->sk_protocol);
if (IS_ERR(handler)) {
inet_diag_unlock_handler(handler);
if (!handler) {
nlmsg_cancel(skb, nlh);
return PTR_ERR(handler);
return -ENOENT;
}
attr = handler->idiag_info_size
......@@ -1479,6 +1496,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
static const struct sock_diag_handler inet_diag_handler = {
.owner = THIS_MODULE,
.family = AF_INET,
.dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
......@@ -1486,6 +1504,7 @@ static const struct sock_diag_handler inet_diag_handler = {
};
static const struct sock_diag_handler inet6_diag_handler = {
.owner = THIS_MODULE,
.family = AF_INET6,
.dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
......@@ -1495,20 +1514,12 @@ static const struct sock_diag_handler inet6_diag_handler = {
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
int err = -EINVAL;
if (type >= IPPROTO_MAX)
goto out;
return -EINVAL;
mutex_lock(&inet_diag_table_mutex);
err = -EEXIST;
if (!inet_diag_table[type]) {
inet_diag_table[type] = h;
err = 0;
}
mutex_unlock(&inet_diag_table_mutex);
out:
return err;
return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type],
NULL, h) ? 0 : -EEXIST;
}
EXPORT_SYMBOL_GPL(inet_diag_register);
......@@ -1519,12 +1530,16 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
if (type >= IPPROTO_MAX)
return;
mutex_lock(&inet_diag_table_mutex);
inet_diag_table[type] = NULL;
mutex_unlock(&inet_diag_table_mutex);
xchg((const struct inet_diag_handler **)&inet_diag_table[type],
NULL);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
static const struct sock_diag_inet_compat inet_diag_compat = {
.owner = THIS_MODULE,
.fn = inet_diag_rcv_msg_compat,
};
static int __init inet_diag_init(void)
{
const int inet_diag_table_size = (IPPROTO_MAX *
......@@ -1543,7 +1558,7 @@ static int __init inet_diag_init(void)
if (err)
goto out_free_inet;
sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
sock_diag_register_inet_compat(&inet_diag_compat);
out:
return err;
......@@ -1558,7 +1573,7 @@ static void __exit inet_diag_exit(void)
{
sock_diag_unregister(&inet6_diag_handler);
sock_diag_unregister(&inet_diag_handler);
sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
sock_diag_unregister_inet_compat(&inet_diag_compat);
kfree(inet_diag_table);
}
......
......@@ -213,6 +213,7 @@ static int raw_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler raw_diag_handler = {
.owner = THIS_MODULE,
.dump = raw_diag_dump,
.dump_one = raw_diag_dump_one,
.idiag_get_info = raw_diag_get_info,
......
......@@ -222,6 +222,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler tcp_diag_handler = {
.owner = THIS_MODULE,
.dump = tcp_diag_dump,
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
......
......@@ -237,6 +237,7 @@ static int udplite_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler udp_diag_handler = {
.owner = THIS_MODULE,
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
......@@ -260,6 +261,7 @@ static int udplite_diag_dump_one(struct netlink_callback *cb,
}
static const struct inet_diag_handler udplite_diag_handler = {
.owner = THIS_MODULE,
.dump = udplite_diag_dump,
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
......
......@@ -225,6 +225,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static const struct inet_diag_handler mptcp_diag_handler = {
.owner = THIS_MODULE,
.dump = mptcp_diag_dump,
.dump_one = mptcp_diag_dump_one,
.idiag_get_info = mptcp_diag_get_info,
......
......@@ -241,6 +241,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler netlink_diag_handler = {
.owner = THIS_MODULE,
.family = AF_NETLINK,
.dump = netlink_diag_handler_dump,
};
......
......@@ -245,6 +245,7 @@ static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler packet_diag_handler = {
.owner = THIS_MODULE,
.family = AF_PACKET,
.dump = packet_diag_handler_dump,
};
......
......@@ -507,6 +507,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
}
static const struct inet_diag_handler sctp_diag_handler = {
.owner = THIS_MODULE,
.dump = sctp_diag_dump,
.dump_one = sctp_diag_dump_one,
.idiag_get_info = sctp_diag_get_info,
......
......@@ -255,6 +255,7 @@ static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler smc_diag_handler = {
.owner = THIS_MODULE,
.family = AF_SMC,
.dump = smc_diag_handler_dump,
};
......
......@@ -95,6 +95,7 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb,
}
static const struct sock_diag_handler tipc_sock_diag_handler = {
.owner = THIS_MODULE,
.family = AF_TIPC,
.dump = tipc_sock_diag_handler_dump,
};
......
......@@ -322,6 +322,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler unix_diag_handler = {
.owner = THIS_MODULE,
.family = AF_UNIX,
.dump = unix_diag_handler_dump,
};
......
......@@ -157,6 +157,7 @@ static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler vsock_diag_handler = {
.owner = THIS_MODULE,
.family = AF_VSOCK,
.dump = vsock_diag_handler_dump,
};
......
......@@ -194,6 +194,7 @@ static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr)
}
static const struct sock_diag_handler xsk_diag_handler = {
.owner = THIS_MODULE,
.family = AF_XDP,
.dump = xsk_diag_handler_dump,
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment