Commit e67f88dd authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: dont hold rtnl mutex during netlink dump callbacks

Four years ago, Patrick made a change to hold rtnl mutex during netlink
dump callbacks.

I believe it was a wrong move. This slows down concurrent dumps, making
good old /proc/net/ files faster than rtnetlink in some situations.

This occurred to me because one "ip link show dev ..." was _very_ slow
on a workload adding/removing network devices in background.

All dump callbacks are able to use RCU locking now, so this patch does
roughly a revert of commits :

1c2d670f : [RTNETLINK]: Hold rtnl_mutex during netlink dump callbacks
6313c1e0 : [RTNETLINK]: Remove unnecessary locking in dump callbacks

This let writers fight for rtnl mutex and readers going full speed.

It also takes care of phonet : phonet_route_get() is now called from rcu
read section. I renamed it to phonet_route_get_rcu()
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Acked-by: default avatarStephen Hemminger <shemminger@vyatta.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent dcfd9cdc
...@@ -51,7 +51,7 @@ void phonet_address_notify(int event, struct net_device *dev, u8 addr); ...@@ -51,7 +51,7 @@ void phonet_address_notify(int event, struct net_device *dev, u8 addr);
int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_add(struct net_device *dev, u8 daddr);
int phonet_route_del(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr);
void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); void rtm_phonet_notify(int event, struct net_device *dev, u8 dst);
struct net_device *phonet_route_get(struct net *net, u8 daddr); struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr);
struct net_device *phonet_route_output(struct net *net, u8 daddr); struct net_device *phonet_route_output(struct net *net, u8 daddr);
#define PN_NO_ADDR 0xff #define PN_NO_ADDR 0xff
......
...@@ -120,8 +120,9 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -120,8 +120,9 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
int idx; int idx;
idx = 0; idx = 0;
for_each_netdev(net, dev) { rcu_read_lock();
struct net_bridge_port *port = br_port_get_rtnl(dev); for_each_netdev_rcu(net, dev) {
struct net_bridge_port *port = br_port_get_rcu(dev);
/* not a bridge port */ /* not a bridge port */
if (!port || idx < cb->args[0]) if (!port || idx < cb->args[0])
...@@ -135,7 +136,7 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -135,7 +136,7 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
skip: skip:
++idx; ++idx;
} }
rcu_read_unlock();
cb->args[0] = idx; cb->args[0] = idx;
return skb->len; return skb->len;
......
...@@ -590,7 +590,8 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, ...@@ -590,7 +590,8 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
int idx = 0; int idx = 0;
struct fib_rule *rule; struct fib_rule *rule;
list_for_each_entry(rule, &ops->rules_list, list) { rcu_read_lock();
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
if (idx < cb->args[1]) if (idx < cb->args[1])
goto skip; goto skip;
......
...@@ -1007,10 +1007,11 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1007,10 +1007,11 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[0]; s_h = cb->args[0];
s_idx = cb->args[1]; s_idx = cb->args[1];
rcu_read_lock();
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0; idx = 0;
head = &net->dev_index_head[h]; head = &net->dev_index_head[h];
hlist_for_each_entry(dev, node, head, index_hlist) { hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
if (idx < s_idx) if (idx < s_idx)
goto cont; goto cont;
if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
...@@ -1023,6 +1024,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1023,6 +1024,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
} }
} }
out: out:
rcu_read_unlock();
cb->args[1] = idx; cb->args[1] = idx;
cb->args[0] = h; cb->args[0] = h;
...@@ -1879,7 +1881,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) ...@@ -1879,7 +1881,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
int min_len; int min_len;
int family; int family;
int type; int type;
int err;
type = nlh->nlmsg_type; type = nlh->nlmsg_type;
if (type > RTM_MAX) if (type > RTM_MAX)
...@@ -1906,11 +1907,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) ...@@ -1906,11 +1907,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (dumpit == NULL) if (dumpit == NULL)
return -EOPNOTSUPP; return -EOPNOTSUPP;
__rtnl_unlock();
rtnl = net->rtnl; rtnl = net->rtnl;
err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); return netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
rtnl_lock();
return err;
} }
memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
...@@ -1980,7 +1978,7 @@ static int __net_init rtnetlink_net_init(struct net *net) ...@@ -1980,7 +1978,7 @@ static int __net_init rtnetlink_net_init(struct net *net)
{ {
struct sock *sk; struct sock *sk;
sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); rtnetlink_rcv, NULL, THIS_MODULE);
if (!sk) if (!sk)
return -ENOMEM; return -ENOMEM;
net->rtnl = sk; net->rtnl = sk;
......
...@@ -752,7 +752,8 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -752,7 +752,8 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
skip_naddr = cb->args[1]; skip_naddr = cb->args[1];
idx = 0; idx = 0;
for_each_netdev(&init_net, dev) { rcu_read_lock();
for_each_netdev_rcu(&init_net, dev) {
if (idx < skip_ndevs) if (idx < skip_ndevs)
goto cont; goto cont;
else if (idx > skip_ndevs) { else if (idx > skip_ndevs) {
...@@ -761,11 +762,11 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -761,11 +762,11 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
skip_naddr = 0; skip_naddr = 0;
} }
if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL)
goto cont; goto cont;
for (ifa = rtnl_dereference(dn_db->ifa_list), dn_idx = 0; ifa; for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa;
ifa = rtnl_dereference(ifa->ifa_next), dn_idx++) { ifa = rcu_dereference(ifa->ifa_next), dn_idx++) {
if (dn_idx < skip_naddr) if (dn_idx < skip_naddr)
continue; continue;
...@@ -778,6 +779,7 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -778,6 +779,7 @@ static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
idx++; idx++;
} }
done: done:
rcu_read_unlock();
cb->args[0] = idx; cb->args[0] = idx;
cb->args[1] = dn_idx; cb->args[1] = dn_idx;
......
...@@ -394,10 +394,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -394,10 +394,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
arg.net = net; arg.net = net;
w->args = &arg; w->args = &arg;
rcu_read_lock();
for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
e = 0; e = 0;
head = &net->ipv6.fib_table_hash[h]; head = &net->ipv6.fib_table_hash[h];
hlist_for_each_entry(tb, node, head, tb6_hlist) { hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
if (e < s_e) if (e < s_e)
goto next; goto next;
res = fib6_dump_table(tb, skb, cb); res = fib6_dump_table(tb, skb, cb);
...@@ -408,6 +409,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -408,6 +409,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
} }
} }
out: out:
rcu_read_unlock();
cb->args[1] = e; cb->args[1] = e;
cb->args[0] = h; cb->args[0] = h;
......
...@@ -426,18 +426,14 @@ int phonet_route_del(struct net_device *dev, u8 daddr) ...@@ -426,18 +426,14 @@ int phonet_route_del(struct net_device *dev, u8 daddr)
return 0; return 0;
} }
struct net_device *phonet_route_get(struct net *net, u8 daddr) struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr)
{ {
struct phonet_net *pnn = phonet_pernet(net); struct phonet_net *pnn = phonet_pernet(net);
struct phonet_routes *routes = &pnn->routes; struct phonet_routes *routes = &pnn->routes;
struct net_device *dev; struct net_device *dev;
ASSERT_RTNL(); /* no need to hold the device */
daddr >>= 2; daddr >>= 2;
rcu_read_lock();
dev = rcu_dereference(routes->table[daddr]); dev = rcu_dereference(routes->table[daddr]);
rcu_read_unlock();
return dev; return dev;
} }
......
...@@ -264,10 +264,11 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -264,10 +264,11 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
u8 addr, addr_idx = 0, addr_start_idx = cb->args[0]; u8 addr, addr_idx = 0, addr_start_idx = cb->args[0];
rcu_read_lock();
for (addr = 0; addr < 64; addr++) { for (addr = 0; addr < 64; addr++) {
struct net_device *dev; struct net_device *dev;
dev = phonet_route_get(net, addr << 2); dev = phonet_route_get_rcu(net, addr << 2);
if (!dev) if (!dev)
continue; continue;
...@@ -279,6 +280,7 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -279,6 +280,7 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
} }
out: out:
rcu_read_unlock();
cb->args[0] = addr_idx; cb->args[0] = addr_idx;
cb->args[1] = 0; cb->args[1] = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment