Commit 9f2f27a9 authored by David S. Miller's avatar David S. Miller

Merge branch 'icmp-reply-optimize'

Jesper Dangaard Brouer says:

====================
net: optimize ICMP-reply code path

This patchset is optimizing the ICMP-reply code path, for ICMP packets
that gets rate limited. A remote party can easily trigger this code
path by sending packets to port number with no listening service.

Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit
checking to earlier in the code path and removes an allocation.

Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.

 After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets.  This is bad for capacity planning when restarting a service.

UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
 Baseline: 6.6 Mpps
 Patch:   14.7 Mpps
Driver mlx5 at 50Gbit/s.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents aaa9c107 7ba91ecb
...@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net) ...@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
return *this_cpu_ptr(net->ipv4.icmp_sk); return *this_cpu_ptr(net->ipv4.icmp_sk);
} }
/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net) static inline struct sock *icmp_xmit_lock(struct net *net)
{ {
struct sock *sk; struct sock *sk;
local_bh_disable();
sk = icmp_sk(net); sk = icmp_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a /* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet. * dst_link_failure() for an outgoing ICMP packet.
*/ */
local_bh_enable();
return NULL; return NULL;
} }
return sk; return sk;
...@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net) ...@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
static inline void icmp_xmit_unlock(struct sock *sk) static inline void icmp_xmit_unlock(struct sock *sk)
{ {
spin_unlock_bh(&sk->sk_lock.slock); spin_unlock(&sk->sk_lock.slock);
} }
int sysctl_icmp_msgs_per_sec __read_mostly = 1000; int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
...@@ -282,6 +280,33 @@ bool icmp_global_allow(void) ...@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
} }
EXPORT_SYMBOL(icmp_global_allow); EXPORT_SYMBOL(icmp_global_allow);
static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
if (type > NR_ICMP_TYPES)
return true;
/* Don't limit PMTU discovery. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
return true;
/* Limit if icmp type is enabled in ratemask. */
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
return true;
return false;
}
static bool icmpv4_global_allow(struct net *net, int type, int code)
{
if (icmpv4_mask_allow(net, type, code))
return true;
if (icmp_global_allow())
return true;
return false;
}
/* /*
* Send an ICMP frame. * Send an ICMP frame.
*/ */
...@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, ...@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct flowi4 *fl4, int type, int code) struct flowi4 *fl4, int type, int code)
{ {
struct dst_entry *dst = &rt->dst; struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
bool rc = true; bool rc = true;
int vif;
if (type > NR_ICMP_TYPES) if (icmpv4_mask_allow(net, type, code))
goto out;
/* Don't limit PMTU discovery. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
goto out; goto out;
/* No rate limit on loopback */ /* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out; goto out;
/* Limit if icmp type is enabled in ratemask. */ vif = l3mdev_master_ifindex(dst->dev);
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
goto out;
rc = false;
if (icmp_global_allow()) {
int vif = l3mdev_master_ifindex(dst->dev);
struct inet_peer *peer;
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
rc = inet_peer_xrlim_allow(peer, rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
net->ipv4.sysctl_icmp_ratelimit);
if (peer) if (peer)
inet_putpeer(peer); inet_putpeer(peer);
}
out: out:
return rc; return rc;
} }
...@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ...@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct inet_sock *inet; struct inet_sock *inet;
__be32 daddr, saddr; __be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark); u32 mark = IP4_REPLY_MARK(net, skb->mark);
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return; return;
/* Needed by both icmp_global_allow and icmp_xmit_lock */
local_bh_disable();
/* global icmp_msgs_per_sec */
if (!icmpv4_global_allow(net, type, code))
goto out_bh_enable;
sk = icmp_xmit_lock(net); sk = icmp_xmit_lock(net);
if (!sk) if (!sk)
return; goto out_bh_enable;
inet = inet_sk(sk); inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0; icmp_param->data.icmph.checksum = 0;
...@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ...@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
rt = ip_route_output_key(net, &fl4); rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) if (IS_ERR(rt))
goto out_unlock; goto out_unlock;
if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
icmp_param->data.icmph.code))
icmp_push_reply(icmp_param, &fl4, &ipc, &rt); icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt); ip_rt_put(rt);
out_unlock: out_unlock:
icmp_xmit_unlock(sk); icmp_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
} }
#ifdef CONFIG_IP_ROUTE_MULTIPATH #ifdef CONFIG_IP_ROUTE_MULTIPATH
...@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ...@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{ {
struct iphdr *iph; struct iphdr *iph;
int room; int room;
struct icmp_bxm *icmp_param; struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in); struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc; struct ipcm_cookie ipc;
struct flowi4 fl4; struct flowi4 fl4;
...@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ...@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
} }
} }
icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); /* Needed by both icmp_global_allow and icmp_xmit_lock */
if (!icmp_param) local_bh_disable();
return;
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
if (!icmpv4_global_allow(net, type, code))
goto out_bh_enable;
sk = icmp_xmit_lock(net); sk = icmp_xmit_lock(net);
if (!sk) if (!sk)
goto out_free; goto out_bh_enable;
/* /*
* Construct source address and options. * Construct source address and options.
...@@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ...@@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos; iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark); mark = IP4_REPLY_MARK(net, skb_in->mark);
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock; goto out_unlock;
...@@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ...@@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header. * Prepare data for ICMP header.
*/ */
icmp_param->data.icmph.type = type; icmp_param.data.icmph.type = type;
icmp_param->data.icmph.code = code; icmp_param.data.icmph.code = code;
icmp_param->data.icmph.un.gateway = info; icmp_param.data.icmph.un.gateway = info;
icmp_param->data.icmph.checksum = 0; icmp_param.data.icmph.checksum = 0;
icmp_param->skb = skb_in; icmp_param.skb = skb_in;
icmp_param->offset = skb_network_offset(skb_in); icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos; inet_sk(sk)->tos = tos;
sk->sk_mark = mark; sk->sk_mark = mark;
ipc.addr = iph->saddr; ipc.addr = iph->saddr;
ipc.opt = &icmp_param->replyopts.opt; ipc.opt = &icmp_param.replyopts.opt;
ipc.tx_flags = 0; ipc.tx_flags = 0;
ipc.ttl = 0; ipc.ttl = 0;
ipc.tos = -1; ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, icmp_param); type, code, &icmp_param);
if (IS_ERR(rt)) if (IS_ERR(rt))
goto out_unlock; goto out_unlock;
/* peer icmp_ratelimit */
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende; goto ende;
...@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ...@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = dst_mtu(&rt->dst); room = dst_mtu(&rt->dst);
if (room > 576) if (room > 576)
room = 576; room = 576;
room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr); room -= sizeof(struct icmphdr);
icmp_param->data_len = skb_in->len - icmp_param->offset; icmp_param.data_len = skb_in->len - icmp_param.offset;
if (icmp_param->data_len > room) if (icmp_param.data_len > room)
icmp_param->data_len = room; icmp_param.data_len = room;
icmp_param->head_len = sizeof(struct icmphdr); icmp_param.head_len = sizeof(struct icmphdr);
icmp_push_reply(icmp_param, &fl4, &ipc, &rt); icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende: ende:
ip_rt_put(rt); ip_rt_put(rt);
out_unlock: out_unlock:
icmp_xmit_unlock(sk); icmp_xmit_unlock(sk);
out_free: out_bh_enable:
kfree(icmp_param); local_bh_enable();
out:; out:;
} }
EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(icmp_send);
......
...@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = { ...@@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = {
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
}; };
/* Called with BH disabled */
static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
{ {
struct sock *sk; struct sock *sk;
local_bh_disable();
sk = icmpv6_sk(net); sk = icmpv6_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path (f.e. SIT or /* This can happen if the output path (f.e. SIT or
* ip6ip6 tunnel) signals dst_link_failure() for an * ip6ip6 tunnel) signals dst_link_failure() for an
* outgoing ICMP6 packet. * outgoing ICMP6 packet.
*/ */
local_bh_enable();
return NULL; return NULL;
} }
return sk; return sk;
...@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) ...@@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
static __inline__ void icmpv6_xmit_unlock(struct sock *sk) static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
{ {
spin_unlock_bh(&sk->sk_lock.slock); spin_unlock(&sk->sk_lock.slock);
} }
/* /*
...@@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb) ...@@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb)
return false; return false;
} }
static bool icmpv6_mask_allow(int type)
{
/* Informational messages are not limited. */
if (type & ICMPV6_INFOMSG_MASK)
return true;
/* Do not limit pmtu discovery, it would break it. */
if (type == ICMPV6_PKT_TOOBIG)
return true;
return false;
}
static bool icmpv6_global_allow(int type)
{
if (icmpv6_mask_allow(type))
return true;
if (icmp_global_allow())
return true;
return false;
}
/* /*
* Check the ICMP output rate limit * Check the ICMP output rate limit
*/ */
...@@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, ...@@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
struct dst_entry *dst; struct dst_entry *dst;
bool res = false; bool res = false;
/* Informational messages are not limited. */ if (icmpv6_mask_allow(type))
if (type & ICMPV6_INFOMSG_MASK)
return true;
/* Do not limit pmtu discovery, it would break it. */
if (type == ICMPV6_PKT_TOOBIG)
return true; return true;
/* /*
...@@ -200,21 +217,17 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, ...@@ -200,21 +217,17 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
} else { } else {
struct rt6_info *rt = (struct rt6_info *)dst; struct rt6_info *rt = (struct rt6_info *)dst;
int tmo = net->ipv6.sysctl.icmpv6_time; int tmo = net->ipv6.sysctl.icmpv6_time;
struct inet_peer *peer;
/* Give more bandwidth to wider prefixes. */ /* Give more bandwidth to wider prefixes. */
if (rt->rt6i_dst.plen < 128) if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5); tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
if (icmp_global_allow()) { peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
struct inet_peer *peer;
peer = inet_getpeer_v6(net->ipv6.peers,
&fl6->daddr, 1);
res = inet_peer_xrlim_allow(peer, tmo); res = inet_peer_xrlim_allow(peer, tmo);
if (peer) if (peer)
inet_putpeer(peer); inet_putpeer(peer);
} }
}
dst_release(dst); dst_release(dst);
return res; return res;
} }
...@@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ...@@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
return; return;
} }
/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
if (!icmpv6_global_allow(type))
goto out_bh_enable;
mip6_addr_swap(skb); mip6_addr_swap(skb);
memset(&fl6, 0, sizeof(fl6)); memset(&fl6, 0, sizeof(fl6));
...@@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ...@@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
sk = icmpv6_xmit_lock(net); sk = icmpv6_xmit_lock(net);
if (!sk) if (!sk)
return; goto out_bh_enable;
sk->sk_mark = mark; sk->sk_mark = mark;
np = inet6_sk(sk); np = inet6_sk(sk);
...@@ -552,6 +573,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ...@@ -552,6 +573,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
dst_release(dst); dst_release(dst);
out: out:
icmpv6_xmit_unlock(sk); icmpv6_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
} }
/* Slightly more convenient version of icmp6_send. /* Slightly more convenient version of icmp6_send.
...@@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ...@@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
local_bh_disable();
sk = icmpv6_xmit_lock(net); sk = icmpv6_xmit_lock(net);
if (!sk) if (!sk)
return; goto out_bh_enable;
sk->sk_mark = mark; sk->sk_mark = mark;
np = inet6_sk(sk); np = inet6_sk(sk);
...@@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ...@@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
dst_release(dst); dst_release(dst);
out: out:
icmpv6_xmit_unlock(sk); icmpv6_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
} }
void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment