Commit e43b76ab authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'tcp-receive-path-optimizations'

Eric Dumazet says:

====================
tcp: receive path optimizations

This series aims to reduce cache line misses in RX path.

I am still working on better cache locality in tcp_sock but
this will wait few more weeks.
====================

Link: https://lore.kernel.org/r/20211025164825.259415-1-eric.dumazet@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents fd559a94 12c8691d
...@@ -282,7 +282,6 @@ struct ipv6_pinfo { ...@@ -282,7 +282,6 @@ struct ipv6_pinfo {
__be32 rcv_flowinfo; __be32 rcv_flowinfo;
__u32 dst_cookie; __u32 dst_cookie;
__u32 rx_dst_cookie;
struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_mc_socklist __rcu *ipv6_mc_list;
struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_ac_socklist *ipv6_ac_list;
......
...@@ -130,6 +130,7 @@ static inline void skb_mark_napi_id(struct sk_buff *skb, ...@@ -130,6 +130,7 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{ {
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
WRITE_ONCE(sk->sk_napi_id, skb->napi_id); WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif #endif
sk_rx_queue_set(sk, skb); sk_rx_queue_set(sk, skb);
......
...@@ -207,11 +207,10 @@ struct inet_sock { ...@@ -207,11 +207,10 @@ struct inet_sock {
__be32 inet_saddr; __be32 inet_saddr;
__s16 uc_ttl; __s16 uc_ttl;
__u16 cmsg_flags; __u16 cmsg_flags;
struct ip_options_rcu __rcu *inet_opt;
__be16 inet_sport; __be16 inet_sport;
__u16 inet_id; __u16 inet_id;
struct ip_options_rcu __rcu *inet_opt;
int rx_dst_ifindex;
__u8 tos; __u8 tos;
__u8 min_ttl; __u8 min_ttl;
__u8 mc_ttl; __u8 mc_ttl;
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <linux/jhash.h> #include <linux/jhash.h>
#include <linux/sockptr.h> #include <linux/sockptr.h>
#include <linux/static_key.h>
#include <net/inet_sock.h> #include <net/inet_sock.h>
#include <net/route.h> #include <net/route.h>
...@@ -750,6 +751,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, ...@@ -750,6 +751,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset); struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg, int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
struct ipcm_cookie *ipc, bool allow_ipv6); struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen); unsigned int optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
......
...@@ -1092,6 +1092,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, ...@@ -1092,6 +1092,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
/* /*
* socket options (ipv6_sockglue.c) * socket options (ipv6_sockglue.c)
*/ */
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen); unsigned int optlen);
......
...@@ -259,6 +259,8 @@ struct bpf_local_storage; ...@@ -259,6 +259,8 @@ struct bpf_local_storage;
* @sk_rcvbuf: size of receive buffer in bytes * @sk_rcvbuf: size of receive buffer in bytes
* @sk_wq: sock wait queue and async head * @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux * @sk_rx_dst: receive input route used by early demux
* @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
* @sk_rx_dst_cookie: cookie for @sk_rx_dst
* @sk_dst_cache: destination cache * @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour * @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy * @sk_policy: flow policy
...@@ -430,6 +432,9 @@ struct sock { ...@@ -430,6 +432,9 @@ struct sock {
struct xfrm_policy __rcu *sk_policy[2]; struct xfrm_policy __rcu *sk_policy[2];
#endif #endif
struct dst_entry *sk_rx_dst; struct dst_entry *sk_rx_dst;
int sk_rx_dst_ifindex;
u32 sk_rx_dst_cookie;
struct dst_entry __rcu *sk_dst_cache; struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc; atomic_t sk_omem_alloc;
int sk_sndbuf; int sk_sndbuf;
...@@ -1911,10 +1916,8 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) ...@@ -1911,10 +1916,8 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
if (skb_rx_queue_recorded(skb)) { if (skb_rx_queue_recorded(skb)) {
u16 rx_queue = skb_get_rx_queue(skb); u16 rx_queue = skb_get_rx_queue(skb);
if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
return; WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
sk->sk_rx_queue_mapping = rx_queue;
} }
#endif #endif
} }
...@@ -1922,15 +1925,19 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) ...@@ -1922,15 +1925,19 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
static inline void sk_rx_queue_clear(struct sock *sk) static inline void sk_rx_queue_clear(struct sock *sk)
{ {
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif #endif
} }
static inline int sk_rx_queue_get(const struct sock *sk) static inline int sk_rx_queue_get(const struct sock *sk)
{ {
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) if (sk) {
return sk->sk_rx_queue_mapping; int res = READ_ONCE(sk->sk_rx_queue_mapping);
if (res != NO_QUEUE_MAPPING)
return res;
}
#endif #endif
return -1; return -1;
......
...@@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname, ...@@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
return ip_mc_leave_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq);
} }
DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
static int do_ip_setsockopt(struct sock *sk, int level, int optname, static int do_ip_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen) sockptr_t optval, unsigned int optlen)
{ {
...@@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, ...@@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval; goto e_inval;
if (val < 0 || val > 255) if (val < 0 || val > 255)
goto e_inval; goto e_inval;
inet->min_ttl = val;
if (val)
static_branch_enable(&ip4_min_ttl);
/* tcp_v4_err() and tcp_v4_rcv() might read min_ttl
* while we are changint it.
*/
WRITE_ONCE(inet->min_ttl, val);
break; break;
default: default:
......
...@@ -508,10 +508,13 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ...@@ -508,10 +508,13 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
if (sk->sk_state == TCP_CLOSE) if (sk->sk_state == TCP_CLOSE)
goto out; goto out;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out; goto out;
} }
}
tp = tcp_sk(sk); tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
...@@ -1703,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1703,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb); sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb); sk_mark_napi_id(sk, skb);
if (dst) { if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) { dst, 0)) {
dst_release(dst); dst_release(dst);
...@@ -1788,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) ...@@ -1788,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (dst) if (dst)
dst = dst_check(dst, 0); dst = dst_check(dst, 0);
if (dst && if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst); skb_dst_set_noref(skb, dst);
} }
} }
...@@ -2068,10 +2071,14 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -2068,10 +2071,14 @@ int tcp_v4_rcv(struct sk_buff *skb)
return 0; return 0;
} }
} }
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse; goto discard_and_relse;
} }
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse; goto discard_and_relse;
...@@ -2195,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) ...@@ -2195,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
if (dst && dst_hold_safe(dst)) { if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst; sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_ifindex = skb->skb_iif;
} }
} }
EXPORT_SYMBOL(inet_sk_rx_dst_set); EXPORT_SYMBOL(inet_sk_rx_dst_set);
......
...@@ -55,6 +55,8 @@ ...@@ -55,6 +55,8 @@
struct ip6_ra_chain *ip6_ra_chain; struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock); DEFINE_RWLOCK(ip6_ra_lock);
DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);
int ip6_ra_control(struct sock *sk, int sel) int ip6_ra_control(struct sock *sk, int sel)
{ {
struct ip6_ra_chain *ra, *new_ra, **rap; struct ip6_ra_chain *ra, *new_ra, **rap;
...@@ -950,7 +952,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, ...@@ -950,7 +952,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
goto e_inval; goto e_inval;
if (val < 0 || val > 255) if (val < 0 || val > 255)
goto e_inval; goto e_inval;
np->min_hopcount = val;
if (val)
static_branch_enable(&ip6_min_hopcount);
/* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
* while we are changing it.
*/
WRITE_ONCE(np->min_hopcount, val);
retv = 0; retv = 0;
break; break;
case IPV6_DONTFRAG: case IPV6_DONTFRAG:
......
...@@ -108,8 +108,8 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) ...@@ -108,8 +108,8 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
const struct rt6_info *rt = (const struct rt6_info *)dst; const struct rt6_info *rt = (const struct rt6_info *)dst;
sk->sk_rx_dst = dst; sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_ifindex = skb->skb_iif;
tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
} }
} }
...@@ -414,10 +414,13 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ...@@ -414,10 +414,13 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_CLOSE) if (sk->sk_state == TCP_CLOSE)
goto out; goto out;
if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out; goto out;
} }
}
tp = tcp_sk(sk); tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
...@@ -569,7 +572,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -569,7 +572,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
static void tcp_v6_reqsk_destructor(struct request_sock *req) static void tcp_v6_reqsk_destructor(struct request_sock *req)
{ {
kfree(inet_rsk(req)->ipv6_opt); kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts); consume_skb(inet_rsk(req)->pktopts);
} }
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
...@@ -1509,9 +1512,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1509,9 +1512,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb); sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb); sk_mark_napi_id(sk, skb);
if (dst) { if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
dst, np->rx_dst_cookie) == NULL) { dst, sk->sk_rx_dst_cookie) == NULL) {
dst_release(dst); dst_release(dst);
sk->sk_rx_dst = NULL; sk->sk_rx_dst = NULL;
} }
...@@ -1591,7 +1594,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1591,7 +1594,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
} }
} }
kfree_skb(opt_skb); consume_skb(opt_skb);
return 0; return 0;
} }
...@@ -1726,10 +1729,14 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1726,10 +1729,14 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
return 0; return 0;
} }
} }
if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse; goto discard_and_relse;
} }
}
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse; goto discard_and_relse;
...@@ -1872,9 +1879,9 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) ...@@ -1872,9 +1879,9 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
if (dst) if (dst)
dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst && if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst); skb_dst_set_noref(skb, dst);
} }
} }
......
...@@ -884,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) ...@@ -884,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
if (udp_sk_rx_dst_set(sk, dst)) { if (udp_sk_rx_dst_set(sk, dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst; const struct rt6_info *rt = (const struct rt6_info *)dst;
inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
} }
} }
...@@ -1073,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) ...@@ -1073,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
dst = READ_ONCE(sk->sk_rx_dst); dst = READ_ONCE(sk->sk_rx_dst);
if (dst) if (dst)
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst) { if (dst) {
/* set noref for now. /* set noref for now.
* any place which wants to hold dst has to call * any place which wants to hold dst has to call
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment