Commit d8c4ef76 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'ipv6-avoid-atomic-fragment-on-gso-output'

Yan Zhai says:

====================
ipv6: avoid atomic fragment on GSO output

When the ipv6 stack output a GSO packet, if its gso_size is larger than
dst MTU, then all segments would be fragmented. However, it is possible
for a GSO packet to have a trailing segment with smaller actual size
than both gso_size as well as the MTU, which leads to an "atomic
fragment". Atomic fragments are considered harmful in RFC-8021. An
Existing report from APNIC also shows that atomic fragments are more
likely to be dropped even it is equivalent to a no-op [1].

The series contains following changes:
* drop feature RTAX_FEATURE_ALLFRAG, which has been broken. This helps
  simplifying other changes in this set.
* refactor __ip6_finish_output code to separate GSO and non-GSO packet
  processing, mirroring IPv4 side logic.
* avoid generating atomic fragment on GSO packets.

Link: https://www.potaroo.net/presentations/2022-03-01-ipv6-frag.pdf [1]

V4: https://lore.kernel.org/netdev/cover.1698114636.git.yan@cloudflare.com/
V3: https://lore.kernel.org/netdev/cover.1697779681.git.yan@cloudflare.com/
V2: https://lore.kernel.org/netdev/ZS1%2Fqtr0dZJ35VII@debian.debian/
====================

Link: https://lore.kernel.org/r/cover.1698156966.git.yan@cloudflare.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 8846f9a0 03d6c848
...@@ -222,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr ...@@ -222,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
return msecs_to_jiffies(dst_metric(dst, metric)); return msecs_to_jiffies(dst_metric(dst, metric));
} }
static inline u32
dst_allfrag(const struct dst_entry *dst)
{
int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG);
return ret;
}
static inline int static inline int
dst_metric_locked(const struct dst_entry *dst, int metric) dst_metric_locked(const struct dst_entry *dst, int metric)
{ {
......
...@@ -44,7 +44,6 @@ struct inet_connection_sock_af_ops { ...@@ -44,7 +44,6 @@ struct inet_connection_sock_af_ops {
struct request_sock *req_unhash, struct request_sock *req_unhash,
bool *own_req); bool *own_req);
u16 net_header_len; u16 net_header_len;
u16 net_frag_header_len;
u16 sockaddr_len; u16 sockaddr_len;
int (*setsockopt)(struct sock *sk, int level, int optname, int (*setsockopt)(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen); sockptr_t optval, unsigned int optlen);
......
...@@ -244,7 +244,6 @@ struct inet_sock { ...@@ -244,7 +244,6 @@ struct inet_sock {
}; };
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
enum { enum {
INET_FLAGS_PKTINFO = 0, INET_FLAGS_PKTINFO = 0,
......
...@@ -505,7 +505,7 @@ enum { ...@@ -505,7 +505,7 @@ enum {
#define RTAX_FEATURE_ECN (1 << 0) #define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1) /* unused */ #define RTAX_FEATURE_SACK (1 << 1) /* unused */
#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
#define RTAX_FEATURE_ALLFRAG (1 << 3) #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */
#define RTAX_FEATURE_TCP_USEC_TS (1 << 4) #define RTAX_FEATURE_TCP_USEC_TS (1 << 4)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
......
...@@ -1698,14 +1698,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) ...@@ -1698,14 +1698,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
*/ */
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
if (icsk->icsk_af_ops->net_frag_header_len) {
const struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst_allfrag(dst))
mss_now -= icsk->icsk_af_ops->net_frag_header_len;
}
/* Clamp it (mss_clamp does not include tcp options) */ /* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp) if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp; mss_now = tp->rx_opt.mss_clamp;
...@@ -1733,21 +1725,11 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) ...@@ -1733,21 +1725,11 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
{ {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
int mtu;
mtu = mss + return mss +
tp->tcp_header_len + tp->tcp_header_len +
icsk->icsk_ext_hdr_len + icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len; icsk->icsk_af_ops->net_header_len;
/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
if (icsk->icsk_af_ops->net_frag_header_len) {
const struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst_allfrag(dst))
mtu += icsk->icsk_af_ops->net_frag_header_len;
}
return mtu;
} }
EXPORT_SYMBOL(tcp_mss_to_mtu); EXPORT_SYMBOL(tcp_mss_to_mtu);
......
...@@ -164,7 +164,13 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, ...@@ -164,7 +164,13 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
int err; int err;
skb_mark_not_on_list(segs); skb_mark_not_on_list(segs);
err = ip6_fragment(net, sk, segs, ip6_finish_output2); /* Last GSO segment can be smaller than gso_size (and MTU).
* Adding a fragment header would produce an "atomic fragment",
* which is considered harmful (RFC-8021). Avoid that.
*/
err = segs->len > mtu ?
ip6_fragment(net, sk, segs, ip6_finish_output2) :
ip6_finish_output2(net, sk, segs);
if (err && ret == 0) if (err && ret == 0)
ret = err; ret = err;
} }
...@@ -172,6 +178,16 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, ...@@ -172,6 +178,16 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
return ret; return ret;
} }
static int ip6_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *skb, unsigned int mtu)
{
if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
!skb_gso_validate_network_len(skb, mtu))
return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
return ip6_finish_output2(net, sk, skb);
}
static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
unsigned int mtu; unsigned int mtu;
...@@ -185,17 +201,14 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff ...@@ -185,17 +201,14 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
#endif #endif
mtu = ip6_skb_dst_mtu(skb); mtu = ip6_skb_dst_mtu(skb);
if (skb_is_gso(skb) && if (skb_is_gso(skb))
!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && return ip6_finish_output_gso(net, sk, skb, mtu);
!skb_gso_validate_network_len(skb, mtu))
return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
if ((skb->len > mtu && !skb_is_gso(skb)) || if (skb->len > mtu ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
return ip6_fragment(net, sk, skb, ip6_finish_output2); return ip6_fragment(net, sk, skb, ip6_finish_output2);
else
return ip6_finish_output2(net, sk, skb); return ip6_finish_output2(net, sk, skb);
} }
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
...@@ -1017,9 +1030,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -1017,9 +1030,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return err; return err;
fail_toobig: fail_toobig:
if (skb->sk && dst_allfrag(skb_dst(skb)))
sk_gso_disable(skb->sk);
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
err = -EMSGSIZE; err = -EMSGSIZE;
...@@ -1384,10 +1394,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, ...@@ -1384,10 +1394,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
cork->base.mark = ipc6->sockc.mark; cork->base.mark = ipc6->sockc.mark;
sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0; cork->base.length = 0;
cork->base.transmit_time = ipc6->sockc.transmit_time; cork->base.transmit_time = ipc6->sockc.transmit_time;
return 0; return 0;
...@@ -1444,8 +1451,6 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1444,8 +1451,6 @@ static int __ip6_append_data(struct sock *sk,
headersize = sizeof(struct ipv6hdr) + headersize = sizeof(struct ipv6hdr) +
(opt ? opt->opt_flen + opt->opt_nflen : 0) + (opt ? opt->opt_flen + opt->opt_nflen : 0) +
(dst_allfrag(&rt->dst) ?
sizeof(struct frag_hdr) : 0) +
rt->rt6i_nfheader_len; rt->rt6i_nfheader_len;
if (mtu <= fragheaderlen || if (mtu <= fragheaderlen ||
...@@ -1555,7 +1560,7 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1555,7 +1560,7 @@ static int __ip6_append_data(struct sock *sk,
while (length > 0) { while (length > 0) {
/* Check if the remaining data fits into current packet. */ /* Check if the remaining data fits into current packet. */
copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
if (copy < length) if (copy < length)
copy = maxfraglen - skb->len; copy = maxfraglen - skb->len;
...@@ -1586,7 +1591,7 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1586,7 +1591,7 @@ static int __ip6_append_data(struct sock *sk,
*/ */
datalen = length + fraggap; datalen = length + fraggap;
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen; fraglen = datalen + fragheaderlen;
pagedlen = 0; pagedlen = 0;
...@@ -1835,7 +1840,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) ...@@ -1835,7 +1840,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
struct dst_entry *dst = cork->base.dst; struct dst_entry *dst = cork->base.dst;
cork->base.dst = NULL; cork->base.dst = NULL;
cork->base.flags &= ~IPCORK_ALLFRAG;
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
} }
...@@ -1856,7 +1860,6 @@ static void ip6_cork_release(struct inet_cork_full *cork, ...@@ -1856,7 +1860,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
if (cork->base.dst) { if (cork->base.dst) {
dst_release(cork->base.dst); dst_release(cork->base.dst);
cork->base.dst = NULL; cork->base.dst = NULL;
cork->base.flags &= ~IPCORK_ALLFRAG;
} }
} }
......
...@@ -1895,7 +1895,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { ...@@ -1895,7 +1895,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = {
.conn_request = tcp_v6_conn_request, .conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock, .syn_recv_sock = tcp_v6_syn_recv_sock,
.net_header_len = sizeof(struct ipv6hdr), .net_header_len = sizeof(struct ipv6hdr),
.net_frag_header_len = sizeof(struct frag_hdr),
.setsockopt = ipv6_setsockopt, .setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt, .getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr, .addr2sockaddr = inet6_csk_addr2sockaddr,
......
...@@ -95,7 +95,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -95,7 +95,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
return -EMSGSIZE; return -EMSGSIZE;
} }
if (toobig || dst_allfrag(skb_dst(skb))) if (toobig)
return ip6_fragment(net, sk, skb, return ip6_fragment(net, sk, skb,
__xfrm6_output_finish); __xfrm6_output_finish);
......
...@@ -2051,7 +2051,6 @@ void __init mptcp_subflow_init(void) ...@@ -2051,7 +2051,6 @@ void __init mptcp_subflow_init(void)
subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.send_check = ipv4_specific.send_check;
subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
subflow_v6m_specific.net_frag_header_len = 0;
subflow_v6m_specific.rebuild_header = subflow_rebuild_header; subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override = tcpv6_prot;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment