Commit 2ac24d6d authored by David S. Miller's avatar David S. Miller

Merge branch 'Support-PMTU-discovery-with-bridged-UDP-tunnels'

Stefano Brivio says:

====================
Support PMTU discovery with bridged UDP tunnels

Currently, PMTU discovery for UDP tunnels only works if packets are
routed to the encapsulating interfaces, not bridged.

This results from the fact that we generally don't have valid routes
to the senders we can use to relay ICMP and ICMPv6 errors, and makes
PMTU discovery completely non-functional for VXLAN and GENEVE ports of
both regular bridges and Open vSwitch instances.

If the sender is local, and packets are forwarded to the port by a
regular bridge, all it takes is to generate a corresponding route
exception on the encapsulating device. The bridge then finds the route
exception carrying the PMTU value estimate as it forwards frames, and
relays ICMP messages back to the socket of the local sender. Patch 1/6
fixes this case.

If the sender resides on another node, we actually need to reply to
IP and IPv6 packets ourselves and send these ICMP or ICMPv6 errors
back, using the same encapsulating device. Patch 2/6, based on an
original idea by Florian Westphal, adds the needed functionality,
while patches 3/6 and 4/6 add matching support for VXLAN and GENEVE.

Finally, 5/6 and 6/6 introduce selftests for all combinations of
inner and outer IP versions, covering both VXLAN and GENEVE, with
both regular bridges and Open vSwitch instances.

v2: Add helper to check for any bridge port, skip oif check for PMTU
    routes for bridge ports only, split IPv4 and IPv6 helpers and
    functions (all suggested by David Ahern)
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents cabf06e5 7b53682c
......@@ -308,7 +308,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
return PTR_ERR(rt);
skb_tunnel_check_pmtu(skb, &rt->dst,
BAREUDP_IPV4_HLEN + info->options_len);
BAREUDP_IPV4_HLEN + info->options_len, false);
sport = udp_flow_src_port(bareudp->net, skb,
bareudp->sport_min, USHRT_MAX,
......@@ -369,7 +369,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len);
skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len,
false);
sport = udp_flow_src_port(bareudp->net, skb,
bareudp->sport_min, USHRT_MAX,
......
......@@ -893,8 +893,31 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(rt))
return PTR_ERR(rt);
skb_tunnel_check_pmtu(skb, &rt->dst,
GENEVE_IPV4_HLEN + info->options_len);
err = skb_tunnel_check_pmtu(skb, &rt->dst,
GENEVE_IPV4_HLEN + info->options_len,
netif_is_any_bridge_port(dev));
if (err < 0) {
dst_release(&rt->dst);
return err;
} else if (err) {
struct ip_tunnel_info *info;
info = skb_tunnel_info(skb);
if (info) {
info->key.u.ipv4.dst = fl4.saddr;
info->key.u.ipv4.src = fl4.daddr;
}
if (!pskb_may_pull(skb, ETH_HLEN)) {
dst_release(&rt->dst);
return -EINVAL;
}
skb->protocol = eth_type_trans(skb, geneve->dev);
netif_rx(skb);
dst_release(&rt->dst);
return -EMSGSIZE;
}
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
if (geneve->cfg.collect_md) {
......@@ -955,7 +978,30 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len);
err = skb_tunnel_check_pmtu(skb, dst,
GENEVE_IPV6_HLEN + info->options_len,
netif_is_any_bridge_port(dev));
if (err < 0) {
dst_release(dst);
return err;
} else if (err) {
struct ip_tunnel_info *info = skb_tunnel_info(skb);
if (info) {
info->key.u.ipv6.dst = fl6.saddr;
info->key.u.ipv6.src = fl6.daddr;
}
if (!pskb_may_pull(skb, ETH_HLEN)) {
dst_release(dst);
return -EINVAL;
}
skb->protocol = eth_type_trans(skb, geneve->dev);
netif_rx(skb);
dst_release(dst);
return -EMSGSIZE;
}
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
if (geneve->cfg.collect_md) {
......@@ -1012,7 +1058,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
if (likely(!err))
return NETDEV_TX_OK;
dev_kfree_skb(skb);
if (err != -EMSGSIZE)
dev_kfree_skb(skb);
if (err == -ELOOP)
dev->stats.collisions++;
......
......@@ -2500,7 +2500,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
struct vxlan_dev *dst_vxlan, __be32 vni)
struct vxlan_dev *dst_vxlan, __be32 vni,
bool snoop)
{
struct pcpu_sw_netstats *tx_stats, *rx_stats;
union vxlan_addr loopback;
......@@ -2532,7 +2533,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
goto drop;
}
if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
u64_stats_update_begin(&tx_stats->syncp);
......@@ -2581,7 +2582,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
return -ENOENT;
}
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
return 1;
}
......@@ -2617,7 +2618,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
if (vxlan_addr_any(dst)) {
if (did_rsc) {
/* short-circuited back to local bridge */
vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
vxlan_encap_bypass(skb, vxlan, vxlan,
default_vni, true);
return;
}
goto drop;
......@@ -2720,7 +2722,23 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
}
ndst = &rt->dst;
skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
netif_is_any_bridge_port(dev));
if (err < 0) {
goto tx_error;
} else if (err) {
if (info) {
struct in_addr src, dst;
src = remote_ip.sin.sin_addr;
dst = local_ip.sin.sin_addr;
info->key.u.ipv4.src = src.s_addr;
info->key.u.ipv4.dst = dst.s_addr;
}
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
dst_release(ndst);
goto out_unlock;
}
tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
......@@ -2760,7 +2778,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
goto out_unlock;
}
skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
netif_is_any_bridge_port(dev));
if (err < 0) {
goto tx_error;
} else if (err) {
if (info) {
struct in6_addr src, dst;
src = remote_ip.sin6.sin6_addr;
dst = local_ip.sin6.sin6_addr;
info->key.u.ipv6.src = src;
info->key.u.ipv6.dst = dst;
}
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
dst_release(ndst);
goto out_unlock;
}
tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
ttl = ttl ? : ip6_dst_hoplimit(ndst);
......
......@@ -4840,6 +4840,11 @@ static inline bool netif_is_ovs_port(const struct net_device *dev)
return dev->priv_flags & IFF_OVS_DATAPATH;
}
static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}
static inline bool netif_is_team_master(const struct net_device *dev)
{
return dev->priv_flags & IFF_TEAM;
......
......@@ -535,14 +535,4 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}
static inline void skb_tunnel_check_pmtu(struct sk_buff *skb,
struct dst_entry *encap_dst,
int headroom)
{
u32 encap_mtu = dst_mtu(encap_dst);
if (skb->len > encap_mtu - headroom)
skb_dst_update_pmtu_no_confirm(skb, encap_mtu - headroom);
}
#endif /* _NET_DST_H */
......@@ -420,6 +420,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
u8 tos, u8 ttl, __be16 df, bool xnet);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
int headroom, bool reply);
int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);
......
......@@ -184,6 +184,250 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
/**
* iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
* @skb: Original packet with L2 header
* @mtu: MTU value for ICMP error
*
* Return: length on success, negative error code if message couldn't be built.
*/
static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
{
const struct iphdr *iph = ip_hdr(skb);
struct icmphdr *icmph;
struct iphdr *niph;
struct ethhdr eh;
int len, err;
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
return -EINVAL;
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
if (err)
return err;
len = skb->len + sizeof(*icmph);
err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
if (err)
return err;
icmph = skb_push(skb, sizeof(*icmph));
*icmph = (struct icmphdr) {
.type = ICMP_DEST_UNREACH,
.code = ICMP_FRAG_NEEDED,
.checksum = 0,
.un.frag.__unused = 0,
.un.frag.mtu = ntohs(mtu),
};
icmph->checksum = ip_compute_csum(icmph, len);
skb_reset_transport_header(skb);
niph = skb_push(skb, sizeof(*niph));
*niph = (struct iphdr) {
.ihl = sizeof(*niph) / 4u,
.version = 4,
.tos = 0,
.tot_len = htons(len + sizeof(*niph)),
.id = 0,
.frag_off = htons(IP_DF),
.ttl = iph->ttl,
.protocol = IPPROTO_ICMP,
.saddr = iph->daddr,
.daddr = iph->saddr,
};
ip_send_check(niph);
skb_reset_network_header(skb);
skb->ip_summed = CHECKSUM_NONE;
eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
skb_reset_mac_header(skb);
return skb->len;
}
/**
* iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
* @skb: Buffer being sent by encapsulation, L2 headers expected
* @mtu: Network MTU for path
*
* Return: 0 for no ICMP reply, length if built, negative value on error.
*/
static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
{
const struct icmphdr *icmph = icmp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
if (mtu <= 576 || iph->frag_off != htons(IP_DF))
return 0;
if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) ||
ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) ||
ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr))
return 0;
if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
return 0;
return iptunnel_pmtud_build_icmp(skb, mtu);
}
#if IS_ENABLED(CONFIG_IPV6)
/**
* iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
* @skb: Original packet with L2 header
* @mtu: MTU value for ICMPv6 error
*
* Return: length on success, negative error code if message couldn't be built.
*/
static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct icmp6hdr *icmp6h;
struct ipv6hdr *nip6h;
struct ethhdr eh;
int len, err;
__wsum csum;
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
return -EINVAL;
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
if (err)
return err;
len = skb->len + sizeof(*icmp6h);
err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
if (err)
return err;
icmp6h = skb_push(skb, sizeof(*icmp6h));
*icmp6h = (struct icmp6hdr) {
.icmp6_type = ICMPV6_PKT_TOOBIG,
.icmp6_code = 0,
.icmp6_cksum = 0,
.icmp6_mtu = htonl(mtu),
};
skb_reset_transport_header(skb);
nip6h = skb_push(skb, sizeof(*nip6h));
*nip6h = (struct ipv6hdr) {
.priority = 0,
.version = 6,
.flow_lbl = { 0 },
.payload_len = htons(len),
.nexthdr = IPPROTO_ICMPV6,
.hop_limit = ip6h->hop_limit,
.saddr = ip6h->daddr,
.daddr = ip6h->saddr,
};
skb_reset_network_header(skb);
csum = csum_partial(icmp6h, len, 0);
icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
IPPROTO_ICMPV6, csum);
skb->ip_summed = CHECKSUM_NONE;
eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
skb_reset_mac_header(skb);
return skb->len;
}
/**
* iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
* @skb: Buffer being sent by encapsulation, L2 headers expected
* @mtu: Network MTU for path
*
* Return: 0 for no ICMPv6 reply, length if built, negative value on error.
*/
static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
int stype = ipv6_addr_type(&ip6h->saddr);
u8 proto = ip6h->nexthdr;
__be16 frag_off;
int offset;
if (mtu <= IPV6_MIN_MTU)
return 0;
if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
stype == IPV6_ADDR_LOOPBACK)
return 0;
offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
&frag_off);
if (offset < 0 || (frag_off & htons(~0x7)))
return 0;
if (proto == IPPROTO_ICMPV6) {
struct icmp6hdr *icmp6h;
if (!pskb_may_pull(skb, skb_network_header(skb) +
offset + 1 - skb->data))
return 0;
icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
if (icmpv6_is_err(icmp6h->icmp6_type) ||
icmp6h->icmp6_type == NDISC_REDIRECT)
return 0;
}
return iptunnel_pmtud_build_icmpv6(skb, mtu);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */
/**
* skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
* @skb: Buffer being sent by encapsulation, L2 headers expected
* @encap_dst: Destination for tunnel encapsulation (outer IP)
* @headroom: Encapsulation header size, bytes
* @reply: Build matching ICMP or ICMPv6 message as a result
*
* L2 tunnel implementations that can carry IP and can be directly bridged
* (currently UDP tunnels) can't always rely on IP forwarding paths to handle
* PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
* based on payload and sent back by the encapsulation itself.
*
* For routable interfaces, we just need to update the PMTU for the destination.
*
* Return: 0 if ICMP error not needed, length if built, negative value on error
*/
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
int headroom, bool reply)
{
u32 mtu = dst_mtu(encap_dst) - headroom;
if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
(!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
return 0;
skb_dst_update_pmtu_no_confirm(skb, mtu);
if (!reply || skb->pkt_type == PACKET_HOST)
return 0;
if (skb->protocol == htons(ETH_P_IP))
return iptunnel_pmtud_check_icmp(skb, mtu);
#if IS_ENABLED(CONFIG_IPV6)
if (skb->protocol == htons(ETH_P_IPV6))
return iptunnel_pmtud_check_icmpv6(skb, mtu);
#endif
return 0;
}
EXPORT_SYMBOL(skb_tunnel_check_pmtu);
/* Often modified stats are per cpu, other are shared (netdev->stats) */
void ip_tunnel_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *tot)
......
......@@ -1050,6 +1050,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct flowi4 fl4;
ip_rt_build_flow_key(&fl4, sk, skb);
/* Don't make lookup fail for bridged encapsulations */
if (skb && netif_is_any_bridge_port(skb->dev))
fl4.flowi4_oif = 0;
__ip_rt_update_pmtu(rt, &fl4, mtu);
}
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment