Commit 2ac24d6d authored by David S. Miller's avatar David S. Miller

Merge branch 'Support-PMTU-discovery-with-bridged-UDP-tunnels'

Stefano Brivio says:

====================
Support PMTU discovery with bridged UDP tunnels

Currently, PMTU discovery for UDP tunnels only works if packets are
routed to the encapsulating interfaces, not bridged.

This results from the fact that we generally don't have valid routes
to the senders we can use to relay ICMP and ICMPv6 errors, and makes
PMTU discovery completely non-functional for VXLAN and GENEVE ports of
both regular bridges and Open vSwitch instances.

If the sender is local, and packets are forwarded to the port by a
regular bridge, all it takes is to generate a corresponding route
exception on the encapsulating device. The bridge then finds the route
exception carrying the PMTU value estimate as it forwards frames, and
relays ICMP messages back to the socket of the local sender. Patch 1/6
fixes this case.

If the sender resides on another node, we actually need to reply to
IP and IPv6 packets ourselves and send these ICMP or ICMPv6 errors
back, using the same encapsulating device. Patch 2/6, based on an
original idea by Florian Westphal, adds the needed functionality,
while patches 3/6 and 4/6 add matching support for VXLAN and GENEVE.

Finally, 5/6 and 6/6 introduce selftests for all combinations of
inner and outer IP versions, covering both VXLAN and GENEVE, with
both regular bridges and Open vSwitch instances.

v2: Add helper to check for any bridge port, skip oif check for PMTU
    routes for bridge ports only, split IPv4 and IPv6 helpers and
    functions (all suggested by David Ahern)
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents cabf06e5 7b53682c
......@@ -308,7 +308,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
return PTR_ERR(rt);
skb_tunnel_check_pmtu(skb, &rt->dst,
BAREUDP_IPV4_HLEN + info->options_len);
BAREUDP_IPV4_HLEN + info->options_len, false);
sport = udp_flow_src_port(bareudp->net, skb,
bareudp->sport_min, USHRT_MAX,
......@@ -369,7 +369,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len);
skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len,
false);
sport = udp_flow_src_port(bareudp->net, skb,
bareudp->sport_min, USHRT_MAX,
......
......@@ -893,8 +893,31 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(rt))
return PTR_ERR(rt);
skb_tunnel_check_pmtu(skb, &rt->dst,
GENEVE_IPV4_HLEN + info->options_len);
err = skb_tunnel_check_pmtu(skb, &rt->dst,
GENEVE_IPV4_HLEN + info->options_len,
netif_is_any_bridge_port(dev));
if (err < 0) {
dst_release(&rt->dst);
return err;
} else if (err) {
struct ip_tunnel_info *info;
info = skb_tunnel_info(skb);
if (info) {
info->key.u.ipv4.dst = fl4.saddr;
info->key.u.ipv4.src = fl4.daddr;
}
if (!pskb_may_pull(skb, ETH_HLEN)) {
dst_release(&rt->dst);
return -EINVAL;
}
skb->protocol = eth_type_trans(skb, geneve->dev);
netif_rx(skb);
dst_release(&rt->dst);
return -EMSGSIZE;
}
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
if (geneve->cfg.collect_md) {
......@@ -955,7 +978,30 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len);
err = skb_tunnel_check_pmtu(skb, dst,
GENEVE_IPV6_HLEN + info->options_len,
netif_is_any_bridge_port(dev));
if (err < 0) {
dst_release(dst);
return err;
} else if (err) {
struct ip_tunnel_info *info = skb_tunnel_info(skb);
if (info) {
info->key.u.ipv6.dst = fl6.saddr;
info->key.u.ipv6.src = fl6.daddr;
}
if (!pskb_may_pull(skb, ETH_HLEN)) {
dst_release(dst);
return -EINVAL;
}
skb->protocol = eth_type_trans(skb, geneve->dev);
netif_rx(skb);
dst_release(dst);
return -EMSGSIZE;
}
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
if (geneve->cfg.collect_md) {
......@@ -1012,7 +1058,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
if (likely(!err))
return NETDEV_TX_OK;
dev_kfree_skb(skb);
if (err != -EMSGSIZE)
dev_kfree_skb(skb);
if (err == -ELOOP)
dev->stats.collisions++;
......
......@@ -2500,7 +2500,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
struct vxlan_dev *dst_vxlan, __be32 vni)
struct vxlan_dev *dst_vxlan, __be32 vni,
bool snoop)
{
struct pcpu_sw_netstats *tx_stats, *rx_stats;
union vxlan_addr loopback;
......@@ -2532,7 +2533,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
goto drop;
}
if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
u64_stats_update_begin(&tx_stats->syncp);
......@@ -2581,7 +2582,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
return -ENOENT;
}
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
return 1;
}
......@@ -2617,7 +2618,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
if (vxlan_addr_any(dst)) {
if (did_rsc) {
/* short-circuited back to local bridge */
vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
vxlan_encap_bypass(skb, vxlan, vxlan,
default_vni, true);
return;
}
goto drop;
......@@ -2720,7 +2722,23 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
}
ndst = &rt->dst;
skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
netif_is_any_bridge_port(dev));
if (err < 0) {
goto tx_error;
} else if (err) {
if (info) {
struct in_addr src, dst;
src = remote_ip.sin.sin_addr;
dst = local_ip.sin.sin_addr;
info->key.u.ipv4.src = src.s_addr;
info->key.u.ipv4.dst = dst.s_addr;
}
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
dst_release(ndst);
goto out_unlock;
}
tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
......@@ -2760,7 +2778,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
goto out_unlock;
}
skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
netif_is_any_bridge_port(dev));
if (err < 0) {
goto tx_error;
} else if (err) {
if (info) {
struct in6_addr src, dst;
src = remote_ip.sin6.sin6_addr;
dst = local_ip.sin6.sin6_addr;
info->key.u.ipv6.src = src;
info->key.u.ipv6.dst = dst;
}
vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
dst_release(ndst);
goto out_unlock;
}
tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
ttl = ttl ? : ip6_dst_hoplimit(ndst);
......
......@@ -4840,6 +4840,11 @@ static inline bool netif_is_ovs_port(const struct net_device *dev)
return dev->priv_flags & IFF_OVS_DATAPATH;
}
static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}
static inline bool netif_is_team_master(const struct net_device *dev)
{
return dev->priv_flags & IFF_TEAM;
......
......@@ -535,14 +535,4 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}
static inline void skb_tunnel_check_pmtu(struct sk_buff *skb,
struct dst_entry *encap_dst,
int headroom)
{
u32 encap_mtu = dst_mtu(encap_dst);
if (skb->len > encap_mtu - headroom)
skb_dst_update_pmtu_no_confirm(skb, encap_mtu - headroom);
}
#endif /* _NET_DST_H */
......@@ -420,6 +420,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
u8 tos, u8 ttl, __be16 df, bool xnet);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
int headroom, bool reply);
int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);
......
......@@ -184,6 +184,250 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
/**
* iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
* @skb: Original packet with L2 header
* @mtu: MTU value for ICMP error
*
* Return: length on success, negative error code if message couldn't be built.
*/
static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
{
const struct iphdr *iph = ip_hdr(skb);
struct icmphdr *icmph;
struct iphdr *niph;
struct ethhdr eh;
int len, err;
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
return -EINVAL;
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
if (err)
return err;
len = skb->len + sizeof(*icmph);
err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
if (err)
return err;
icmph = skb_push(skb, sizeof(*icmph));
*icmph = (struct icmphdr) {
.type = ICMP_DEST_UNREACH,
.code = ICMP_FRAG_NEEDED,
.checksum = 0,
.un.frag.__unused = 0,
.un.frag.mtu = ntohs(mtu),
};
icmph->checksum = ip_compute_csum(icmph, len);
skb_reset_transport_header(skb);
niph = skb_push(skb, sizeof(*niph));
*niph = (struct iphdr) {
.ihl = sizeof(*niph) / 4u,
.version = 4,
.tos = 0,
.tot_len = htons(len + sizeof(*niph)),
.id = 0,
.frag_off = htons(IP_DF),
.ttl = iph->ttl,
.protocol = IPPROTO_ICMP,
.saddr = iph->daddr,
.daddr = iph->saddr,
};
ip_send_check(niph);
skb_reset_network_header(skb);
skb->ip_summed = CHECKSUM_NONE;
eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
skb_reset_mac_header(skb);
return skb->len;
}
/**
* iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
* @skb: Buffer being sent by encapsulation, L2 headers expected
* @mtu: Network MTU for path
*
* Return: 0 for no ICMP reply, length if built, negative value on error.
*/
static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
{
const struct icmphdr *icmph = icmp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
if (mtu <= 576 || iph->frag_off != htons(IP_DF))
return 0;
if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) ||
ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) ||
ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr))
return 0;
if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
return 0;
return iptunnel_pmtud_build_icmp(skb, mtu);
}
#if IS_ENABLED(CONFIG_IPV6)
/**
* iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
* @skb: Original packet with L2 header
* @mtu: MTU value for ICMPv6 error
*
* Return: length on success, negative error code if message couldn't be built.
*/
static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct icmp6hdr *icmp6h;
struct ipv6hdr *nip6h;
struct ethhdr eh;
int len, err;
__wsum csum;
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
return -EINVAL;
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
if (err)
return err;
len = skb->len + sizeof(*icmp6h);
err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
if (err)
return err;
icmp6h = skb_push(skb, sizeof(*icmp6h));
*icmp6h = (struct icmp6hdr) {
.icmp6_type = ICMPV6_PKT_TOOBIG,
.icmp6_code = 0,
.icmp6_cksum = 0,
.icmp6_mtu = htonl(mtu),
};
skb_reset_transport_header(skb);
nip6h = skb_push(skb, sizeof(*nip6h));
*nip6h = (struct ipv6hdr) {
.priority = 0,
.version = 6,
.flow_lbl = { 0 },
.payload_len = htons(len),
.nexthdr = IPPROTO_ICMPV6,
.hop_limit = ip6h->hop_limit,
.saddr = ip6h->daddr,
.daddr = ip6h->saddr,
};
skb_reset_network_header(skb);
csum = csum_partial(icmp6h, len, 0);
icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
IPPROTO_ICMPV6, csum);
skb->ip_summed = CHECKSUM_NONE;
eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
skb_reset_mac_header(skb);
return skb->len;
}
/**
* iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
* @skb: Buffer being sent by encapsulation, L2 headers expected
* @mtu: Network MTU for path
*
* Return: 0 for no ICMPv6 reply, length if built, negative value on error.
*/
static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
int stype = ipv6_addr_type(&ip6h->saddr);
u8 proto = ip6h->nexthdr;
__be16 frag_off;
int offset;
if (mtu <= IPV6_MIN_MTU)
return 0;
if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
stype == IPV6_ADDR_LOOPBACK)
return 0;
offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
&frag_off);
if (offset < 0 || (frag_off & htons(~0x7)))
return 0;
if (proto == IPPROTO_ICMPV6) {
struct icmp6hdr *icmp6h;
if (!pskb_may_pull(skb, skb_network_header(skb) +
offset + 1 - skb->data))
return 0;
icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
if (icmpv6_is_err(icmp6h->icmp6_type) ||
icmp6h->icmp6_type == NDISC_REDIRECT)
return 0;
}
return iptunnel_pmtud_build_icmpv6(skb, mtu);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */
/**
* skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
* @skb: Buffer being sent by encapsulation, L2 headers expected
* @encap_dst: Destination for tunnel encapsulation (outer IP)
* @headroom: Encapsulation header size, bytes
* @reply: Build matching ICMP or ICMPv6 message as a result
*
* L2 tunnel implementations that can carry IP and can be directly bridged
* (currently UDP tunnels) can't always rely on IP forwarding paths to handle
* PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
* based on payload and sent back by the encapsulation itself.
*
* For routable interfaces, we just need to update the PMTU for the destination.
*
* Return: 0 if ICMP error not needed, length if built, negative value on error
*/
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
int headroom, bool reply)
{
u32 mtu = dst_mtu(encap_dst) - headroom;
if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
(!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
return 0;
skb_dst_update_pmtu_no_confirm(skb, mtu);
if (!reply || skb->pkt_type == PACKET_HOST)
return 0;
if (skb->protocol == htons(ETH_P_IP))
return iptunnel_pmtud_check_icmp(skb, mtu);
#if IS_ENABLED(CONFIG_IPV6)
if (skb->protocol == htons(ETH_P_IPV6))
return iptunnel_pmtud_check_icmpv6(skb, mtu);
#endif
return 0;
}
EXPORT_SYMBOL(skb_tunnel_check_pmtu);
/* Often modified stats are per cpu, other are shared (netdev->stats) */
void ip_tunnel_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *tot)
......
......@@ -1050,6 +1050,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct flowi4 fl4;
ip_rt_build_flow_key(&fl4, sk, skb);
/* Don't make lookup fail for bridged encapsulations */
if (skb && netif_is_any_bridge_port(skb->dev))
fl4.flowi4_oif = 0;
__ip_rt_update_pmtu(rt, &fl4, mtu);
}
......
......@@ -59,6 +59,45 @@
# Same as pmtu_ipv6_vxlan6_exception, but using a GENEVE tunnel instead of
# VXLAN
#
# - pmtu_ipv{4,6}_br_vxlan{4,6}_exception
# Set up three namespaces, A, B, and C, with routing between A and B over
# R1. R2 is unused in these tests. A has a veth connection to C, and is
# connected to B via a VXLAN endpoint, which is directly bridged to C.
# MTU on the B-R1 link is lower than other MTUs.
#
# Check that both C and A are able to communicate with B over the VXLAN
# tunnel, and that PMTU exceptions with the correct values are created.
#
# segment a_r1 segment b_r1 b_r1: 4000
# .--------------R1--------------. everything
# C---veth A B else: 5000
# ' bridge |
# '---- - - - - - VXLAN - - - - - - - '
#
# - pmtu_ipv{4,6}_br_geneve{4,6}_exception
# Same as pmtu_ipv{4,6}_br_vxlan{4,6}_exception, with a GENEVE tunnel
# instead.
#
# - pmtu_ipv{4,6}_ovs_vxlan{4,6}_exception
# Set up two namespaces, B, and C, with routing between the init namespace
# and B over R1. A and R2 are unused in these tests. The init namespace
# has a veth connection to C, and is connected to B via a VXLAN endpoint,
# which is handled by Open vSwitch and bridged to C. MTU on the B-R1 link
# is lower than other MTUs.
#
# Check that C is able to communicate with B over the VXLAN tunnel, and
# that PMTU exceptions with the correct values are created.
#
# segment a_r1 segment b_r1 b_r1: 4000
# .--------------R1--------------. everything
# C---veth init B else: 5000
# '- ovs |
# '---- - - - - - VXLAN - - - - - - - '
#
# - pmtu_ipv{4,6}_ovs_geneve{4,6}_exception
# Same as pmtu_ipv{4,6}_ovs_vxlan{4,6}_exception, with a GENEVE tunnel
# instead.
#
# - pmtu_ipv{4,6}_fou{4,6}_exception
# Same as pmtu_ipv4_vxlan4, but using a direct IPv4/IPv6 encapsulation
# (FoU) over IPv4/IPv6, instead of VXLAN
......@@ -147,6 +186,22 @@ tests="
pmtu_ipv6_geneve4_exception IPv6 over geneve4: PMTU exceptions 1
pmtu_ipv4_geneve6_exception IPv4 over geneve6: PMTU exceptions 1
pmtu_ipv6_geneve6_exception IPv6 over geneve6: PMTU exceptions 1
pmtu_ipv4_br_vxlan4_exception IPv4, bridged vxlan4: PMTU exceptions 1
pmtu_ipv6_br_vxlan4_exception IPv6, bridged vxlan4: PMTU exceptions 1
pmtu_ipv4_br_vxlan6_exception IPv4, bridged vxlan6: PMTU exceptions 1
pmtu_ipv6_br_vxlan6_exception IPv6, bridged vxlan6: PMTU exceptions 1
pmtu_ipv4_br_geneve4_exception IPv4, bridged geneve4: PMTU exceptions 1
pmtu_ipv6_br_geneve4_exception IPv6, bridged geneve4: PMTU exceptions 1
pmtu_ipv4_br_geneve6_exception IPv4, bridged geneve6: PMTU exceptions 1
pmtu_ipv6_br_geneve6_exception IPv6, bridged geneve6: PMTU exceptions 1
pmtu_ipv4_ovs_vxlan4_exception IPv4, OVS vxlan4: PMTU exceptions 1
pmtu_ipv6_ovs_vxlan4_exception IPv6, OVS vxlan4: PMTU exceptions 1
pmtu_ipv4_ovs_vxlan6_exception IPv4, OVS vxlan6: PMTU exceptions 1
pmtu_ipv6_ovs_vxlan6_exception IPv6, OVS vxlan6: PMTU exceptions 1
pmtu_ipv4_ovs_geneve4_exception IPv4, OVS geneve4: PMTU exceptions 1
pmtu_ipv6_ovs_geneve4_exception IPv6, OVS geneve4: PMTU exceptions 1
pmtu_ipv4_ovs_geneve6_exception IPv4, OVS geneve6: PMTU exceptions 1
pmtu_ipv6_ovs_geneve6_exception IPv6, OVS geneve6: PMTU exceptions 1
pmtu_ipv4_fou4_exception IPv4 over fou4: PMTU exceptions 1
pmtu_ipv6_fou4_exception IPv6 over fou4: PMTU exceptions 1
pmtu_ipv4_fou6_exception IPv4 over fou6: PMTU exceptions 1
......@@ -173,10 +228,12 @@ tests="
NS_A="ns-A"
NS_B="ns-B"
NS_C="ns-C"
NS_R1="ns-R1"
NS_R2="ns-R2"
ns_a="ip netns exec ${NS_A}"
ns_b="ip netns exec ${NS_B}"
ns_c="ip netns exec ${NS_C}"
ns_r1="ip netns exec ${NS_R1}"
ns_r2="ip netns exec ${NS_R2}"
......@@ -239,9 +296,11 @@ routes_nh="
veth4_a_addr="192.168.1.1"
veth4_b_addr="192.168.1.2"
veth4_c_addr="192.168.2.10"
veth4_mask="24"
veth6_a_addr="fd00:1::a"
veth6_b_addr="fd00:1::b"
veth6_c_addr="fd00:2::c"
veth6_mask="64"
tunnel4_a_addr="192.168.2.1"
......@@ -428,7 +487,7 @@ setup_ip6ip6() {
}
setup_namespaces() {
for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
ip netns add ${n} || return 1
# Disable DAD, so that we don't have to wait to use the
......@@ -484,6 +543,7 @@ setup_vxlan_or_geneve() {
a_addr="${2}"
b_addr="${3}"
opts="${4}"
br_if_a="${5}"
if [ "${type}" = "vxlan" ]; then
opts="${opts} ttl 64 dstport 4789"
......@@ -497,10 +557,16 @@ setup_vxlan_or_geneve() {
run_cmd ${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
if [ -n "${br_if_a}" ]; then
run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${br_if_a}
run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${br_if_a}
run_cmd ${ns_a} ip link set ${type}_a master ${br_if_a}
else
run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
fi
run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
run_cmd ${ns_a} ip link set ${type}_a up
......@@ -516,11 +582,27 @@ setup_vxlan4() {
}
setup_geneve6() {
setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ""
}
setup_vxlan6() {
setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ""
}
setup_bridged_geneve4() {
setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" "br0"
}
setup_bridged_vxlan4() {
setup_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" "br0"
}
setup_bridged_geneve6() {
setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "" "br0"
}
setup_bridged_vxlan6() {
setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "" "br0"
}
setup_xfrm() {
......@@ -630,6 +712,80 @@ setup_routing() {
return 0
}
setup_bridge() {
run_cmd ${ns_a} ip link add br0 type bridge || return 2
run_cmd ${ns_a} ip link set br0 up
run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
run_cmd ${ns_c} ip link set veth_A-C netns ns-A
run_cmd ${ns_a} ip link set veth_A-C up
run_cmd ${ns_c} ip link set veth_C-A up
run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
run_cmd ${ns_a} ip link set veth_A-C master br0
}
setup_ovs_vxlan_or_geneve() {
type="${1}"
a_addr="${2}"
b_addr="${3}"
if [ "${type}" = "vxlan" ]; then
opts="${opts} ttl 64 dstport 4789"
opts_b="local ${b_addr}"
fi
run_cmd ovs-vsctl add-port ovs_br0 ${type}_a -- \
set interface ${type}_a type=${type} \
options:remote_ip=${b_addr} options:key=1 options:csum=true || return 1
run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts} || return 1
run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
run_cmd ${ns_b} ip link set ${type}_b up
}
setup_ovs_geneve4() {
setup_ovs_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1
}
setup_ovs_vxlan4() {
setup_ovs_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1
}
setup_ovs_geneve6() {
setup_ovs_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
}
setup_ovs_vxlan6() {
setup_ovs_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
}
setup_ovs_bridge() {
run_cmd ovs-vsctl add-br ovs_br0 || return 2
run_cmd ip link set ovs_br0 up
run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
run_cmd ${ns_c} ip link set veth_A-C netns 1
run_cmd ip link set veth_A-C up
run_cmd ${ns_c} ip link set veth_C-A up
run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
run_cmd ovs-vsctl add-port ovs_br0 veth_A-C
# Move veth_A-R1 to init
run_cmd ${ns_a} ip link set veth_A-R1 netns 1
run_cmd ip addr add ${prefix4}.${a_r1}.1/${veth4_mask} dev veth_A-R1
run_cmd ip addr add ${prefix6}:${a_r1}::1/${veth6_mask} dev veth_A-R1
run_cmd ip link set veth_A-R1 up
run_cmd ip route add ${prefix4}.${b_r1}.1 via ${prefix4}.${a_r1}.2
run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
}
setup() {
[ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
......@@ -657,9 +813,14 @@ cleanup() {
done
tcpdump_pids=
for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
ip netns del ${n} 2> /dev/null
done
ip link del veth_A-C 2>/dev/null
ip link del veth_A-R1 2>/dev/null
ovs-vsctl --if-exists del-port vxlan_a 2>/dev/null
ovs-vsctl --if-exists del-br ovs_br0 2>/dev/null
}
mtu() {
......@@ -892,6 +1053,177 @@ test_pmtu_ipv6_geneve6_exception() {
test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 6
}
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
type=${1}
family=${2}
outer_family=${3}
ll_mtu=4000
if [ ${outer_family} -eq 4 ]; then
setup namespaces routing bridge bridged_${type}4 || return 2
# IPv4 header UDP header VXLAN/GENEVE header Ethernet header
exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
else
setup namespaces routing bridge bridged_${type}6 || return 2
# IPv6 header UDP header VXLAN/GENEVE header Ethernet header
exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
fi
trace "${ns_a}" ${type}_a "${ns_b}" ${type}_b \
"${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
"${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B \
"${ns_a}" br0 "${ns_a}" veth-A-C \
"${ns_c}" veth_C-A
if [ ${family} -eq 4 ]; then
ping=ping
dst=${tunnel4_b_addr}
else
ping=${ping6}
dst=${tunnel6_b_addr}
fi
# Create route exception by exceeding link layer MTU
mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
mtu "${ns_a}" br0 $((${ll_mtu} + 1000))
mtu "${ns_a}" veth_A-C $((${ll_mtu} + 1000))
mtu "${ns_c}" veth_C-A $((${ll_mtu} + 1000))
mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
mtu "${ns_b}" veth_B-R1 ${ll_mtu}
mtu "${ns_r1}" veth_R1-B ${ll_mtu}
mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 10 -s $((${ll_mtu} + 500)) ${dst} || return 1
run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} || return 1
# Check that exceptions were created
pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on bridged ${type} interface"
pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on locally bridged ${type} interface"
}
test_pmtu_ipv4_br_vxlan4_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 4 4
}
test_pmtu_ipv6_br_vxlan4_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 6 4
}
test_pmtu_ipv4_br_geneve4_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 4 4
}
test_pmtu_ipv6_br_geneve4_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 6 4
}
test_pmtu_ipv4_br_vxlan6_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 4 6
}
test_pmtu_ipv6_br_vxlan6_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 6 6
}
test_pmtu_ipv4_br_geneve6_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 4 6
}
test_pmtu_ipv6_br_geneve6_exception() {
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 6 6
}
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() {
type=${1}
family=${2}
outer_family=${3}
ll_mtu=4000
if [ ${outer_family} -eq 4 ]; then
setup namespaces routing ovs_bridge ovs_${type}4 || return 2
# IPv4 header UDP header VXLAN/GENEVE header Ethernet header
exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
else
setup namespaces routing ovs_bridge ovs_${type}6 || return 2
# IPv6 header UDP header VXLAN/GENEVE header Ethernet header
exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
fi
if [ "${type}" = "vxlan" ]; then
tun_a="vxlan_sys_4789"
elif [ "${type}" = "geneve" ]; then
tun_a="genev_sys_6081"
fi
trace "" "${tun_a}" "${ns_b}" ${type}_b \
"" veth_A-R1 "${ns_r1}" veth_R1-A \
"${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B \
"" ovs_br0 "" veth-A-C \
"${ns_c}" veth_C-A
if [ ${family} -eq 4 ]; then
ping=ping
dst=${tunnel4_b_addr}
else
ping=${ping6}
dst=${tunnel6_b_addr}
fi
# Create route exception by exceeding link layer MTU
mtu "" veth_A-R1 $((${ll_mtu} + 1000))
mtu "" ovs_br0 $((${ll_mtu} + 1000))
mtu "" veth_A-C $((${ll_mtu} + 1000))
mtu "${ns_c}" veth_C-A $((${ll_mtu} + 1000))
mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
mtu "${ns_b}" veth_B-R1 ${ll_mtu}
mtu "${ns_r1}" veth_R1-B ${ll_mtu}
mtu "" ${tun_a} $((${ll_mtu} + 1000))
mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 20 -s $((${ll_mtu} + 500)) ${dst} || return 1
# Check that exceptions were created
pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on Open vSwitch ${type} interface"
}
test_pmtu_ipv4_ovs_vxlan4_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 4 4
}
test_pmtu_ipv6_ovs_vxlan4_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 6 4
}
test_pmtu_ipv4_ovs_geneve4_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 4 4
}
test_pmtu_ipv6_ovs_geneve4_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 6 4
}
test_pmtu_ipv4_ovs_vxlan6_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 4 6
}
test_pmtu_ipv6_ovs_vxlan6_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 6 6
}
test_pmtu_ipv4_ovs_geneve6_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 4 6
}
test_pmtu_ipv6_ovs_geneve6_exception() {
test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 6 6
}
test_pmtu_ipvX_over_fouY_or_gueY() {
inner_family=${1}
outer_family=${2}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment