Commit f87c10a8 authored by Hannes Frederic Sowa's avatar Hannes Frederic Sowa Committed by David S. Miller

ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing

While forwarding we should not use the protocol path mtu to calculate
the mtu for a forwarded packet but instead use the interface mtu.

We mark forwarded skbs in ip_forward with IPSKB_FORWARDED, which was
introduced for multicast forwarding. But as it does not conflict with
our usage in unicast code path it is perfect for reuse.

I moved the functions ip_sk_accept_pmtu, ip_sk_use_pmtu and ip_skb_dst_mtu
along with the new ip_dst_mtu_maybe_forward to net/ip.h to fix circular
dependencies because of IPSKB_FORWARDED.

Because someone might have written a software which does probe
destinations manually and expects the kernel to honour those path mtus
I introduced a new per-namespace "ip_forward_use_pmtu" knob so someone
can disable this new behaviour. We also still use mtus which are locked on a
route for forwarding.

The reason for this change is, that path mtus information can be injected
into the kernel via e.g. icmp_err protocol handler without verification
of local sockets. As such, this could cause the IPv4 forwarding path to
wrongfully emit fragmentation needed notifications or start to fragment
packets along a path.

Tunnel and ipsec output paths clear IPCB again, thus IPSKB_FORWARDED
won't be set and further fragmentation logic will use the path mtu to
determine the fragmentation size. They also recheck packet size with
help of path mtu discovery and report appropriate errors.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: John Heffner <johnwheffner@gmail.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: default avatarHannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6c76a07a
...@@ -32,6 +32,19 @@ ip_no_pmtu_disc - INTEGER ...@@ -32,6 +32,19 @@ ip_no_pmtu_disc - INTEGER
min_pmtu - INTEGER min_pmtu - INTEGER
default 552 - minimum discovered Path MTU default 552 - minimum discovered Path MTU
ip_forward_use_pmtu - BOOLEAN
By default we don't trust protocol path MTUs while forwarding
because they could be easily forged and can lead to unwanted
fragmentation by the router.
You only need to enable this if you have user-space software
which tries to discover path mtus by itself and depends on the
kernel honoring this information. This is normally not the
case.
Default: 0 (disabled)
Possible values:
0 - disabled
1 - enabled
route/max_size - INTEGER route/max_size - INTEGER
Maximum number of routes allowed in the kernel. Increase Maximum number of routes allowed in the kernel. Increase
this when using large numbers of interfaces and/or routes. this when using large numbers of interfaces and/or routes.
......
...@@ -263,6 +263,39 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst) ...@@ -263,6 +263,39 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
!(dst_metric_locked(dst, RTAX_MTU))); !(dst_metric_locked(dst, RTAX_MTU)));
} }
static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
}
static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
}
static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
bool forwarding)
{
struct net *net = dev_net(dst->dev);
if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
dst_metric_locked(dst, RTAX_MTU) ||
!forwarding)
return dst_mtu(dst);
return min(dst->dev->mtu, IP_MAX_MTU);
}
static inline unsigned int ip_skb_dst_mtu(const struct sk_buff *skb)
{
if (!skb->sk || ip_sk_use_pmtu(skb->sk)) {
bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
} else {
return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
}
}
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more); void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk) static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk)
......
...@@ -70,6 +70,7 @@ struct netns_ipv4 { ...@@ -70,6 +70,7 @@ struct netns_ipv4 {
int sysctl_tcp_ecn; int sysctl_tcp_ecn;
int sysctl_ip_no_pmtu_disc; int sysctl_ip_no_pmtu_disc;
int sysctl_ip_fwd_use_pmtu;
kgid_t sysctl_ping_group_range[2]; kgid_t sysctl_ping_group_range[2];
......
...@@ -36,6 +36,9 @@ ...@@ -36,6 +36,9 @@
#include <linux/cache.h> #include <linux/cache.h>
#include <linux/security.h> #include <linux/security.h>
/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU 0xFFFFU
#define RTO_ONLINK 0x01 #define RTO_ONLINK 0x01
#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
...@@ -311,20 +314,4 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) ...@@ -311,20 +314,4 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
return hoplimit; return hoplimit;
} }
static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
}
static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
}
static inline int ip_skb_dst_mtu(const struct sk_buff *skb)
{
return (!skb->sk || ip_sk_use_pmtu(skb->sk)) ?
dst_mtu(skb_dst(skb)) : skb_dst(skb)->dev->mtu;
}
#endif /* _ROUTE_H */ #endif /* _ROUTE_H */
...@@ -54,6 +54,7 @@ static int ip_forward_finish(struct sk_buff *skb) ...@@ -54,6 +54,7 @@ static int ip_forward_finish(struct sk_buff *skb)
int ip_forward(struct sk_buff *skb) int ip_forward(struct sk_buff *skb)
{ {
u32 mtu;
struct iphdr *iph; /* Our header */ struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */ struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *opt = &(IPCB(skb)->opt);
...@@ -88,11 +89,13 @@ int ip_forward(struct sk_buff *skb) ...@@ -88,11 +89,13 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_uses_gateway) if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed; goto sr_failed;
if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (unlikely(skb->len > mtu && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(dst_mtu(&rt->dst))); htonl(mtu));
goto drop; goto drop;
} }
......
...@@ -449,6 +449,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ...@@ -449,6 +449,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
__be16 not_last_frag; __be16 not_last_frag;
struct rtable *rt = skb_rtable(skb); struct rtable *rt = skb_rtable(skb);
int err = 0; int err = 0;
bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
dev = rt->dst.dev; dev = rt->dst.dev;
...@@ -458,12 +459,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ...@@ -458,12 +459,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
iph = ip_hdr(skb); iph = ip_hdr(skb);
mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding);
if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
(IPCB(skb)->frag_max_size && (IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { IPCB(skb)->frag_max_size > mtu))) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(ip_skb_dst_mtu(skb))); htonl(mtu));
kfree_skb(skb); kfree_skb(skb);
return -EMSGSIZE; return -EMSGSIZE;
} }
...@@ -473,7 +475,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ...@@ -473,7 +475,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
*/ */
hlen = iph->ihl * 4; hlen = iph->ihl * 4;
mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ mtu = mtu - hlen; /* Size of data space */
#ifdef CONFIG_BRIDGE_NETFILTER #ifdef CONFIG_BRIDGE_NETFILTER
if (skb->nf_bridge) if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb); mtu -= nf_bridge_mtu_reduction(skb);
......
...@@ -112,9 +112,6 @@ ...@@ -112,9 +112,6 @@
#define RT_FL_TOS(oldflp4) \ #define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU 0xFFFF
#define RT_GC_TIMEOUT (300*HZ) #define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_max_size; static int ip_rt_max_size;
......
...@@ -831,6 +831,13 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -831,6 +831,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "ip_forward_use_pmtu",
.data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ } { }
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment