Commit 230583c1 authored by David S. Miller's avatar David S. Miller

Merge branch 'udp-fix-early-demux-for-mcast-packets'

Paolo Abeni says:

====================
udp: fix early demux for mcast packets

Currently the early demux callbacks do not perform source address validation.
This is not an issue for TCP or UDP unicast, where the early demux
is only allowed for connected sockets and the source address is validated
for the first packet and never change.

The UDP protocol currently allows early demux also for unconnected multicast
sockets, and we are not currently doing any validation for them, after that
the first packet lands on the socket: beyond ignoring the rp_filter - if
enabled - any kind of martian sources are also allowed.

This series addresses the issue allowing the early demux callback to return an
error code, and performing the proper checks for unconnected UDP multicast
sockets before leveraging the rx dst cache.

Alternatively we could disable the early demux for unconnected mcast sockets,
but that would cause relevant performance regression - around 50% - while with
this series, with full rp_filter in place, we keep the regression to a more
moderate level.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d41bb33b bc044e8d
......@@ -39,8 +39,8 @@
/* This is used to register protocols. */
struct net_protocol {
void (*early_demux)(struct sk_buff *skb);
void (*early_demux_handler)(struct sk_buff *skb);
int (*early_demux)(struct sk_buff *skb);
int (*early_demux_handler)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
unsigned int no_policy:1,
......
......@@ -175,7 +175,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
fl4->fl4_gre_key = gre_key;
return ip_route_output_key(net, fl4);
}
int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev,
struct in_device *in_dev, u32 *itag);
int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin);
int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
......
......@@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32);
void tcp_shutdown(struct sock *sk, int how);
void tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
......
......@@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
}
void udp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
int udp_get_port(struct sock *sk, unsigned short snum,
int (*saddr_cmp)(const struct sock *,
......
......@@ -311,9 +311,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
int (*edemux)(struct sk_buff *skb);
struct net_device *dev = skb->dev;
void (*edemux)(struct sk_buff *skb);
struct rtable *rt;
int err;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
......@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
edemux(skb);
err = edemux(skb);
if (unlikely(err))
goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
......@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
* how the packet travels inside Linux networking.
*/
if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, dev);
if (unlikely(err)) {
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, dev);
if (unlikely(err))
goto drop_error;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
......@@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
drop:
kfree_skb(skb);
return NET_RX_DROP;
drop_error:
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
/*
......
......@@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
EXPORT_SYMBOL(rt_dst_alloc);
/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev,
struct in_device *in_dev, u32 *itag)
{
struct rtable *rth;
struct in_device *in_dev = __in_dev_get_rcu(dev);
unsigned int flags = RTCF_MULTICAST;
u32 itag = 0;
int err;
/* Primary sanity checks. */
if (!in_dev)
return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
skb->protocol != htons(ETH_P_IP))
goto e_inval;
return -EINVAL;
if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
goto e_inval;
return -EINVAL;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
goto e_inval;
return -EINVAL;
} else {
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
in_dev, &itag);
in_dev, itag);
if (err < 0)
goto e_err;
return err;
}
return 0;
}
/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
unsigned int flags = RTCF_MULTICAST;
struct rtable *rth;
u32 itag = 0;
int err;
err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
if (err)
return err;
if (our)
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
goto e_nobufs;
return -ENOBUFS;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
......@@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb_dst_set(skb, &rth->dst);
return 0;
e_nobufs:
return -ENOBUFS;
e_inval:
return -EINVAL;
e_err:
return err;
}
......
......@@ -1503,23 +1503,23 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
void tcp_v4_early_demux(struct sk_buff *skb)
int tcp_v4_early_demux(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
if (skb->pkt_type != PACKET_HOST)
return;
return 0;
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
return;
return 0;
iph = ip_hdr(skb);
th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
return;
return 0;
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
......@@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
skb_dst_set_noref(skb, dst);
}
}
return 0;
}
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
......
......@@ -2221,9 +2221,10 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
return NULL;
}
void udp_v4_early_demux(struct sk_buff *skb)
int udp_v4_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct in_device *in_dev = NULL;
const struct iphdr *iph;
const struct udphdr *uh;
struct sock *sk = NULL;
......@@ -2234,24 +2235,24 @@ void udp_v4_early_demux(struct sk_buff *skb)
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
return;
return 0;
iph = ip_hdr(skb);
uh = udp_hdr(skb);
if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev)
return;
return 0;
/* we are supposed to accept bcast packets */
if (skb->pkt_type == PACKET_MULTICAST) {
ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
iph->protocol);
if (!ours)
return;
return 0;
}
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
......@@ -2263,7 +2264,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
}
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
return;
return 0;
skb->sk = sk;
skb->destructor = sock_efree;
......@@ -2272,12 +2273,23 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst) {
u32 itag = 0;
/* set noref for now.
* any place which wants to hold dst has to call
* dst_hold_safe()
*/
skb_dst_set_noref(skb, dst);
/* for unconnected multicast sockets we need to validate
* the source on each packet
*/
if (!inet_sk(sk)->inet_daddr && in_dev)
return ip_mc_validate_source(skb, iph->daddr,
iph->saddr, iph->tos,
skb->dev, in_dev, &itag);
}
return 0;
}
int udp_rcv(struct sk_buff *skb)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment