Commit d0928c1c authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Florian Westphal says:

====================
Netfilter updates for net-next

1. nf_tables 'brouting' support, from Sriram Yagnaraman.

2. Update bridge netfilter and ovs conntrack helpers to handle
   IPv6 Jumbo packets properly, i.e. fetch the packet length
   from hop-by-hop extension header, from Xin Long.

   This comes with a test BIG TCP test case, added to
   tools/testing/selftests/net/.

3. Fix spelling and indentation in conntrack, from Jeremy Sowden.

* 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
  netfilter: nat: fix indentation of function arguments
  netfilter: conntrack: fix typo
  selftests: add a selftest for big tcp
  netfilter: use nf_ip6_check_hbh_len in nf_ct_skb_network_trim
  netfilter: move br_nf_check_hbh_len to utils
  netfilter: bridge: move pskb_trim_rcsum out of br_nf_check_hbh_len
  netfilter: bridge: check len before accessing more nh data
  netfilter: bridge: call pskb_may_pull in br_nf_check_hbh_len
  netfilter: bridge: introduce broute meta statement
====================

Link: https://lore.kernel.org/r/20230308193033.13965-1-fw@strlen.deSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b3a8df9f b0ca2000
...@@ -197,6 +197,8 @@ static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, ...@@ -197,6 +197,8 @@ static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
unsigned int dataoff, u_int8_t protocol); unsigned int dataoff, u_int8_t protocol);
int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen);
int ipv6_netfilter_init(void); int ipv6_netfilter_init(void);
void ipv6_netfilter_fini(void); void ipv6_netfilter_fini(void);
......
...@@ -931,6 +931,7 @@ enum nft_exthdr_attributes { ...@@ -931,6 +931,7 @@ enum nft_exthdr_attributes {
* @NFT_META_TIME_HOUR: hour of day (in seconds) * @NFT_META_TIME_HOUR: hour of day (in seconds)
* @NFT_META_SDIF: slave device interface index * @NFT_META_SDIF: slave device interface index
* @NFT_META_SDIFNAME: slave device interface name * @NFT_META_SDIFNAME: slave device interface name
* @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit
*/ */
enum nft_meta_keys { enum nft_meta_keys {
NFT_META_LEN, NFT_META_LEN,
...@@ -969,6 +970,7 @@ enum nft_meta_keys { ...@@ -969,6 +970,7 @@ enum nft_meta_keys {
NFT_META_TIME_HOUR, NFT_META_TIME_HOUR,
NFT_META_SDIF, NFT_META_SDIF,
NFT_META_SDIFNAME, NFT_META_SDIFNAME,
NFT_META_BRI_BROUTE,
__NFT_META_IIFTYPE, __NFT_META_IIFTYPE,
}; };
......
...@@ -40,62 +40,6 @@ ...@@ -40,62 +40,6 @@
#include <linux/sysctl.h> #include <linux/sysctl.h>
#endif #endif
/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
* anyway
*/
static int br_nf_check_hbh_len(struct sk_buff *skb)
{
unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
u32 pkt_len;
const unsigned char *nh = skb_network_header(skb);
int off = raw - nh;
int len = (raw[1] + 1) << 3;
if ((raw + len) - skb->data > skb_headlen(skb))
goto bad;
off += 2;
len -= 2;
while (len > 0) {
int optlen = nh[off + 1] + 2;
switch (nh[off]) {
case IPV6_TLV_PAD1:
optlen = 1;
break;
case IPV6_TLV_PADN:
break;
case IPV6_TLV_JUMBO:
if (nh[off + 1] != 4 || (off & 3) != 2)
goto bad;
pkt_len = ntohl(*(__be32 *)(nh + off + 2));
if (pkt_len <= IPV6_MAXPLEN ||
ipv6_hdr(skb)->payload_len)
goto bad;
if (pkt_len > skb->len - sizeof(struct ipv6hdr))
goto bad;
if (pskb_trim_rcsum(skb,
pkt_len + sizeof(struct ipv6hdr)))
goto bad;
nh = skb_network_header(skb);
break;
default:
if (optlen > len)
goto bad;
break;
}
off += optlen;
len -= optlen;
}
if (len == 0)
return 0;
bad:
return -1;
}
int br_validate_ipv6(struct net *net, struct sk_buff *skb) int br_validate_ipv6(struct net *net, struct sk_buff *skb)
{ {
const struct ipv6hdr *hdr; const struct ipv6hdr *hdr;
...@@ -115,8 +59,9 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) ...@@ -115,8 +59,9 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
goto inhdr_error; goto inhdr_error;
pkt_len = ntohs(hdr->payload_len); pkt_len = ntohs(hdr->payload_len);
if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len))
goto drop;
if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
if (pkt_len + ip6h_len > skb->len) { if (pkt_len + ip6h_len > skb->len) {
__IP6_INC_STATS(net, idev, __IP6_INC_STATS(net, idev,
IPSTATS_MIB_INTRUNCATEDPKTS); IPSTATS_MIB_INTRUNCATEDPKTS);
...@@ -127,10 +72,6 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) ...@@ -127,10 +72,6 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
IPSTATS_MIB_INDISCARDS); IPSTATS_MIB_INDISCARDS);
goto drop; goto drop;
} }
hdr = ipv6_hdr(skb);
}
if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
goto drop;
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
/* No IP options in IPv6 header; however it should be /* No IP options in IPv6 header; however it should be
......
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
#include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_meta.h> #include <net/netfilter/nft_meta.h>
#include <linux/if_bridge.h> #include <linux/if_bridge.h>
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
#include "../br_private.h"
static const struct net_device * static const struct net_device *
nft_meta_get_bridge(const struct net_device *dev) nft_meta_get_bridge(const struct net_device *dev)
...@@ -102,6 +105,50 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = { ...@@ -102,6 +105,50 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = {
.reduce = nft_meta_get_reduce, .reduce = nft_meta_get_reduce,
}; };
static void nft_meta_bridge_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_meta *meta = nft_expr_priv(expr);
u32 *sreg = &regs->data[meta->sreg];
struct sk_buff *skb = pkt->skb;
u8 value8;
switch (meta->key) {
case NFT_META_BRI_BROUTE:
value8 = nft_reg_load8(sreg);
BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8;
break;
default:
nft_meta_set_eval(expr, regs, pkt);
}
}
static int nft_meta_bridge_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
int err;
priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
switch (priv->key) {
case NFT_META_BRI_BROUTE:
len = sizeof(u8);
break;
default:
return nft_meta_set_init(ctx, expr, tb);
}
priv->len = len;
err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len);
if (err < 0)
return err;
return 0;
}
static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
const struct nft_expr *expr) const struct nft_expr *expr)
{ {
...@@ -120,15 +167,33 @@ static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, ...@@ -120,15 +167,33 @@ static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
return false; return false;
} }
static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
switch (priv->key) {
case NFT_META_BRI_BROUTE:
hooks = 1 << NF_BR_PRE_ROUTING;
break;
default:
return nft_meta_set_validate(ctx, expr, data);
}
return nft_chain_validate_hooks(ctx->chain, hooks);
}
static const struct nft_expr_ops nft_meta_bridge_set_ops = { static const struct nft_expr_ops nft_meta_bridge_set_ops = {
.type = &nft_meta_bridge_type, .type = &nft_meta_bridge_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
.eval = nft_meta_set_eval, .eval = nft_meta_bridge_set_eval,
.init = nft_meta_set_init, .init = nft_meta_bridge_set_init,
.destroy = nft_meta_set_destroy, .destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump, .dump = nft_meta_set_dump,
.reduce = nft_meta_bridge_set_reduce, .reduce = nft_meta_bridge_set_reduce,
.validate = nft_meta_set_validate, .validate = nft_meta_bridge_set_validate,
}; };
static const struct nft_expr_ops * static const struct nft_expr_ops *
......
...@@ -1294,7 +1294,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) ...@@ -1294,7 +1294,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
} }
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
/* Returns true if a connection correspondings to the tuple (required /* Returns true if a connection corresponds to the tuple (required
for NAT). */ for NAT). */
int int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/ipv6_frag.h> #include <net/ipv6_frag.h>
#include <net/ip.h> #include <net/ip.h>
#include <linux/netfilter_ipv6.h>
/* 'skb' should already be pulled to nh_ofs. */ /* 'skb' should already be pulled to nh_ofs. */
int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
...@@ -120,8 +121,14 @@ int nf_ct_skb_network_trim(struct sk_buff *skb, int family) ...@@ -120,8 +121,14 @@ int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
len = skb_ip_totlen(skb); len = skb_ip_totlen(skb);
break; break;
case NFPROTO_IPV6: case NFPROTO_IPV6:
len = sizeof(struct ipv6hdr) len = ntohs(ipv6_hdr(skb)->payload_len);
+ ntohs(ipv6_hdr(skb)->payload_len); if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) {
int err = nf_ip6_check_hbh_len(skb, &len);
if (err)
return err;
}
len += sizeof(struct ipv6hdr);
break; break;
default: default:
len = skb->len; len = skb->len;
......
...@@ -215,3 +215,55 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) ...@@ -215,3 +215,55 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
} }
return ret; return ret;
} }
/* Only get and check the lengths, not do any hop-by-hop stuff. */
int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
{
int len, off = sizeof(struct ipv6hdr);
unsigned char *nh;
if (!pskb_may_pull(skb, off + 8))
return -ENOMEM;
nh = (unsigned char *)(ipv6_hdr(skb) + 1);
len = (nh[1] + 1) << 3;
if (!pskb_may_pull(skb, off + len))
return -ENOMEM;
nh = skb_network_header(skb);
off += 2;
len -= 2;
while (len > 0) {
int optlen;
if (nh[off] == IPV6_TLV_PAD1) {
off++;
len--;
continue;
}
if (len < 2)
return -EBADMSG;
optlen = nh[off + 1] + 2;
if (optlen > len)
return -EBADMSG;
if (nh[off] == IPV6_TLV_JUMBO) {
u32 pkt_len;
if (nh[off + 1] != 4 || (off & 3) != 2)
return -EBADMSG;
pkt_len = ntohl(*(__be32 *)(nh + off + 2));
if (pkt_len <= IPV6_MAXPLEN ||
ipv6_hdr(skb)->payload_len)
return -EBADMSG;
if (pkt_len > skb->len - sizeof(struct ipv6hdr))
return -EBADMSG;
*plen = pkt_len;
}
off += optlen;
len -= optlen;
}
return len ? -EBADMSG : 0;
}
EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
...@@ -48,6 +48,7 @@ TEST_PROGS += l2_tos_ttl_inherit.sh ...@@ -48,6 +48,7 @@ TEST_PROGS += l2_tos_ttl_inherit.sh
TEST_PROGS += bind_bhash.sh TEST_PROGS += bind_bhash.sh
TEST_PROGS += ip_local_port_range.sh TEST_PROGS += ip_local_port_range.sh
TEST_PROGS += rps_default_mask.sh TEST_PROGS += rps_default_mask.sh
TEST_PROGS += big_tcp.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest TEST_GEN_FILES = socket nettest
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Testing For IPv4 and IPv6 BIG TCP.
# TOPO: CLIENT_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) SERVER_NS
CLIENT_NS=$(mktemp -u client-XXXXXXXX)
CLIENT_IP4="198.51.100.1"
CLIENT_IP6="2001:db8:1::1"
SERVER_NS=$(mktemp -u server-XXXXXXXX)
SERVER_IP4="203.0.113.1"
SERVER_IP6="2001:db8:2::1"
ROUTER_NS=$(mktemp -u router-XXXXXXXX)
SERVER_GW4="203.0.113.2"
CLIENT_GW4="198.51.100.2"
SERVER_GW6="2001:db8:2::2"
CLIENT_GW6="2001:db8:1::2"
MAX_SIZE=128000
CHK_SIZE=65535
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
setup() {
ip netns add $CLIENT_NS
ip netns add $SERVER_NS
ip netns add $ROUTER_NS
ip -net $ROUTER_NS link add link1 type veth peer name link0 netns $CLIENT_NS
ip -net $ROUTER_NS link add link2 type veth peer name link3 netns $SERVER_NS
ip -net $CLIENT_NS link set link0 up
ip -net $CLIENT_NS link set link0 mtu 1442
ip -net $CLIENT_NS addr add $CLIENT_IP4/24 dev link0
ip -net $CLIENT_NS addr add $CLIENT_IP6/64 dev link0 nodad
ip -net $CLIENT_NS route add $SERVER_IP4 dev link0 via $CLIENT_GW4
ip -net $CLIENT_NS route add $SERVER_IP6 dev link0 via $CLIENT_GW6
ip -net $CLIENT_NS link set dev link0 \
gro_ipv4_max_size $MAX_SIZE gso_ipv4_max_size $MAX_SIZE
ip -net $CLIENT_NS link set dev link0 \
gro_max_size $MAX_SIZE gso_max_size $MAX_SIZE
ip net exec $CLIENT_NS sysctl -wq net.ipv4.tcp_window_scaling=10
ip -net $ROUTER_NS link set link1 up
ip -net $ROUTER_NS link set link2 up
ip -net $ROUTER_NS addr add $CLIENT_GW4/24 dev link1
ip -net $ROUTER_NS addr add $CLIENT_GW6/64 dev link1 nodad
ip -net $ROUTER_NS addr add $SERVER_GW4/24 dev link2
ip -net $ROUTER_NS addr add $SERVER_GW6/64 dev link2 nodad
ip -net $ROUTER_NS link set dev link1 \
gro_ipv4_max_size $MAX_SIZE gso_ipv4_max_size $MAX_SIZE
ip -net $ROUTER_NS link set dev link2 \
gro_ipv4_max_size $MAX_SIZE gso_ipv4_max_size $MAX_SIZE
ip -net $ROUTER_NS link set dev link1 \
gro_max_size $MAX_SIZE gso_max_size $MAX_SIZE
ip -net $ROUTER_NS link set dev link2 \
gro_max_size $MAX_SIZE gso_max_size $MAX_SIZE
# test for nf_ct_skb_network_trim in nf_conntrack_ovs used by TC ct action.
ip net exec $ROUTER_NS tc qdisc add dev link1 ingress
ip net exec $ROUTER_NS tc filter add dev link1 ingress \
proto ip flower ip_proto tcp action ct
ip net exec $ROUTER_NS tc filter add dev link1 ingress \
proto ipv6 flower ip_proto tcp action ct
ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1
ip net exec $ROUTER_NS sysctl -wq net.ipv6.conf.all.forwarding=1
ip -net $SERVER_NS link set link3 up
ip -net $SERVER_NS addr add $SERVER_IP4/24 dev link3
ip -net $SERVER_NS addr add $SERVER_IP6/64 dev link3 nodad
ip -net $SERVER_NS route add $CLIENT_IP4 dev link3 via $SERVER_GW4
ip -net $SERVER_NS route add $CLIENT_IP6 dev link3 via $SERVER_GW6
ip -net $SERVER_NS link set dev link3 \
gro_ipv4_max_size $MAX_SIZE gso_ipv4_max_size $MAX_SIZE
ip -net $SERVER_NS link set dev link3 \
gro_max_size $MAX_SIZE gso_max_size $MAX_SIZE
ip net exec $SERVER_NS sysctl -wq net.ipv4.tcp_window_scaling=10
ip net exec $SERVER_NS netserver 2>&1 >/dev/null
}
cleanup() {
ip net exec $SERVER_NS pkill netserver
ip -net $ROUTER_NS link del link1
ip -net $ROUTER_NS link del link2
ip netns del "$CLIENT_NS"
ip netns del "$SERVER_NS"
ip netns del "$ROUTER_NS"
}
start_counter() {
local ipt="iptables"
local iface=$1
local netns=$2
[ "$NF" = "6" ] && ipt="ip6tables"
ip net exec $netns $ipt -t raw -A PREROUTING -i $iface \
-m length ! --length 0:$CHK_SIZE -j ACCEPT
}
check_counter() {
local ipt="iptables"
local iface=$1
local netns=$2
[ "$NF" = "6" ] && ipt="ip6tables"
test `ip net exec $netns $ipt -t raw -L -v |grep $iface | awk '{print $1}'` != "0"
}
stop_counter() {
local ipt="iptables"
local iface=$1
local netns=$2
[ "$NF" = "6" ] && ipt="ip6tables"
ip net exec $netns $ipt -t raw -D PREROUTING -i $iface \
-m length ! --length 0:$CHK_SIZE -j ACCEPT
}
do_netperf() {
local serip=$SERVER_IP4
local netns=$1
[ "$NF" = "6" ] && serip=$SERVER_IP6
ip net exec $netns netperf -$NF -t TCP_STREAM -H $serip 2>&1 >/dev/null
}
do_test() {
local cli_tso=$1
local gw_gro=$2
local gw_tso=$3
local ser_gro=$4
local ret="PASS"
ip net exec $CLIENT_NS ethtool -K link0 tso $cli_tso
ip net exec $ROUTER_NS ethtool -K link1 gro $gw_gro
ip net exec $ROUTER_NS ethtool -K link2 tso $gw_tso
ip net exec $SERVER_NS ethtool -K link3 gro $ser_gro
start_counter link1 $ROUTER_NS
start_counter link3 $SERVER_NS
do_netperf $CLIENT_NS
if check_counter link1 $ROUTER_NS; then
check_counter link3 $SERVER_NS || ret="FAIL_on_link3"
else
ret="FAIL_on_link1"
fi
stop_counter link1 $ROUTER_NS
stop_counter link3 $SERVER_NS
printf "%-9s %-8s %-8s %-8s: [%s]\n" \
$cli_tso $gw_gro $gw_tso $ser_gro $ret
test $ret = "PASS"
}
testup() {
echo "CLI GSO | GW GRO | GW GSO | SER GRO" && \
do_test "on" "on" "on" "on" && \
do_test "on" "off" "on" "off" && \
do_test "off" "on" "on" "on" && \
do_test "on" "on" "off" "on" && \
do_test "off" "on" "off" "on"
}
if ! netperf -V &> /dev/null; then
echo "SKIP: Could not run test without netperf tool"
exit $ksft_skip
fi
if ! ip link help 2>&1 | grep gso_ipv4_max_size &> /dev/null; then
echo "SKIP: Could not run test without gso/gro_ipv4_max_size supported in ip-link"
exit $ksft_skip
fi
trap cleanup EXIT
setup && echo "Testing for BIG TCP:" && \
NF=4 testup && echo "***v4 Tests Done***" && \
NF=6 testup && echo "***v6 Tests Done***"
exit $?
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment