Commit 7fe7f318 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

1) ipset limits the max allocatable memory via kvmalloc() to MAX_INT,
   from Jozsef Kadlecsik.

2) Check ip_vs_conn_tab_bits value to be in the range specified
   in Kconfig, from Andrea Claudi.

3) Initialize fragment offset in ip6tables, from Jeremy Sowden.

4) Make conntrack hash chain length random, from Florian Westphal.

5) Add zone ID to conntrack and NAT hashtuple again, also from Florian.

6) Add selftests for bidirectional zone support and colliding tuples,
   from Florian Westphal.

7) Unlink table before synchronize_rcu when cleaning tables with
   owner, from Florian.

8) ipset limits the max allocatable memory via kvmalloc() to MAX_INT.

9) Release conntrack entries via workqueue in masquerade, from Florian.

10) Fix bogus net_init in iptables raw table definition, also from Florian.

11) Work around missing softdep in log extensions, from Florian Westphal.

12) Serialize hash resizes and cleanups with mutex, from Eric Dumazet.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
  netfilter: conntrack: serialize hash resizes and cleanups
  netfilter: log: work around missing softdep backend module
  netfilter: iptable_raw: drop bogus net_init annotation
  netfilter: nf_nat_masquerade: defer conntrack walk to work queue
  netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic
  netfilter: nf_tables: Fix oversized kvmalloc() calls
  netfilter: nf_tables: unlink table before deleting it
  selftests: netfilter: add zone stress test with colliding tuples
  selftests: netfilter: add selftest for directional zone support
  netfilter: nat: include zone id in nat table hash again
  netfilter: conntrack: include zone id in tuple hash again
  netfilter: conntrack: make max chain length random
  netfilter: ip6_tables: zero-initialize fragment offset
  ipvs: check that ip_vs_conn_tab_bits is between 8 and 20
  netfilter: ipset: Fix oversized kvmalloc() calls
====================

Link: https://lore.kernel.org/r/20210924221113.348767-1-pablo@netfilter.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 4526fe74 e9edc188
......@@ -42,7 +42,7 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *rawtable_ops __read_mostly;
static int __net_init iptable_raw_table_init(struct net *net)
static int iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
const struct xt_table *table = &packet_raw;
......
......@@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb,
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.fragoff = 0;
acpar.hotdrop = false;
acpar.state = state;
......
......@@ -130,11 +130,11 @@ htable_size(u8 hbits)
{
size_t hsize;
/* We must fit both into u32 in jhash and size_t */
/* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;
......
......@@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void)
int idx;
/* Compute size and mask */
if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
}
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
......
This diff is collapsed.
......@@ -150,13 +150,16 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
/* We keep an extra hash for each conntrack, for fast searching. */
static unsigned int
hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
hash_by_src(const struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
struct {
struct nf_conntrack_man src;
u32 net_mix;
u32 protonum;
u32 zone;
} __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
......@@ -165,9 +168,13 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
/* Original src, to ensure we map it consistently if poss. */
combined.src = tuple->src;
combined.net_mix = net_hash_mix(n);
combined.net_mix = net_hash_mix(net);
combined.protonum = tuple->dst.protonum;
/* Zone ID can be used provided its valid for both directions */
if (zone->dir == NF_CT_DEFAULT_ZONE_DIR)
combined.zone = zone->id;
hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);
return reciprocal_scale(hash, nf_nat_htable_size);
......@@ -272,7 +279,7 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range2 *range)
{
unsigned int h = hash_by_src(net, tuple);
unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn *ct;
hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
......@@ -619,7 +626,7 @@ nf_nat_setup_info(struct nf_conn *ct,
unsigned int srchash;
spinlock_t *lock;
srchash = hash_by_src(net,
srchash = hash_by_src(net, nf_ct_zone(ct),
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
......@@ -788,7 +795,7 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
hlist_del_rcu(&ct->nat_bysource);
spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
......
......@@ -9,8 +9,19 @@
#include <net/netfilter/nf_nat_masquerade.h>
struct masq_dev_work {
struct work_struct work;
struct net *net;
union nf_inet_addr addr;
int ifindex;
int (*iter)(struct nf_conn *i, void *data);
};
#define MAX_MASQ_WORKER_COUNT 16
static DEFINE_MUTEX(masq_mutex);
static unsigned int masq_refcnt __read_mostly;
static atomic_t masq_worker_count __read_mostly;
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
......@@ -63,13 +74,71 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
static int device_cmp(struct nf_conn *i, void *ifindex)
static void iterate_cleanup_work(struct work_struct *work)
{
struct masq_dev_work *w;
w = container_of(work, struct masq_dev_work, work);
nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0);
put_net(w->net);
kfree(w);
atomic_dec(&masq_worker_count);
module_put(THIS_MODULE);
}
/* Iterate conntrack table in the background and remove conntrack entries
* that use the device/address being removed.
*
* In case too many work items have been queued already or memory allocation
* fails iteration is skipped, conntrack entries will time out eventually.
*/
static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
int ifindex,
int (*iter)(struct nf_conn *i, void *data),
gfp_t gfp_flags)
{
struct masq_dev_work *w;
if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
return;
net = maybe_get_net(net);
if (!net)
return;
if (!try_module_get(THIS_MODULE))
goto err_module;
w = kzalloc(sizeof(*w), gfp_flags);
if (w) {
/* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
atomic_inc(&masq_worker_count);
INIT_WORK(&w->work, iterate_cleanup_work);
w->ifindex = ifindex;
w->net = net;
w->iter = iter;
if (addr)
w->addr = *addr;
schedule_work(&w->work);
return;
}
module_put(THIS_MODULE);
err_module:
put_net(net);
}
static int device_cmp(struct nf_conn *i, void *arg)
{
const struct nf_conn_nat *nat = nfct_nat(i);
const struct masq_dev_work *w = arg;
if (!nat)
return 0;
return nat->masq_index == (int)(long)ifindex;
return nat->masq_index == w->ifindex;
}
static int masq_device_event(struct notifier_block *this,
......@@ -85,8 +154,8 @@ static int masq_device_event(struct notifier_block *this,
* and forget them.
*/
nf_ct_iterate_cleanup_net(net, device_cmp,
(void *)(long)dev->ifindex, 0, 0);
nf_nat_masq_schedule(net, NULL, dev->ifindex,
device_cmp, GFP_KERNEL);
}
return NOTIFY_DONE;
......@@ -94,35 +163,45 @@ static int masq_device_event(struct notifier_block *this,
static int inet_cmp(struct nf_conn *ct, void *ptr)
{
struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
struct net_device *dev = ifa->ifa_dev->dev;
struct nf_conntrack_tuple *tuple;
struct masq_dev_work *w = ptr;
if (!device_cmp(ct, (void *)(long)dev->ifindex))
if (!device_cmp(ct, ptr))
return 0;
tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
return ifa->ifa_address == tuple->dst.u3.ip;
return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
}
static int masq_inet_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
struct net *net = dev_net(idev->dev);
const struct in_ifaddr *ifa = ptr;
const struct in_device *idev;
const struct net_device *dev;
union nf_inet_addr addr;
if (event != NETDEV_DOWN)
return NOTIFY_DONE;
/* The masq_dev_notifier will catch the case of the device going
* down. So if the inetdev is dead and being destroyed we have
* no work to do. Otherwise this is an individual address removal
* and we have to perform the flush.
*/
idev = ifa->ifa_dev;
if (idev->dead)
return NOTIFY_DONE;
if (event == NETDEV_DOWN)
nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
memset(&addr, 0, sizeof(addr));
addr.ip = ifa->ifa_address;
dev = idev->dev;
nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
inet_cmp, GFP_KERNEL);
return NOTIFY_DONE;
}
......@@ -136,8 +215,6 @@ static struct notifier_block masq_inet_notifier = {
};
#if IS_ENABLED(CONFIG_IPV6)
static atomic_t v6_worker_count __read_mostly;
static int
nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
const struct in6_addr *daddr, unsigned int srcprefs,
......@@ -187,40 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
struct masq_dev_work {
struct work_struct work;
struct net *net;
struct in6_addr addr;
int ifindex;
};
static int inet6_cmp(struct nf_conn *ct, void *work)
{
struct masq_dev_work *w = (struct masq_dev_work *)work;
struct nf_conntrack_tuple *tuple;
if (!device_cmp(ct, (void *)(long)w->ifindex))
return 0;
tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6);
}
static void iterate_cleanup_work(struct work_struct *work)
{
struct masq_dev_work *w;
w = container_of(work, struct masq_dev_work, work);
nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0);
put_net(w->net);
kfree(w);
atomic_dec(&v6_worker_count);
module_put(THIS_MODULE);
}
/* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
*
* Defer it to the system workqueue.
......@@ -233,36 +276,19 @@ static int masq_inet6_event(struct notifier_block *this,
{
struct inet6_ifaddr *ifa = ptr;
const struct net_device *dev;
struct masq_dev_work *w;
struct net *net;
union nf_inet_addr addr;
if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16)
if (event != NETDEV_DOWN)
return NOTIFY_DONE;
dev = ifa->idev->dev;
net = maybe_get_net(dev_net(dev));
if (!net)
return NOTIFY_DONE;
if (!try_module_get(THIS_MODULE))
goto err_module;
memset(&addr, 0, sizeof(addr));
w = kmalloc(sizeof(*w), GFP_ATOMIC);
if (w) {
atomic_inc(&v6_worker_count);
INIT_WORK(&w->work, iterate_cleanup_work);
w->ifindex = dev->ifindex;
w->net = net;
w->addr = ifa->addr;
schedule_work(&w->work);
addr.in6 = ifa->addr;
return NOTIFY_DONE;
}
module_put(THIS_MODULE);
err_module:
put_net(net);
nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
GFP_ATOMIC);
return NOTIFY_DONE;
}
......
......@@ -4336,7 +4336,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (ops->privsize != NULL)
size = ops->privsize(nla, &desc);
alloc_size = sizeof(*set) + size + udlen;
if (alloc_size < size)
if (alloc_size < size || alloc_size > INT_MAX)
return -ENOMEM;
set = kvzalloc(alloc_size, GFP_KERNEL);
if (!set)
......@@ -9599,7 +9599,6 @@ static void __nft_release_table(struct net *net, struct nft_table *table)
table->use--;
nf_tables_chain_destroy(&ctx);
}
list_del(&table->list);
nf_tables_table_destroy(&ctx);
}
......@@ -9612,6 +9611,8 @@ static void __nft_release_tables(struct net *net)
if (nft_table_has_owner(table))
continue;
list_del(&table->list);
__nft_release_table(net, table);
}
}
......@@ -9619,31 +9620,38 @@ static void __nft_release_tables(struct net *net)
static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct nft_table *table, *to_delete[8];
struct nftables_pernet *nft_net;
struct netlink_notify *n = ptr;
struct nft_table *table, *nt;
struct net *net = n->net;
bool release = false;
unsigned int deleted;
bool restart = false;
if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
return NOTIFY_DONE;
nft_net = nft_pernet(net);
deleted = 0;
mutex_lock(&nft_net->commit_mutex);
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
n->portid == table->nlpid) {
__nft_release_hook(net, table);
release = true;
list_del_rcu(&table->list);
to_delete[deleted++] = table;
if (deleted >= ARRAY_SIZE(to_delete))
break;
}
}
if (release) {
if (deleted) {
restart = deleted >= ARRAY_SIZE(to_delete);
synchronize_rcu();
list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
n->portid == table->nlpid)
__nft_release_table(net, table);
}
while (deleted)
__nft_release_table(net, to_delete[--deleted]);
if (restart)
goto again;
}
mutex_unlock(&nft_net->commit_mutex);
......
......@@ -19,6 +19,7 @@
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_arp/arp_tables.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
/* Used for matches where *info is larger than X byte */
#define NFT_MATCH_LARGE_THRESH 192
......@@ -257,8 +258,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_compat_wait_for_destructors();
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0)
if (ret < 0) {
if (ret == -ENOENT) {
const char *modname = NULL;
if (strcmp(target->name, "LOG") == 0)
modname = "nf_log_syslog";
else if (strcmp(target->name, "NFLOG") == 0)
modname = "nfnetlink_log";
if (modname &&
nft_request_module(ctx->net, "%s", modname) == -EAGAIN)
return -EAGAIN;
}
return ret;
}
/* The standard target cannot be used */
if (!target->target)
......
......@@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int log_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
int ret;
if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6)
return -EINVAL;
......@@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
if (ret != 0 && !par->nft_compat) {
request_module("%s", "nf_log_syslog");
ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
}
return ret;
}
static void log_tg_destroy(const struct xt_tgdtor_param *par)
......
......@@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int nflog_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
int ret;
if (info->flags & ~XT_NFLOG_MASK)
return -EINVAL;
if (info->prefix[sizeof(info->prefix) - 1] != '\0')
return -EINVAL;
return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
if (ret != 0 && !par->nft_compat) {
request_module("%s", "nfnetlink_log");
ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
}
return ret;
}
static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
......
#!/bin/bash
#
# Test connection tracking zone and NAT source port reallocation support.
#
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
# Don't increase too much, 2000 clients should work
# just fine but script can then take several minutes with
# KASAN/debug builds.
maxclients=100
have_iperf=1
ret=0
# client1---.
# veth1-.
# |
# NAT Gateway --veth0--> Server
# | |
# veth2-' |
# client2---' |
# .... |
# clientX----vethX---'
# All clients share identical IP address.
# NAT Gateway uses policy routing and conntrack zones to isolate client
# namespaces. Each client connects to Server, each with colliding tuples:
# clientsaddr:10000 -> serveraddr:dport
# NAT Gateway is supposed to do port reallocation for each of the
# connections.
sfx=$(mktemp -u "XXXXXXXX")
gw="ns-gw-$sfx"
cl1="ns-cl1-$sfx"
cl2="ns-cl2-$sfx"
srv="ns-srv-$sfx"
v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null)
v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null)
v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null)
v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null)
v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null)
v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null)
cleanup()
{
ip netns del $gw
ip netns del $srv
for i in $(seq 1 $maxclients); do
ip netns del ns-cl$i-$sfx 2>/dev/null
done
sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null
}
nft --version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without nft tool"
exit $ksft_skip
fi
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
conntrack -V > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without conntrack tool"
exit $ksft_skip
fi
iperf3 -v >/dev/null 2>&1
if [ $? -ne 0 ];then
have_iperf=0
fi
ip netns add "$gw"
if [ $? -ne 0 ];then
echo "SKIP: Could not create net namespace $gw"
exit $ksft_skip
fi
ip -net "$gw" link set lo up
trap cleanup EXIT
ip netns add "$srv"
if [ $? -ne 0 ];then
echo "SKIP: Could not create server netns $srv"
exit $ksft_skip
fi
ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv"
ip -net "$gw" link set veth0 up
ip -net "$srv" link set lo up
ip -net "$srv" link set eth0 up
sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null
sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null
sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null
for i in $(seq 1 $maxclients);do
cl="ns-cl$i-$sfx"
ip netns add "$cl"
if [ $? -ne 0 ];then
echo "SKIP: Could not create client netns $cl"
exit $ksft_skip
fi
ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: No virtual ethernet pair device support in kernel"
exit $ksft_skip
fi
done
for i in $(seq 1 $maxclients);do
cl="ns-cl$i-$sfx"
echo netns exec "$cl" ip link set lo up
echo netns exec "$cl" ip link set eth0 up
echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2
echo netns exec "$gw" ip link set veth$i up
echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2
echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0
# clients have same IP addresses.
echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0
echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0
echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0
echo netns exec "$cl" ip route add default via dead:1::2 dev eth0
# NB: same addresses on client-facing interfaces.
echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i
echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i
# gw: policy routing
echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i))
echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i))
echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i))
echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i))
echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i))
done | ip -batch /dev/stdin
ip -net "$gw" addr add 10.3.0.1/24 dev veth0
ip -net "$gw" addr add dead:3::1/64 dev veth0
ip -net "$srv" addr add 10.3.0.99/24 dev eth0
ip -net "$srv" addr add dead:3::99/64 dev eth0
ip netns exec $gw nft -f /dev/stdin<<EOF
table inet raw {
map iiftomark {
type ifname : mark
}
map iiftozone {
typeof iifname : ct zone
}
set inicmp {
flags dynamic
type ipv4_addr . ifname . ipv4_addr
}
set inflows {
flags dynamic
type ipv4_addr . inet_service . ifname . ipv4_addr . inet_service
}
set inflows6 {
flags dynamic
type ipv6_addr . inet_service . ifname . ipv6_addr . inet_service
}
chain prerouting {
type filter hook prerouting priority -64000; policy accept;
ct original zone set meta iifname map @iiftozone
meta mark set meta iifname map @iiftomark
tcp flags & (syn|ack) == ack add @inflows { ip saddr . tcp sport . meta iifname . ip daddr . tcp dport counter }
add @inflows6 { ip6 saddr . tcp sport . meta iifname . ip6 daddr . tcp dport counter }
ip protocol icmp add @inicmp { ip saddr . meta iifname . ip daddr counter }
}
chain nat_postrouting {
type nat hook postrouting priority 0; policy accept;
ct mark set meta mark meta oifname veth0 masquerade
}
chain mangle_prerouting {
type filter hook prerouting priority -100; policy accept;
ct direction reply meta mark set ct mark
}
}
EOF
( echo add element inet raw iiftomark \{
for i in $(seq 1 $((maxclients-1))); do
echo \"veth$i\" : $i,
done
echo \"veth$maxclients\" : $maxclients \}
echo add element inet raw iiftozone \{
for i in $(seq 1 $((maxclients-1))); do
echo \"veth$i\" : $i,
done
echo \"veth$maxclients\" : $maxclients \}
) | ip netns exec $gw nft -f /dev/stdin
ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null
ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null
# useful for debugging: allows to use 'ping' from clients to gateway.
ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null
ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null
for i in $(seq 1 $maxclients); do
cl="ns-cl$i-$sfx"
ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 &
if [ $? -ne 0 ]; then
echo FAIL: Ping failure from $cl 1>&2
ret=1
break
fi
done
wait
for i in $(seq 1 $maxclients); do
ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"
if [ $? -ne 0 ];then
ret=1
echo "FAIL: counter icmp mismatch for veth$i" 1>&2
ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2
break
fi
done
ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }"
if [ $? -ne 0 ];then
ret=1
echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }"
ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2
fi
if [ $ret -eq 0 ]; then
echo "PASS: ping test from all $maxclients namespaces"
fi
if [ $have_iperf -eq 0 ];then
echo "SKIP: iperf3 not installed"
if [ $ret -ne 0 ];then
exit $ret
fi
exit $ksft_skip
fi
ip netns exec $srv iperf3 -s > /dev/null 2>&1 &
iperfpid=$!
sleep 1
for i in $(seq 1 $maxclients); do
if [ $ret -ne 0 ]; then
break
fi
cl="ns-cl$i-$sfx"
ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null
if [ $? -ne 0 ]; then
echo FAIL: Failure to connect for $cl 1>&2
ip netns exec $gw conntrack -S 1>&2
ret=1
fi
done
if [ $ret -eq 0 ];then
echo "PASS: iperf3 connections for all $maxclients net namespaces"
fi
kill $iperfpid
wait
for i in $(seq 1 $maxclients); do
ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null
if [ $? -ne 0 ];then
ret=1
echo "FAIL: can't find expected tcp entry for veth$i" 1>&2
break
fi
done
if [ $ret -eq 0 ];then
echo "PASS: Found client connection for all $maxclients net namespaces"
fi
ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null
if [ $? -ne 0 ];then
ret=1
echo "FAIL: cannot find return entry on veth0" 1>&2
fi
exit $ret
#!/bin/bash
# Test insertion speed for packets with identical addresses/ports
# that are all placed in distinct conntrack zones.
sfx=$(mktemp -u "XXXXXXXX")
ns="ns-$sfx"
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
zones=20000
have_ct_tool=0
ret=0
cleanup()
{
ip netns del $ns
}
ip netns add $ns
if [ $? -ne 0 ];then
echo "SKIP: Could not create net namespace $gw"
exit $ksft_skip
fi
trap cleanup EXIT
conntrack -V > /dev/null 2>&1
if [ $? -eq 0 ];then
have_ct_tool=1
fi
ip -net "$ns" link set lo up
test_zones() {
local max_zones=$1
ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600
ip netns exec $ns nft -f /dev/stdin<<EOF
flush ruleset
table inet raw {
map rndzone {
typeof numgen inc mod $max_zones : ct zone
}
chain output {
type filter hook output priority -64000; policy accept;
udp dport 12345 ct zone set numgen inc mod 65536 map @rndzone
}
}
EOF
(
echo "add element inet raw rndzone {"
for i in $(seq 1 $max_zones);do
echo -n "$i : $i"
if [ $i -lt $max_zones ]; then
echo ","
else
echo "}"
fi
done
) | ip netns exec $ns nft -f /dev/stdin
local i=0
local j=0
local outerstart=$(date +%s%3N)
local stop=$outerstart
while [ $i -lt $max_zones ]; do
local start=$(date +%s%3N)
i=$((i + 10000))
j=$((j + 1))
dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null
if [ $? -ne 0 ] ;then
ret=1
break
fi
stop=$(date +%s%3N)
local duration=$((stop-start))
echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)"
done
if [ $have_ct_tool -eq 1 ]; then
local count=$(ip netns exec "$ns" conntrack -C)
local duration=$((stop-outerstart))
if [ $count -eq $max_zones ]; then
echo "PASS: inserted $count entries from packet path in $duration ms total"
else
ip netns exec $ns conntrack -S 1>&2
echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries"
ret=1
fi
fi
if [ $ret -ne 0 ];then
echo "FAIL: insert $max_zones entries from packet path" 1>&2
fi
}
test_conntrack_tool() {
local max_zones=$1
ip netns exec $ns conntrack -F >/dev/null 2>/dev/null
local outerstart=$(date +%s%3N)
local start=$(date +%s%3N)
local stop=$start
local i=0
while [ $i -lt $max_zones ]; do
i=$((i + 1))
ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
--timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1
if [ $? -ne 0 ];then
ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
--timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null
echo "FAIL: conntrack -I returned an error"
ret=1
break
fi
if [ $((i%10000)) -eq 0 ];then
stop=$(date +%s%3N)
local duration=$((stop-start))
echo "PASS: added 10000 entries in $duration ms (now $i total)"
start=$stop
fi
done
local count=$(ip netns exec "$ns" conntrack -C)
local duration=$((stop-outerstart))
if [ $count -eq $max_zones ]; then
echo "PASS: inserted $count entries via ctnetlink in $duration ms"
else
ip netns exec $ns conntrack -S 1>&2
echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)"
ret=1
fi
}
test_zones $zones
if [ $have_ct_tool -eq 1 ];then
test_conntrack_tool $zones
else
echo "SKIP: Could not run ctnetlink insertion test without conntrack tool"
if [ $ret -eq 0 ];then
exit $ksft_skip
fi
fi
exit $ret
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment