Commit 8537f786 authored by Lukas Wunner's avatar Lukas Wunner Committed by Pablo Neira Ayuso

netfilter: Introduce egress hook

Commit e687ad60 ("netfilter: add netfilter ingress hook after
handle_ing() under unique static key") introduced the ability to
classify packets on ingress.

Allow the same on egress.  Position the hook immediately before a packet
is handed to tc and then sent out on an interface, thereby mirroring the
ingress order.  This order allows marking packets in the netfilter
egress hook and subsequently using the mark in tc.  Another benefit of
this order is consistency with a lot of existing documentation which
says that egress tc is performed after netfilter hooks.

Egress hooks already exist for the most common protocols, such as
NF_INET_LOCAL_OUT or NF_ARP_OUT, and those are to be preferred because
they are executed earlier during packet processing.  However for more
exotic protocols, there is currently no provision to apply netfilter on
egress.  A common workaround is to enslave the interface to a bridge and
use ebtables, or to resort to tc.  But when the ingress hook was
introduced, consensus was that users should be given the choice to use
netfilter or tc, whichever tool suits their needs best:
https://lore.kernel.org/netdev/20150430153317.GA3230@salvia/
This hook is also useful for NAT46/NAT64, tunneling and filtering of
locally generated af_packet traffic such as dhclient.

There have also been occasional user requests for a netfilter egress
hook in the past, e.g.:
https://www.spinics.net/lists/netfilter/msg50038.html

Performance measurements with pktgen surprisingly show a speedup rather
than a slowdown with this commit:

* Without this commit:
  Result: OK: 34240933(c34238375+d2558) usec, 100000000 (60byte,0frags)
  2920481pps 1401Mb/sec (1401830880bps) errors: 0

* With this commit:
  Result: OK: 33997299(c33994193+d3106) usec, 100000000 (60byte,0frags)
  2941410pps 1411Mb/sec (1411876800bps) errors: 0

* Without this commit + tc egress:
  Result: OK: 39022386(c39019547+d2839) usec, 100000000 (60byte,0frags)
  2562631pps 1230Mb/sec (1230062880bps) errors: 0

* With this commit + tc egress:
  Result: OK: 37604447(c37601877+d2570) usec, 100000000 (60byte,0frags)
  2659259pps 1276Mb/sec (1276444320bps) errors: 0

* With this commit + nft egress:
  Result: OK: 41436689(c41434088+d2600) usec, 100000000 (60byte,0frags)
  2413320pps 1158Mb/sec (1158393600bps) errors: 0

Tested on a bare-metal Core i7-3615QM, each measurement was performed
three times to verify that the numbers are stable.

Commands to perform a measurement:
modprobe pktgen
echo "add_device lo@3" > /proc/net/pktgen/kpktgend_3
samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i 'lo@3' -n 100000000

Commands for testing tc egress:
tc qdisc add dev lo clsact
tc filter add dev lo egress protocol ip prio 1 u32 match ip dst 4.3.2.1/32

Commands for testing nft egress:
nft add table netdev t
nft add chain netdev t co \{ type filter hook egress device lo priority 0 \; \}
nft add rule netdev t co ip daddr 4.3.2.1/32 drop

All testing was performed on the loopback interface to avoid distorting
measurements by the packet handling in the low-level Ethernet driver.
Signed-off-by: default avatarLukas Wunner <lukas@wunner.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent 5418d388
...@@ -1751,6 +1751,7 @@ enum netdev_priv_flags { ...@@ -1751,6 +1751,7 @@ enum netdev_priv_flags {
* @xps_maps: XXX: need comments on this one * @xps_maps: XXX: need comments on this one
* @miniq_egress: clsact qdisc specific data for * @miniq_egress: clsact qdisc specific data for
* egress processing * egress processing
* @nf_hooks_egress: netfilter hooks executed for egress packets
* @qdisc_hash: qdisc hash table * @qdisc_hash: qdisc hash table
* @watchdog_timeo: Represents the timeout that is used by * @watchdog_timeo: Represents the timeout that is used by
* the watchdog (see dev_watchdog()) * the watchdog (see dev_watchdog())
...@@ -2026,6 +2027,9 @@ struct net_device { ...@@ -2026,6 +2027,9 @@ struct net_device {
#ifdef CONFIG_NET_CLS_ACT #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress; struct mini_Qdisc __rcu *miniq_egress;
#endif #endif
#ifdef CONFIG_NETFILTER_EGRESS
struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4); DECLARE_HASHTABLE (qdisc_hash, 4);
......
...@@ -47,6 +47,9 @@ static inline void nf_hook_netdev_init(struct net_device *dev) ...@@ -47,6 +47,9 @@ static inline void nf_hook_netdev_init(struct net_device *dev)
#ifdef CONFIG_NETFILTER_INGRESS #ifdef CONFIG_NETFILTER_INGRESS
RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
#endif #endif
#ifdef CONFIG_NETFILTER_EGRESS
RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
#endif
} }
#ifdef CONFIG_NETFILTER_INGRESS #ifdef CONFIG_NETFILTER_INGRESS
...@@ -72,4 +75,28 @@ static inline int nf_hook_ingress(struct sk_buff *skb) ...@@ -72,4 +75,28 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
return 0; return 0;
} }
#endif /* CONFIG_NETFILTER_INGRESS */ #endif /* CONFIG_NETFILTER_INGRESS */
#ifdef CONFIG_NETFILTER_EGRESS
static inline bool nf_hook_egress_active(const struct sk_buff *skb)
{
return nf_hook_netdev_active(NF_NETDEV_EGRESS,
skb->dev->nf_hooks_egress);
}
static inline int nf_hook_egress(struct sk_buff *skb)
{
return nf_hook_netdev(skb, NF_NETDEV_EGRESS,
skb->dev->nf_hooks_egress);
}
#else /* CONFIG_NETFILTER_EGRESS */
static inline int nf_hook_egress_active(struct sk_buff *skb)
{
return 0;
}
static inline int nf_hook_egress(struct sk_buff *skb)
{
return 0;
}
#endif /* CONFIG_NETFILTER_EGRESS */
#endif /* _NETFILTER_INGRESS_H_ */ #endif /* _NETFILTER_INGRESS_H_ */
...@@ -50,6 +50,7 @@ enum nf_inet_hooks { ...@@ -50,6 +50,7 @@ enum nf_inet_hooks {
enum nf_dev_hooks { enum nf_dev_hooks {
NF_NETDEV_INGRESS, NF_NETDEV_INGRESS,
NF_NETDEV_EGRESS,
NF_NETDEV_NUMHOOKS NF_NETDEV_NUMHOOKS
}; };
......
...@@ -3773,6 +3773,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); ...@@ -3773,6 +3773,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
static struct sk_buff * static struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{ {
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
struct tcf_result cl_res; struct tcf_result cl_res;
...@@ -3806,11 +3807,24 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) ...@@ -3806,11 +3807,24 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
default: default:
break; break;
} }
#endif /* CONFIG_NET_CLS_ACT */
return skb; return skb;
} }
#endif /* CONFIG_NET_EGRESS */ #endif /* CONFIG_NET_EGRESS */
static inline int nf_egress(struct sk_buff *skb)
{
if (nf_hook_egress_active(skb)) {
int ret;
rcu_read_lock();
ret = nf_hook_egress(skb);
rcu_read_unlock();
return ret;
}
return 0;
}
#ifdef CONFIG_XPS #ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci) struct xps_dev_maps *dev_maps, unsigned int tci)
...@@ -3997,13 +4011,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) ...@@ -3997,13 +4011,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
qdisc_pkt_len_init(skb); qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT #ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0; skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS #endif
#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) { if (static_branch_unlikely(&egress_needed_key)) {
if (nf_egress(skb) < 0)
goto out;
skb = sch_handle_egress(skb, &rc, dev); skb = sch_handle_egress(skb, &rc, dev);
if (!skb) if (!skb)
goto out; goto out;
} }
# endif
#endif #endif
/* If device/qdisc don't need skb->dst, release it right now while /* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache. * its hot in this cpu cache.
......
...@@ -10,6 +10,14 @@ config NETFILTER_INGRESS ...@@ -10,6 +10,14 @@ config NETFILTER_INGRESS
This allows you to classify packets from ingress using the Netfilter This allows you to classify packets from ingress using the Netfilter
infrastructure. infrastructure.
config NETFILTER_EGRESS
bool "Netfilter egress support"
default y
select NET_EGRESS
help
This allows you to classify packets before transmission using the
Netfilter infrastructure.
config NETFILTER_NETLINK config NETFILTER_NETLINK
tristate tristate
......
...@@ -306,6 +306,12 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, ...@@ -306,6 +306,12 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
if (dev && dev_net(dev) == net) if (dev && dev_net(dev) == net)
return &dev->nf_hooks_ingress; return &dev->nf_hooks_ingress;
} }
#endif
#ifdef CONFIG_NETFILTER_EGRESS
if (hooknum == NF_NETDEV_EGRESS) {
if (dev && dev_net(dev) == net)
return &dev->nf_hooks_egress;
}
#endif #endif
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
return NULL; return NULL;
...@@ -318,11 +324,13 @@ static int __nf_register_net_hook(struct net *net, int pf, ...@@ -318,11 +324,13 @@ static int __nf_register_net_hook(struct net *net, int pf,
struct nf_hook_entries __rcu **pp; struct nf_hook_entries __rcu **pp;
if (pf == NFPROTO_NETDEV) { if (pf == NFPROTO_NETDEV) {
#ifndef CONFIG_NETFILTER_INGRESS if ((!IS_ENABLED(CONFIG_NETFILTER_INGRESS) &&
if (reg->hooknum == NF_NETDEV_INGRESS) reg->hooknum == NF_NETDEV_INGRESS) ||
(!IS_ENABLED(CONFIG_NETFILTER_EGRESS) &&
reg->hooknum == NF_NETDEV_EGRESS))
return -EOPNOTSUPP; return -EOPNOTSUPP;
#endif if ((reg->hooknum != NF_NETDEV_INGRESS &&
if (reg->hooknum != NF_NETDEV_INGRESS || reg->hooknum != NF_NETDEV_EGRESS) ||
!reg->dev || dev_net(reg->dev) != net) !reg->dev || dev_net(reg->dev) != net)
return -EINVAL; return -EINVAL;
} }
...@@ -348,6 +356,10 @@ static int __nf_register_net_hook(struct net *net, int pf, ...@@ -348,6 +356,10 @@ static int __nf_register_net_hook(struct net *net, int pf,
if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
net_inc_ingress_queue(); net_inc_ingress_queue();
#endif #endif
#ifdef CONFIG_NETFILTER_EGRESS
if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS)
net_inc_egress_queue();
#endif
#ifdef CONFIG_JUMP_LABEL #ifdef CONFIG_JUMP_LABEL
static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]); static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
#endif #endif
...@@ -406,6 +418,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf, ...@@ -406,6 +418,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
net_dec_ingress_queue(); net_dec_ingress_queue();
#endif #endif
#ifdef CONFIG_NETFILTER_EGRESS
if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS)
net_dec_egress_queue();
#endif
#ifdef CONFIG_JUMP_LABEL #ifdef CONFIG_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]); static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
#endif #endif
......
...@@ -277,9 +277,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = { ...@@ -277,9 +277,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
.name = "filter", .name = "filter",
.type = NFT_CHAIN_T_DEFAULT, .type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_NETDEV, .family = NFPROTO_NETDEV,
.hook_mask = (1 << NF_NETDEV_INGRESS), .hook_mask = (1 << NF_NETDEV_INGRESS) |
(1 << NF_NETDEV_EGRESS),
.hooks = { .hooks = {
[NF_NETDEV_INGRESS] = nft_do_chain_netdev, [NF_NETDEV_INGRESS] = nft_do_chain_netdev,
[NF_NETDEV_EGRESS] = nft_do_chain_netdev,
}, },
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment