Commit 452606d6 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-cpumap-type-for-XDP_REDIRECT'

Jesper Dangaard Brouer says:

====================
net: New bpf cpumap type for XDP_REDIRECT

Introducing a new way to redirect XDP frames.  Notice how no driver
changes are necessary given the design of XDP_REDIRECT.

This redirect map type is called 'cpumap', as it allows redirection
XDP frames to remote CPUs.  The remote CPU will do the SKB allocation
and start the network stack invocation on that CPU.

This is a scalability and isolation mechanism, that allow separating
the early driver network XDP layer, from the rest of the netstack, and
assigning dedicated CPUs for this stage.  The sysadm control/configure
the RX-CPU to NIC-RX queue (as usual) via procfs smp_affinity and how
many queues are configured via ethtool --set-channels.  Benchmarks
show that a single CPU can handle approx 11Mpps.  Thus, only assigning
two NIC RX-queues (and two CPUs) is sufficient for handling 10Gbit/s
wirespeed smallest packet 14.88Mpps.  Reducing the number of queues
have the advantage that more packets being "bulk" available per hard
interrupt[1].

[1] https://www.netdevconf.org/2.1/papers/BusyPollingNextGen.pdf

Use-cases:

1. End-host based pre-filtering for DDoS mitigation.  This is fast
   enough to allow software to see and filter all packets wirespeed.
   Thus, no packets getting silently dropped by hardware.

2. Given NIC HW unevenly distributes packets across RX queue, this
   mechanism can be used for redistribution load across CPUs.  This
   usually happens when HW is unaware of a new protocol.  This
   resembles RPS (Receive Packet Steering), just faster, but with more
   responsibility placed on the BPF program for correct steering.

3. Auto-scaling or power saving via only activating the appropriate
   number of remote CPUs for handling the current load.  The cpumap
   tracepoints can function as a feedback loop for this purpose.

In V7, a --stress-mode was implemented for the samples program, which
between each stats update, adds + removes CPUs from the map
concurrently with traffic.  I did find and fix some concurrency issues
in the tear-down path, details in patch desc.  The stress test have
now been running for 15 hours without any issues, while being
bombarded with 11.6 Mpps via pktgen_sample04_many_flows.sh.

See individual patches for patchset-version changes.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 4b70c62b fad3917e
......@@ -355,6 +355,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
void __cpu_map_flush(struct bpf_map *map);
struct xdp_buff;
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx);
/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
......@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
attr->numa_node : NUMA_NO_NODE;
}
#else
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
return ERR_PTR(-EOPNOTSUPP);
......@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
static inline void __dev_map_flush(struct bpf_map *map)
{
}
static inline
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
return NULL;
}
static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
{
}
static inline void __cpu_map_flush(struct bpf_map *map)
{
}
struct xdp_buff;
static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff *xdp,
struct net_device *dev_rx)
{
return 0;
}
#endif /* CONFIG_BPF_SYSCALL */
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
......
......@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
#ifdef CONFIG_STREAM_PARSER
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#endif
......@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
......
......@@ -136,14 +136,90 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
__entry->map_id, __entry->map_index)
);
#define devmap_ifindex(fwd, map) \
(!fwd ? 0 : \
(!map ? 0 : \
((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
((struct net_device *)fwd)->ifindex : 0)))
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0, \
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
0, map, idx)
#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \
trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0, \
trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \
err, map, idx)
TRACE_EVENT(xdp_cpumap_kthread,
TP_PROTO(int map_id, unsigned int processed, unsigned int drops,
int sched),
TP_ARGS(map_id, processed, drops, sched),
TP_STRUCT__entry(
__field(int, map_id)
__field(u32, act)
__field(int, cpu)
__field(unsigned int, drops)
__field(unsigned int, processed)
__field(int, sched)
),
TP_fast_assign(
__entry->map_id = map_id;
__entry->act = XDP_REDIRECT;
__entry->cpu = smp_processor_id();
__entry->drops = drops;
__entry->processed = processed;
__entry->sched = sched;
),
TP_printk("kthread"
" cpu=%d map_id=%d action=%s"
" processed=%u drops=%u"
" sched=%d",
__entry->cpu, __entry->map_id,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->processed, __entry->drops,
__entry->sched)
);
TRACE_EVENT(xdp_cpumap_enqueue,
TP_PROTO(int map_id, unsigned int processed, unsigned int drops,
int to_cpu),
TP_ARGS(map_id, processed, drops, to_cpu),
TP_STRUCT__entry(
__field(int, map_id)
__field(u32, act)
__field(int, cpu)
__field(unsigned int, drops)
__field(unsigned int, processed)
__field(int, to_cpu)
),
TP_fast_assign(
__entry->map_id = map_id;
__entry->act = XDP_REDIRECT;
__entry->cpu = smp_processor_id();
__entry->drops = drops;
__entry->processed = processed;
__entry->to_cpu = to_cpu;
),
TP_printk("enqueue"
" cpu=%d map_id=%d action=%s"
" processed=%u drops=%u"
" to_cpu=%d",
__entry->cpu, __entry->map_id,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->processed, __entry->drops,
__entry->to_cpu)
);
#endif /* _TRACE_XDP_H */
#include <trace/define_trace.h>
......@@ -112,6 +112,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
};
enum bpf_prog_type {
......
......@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
ifeq ($(CONFIG_STREAM_PARSER),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
......
This diff is collapsed.
......@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
/* Need to create a kthread, thus must support schedule */
if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
......@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
out:
if (!err)
trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
......
......@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
/* Restrict bpf side of cpumap, open when use-cases appear */
case BPF_MAP_TYPE_CPUMAP:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
......@@ -1481,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error;
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP)
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
map->map_type != BPF_MAP_TYPE_CPUMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
......
......@@ -4492,6 +4492,33 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
return ret;
}
/**
* netif_receive_skb_core - special purpose version of netif_receive_skb
* @skb: buffer to process
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
* Caller must also take care of handling if (page_is_)pfmemalloc.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb_core(struct sk_buff *skb)
{
int ret;
rcu_read_lock();
ret = __netif_receive_skb_core(skb, false);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
......
......@@ -2526,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
if (err)
return err;
if (map)
dev->netdev_ops->ndo_xdp_flush(dev);
return 0;
}
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_map *map,
struct xdp_buff *xdp,
u32 index)
{
int err;
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
struct net_device *dev = fwd;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
if (err)
return err;
__dev_map_insert_ctx(map, index);
else
dev->netdev_ops->ndo_xdp_flush(dev);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (err)
return err;
__cpu_map_insert_ctx(map, index);
}
return 0;
}
......@@ -2539,11 +2565,33 @@ void xdp_do_flush_map(void)
struct bpf_map *map = ri->map_to_flush;
ri->map_to_flush = NULL;
if (map)
__dev_map_flush(map);
if (map) {
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
__dev_map_flush(map);
break;
case BPF_MAP_TYPE_CPUMAP:
__cpu_map_flush(map);
break;
default:
break;
}
}
}
EXPORT_SYMBOL_GPL(xdp_do_flush_map);
static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
return __dev_map_lookup_elem(map, index);
case BPF_MAP_TYPE_CPUMAP:
return __cpu_map_lookup_elem(map, index);
default:
return NULL;
}
}
static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
unsigned long aux)
{
......@@ -2556,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
struct net_device *fwd = NULL;
u32 index = ri->ifindex;
void *fwd = NULL;
int err;
ri->ifindex = 0;
......@@ -2570,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
goto err;
}
fwd = __dev_map_lookup_elem(map, index);
fwd = __xdp_map_lookup_elem(map, index);
if (!fwd) {
err = -EINVAL;
goto err;
......@@ -2578,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
if (ri->map_to_flush && ri->map_to_flush != map)
xdp_do_flush_map();
err = __bpf_tx_xdp(fwd, map, xdp, index);
err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
if (unlikely(err))
goto err;
......@@ -2620,54 +2668,88 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog)
static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
{
unsigned int len;
if (unlikely(!(fwd->flags & IFF_UP)))
return -ENETDOWN;
len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
if (skb->len > len)
return -EMSGSIZE;
return 0;
}
int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
struct net_device *fwd = NULL;
u32 index = ri->ifindex;
unsigned int len;
int err = 0;
ri->ifindex = 0;
ri->map = NULL;
ri->map_owner = 0;
if (map) {
if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
err = -EFAULT;
map = NULL;
goto err;
}
fwd = __dev_map_lookup_elem(map, index);
} else {
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
err = -EFAULT;
map = NULL;
goto err;
}
fwd = __xdp_map_lookup_elem(map, index);
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
if (unlikely(!(fwd->flags & IFF_UP))) {
err = -ENETDOWN;
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
goto err;
skb->dev = fwd;
} else {
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
goto err;
}
len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
if (skb->len > len) {
err = -EMSGSIZE;
_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
return 0;
err:
_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
return err;
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
u32 index = ri->ifindex;
struct net_device *fwd;
int err = 0;
if (ri->map)
return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
ri->ifindex = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
goto err;
skb->dev = fwd;
map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
: _trace_xdp_redirect(dev, xdp_prog, index);
_trace_xdp_redirect(dev, xdp_prog, index);
return 0;
err:
map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
}
EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
......
......@@ -39,6 +39,7 @@ hostprogs-y += per_socket_stats_example
hostprogs-y += load_sock_ops
hostprogs-y += xdp_redirect
hostprogs-y += xdp_redirect_map
hostprogs-y += xdp_redirect_cpu
hostprogs-y += xdp_monitor
hostprogs-y += syscall_tp
......@@ -84,6 +85,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
......@@ -129,6 +131,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
always += xdp_monitor_kern.o
always += syscall_tp_kern.o
......@@ -169,6 +172,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
HOSTLOADLIBES_test_map_in_map += -lelf
HOSTLOADLIBES_xdp_redirect += -lelf
HOSTLOADLIBES_xdp_redirect_map += -lelf
HOSTLOADLIBES_xdp_redirect_cpu += -lelf
HOSTLOADLIBES_xdp_monitor += -lelf
HOSTLOADLIBES_syscall_tp += -lelf
......
This diff is collapsed.
This diff is collapsed.
......@@ -112,6 +112,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
};
enum bpf_prog_type {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment