Commit ba926603 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'xdp_redirect-bulking'

Toke Høiland-Jørgensen says:

====================
Since commit 96360004 ("xdp: Make devmap flush_list common for all map
instances"), devmap flushing is a global operation instead of tied to a
particular map. This means that with a bit of refactoring, we can finally fix
the performance delta between the bpf_redirect_map() and bpf_redirect() helper
functions, by introducing bulking for the latter as well.

This series makes this change by moving the data structure used for the bulking
into struct net_device itself, so we can access it even when there is not
devmap. Once this is done, moving the bpf_redirect() helper to use the bulking
mechanism becomes quite trivial, and brings bpf_redirect() up to the same as
bpf_redirect_map():

                       Before:   After:
1 CPU:
bpf_redirect_map:      8.4 Mpps  8.4 Mpps  (no change)
bpf_redirect:          5.0 Mpps  8.4 Mpps  (+68%)
2 CPUs:
bpf_redirect_map:     15.9 Mpps  16.1 Mpps  (+1% or ~no change)
bpf_redirect:          9.5 Mpps  15.9 Mpps  (+67%)

After this patch series, the only semantics different between the two variants
of the bpf() helper (apart from the absence of a map argument, obviously) is
that the _map() variant will return an error if passed an invalid map index,
whereas the bpf_redirect() helper will succeed, but drop packets on
xdp_do_redirect(). This is because the helper has no reference to the calling
netdev, so unfortunately we can't do the ifindex lookup directly in the helper.

Changelog:

v3:
  - Switch two more fields to avoid a list_head spanning two cache lines
  - Include Jesper's tracepoint patch
  - Also rename xdp_do_flush_map()
  - Fix a few nits from Maciej

v2:
  - Consolidate code paths and tracepoints for map and non-map redirect variants
    (Björn)
  - Add performance data for 2-CPU test (Jesper)
  - Move fields to avoid shifting cache lines in struct net_device (Eric)
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 20f21d98 58aa94f9
...@@ -1718,7 +1718,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, ...@@ -1718,7 +1718,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
if (err < 0) if (err < 0)
goto err_xdp; goto err_xdp;
if (err == XDP_REDIRECT) if (err == XDP_REDIRECT)
xdp_do_flush_map(); xdp_do_flush();
if (err != XDP_PASS) if (err != XDP_PASS)
goto out; goto out;
...@@ -2549,7 +2549,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ...@@ -2549,7 +2549,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
} }
if (flush) if (flush)
xdp_do_flush_map(); xdp_do_flush();
rcu_read_unlock(); rcu_read_unlock();
local_bh_enable(); local_bh_enable();
......
...@@ -769,7 +769,7 @@ static int veth_poll(struct napi_struct *napi, int budget) ...@@ -769,7 +769,7 @@ static int veth_poll(struct napi_struct *napi, int budget)
if (xdp_xmit & VETH_XDP_TX) if (xdp_xmit & VETH_XDP_TX)
veth_xdp_flush(rq->dev, &bq); veth_xdp_flush(rq->dev, &bq);
if (xdp_xmit & VETH_XDP_REDIR) if (xdp_xmit & VETH_XDP_REDIR)
xdp_do_flush_map(); xdp_do_flush();
xdp_clear_return_frame_no_direct(); xdp_clear_return_frame_no_direct();
return done; return done;
......
...@@ -1432,7 +1432,7 @@ static int virtnet_poll(struct napi_struct *napi, int budget) ...@@ -1432,7 +1432,7 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
virtqueue_napi_complete(napi, rq->vq, received); virtqueue_napi_complete(napi, rq->vq, received);
if (xdp_xmit & VIRTIO_XDP_REDIR) if (xdp_xmit & VIRTIO_XDP_REDIR)
xdp_do_flush_map(); xdp_do_flush();
if (xdp_xmit & VIRTIO_XDP_TX) { if (xdp_xmit & VIRTIO_XDP_TX) {
sq = virtnet_xdp_sq(vi); sq = virtnet_xdp_sq(vi);
......
...@@ -1056,7 +1056,9 @@ struct sk_buff; ...@@ -1056,7 +1056,9 @@ struct sk_buff;
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_flush(void); void __dev_flush(void);
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx); struct net_device *dev_rx);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
...@@ -1169,13 +1171,20 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map ...@@ -1169,13 +1171,20 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map
return NULL; return NULL;
} }
static inline void __dev_map_flush(void) static inline void __dev_flush(void)
{ {
} }
struct xdp_buff; struct xdp_buff;
struct bpf_dtab_netdev; struct bpf_dtab_netdev;
static inline
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
return 0;
}
static inline static inline
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx) struct net_device *dev_rx)
......
...@@ -918,7 +918,7 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, ...@@ -918,7 +918,7 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
return 0; return 0;
} }
/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the /* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
* same cpu context. Further for best results no more than a single map * same cpu context. Further for best results no more than a single map
* for the do_redirect/do_flush pair should be used. This limitation is * for the do_redirect/do_flush pair should be used. This limitation is
* because we only track one map and force a flush when the map changes. * because we only track one map and force a flush when the map changes.
...@@ -929,7 +929,13 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ...@@ -929,7 +929,13 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
int xdp_do_redirect(struct net_device *dev, int xdp_do_redirect(struct net_device *dev,
struct xdp_buff *xdp, struct xdp_buff *xdp,
struct bpf_prog *prog); struct bpf_prog *prog);
void xdp_do_flush_map(void); void xdp_do_flush(void);
/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
* it is no longer only flushing maps. Keep this define for compatibility
* until all drivers are updated - do not use xdp_do_flush_map() in new code!
*/
#define xdp_do_flush_map xdp_do_flush
void bpf_warn_invalid_xdp_action(u32 act); void bpf_warn_invalid_xdp_action(u32 act);
......
...@@ -876,6 +876,7 @@ enum bpf_netdev_command { ...@@ -876,6 +876,7 @@ enum bpf_netdev_command {
struct bpf_prog_offload_ops; struct bpf_prog_offload_ops;
struct netlink_ext_ack; struct netlink_ext_ack;
struct xdp_umem; struct xdp_umem;
struct xdp_dev_bulk_queue;
struct netdev_bpf { struct netdev_bpf {
enum bpf_netdev_command command; enum bpf_netdev_command command;
...@@ -1986,12 +1987,10 @@ struct net_device { ...@@ -1986,12 +1987,10 @@ struct net_device {
unsigned int num_tx_queues; unsigned int num_tx_queues;
unsigned int real_num_tx_queues; unsigned int real_num_tx_queues;
struct Qdisc *qdisc; struct Qdisc *qdisc;
#ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4);
#endif
unsigned int tx_queue_len; unsigned int tx_queue_len;
spinlock_t tx_global_lock; spinlock_t tx_global_lock;
int watchdog_timeo;
struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
#ifdef CONFIG_XPS #ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_cpus_map; struct xps_dev_maps __rcu *xps_cpus_map;
...@@ -2001,11 +2000,15 @@ struct net_device { ...@@ -2001,11 +2000,15 @@ struct net_device {
struct mini_Qdisc __rcu *miniq_egress; struct mini_Qdisc __rcu *miniq_egress;
#endif #endif
#ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4);
#endif
/* These may be needed for future network-power-down code. */ /* These may be needed for future network-power-down code. */
struct timer_list watchdog_timer; struct timer_list watchdog_timer;
int watchdog_timeo;
int __percpu *pcpu_refcnt;
struct list_head todo_list; struct list_head todo_list;
int __percpu *pcpu_refcnt;
struct list_head link_watch_list; struct list_head link_watch_list;
......
...@@ -79,14 +79,26 @@ TRACE_EVENT(xdp_bulk_tx, ...@@ -79,14 +79,26 @@ TRACE_EVENT(xdp_bulk_tx,
__entry->sent, __entry->drops, __entry->err) __entry->sent, __entry->drops, __entry->err)
); );
#ifndef __DEVMAP_OBJ_TYPE
#define __DEVMAP_OBJ_TYPE
struct _bpf_dtab_netdev {
struct net_device *dev;
};
#endif /* __DEVMAP_OBJ_TYPE */
#define devmap_ifindex(tgt, map) \
(((map->map_type == BPF_MAP_TYPE_DEVMAP || \
map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \
((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0)
DECLARE_EVENT_CLASS(xdp_redirect_template, DECLARE_EVENT_CLASS(xdp_redirect_template,
TP_PROTO(const struct net_device *dev, TP_PROTO(const struct net_device *dev,
const struct bpf_prog *xdp, const struct bpf_prog *xdp,
int to_ifindex, int err, const void *tgt, int err,
const struct bpf_map *map, u32 map_index), const struct bpf_map *map, u32 index),
TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), TP_ARGS(dev, xdp, tgt, err, map, index),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(int, prog_id) __field(int, prog_id)
...@@ -103,90 +115,65 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, ...@@ -103,90 +115,65 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
__entry->act = XDP_REDIRECT; __entry->act = XDP_REDIRECT;
__entry->ifindex = dev->ifindex; __entry->ifindex = dev->ifindex;
__entry->err = err; __entry->err = err;
__entry->to_ifindex = to_ifindex; __entry->to_ifindex = map ? devmap_ifindex(tgt, map) :
index;
__entry->map_id = map ? map->id : 0; __entry->map_id = map ? map->id : 0;
__entry->map_index = map_index; __entry->map_index = map ? index : 0;
), ),
TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d", TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
" map_id=%d map_index=%d",
__entry->prog_id, __entry->prog_id,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->ifindex, __entry->to_ifindex, __entry->ifindex, __entry->to_ifindex,
__entry->err) __entry->err, __entry->map_id, __entry->map_index)
); );
DEFINE_EVENT(xdp_redirect_template, xdp_redirect, DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
TP_PROTO(const struct net_device *dev, TP_PROTO(const struct net_device *dev,
const struct bpf_prog *xdp, const struct bpf_prog *xdp,
int to_ifindex, int err, const void *tgt, int err,
const struct bpf_map *map, u32 map_index), const struct bpf_map *map, u32 index),
TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) TP_ARGS(dev, xdp, tgt, err, map, index)
); );
DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
TP_PROTO(const struct net_device *dev, TP_PROTO(const struct net_device *dev,
const struct bpf_prog *xdp, const struct bpf_prog *xdp,
int to_ifindex, int err, const void *tgt, int err,
const struct bpf_map *map, u32 map_index), const struct bpf_map *map, u32 index),
TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) TP_ARGS(dev, xdp, tgt, err, map, index)
); );
#define _trace_xdp_redirect(dev, xdp, to) \ #define _trace_xdp_redirect(dev, xdp, to) \
trace_xdp_redirect(dev, xdp, to, 0, NULL, 0); trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to);
#define _trace_xdp_redirect_err(dev, xdp, to, err) \ #define _trace_xdp_redirect_err(dev, xdp, to, err) \
trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0); trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to);
DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map, #define _trace_xdp_redirect_map(dev, xdp, to, map, index) \
trace_xdp_redirect(dev, xdp, to, 0, map, index);
#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \
trace_xdp_redirect_err(dev, xdp, to, err, map, index);
/* not used anymore, but kept around so as not to break old programs */
DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map,
TP_PROTO(const struct net_device *dev, TP_PROTO(const struct net_device *dev,
const struct bpf_prog *xdp, const struct bpf_prog *xdp,
int to_ifindex, int err, const void *tgt, int err,
const struct bpf_map *map, u32 map_index), const struct bpf_map *map, u32 index),
TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), TP_ARGS(dev, xdp, tgt, err, map, index)
TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
" map_id=%d map_index=%d",
__entry->prog_id,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->ifindex, __entry->to_ifindex,
__entry->err,
__entry->map_id, __entry->map_index)
); );
DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
TP_PROTO(const struct net_device *dev, TP_PROTO(const struct net_device *dev,
const struct bpf_prog *xdp, const struct bpf_prog *xdp,
int to_ifindex, int err, const void *tgt, int err,
const struct bpf_map *map, u32 map_index), const struct bpf_map *map, u32 index),
TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), TP_ARGS(dev, xdp, tgt, err, map, index)
TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
" map_id=%d map_index=%d",
__entry->prog_id,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->ifindex, __entry->to_ifindex,
__entry->err,
__entry->map_id, __entry->map_index)
); );
#ifndef __DEVMAP_OBJ_TYPE
#define __DEVMAP_OBJ_TYPE
struct _bpf_dtab_netdev {
struct net_device *dev;
};
#endif /* __DEVMAP_OBJ_TYPE */
#define devmap_ifindex(fwd, map) \
((map->map_type == BPF_MAP_TYPE_DEVMAP || \
map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \
((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
0, map, idx)
#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \
trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \
err, map, idx)
TRACE_EVENT(xdp_cpumap_kthread, TRACE_EVENT(xdp_cpumap_kthread,
TP_PROTO(int map_id, unsigned int processed, unsigned int drops, TP_PROTO(int map_id, unsigned int processed, unsigned int drops,
...@@ -259,43 +246,38 @@ TRACE_EVENT(xdp_cpumap_enqueue, ...@@ -259,43 +246,38 @@ TRACE_EVENT(xdp_cpumap_enqueue,
TRACE_EVENT(xdp_devmap_xmit, TRACE_EVENT(xdp_devmap_xmit,
TP_PROTO(const struct bpf_map *map, u32 map_index, TP_PROTO(const struct net_device *from_dev,
int sent, int drops, const struct net_device *to_dev,
const struct net_device *from_dev, int sent, int drops, int err),
const struct net_device *to_dev, int err),
TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), TP_ARGS(from_dev, to_dev, sent, drops, err),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(int, map_id) __field(int, from_ifindex)
__field(u32, act) __field(u32, act)
__field(u32, map_index) __field(int, to_ifindex)
__field(int, drops) __field(int, drops)
__field(int, sent) __field(int, sent)
__field(int, from_ifindex)
__field(int, to_ifindex)
__field(int, err) __field(int, err)
), ),
TP_fast_assign( TP_fast_assign(
__entry->map_id = map->id; __entry->from_ifindex = from_dev->ifindex;
__entry->act = XDP_REDIRECT; __entry->act = XDP_REDIRECT;
__entry->map_index = map_index; __entry->to_ifindex = to_dev->ifindex;
__entry->drops = drops; __entry->drops = drops;
__entry->sent = sent; __entry->sent = sent;
__entry->from_ifindex = from_dev->ifindex;
__entry->to_ifindex = to_dev->ifindex;
__entry->err = err; __entry->err = err;
), ),
TP_printk("ndo_xdp_xmit" TP_printk("ndo_xdp_xmit"
" map_id=%d map_index=%d action=%s" " from_ifindex=%d to_ifindex=%d action=%s"
" sent=%d drops=%d" " sent=%d drops=%d"
" from_ifindex=%d to_ifindex=%d err=%d", " err=%d",
__entry->map_id, __entry->map_index, __entry->from_ifindex, __entry->to_ifindex,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->sent, __entry->drops, __entry->sent, __entry->drops,
__entry->from_ifindex, __entry->to_ifindex, __entry->err) __entry->err)
); );
/* Expect users already include <net/xdp.h>, but not xdp_priv.h */ /* Expect users already include <net/xdp.h>, but not xdp_priv.h */
......
...@@ -53,13 +53,11 @@ ...@@ -53,13 +53,11 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16 #define DEV_MAP_BULK_SIZE 16
struct bpf_dtab_netdev; struct xdp_dev_bulk_queue {
struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node; struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx; struct net_device *dev_rx;
struct bpf_dtab_netdev *obj;
unsigned int count; unsigned int count;
}; };
...@@ -67,9 +65,8 @@ struct bpf_dtab_netdev { ...@@ -67,9 +65,8 @@ struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */ struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist; struct hlist_node index_hlist;
struct bpf_dtab *dtab; struct bpf_dtab *dtab;
struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu; struct rcu_head rcu;
unsigned int idx; /* keep track of map index for tracepoint */ unsigned int idx;
}; };
struct bpf_dtab { struct bpf_dtab {
...@@ -84,7 +81,7 @@ struct bpf_dtab { ...@@ -84,7 +81,7 @@ struct bpf_dtab {
u32 n_buckets; u32 n_buckets;
}; };
static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock); static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list); static LIST_HEAD(dev_map_list);
...@@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map) ...@@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist); hlist_del_rcu(&dev->index_hlist);
free_percpu(dev->bulkq);
dev_put(dev->dev); dev_put(dev->dev);
kfree(dev); kfree(dev);
} }
...@@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map) ...@@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map)
if (!dev) if (!dev)
continue; continue;
free_percpu(dev->bulkq);
dev_put(dev->dev); dev_put(dev->dev);
kfree(dev); kfree(dev);
} }
...@@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, ...@@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT; return -ENOENT;
} }
static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{ {
struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = bq->dev;
struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0; int sent = 0, drops = 0, err = 0;
int i; int i;
...@@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) ...@@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags)
out: out:
bq->count = 0; bq->count = 0;
trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL; bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node); __list_del_clearprev(&bq->flush_node);
return 0; return 0;
...@@ -364,17 +357,17 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) ...@@ -364,17 +357,17 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags)
goto out; goto out;
} }
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled /* __dev_flush is called from xdp_do_flush() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll() * from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled * routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the flush list * net device can be torn down. On devmap tear down we ensure the flush list
* is empty before completing to ensure all flush operations have completed. * is empty before completing to ensure all flush operations have completed.
*/ */
void __dev_map_flush(void) void __dev_flush(void)
{ {
struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_bulk_queue *bq, *tmp; struct xdp_dev_bulk_queue *bq, *tmp;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
...@@ -401,12 +394,11 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) ...@@ -401,12 +394,11 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection. /* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access. * Thus, safe percpu variable access.
*/ */
static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx) struct net_device *dev_rx)
{ {
struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(bq, 0); bq_xmit_all(bq, 0);
...@@ -426,10 +418,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, ...@@ -426,10 +418,9 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
return 0; return 0;
} }
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx) struct net_device *dev_rx)
{ {
struct net_device *dev = dst->dev;
struct xdp_frame *xdpf; struct xdp_frame *xdpf;
int err; int err;
...@@ -444,7 +435,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, ...@@ -444,7 +435,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
if (unlikely(!xdpf)) if (unlikely(!xdpf))
return -EOVERFLOW; return -EOVERFLOW;
return bq_enqueue(dst, xdpf, dev_rx); return bq_enqueue(dev, xdpf, dev_rx);
}
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
return __xdp_enqueue(dev, xdp, dev_rx);
}
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
return __xdp_enqueue(dev, xdp, dev_rx);
} }
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
...@@ -483,7 +488,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu) ...@@ -483,7 +488,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
struct bpf_dtab_netdev *dev; struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
free_percpu(dev->bulkq);
dev_put(dev->dev); dev_put(dev->dev);
kfree(dev); kfree(dev);
} }
...@@ -538,30 +542,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, ...@@ -538,30 +542,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
u32 ifindex, u32 ifindex,
unsigned int idx) unsigned int idx)
{ {
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev; struct bpf_dtab_netdev *dev;
struct xdp_bulk_queue *bq;
int cpu;
dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
dtab->map.numa_node);
if (!dev) if (!dev)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
sizeof(void *), gfp);
if (!dev->bulkq) {
kfree(dev);
return ERR_PTR(-ENOMEM);
}
for_each_possible_cpu(cpu) {
bq = per_cpu_ptr(dev->bulkq, cpu);
bq->obj = dev;
}
dev->dev = dev_get_by_index(net, ifindex); dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) { if (!dev->dev) {
free_percpu(dev->bulkq);
kfree(dev); kfree(dev);
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
...@@ -721,9 +710,23 @@ static int dev_map_notification(struct notifier_block *notifier, ...@@ -721,9 +710,23 @@ static int dev_map_notification(struct notifier_block *notifier,
{ {
struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_dtab *dtab; struct bpf_dtab *dtab;
int i; int i, cpu;
switch (event) { switch (event) {
case NETDEV_REGISTER:
if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
break;
/* will be freed in free_netdev() */
netdev->xdp_bulkq =
__alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
sizeof(void *), GFP_ATOMIC);
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
for_each_possible_cpu(cpu)
per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
break;
case NETDEV_UNREGISTER: case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because /* This rcu_read_lock/unlock pair is needed because
* dev_map_list is an RCU list AND to ensure a delete * dev_map_list is an RCU list AND to ensure a delete
...@@ -771,7 +774,7 @@ static int __init dev_map_init(void) ...@@ -771,7 +774,7 @@ static int __init dev_map_init(void)
register_netdevice_notifier(&dev_map_notifier); register_netdevice_notifier(&dev_map_notifier);
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0; return 0;
} }
......
...@@ -9847,6 +9847,8 @@ void free_netdev(struct net_device *dev) ...@@ -9847,6 +9847,8 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->pcpu_refcnt); free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL; dev->pcpu_refcnt = NULL;
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
netdev_unregister_lockdep_key(dev); netdev_unregister_lockdep_key(dev);
......
...@@ -3458,58 +3458,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { ...@@ -3458,58 +3458,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
static int __bpf_tx_xdp(struct net_device *dev,
struct bpf_map *map,
struct xdp_buff *xdp,
u32 index)
{
struct xdp_frame *xdpf;
int err, sent;
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
}
err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
if (unlikely(err))
return err;
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
if (sent <= 0)
return sent;
return 0;
}
static noinline int
xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
{
struct net_device *fwd;
u32 index = ri->tgt_index;
int err;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
ri->tgt_index = 0;
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
if (unlikely(err))
goto err;
_trace_xdp_redirect(dev, xdp_prog, index);
return 0;
err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
}
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_map *map, struct xdp_buff *xdp) struct bpf_map *map, struct xdp_buff *xdp)
{ {
...@@ -3527,13 +3475,13 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -3527,13 +3475,13 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
return 0; return 0;
} }
void xdp_do_flush_map(void) void xdp_do_flush(void)
{ {
__dev_map_flush(); __dev_flush();
__cpu_map_flush(); __cpu_map_flush();
__xsk_map_flush(); __xsk_map_flush();
} }
EXPORT_SYMBOL_GPL(xdp_do_flush_map); EXPORT_SYMBOL_GPL(xdp_do_flush);
static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{ {
...@@ -3568,10 +3516,11 @@ void bpf_clear_redirect_map(struct bpf_map *map) ...@@ -3568,10 +3516,11 @@ void bpf_clear_redirect_map(struct bpf_map *map)
} }
} }
static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_map *map, struct bpf_prog *xdp_prog)
struct bpf_redirect_info *ri)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
u32 index = ri->tgt_index; u32 index = ri->tgt_index;
void *fwd = ri->tgt_value; void *fwd = ri->tgt_value;
int err; int err;
...@@ -3580,7 +3529,18 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3580,7 +3529,18 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
ri->tgt_value = NULL; ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL); WRITE_ONCE(ri->map, NULL);
err = __bpf_tx_xdp_map(dev, fwd, map, xdp); if (unlikely(!map)) {
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
err = dev_xdp_enqueue(fwd, xdp, dev);
} else {
err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
}
if (unlikely(err)) if (unlikely(err))
goto err; goto err;
...@@ -3590,18 +3550,6 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3590,18 +3550,6 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
return err; return err;
} }
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
if (likely(map))
return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect); EXPORT_SYMBOL_GPL(xdp_do_redirect);
static int xdp_do_generic_redirect_map(struct net_device *dev, static int xdp_do_generic_redirect_map(struct net_device *dev,
......
...@@ -222,14 +222,12 @@ struct bpf_map_def SEC("maps") devmap_xmit_cnt = { ...@@ -222,14 +222,12 @@ struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
*/ */
struct devmap_xmit_ctx { struct devmap_xmit_ctx {
u64 __pad; // First 8 bytes are not accessible by bpf code u64 __pad; // First 8 bytes are not accessible by bpf code
int map_id; // offset:8; size:4; signed:1; int from_ifindex; // offset:8; size:4; signed:1;
u32 act; // offset:12; size:4; signed:0; u32 act; // offset:12; size:4; signed:0;
u32 map_index; // offset:16; size:4; signed:0; int to_ifindex; // offset:16; size:4; signed:1;
int drops; // offset:20; size:4; signed:1; int drops; // offset:20; size:4; signed:1;
int sent; // offset:24; size:4; signed:1; int sent; // offset:24; size:4; signed:1;
int from_ifindex; // offset:28; size:4; signed:1; int err; // offset:28; size:4; signed:1;
int to_ifindex; // offset:32; size:4; signed:1;
int err; // offset:36; size:4; signed:1;
}; };
SEC("tracepoint/xdp/xdp_devmap_xmit") SEC("tracepoint/xdp/xdp_devmap_xmit")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment