Commit 10f67868 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'xdp_xmit-bulking'

Jesper Dangaard Brouer says:

====================
This patchset change ndo_xdp_xmit API to take a bulk of xdp frames.

When kernel is compiled with CONFIG_RETPOLINE, every indirect function
pointer (branch) call hurts performance. For XDP this have a huge
negative performance impact.

This patchset reduce the needed (indirect) calls to ndo_xdp_xmit, but
also prepares for further optimizations.  The DMA APIs use of indirect
function pointer calls is the primary source the regression.  It is
left for a followup patchset, to use bulking calls towards the DMA API
(via the scatter-gatter calls).

The other advantage of this API change is that drivers can easier
amortize the cost of any sync/locking scheme, over the bulk of
packets.  The assumption of the current API is that the driver
implemementing the NDO will also allocate a dedicated XDP TX queue for
every CPU in the system.  Which is not always possible or practical to
configure. E.g. ixgbe cannot load an XDP program on a machine with
more than 96 CPUs, due to limited hardware TX queues.  E.g. virtio_net
is hard to configure as it requires manually increasing the
queues. E.g. tun driver chooses to use a per XDP frame producer lock
modulo smp_processor_id over avail queues.

I'm considered adding 'flags' to ndo_xdp_xmit, but it's not part of
this patchset.  This will be a followup patchset, once we know if this
will be needed (e.g. for non-map xdp_redirect flush-flag, and if
AF_XDP chooses to use ndo_xdp_xmit for TX).

---
V5: Fixed up issues spotted by Daniel and John

V4: Splitout the patches from 4 to 8 patches.  I cannot split the
driver changes from the NDO change, but I've tried to isolated the NDO
change together with the driver change as much as possible.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents f80acbd2 a570e48f
......@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
* @dev: netdev
* @xdp: XDP buffer
*
* Returns Zero if sent, else an error code
* Returns number of frames successfully sent. Frames that fail are
* free'ed via XDP return API.
*
* For error cases, a negative errno code is returned and no-frames
* are transmitted (caller must handle freeing frames).
**/
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{
struct i40e_netdev_priv *np = netdev_priv(dev);
unsigned int queue_index = smp_processor_id();
struct i40e_vsi *vsi = np->vsi;
int err;
int drops = 0;
int i;
if (test_bit(__I40E_VSI_DOWN, vsi->state))
return -ENETDOWN;
......@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO;
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
if (err != I40E_XDP_TX)
return -ENOSPC;
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
int err;
return 0;
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
if (err != I40E_XDP_TX) {
xdp_return_frame_rx_napi(xdpf);
drops++;
}
}
return n - drops;
}
/**
......
......@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
void i40e_xdp_flush(struct net_device *dev);
/**
......
......@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
}
}
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
static int ixgbe_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;
int err;
int drops = 0;
int i;
if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
return -ENETDOWN;
......@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (unlikely(!ring))
return -ENXIO;
err = ixgbe_xmit_xdp_ring(adapter, xdpf);
if (err != IXGBE_XDP_TX)
return -ENOSPC;
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
int err;
return 0;
err = ixgbe_xmit_xdp_ring(adapter, xdpf);
if (err != IXGBE_XDP_TX) {
xdp_return_frame_rx_napi(xdpf);
drops++;
}
}
return n - drops;
}
static void ixgbe_xdp_flush(struct net_device *dev)
......
......@@ -70,6 +70,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
......@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64 = tun_net_get_stats64,
};
static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{
struct tun_struct *tun = netdev_priv(dev);
struct tun_file *tfile;
u32 numqueues;
int ret = 0;
int drops = 0;
int cnt = n;
int i;
rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues);
if (!numqueues) {
ret = -ENOSPC;
goto out;
rcu_read_unlock();
return -ENXIO; /* Caller will free/return all frames */
}
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]);
/* Encode the XDP flag into lowest bit for consumer to differ
* XDP buffer from sk_buff.
*/
if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
this_cpu_inc(tun->pcpu_stats->tx_dropped);
ret = -ENOSPC;
spin_lock(&tfile->tx_ring.producer_lock);
for (i = 0; i < n; i++) {
struct xdp_frame *xdp = frames[i];
/* Encode the XDP flag into lowest bit for consumer to differ
* XDP buffer from sk_buff.
*/
void *frame = tun_xdp_to_ptr(xdp);
if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
this_cpu_inc(tun->pcpu_stats->tx_dropped);
xdp_return_frame_rx_napi(xdp);
drops++;
}
}
spin_unlock(&tfile->tx_ring.producer_lock);
out:
rcu_read_unlock();
return ret;
return cnt - drops;
}
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
......@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
if (unlikely(!frame))
return -EOVERFLOW;
return tun_xdp_xmit(dev, frame);
return tun_xdp_xmit(dev, 1, &frame);
}
static void tun_xdp_flush(struct net_device *dev)
......
......@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
virtqueue_kick(sq->vq);
}
static int __virtnet_xdp_xmit(struct virtnet_info *vi,
struct xdp_frame *xdpf)
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct send_queue *sq,
struct xdp_frame *xdpf)
{
struct virtio_net_hdr_mrg_rxbuf *hdr;
struct xdp_frame *xdpf_sent;
struct send_queue *sq;
unsigned int len;
unsigned int qp;
int err;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp];
/* Free up any pending old buffers before queueing new ones. */
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
xdp_return_frame(xdpf_sent);
/* virtqueue want to use data area in-front of packet */
if (unlikely(xdpf->metasize > 0))
return -EOPNOTSUPP;
......@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
return 0;
}
static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
struct xdp_frame *xdpf)
{
struct xdp_frame *xdpf_sent;
struct send_queue *sq;
unsigned int len;
unsigned int qp;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp];
/* Free up any pending old buffers before queueing new ones. */
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
xdp_return_frame(xdpf_sent);
return __virtnet_xdp_xmit_one(vi, sq, xdpf);
}
static int virtnet_xdp_xmit(struct net_device *dev,
int n, struct xdp_frame **frames)
{
struct virtnet_info *vi = netdev_priv(dev);
struct receive_queue *rq = vi->rq;
struct xdp_frame *xdpf_sent;
struct bpf_prog *xdp_prog;
struct send_queue *sq;
unsigned int len;
unsigned int qp;
int drops = 0;
int err;
int i;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp];
/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
* indicate XDP resources have been successfully allocated.
......@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (!xdp_prog)
return -ENXIO;
return __virtnet_xdp_xmit(vi, xdpf);
/* Free up any pending old buffers before queueing new ones. */
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
xdp_return_frame(xdpf_sent);
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
if (err) {
xdp_return_frame_rx_napi(xdpf);
drops++;
}
}
return n - drops;
}
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
......@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
err = __virtnet_xdp_xmit(vi, xdpf);
err = __virtnet_xdp_tx_xmit(vi, xdpf);
if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
goto err_xdp;
......@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
err = __virtnet_xdp_xmit(vi, xdpf);
err = __virtnet_xdp_tx_xmit(vi, xdpf);
if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
if (unlikely(xdp_page != page))
......
......@@ -487,14 +487,17 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
/* Map specifics */
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
struct xdp_buff;
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
void __cpu_map_flush(struct bpf_map *map);
struct xdp_buff;
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx);
......@@ -573,6 +576,16 @@ static inline void __dev_map_flush(struct bpf_map *map)
{
}
struct xdp_buff;
struct bpf_dtab_netdev;
static inline
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
return 0;
}
static inline
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
......@@ -587,7 +600,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
{
}
struct xdp_buff;
static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff *xdp,
struct net_device *dev_rx)
......
......@@ -1185,9 +1185,13 @@ struct dev_ifalias {
* This function is used to set or query state related to XDP on the
* netdevice and manage BPF offload. See definition of
* enum bpf_netdev_command for details.
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
* This function is used to submit a XDP packet for transmit on a
* netdevice.
* int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
* This function is used to submit @n XDP packets for transmit on a
* netdevice. Returns number of frames successfully transmitted, frames
* that got dropped are freed/returned via xdp_return_frame().
* Returns negative number, means general error invoking ndo, meaning
* no frames were xmit'ed and core-caller will free all frames.
* TODO: Consider add flag to allow sending flush operation.
* void (*ndo_xdp_flush)(struct net_device *dev);
* This function is used to inform the driver to flush a particular
* xdp tx queue. Must be called on same CPU as xdp_xmit.
......@@ -1375,8 +1379,8 @@ struct net_device_ops {
int needed_headroom);
int (*ndo_bpf)(struct net_device *dev,
struct netdev_bpf *bpf);
int (*ndo_xdp_xmit)(struct net_device *dev,
struct xdp_frame *xdp);
int (*ndo_xdp_xmit)(struct net_device *dev, int n,
struct xdp_frame **xdp);
void (*ndo_xdp_flush)(struct net_device *dev);
};
......
......@@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
void __page_pool_put_page(struct page_pool *pool,
struct page *page, bool allow_direct);
static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
static inline void page_pool_put_page(struct page_pool *pool,
struct page *page, bool allow_direct)
{
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
*/
#ifdef CONFIG_PAGE_POOL
__page_pool_put_page(pool, page, false);
__page_pool_put_page(pool, page, allow_direct);
#endif
}
/* Very limited use-cases allow recycle direct */
......
......@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
}
void xdp_return_frame(struct xdp_frame *xdpf);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
......
......@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
__entry->map_id, __entry->map_index)
);
#ifndef __DEVMAP_OBJ_TYPE
#define __DEVMAP_OBJ_TYPE
struct _bpf_dtab_netdev {
struct net_device *dev;
};
#endif /* __DEVMAP_OBJ_TYPE */
#define devmap_ifindex(fwd, map) \
(!fwd ? 0 : \
(!map ? 0 : \
((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
((struct net_device *)fwd)->ifindex : 0)))
((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
......@@ -222,6 +229,47 @@ TRACE_EVENT(xdp_cpumap_enqueue,
__entry->to_cpu)
);
TRACE_EVENT(xdp_devmap_xmit,
TP_PROTO(const struct bpf_map *map, u32 map_index,
int sent, int drops,
const struct net_device *from_dev,
const struct net_device *to_dev, int err),
TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
TP_STRUCT__entry(
__field(int, map_id)
__field(u32, act)
__field(u32, map_index)
__field(int, drops)
__field(int, sent)
__field(int, from_ifindex)
__field(int, to_ifindex)
__field(int, err)
),
TP_fast_assign(
__entry->map_id = map->id;
__entry->act = XDP_REDIRECT;
__entry->map_index = map_index;
__entry->drops = drops;
__entry->sent = sent;
__entry->from_ifindex = from_dev->ifindex;
__entry->to_ifindex = to_dev->ifindex;
__entry->err = err;
),
TP_printk("ndo_xdp_xmit"
" map_id=%d map_index=%d action=%s"
" sent=%d drops=%d"
" from_ifindex=%d to_ifindex=%d err=%d",
__entry->map_id, __entry->map_index,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->sent, __entry->drops,
__entry->from_ifindex, __entry->to_ifindex, __entry->err)
);
#endif /* _TRACE_XDP_H */
#include <trace/define_trace.h>
......@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
xdp_return_frame(xdpf);
xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
......
......@@ -48,15 +48,25 @@
* calls will fail at this point.
*/
#include <linux/bpf.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct net_device *dev_rx;
unsigned int count;
};
struct bpf_dtab_netdev {
struct net_device *dev;
struct net_device *dev; /* must be first member, due to tracepoint */
struct bpf_dtab *dtab;
unsigned int bit;
struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
};
......@@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
__set_bit(bit, bitmap);
}
static int bq_xmit_all(struct bpf_dtab_netdev *obj,
struct xdp_bulk_queue *bq)
{
struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0;
int i;
if (unlikely(!bq->count))
return 0;
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
if (sent < 0) {
err = sent;
sent = 0;
goto error;
}
drops = bq->count - sent;
out:
bq->count = 0;
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
return 0;
error:
/* If ndo_xdp_xmit fails with an errno, no frames have been
* xmit'ed and it's our responsibility to them free all.
*/
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
/* RX path under NAPI protection, can return frames faster */
xdp_return_frame_rx_napi(xdpf);
drops++;
}
goto out;
}
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
......@@ -221,6 +275,7 @@ void __dev_map_flush(struct bpf_map *map)
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
struct xdp_bulk_queue *bq;
struct net_device *netdev;
/* This is possible if the dev entry is removed by user space
......@@ -230,6 +285,9 @@ void __dev_map_flush(struct bpf_map *map)
continue;
__clear_bit(bit, bitmap);
bq = this_cpu_ptr(dev->bulkq);
bq_xmit_all(dev, bq);
netdev = dev->dev;
if (likely(netdev->netdev_ops->ndo_xdp_flush))
netdev->netdev_ops->ndo_xdp_flush(netdev);
......@@ -240,21 +298,61 @@ void __dev_map_flush(struct bpf_map *map)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev;
struct bpf_dtab_netdev *obj;
if (key >= map->max_entries)
return NULL;
dev = READ_ONCE(dtab->netdev_map[key]);
return dev ? dev->dev : NULL;
obj = READ_ONCE(dtab->netdev_map[key]);
return obj;
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(obj, bq);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
*/
if (!bq->dev_rx)
bq->dev_rx = dev_rx;
bq->q[bq->count++] = xdpf;
return 0;
}
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
return bq_enqueue(dst, xdpf, dev_rx);
}
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
struct net_device *dev = dev = obj ? obj->dev : NULL;
return dev ? &dev->ifindex : NULL;
}
......@@ -263,13 +361,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_flush) {
struct net_device *fl = dev->dev;
struct xdp_bulk_queue *bq;
unsigned long *bitmap;
int cpu;
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
bq = per_cpu_ptr(dev->bulkq, cpu);
bq_xmit_all(dev, bq);
fl->netdev_ops->ndo_xdp_flush(dev->dev);
}
}
......@@ -281,6 +384,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
dev_map_flush_old(dev);
free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
......@@ -313,6 +417,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
......@@ -327,13 +432,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
if (!dev)
return -ENOMEM;
dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
sizeof(void *), gfp);
if (!dev->bulkq) {
kfree(dev);
return -ENOMEM;
}
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
free_percpu(dev->bulkq);
kfree(dev);
return -EINVAL;
}
......@@ -405,6 +517,9 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
return 0;
}
......
......@@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
u32 index)
{
struct xdp_frame *xdpf;
int err;
int sent;
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
......@@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
if (unlikely(!xdpf))
return -EOVERFLOW;
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
if (err)
return err;
sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
if (sent <= 0)
return sent;
dev->netdev_ops->ndo_xdp_flush(dev);
return 0;
}
......@@ -3065,20 +3065,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP: {
struct net_device *dev = fwd;
struct xdp_frame *xdpf;
struct bpf_dtab_netdev *dst = fwd;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
/* TODO: move to inside map code instead, for bulk support
* err = dev_map_enqueue(dev, xdp);
*/
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
err = dev_map_enqueue(dst, xdp, dev_rx);
if (err)
return err;
__dev_map_insert_ctx(map, index);
......
......@@ -308,7 +308,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
static void xdp_return(void *data, struct xdp_mem_info *mem)
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolian
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
{
struct xdp_mem_allocator *xa;
struct page *page;
......@@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
if (xa)
page_pool_put_page(xa->page_pool, page);
page_pool_put_page(xa->page_pool, page, napi_direct);
else
put_page(page);
rcu_read_unlock();
......@@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
void xdp_return_frame(struct xdp_frame *xdpf)
{
xdp_return(xdpf->data, &xdpf->mem);
__xdp_return(xdpf->data, &xdpf->mem, false);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
__xdp_return(xdpf->data, &xdpf->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
void xdp_return_buff(struct xdp_buff *xdp)
{
xdp_return(xdp->data, &xdp->rxq->mem);
__xdp_return(xdp->data, &xdp->rxq->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
......@@ -125,6 +125,7 @@ struct datarec {
u64 processed;
u64 dropped;
u64 info;
u64 err;
};
#define MAX_CPUS 64
......@@ -208,3 +209,51 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
return 0;
}
struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(struct datarec),
.max_entries = 1,
};
/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
* Code in: kernel/include/trace/events/xdp.h
*/
struct devmap_xmit_ctx {
u64 __pad; // First 8 bytes are not accessible by bpf code
int map_id; // offset:8; size:4; signed:1;
u32 act; // offset:12; size:4; signed:0;
u32 map_index; // offset:16; size:4; signed:0;
int drops; // offset:20; size:4; signed:1;
int sent; // offset:24; size:4; signed:1;
int from_ifindex; // offset:28; size:4; signed:1;
int to_ifindex; // offset:32; size:4; signed:1;
int err; // offset:36; size:4; signed:1;
};
SEC("tracepoint/xdp/xdp_devmap_xmit")
int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
{
struct datarec *rec;
u32 key = 0;
rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
if (!rec)
return 0;
rec->processed += ctx->sent;
rec->dropped += ctx->drops;
/* Record bulk events, then userspace can calc average bulk size */
rec->info += 1;
/* Record error cases, where no frame were sent */
if (ctx->err)
rec->err++;
/* Catch API error of drv ndo_xdp_xmit sent more than count */
if (ctx->drops < 0)
rec->err++;
return 1;
}
......@@ -117,6 +117,7 @@ struct datarec {
__u64 processed;
__u64 dropped;
__u64 info;
__u64 err;
};
#define MAX_CPUS 64
......@@ -141,6 +142,7 @@ struct stats_record {
struct record_u64 xdp_exception[XDP_ACTION_MAX];
struct record xdp_cpumap_kthread;
struct record xdp_cpumap_enqueue[MAX_CPUS];
struct record xdp_devmap_xmit;
};
static bool map_collect_record(int fd, __u32 key, struct record *rec)
......@@ -151,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
__u64 sum_processed = 0;
__u64 sum_dropped = 0;
__u64 sum_info = 0;
__u64 sum_err = 0;
int i;
if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
......@@ -169,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
sum_dropped += values[i].dropped;
rec->cpu[i].info = values[i].info;
sum_info += values[i].info;
rec->cpu[i].err = values[i].err;
sum_err += values[i].err;
}
rec->total.processed = sum_processed;
rec->total.dropped = sum_dropped;
rec->total.info = sum_info;
rec->total.err = sum_err;
return true;
}
......@@ -273,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
return pps;
}
static double calc_err(struct datarec *r, struct datarec *p, double period)
{
__u64 packets = 0;
double pps = 0;
if (period > 0) {
packets = r->err - p->err;
pps = packets / period;
}
return pps;
}
static void stats_print(struct stats_record *stats_rec,
struct stats_record *stats_prev,
bool err_only)
......@@ -397,7 +415,7 @@ static void stats_print(struct stats_record *stats_rec,
info = calc_info(r, p, t);
if (info > 0)
i_str = "sched";
if (pps > 0)
if (pps > 0 || drop > 0)
printf(fmt1, "cpumap-kthread",
i, pps, drop, info, i_str);
}
......@@ -409,6 +427,50 @@ static void stats_print(struct stats_record *stats_rec,
printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
}
/* devmap ndo_xdp_xmit stats */
{
char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
struct record *rec, *prev;
double drop, info, err;
char *i_str = "";
char *err_str = "";
rec = &stats_rec->xdp_devmap_xmit;
prev = &stats_prev->xdp_devmap_xmit;
t = calc_period(rec, prev);
for (i = 0; i < nr_cpus; i++) {
struct datarec *r = &rec->cpu[i];
struct datarec *p = &prev->cpu[i];
pps = calc_pps(r, p, t);
drop = calc_drop(r, p, t);
info = calc_info(r, p, t);
err = calc_err(r, p, t);
if (info > 0) {
i_str = "bulk-average";
info = (pps+drop) / info; /* calc avg bulk */
}
if (err > 0)
err_str = "drv-err";
if (pps > 0 || drop > 0)
printf(fmt1, "devmap-xmit",
i, pps, drop, info, i_str, err_str);
}
pps = calc_pps(&rec->total, &prev->total, t);
drop = calc_drop(&rec->total, &prev->total, t);
info = calc_info(&rec->total, &prev->total, t);
err = calc_err(&rec->total, &prev->total, t);
if (info > 0) {
i_str = "bulk-average";
info = (pps+drop) / info; /* calc avg bulk */
}
if (err > 0)
err_str = "drv-err";
printf(fmt2, "devmap-xmit", "total", pps, drop,
info, i_str, err_str);
}
printf("\n");
}
......@@ -437,6 +499,9 @@ static bool stats_collect(struct stats_record *rec)
fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
return true;
}
......@@ -480,6 +545,7 @@ static struct stats_record *alloc_stats_record(void)
rec_sz = sizeof(struct datarec);
rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz);
for (i = 0; i < MAX_CPUS; i++)
rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
......@@ -498,6 +564,7 @@ static void free_stats_record(struct stats_record *r)
free(r->xdp_exception[i].cpu);
free(r->xdp_cpumap_kthread.cpu);
free(r->xdp_devmap_xmit.cpu);
for (i = 0; i < MAX_CPUS; i++)
free(r->xdp_cpumap_enqueue[i].cpu);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment