Commit aa7f1f03 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-xdp-bcast'

Hangbin Liu says:

====================
This patchset is a new implementation for XDP multicast support based
on my previous 2 maps implementation[1]. The reason is that Daniel thinks
the exclude map implementation is missing proper bond support in XDP
context. And there is a plan to add native XDP bonding support. Adding
a exclude map in the helper also increases the complexity of verifier and
has drawbacks on performance.

The new implementation just add two new flags BPF_F_BROADCAST and
BPF_F_EXCLUDE_INGRESS to extend xdp_redirect_map for broadcast support.

With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces
in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be
excluded when do broadcasting.

The patchv11 link is here [2].

  [1] https://lore.kernel.org/bpf/20210223125809.1376577-1-liuhangbin@gmail.com
  [2] https://lore.kernel.org/bpf/20210513070447.1878448-1-liuhangbin@gmail.com

v12: As Daniel pointed out:
  a) defined as const u64 for flag_mask and action_mask in
     __bpf_xdp_redirect_map()
  b) remove BPF_F_ACTION_MASK in uapi header
  c) remove EXPORT_SYMBOL_GPL for xdpf_clone()

v11:
  a) Use unlikely() when checking if this is for broadcast redirecting.
  b) Fix a tracepoint NULL pointer issue Jesper found
  c) Remove BPF_F_REDIR_MASK and just use OR flags to make the reader more
     clear about what's flags we are using
  d) Add the performace number with multi veth interfaces in patch 01
     description.
  e) remove some sleeps to reduce the testing time in patch04. Re-struct the
     test and make clear what flags we are testing.

v10: use READ/WRITE_ONCE when read/write map instead of xchg()
v9: Update patch 01 commit description
v8: use hlist_for_each_entry_rcu() when looping the devmap hash ojbs
v7: No need to free xdpf in dev_map_enqueue_clone() if xdpf_clone failed.
v6: Fix a skb leak in the error path for generic XDP
v5: Just walk the map directly to get interfaces as get_next_key() of devmap
    hash may restart looping from the first key if the device get removed.
    After update the performace has improved 10% compired with v4.
v4: Fix flags never cleared issue in patch 02. Update selftest to cover this.
v3: Rebase the code based on latest bpf-next
v2: fix flag renaming issue in patch 02
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 21703cf7 d2329247
......@@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
struct bpf_prog *xdp_prog);
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog, struct bpf_map *map,
bool exclude_ingress);
bool dev_map_can_have_prog(struct bpf_map *map);
void __cpu_map_flush(void);
......@@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return 0;
}
static inline
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
return 0;
}
struct sk_buff;
static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
......@@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
return 0;
}
static inline
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog, struct bpf_map *map,
bool exclude_ingress)
{
return 0;
}
static inline void __cpu_map_flush(void)
{
}
......
......@@ -646,6 +646,7 @@ struct bpf_redirect_info {
u32 flags;
u32 tgt_index;
void *tgt_value;
struct bpf_map *map;
u32 map_id;
enum bpf_map_type map_type;
u32 kern_flags;
......@@ -1464,17 +1465,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
}
#endif /* IS_ENABLED(CONFIG_IPV6) */
static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
u64 flags, const u64 flag_mask,
void *lookup_elem(struct bpf_map *map, u32 key))
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
/* Lower bits of the flags are used as return code on lookup failure */
if (unlikely(flags > XDP_TX))
if (unlikely(flags & ~(action_mask | flag_mask)))
return XDP_ABORTED;
ri->tgt_value = lookup_elem(map, ifindex);
if (unlikely(!ri->tgt_value)) {
if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
/* If the lookup fails we want to clear out the state in the
* redirect_info struct completely, so that if an eBPF program
* performs multiple lookups, the last one always takes
......@@ -1482,13 +1485,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
*/
ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
ri->map_type = BPF_MAP_TYPE_UNSPEC;
return flags;
return flags & action_mask;
}
ri->tgt_index = ifindex;
ri->map_id = map->id;
ri->map_type = map->map_type;
if (flags & BPF_F_BROADCAST) {
WRITE_ONCE(ri->map, map);
ri->flags = flags;
} else {
WRITE_ONCE(ri->map, NULL);
ri->flags = 0;
}
return XDP_REDIRECT;
}
......
......@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
static inline
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
......
......@@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
u32 ifindex = 0, map_index = index;
if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
/* Just leave to_ifindex to 0 if do broadcast redirect,
* as tgt will be NULL.
*/
if (tgt)
ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
ifindex = index;
map_index = 0;
......
......@@ -2555,8 +2555,12 @@ union bpf_attr {
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to **XDP_TX**, as chosen
* by the caller. Any higher bits in the *flags* argument must be
* unset.
* by the caller. The higher bits of *flags* can be set to
* BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
*
* With BPF_F_BROADCAST the packet will be broadcasted to all the
* interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
* interface will be excluded when do broadcasting.
*
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
......@@ -5122,6 +5126,12 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
/* Flags for bpf_redirect_map helper */
enum {
BPF_F_BROADCAST = (1ULL << 3),
BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
};
#define __bpf_md_ptr(type, name) \
union { \
type name; \
......
......@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
__cpu_map_lookup_elem);
}
static int cpu_map_btf_id;
......
This diff is collapsed.
......@@ -3930,6 +3930,23 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
void bpf_clear_redirect_map(struct bpf_map *map)
{
struct bpf_redirect_info *ri;
int cpu;
for_each_possible_cpu(cpu) {
ri = per_cpu_ptr(&bpf_redirect_info, cpu);
/* Avoid polluting remote cacheline due to writes if
* not needed. Once we pass this test, we need the
* cmpxchg() to make sure it hasn't been changed in
* the meantime by remote CPU.
*/
if (unlikely(READ_ONCE(ri->map) == map))
cmpxchg(&ri->map, map, NULL);
}
}
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
......@@ -3937,6 +3954,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
struct bpf_map *map;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
......@@ -3946,7 +3964,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
err = dev_map_enqueue(fwd, xdp, dev);
map = READ_ONCE(ri->map);
if (unlikely(map)) {
WRITE_ONCE(ri->map, NULL);
err = dev_map_enqueue_multi(xdp, dev, map,
ri->flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_enqueue(fwd, xdp, dev);
}
break;
case BPF_MAP_TYPE_CPUMAP:
err = cpu_map_enqueue(fwd, xdp, dev);
......@@ -3988,13 +4013,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map;
int err;
switch (map_type) {
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
err = dev_map_generic_redirect(fwd, skb, xdp_prog);
map = READ_ONCE(ri->map);
if (unlikely(map)) {
WRITE_ONCE(ri->map, NULL);
err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
ri->flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_generic_redirect(fwd, skb, xdp_prog);
}
if (unlikely(err))
goto err;
break;
......
......@@ -584,3 +584,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
return __xdp_build_skb_from_frame(xdpf, skb, dev);
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
{
unsigned int headroom, totalsize;
struct xdp_frame *nxdpf;
struct page *page;
void *addr;
headroom = xdpf->headroom + sizeof(*xdpf);
totalsize = headroom + xdpf->len;
if (unlikely(totalsize > PAGE_SIZE))
return NULL;
page = dev_alloc_page();
if (!page)
return NULL;
addr = page_to_virt(page);
memcpy(addr, xdpf, totalsize);
nxdpf = addr;
nxdpf->data = addr + headroom;
nxdpf->frame_sz = PAGE_SIZE;
nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
nxdpf->mem.id = 0;
return nxdpf;
}
......@@ -226,7 +226,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
__xsk_map_lookup_elem);
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
......
......@@ -41,6 +41,7 @@ tprogs-y += test_map_in_map
tprogs-y += per_socket_stats_example
tprogs-y += xdp_redirect
tprogs-y += xdp_redirect_map
tprogs-y += xdp_redirect_map_multi
tprogs-y += xdp_redirect_cpu
tprogs-y += xdp_monitor
tprogs-y += xdp_rxq_info
......@@ -99,6 +100,7 @@ test_map_in_map-objs := test_map_in_map_user.o
per_socket_stats_example-objs := cookie_uid_helper_example.o
xdp_redirect-objs := xdp_redirect_user.o
xdp_redirect_map-objs := xdp_redirect_map_user.o
xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o
xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
xdp_monitor-objs := xdp_monitor_user.o
xdp_rxq_info-objs := xdp_rxq_info_user.o
......@@ -160,6 +162,7 @@ always-y += tcp_tos_reflect_kern.o
always-y += tcp_dumpstats_kern.o
always-y += xdp_redirect_kern.o
always-y += xdp_redirect_map_kern.o
always-y += xdp_redirect_map_multi_kern.o
always-y += xdp_redirect_cpu_kern.o
always-y += xdp_monitor_kern.o
always-y += xdp_rxq_info_kern.o
......
// SPDX-License-Identifier: GPL-2.0
#define KBUILD_MODNAME "foo"
#include <uapi/linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <bpf/bpf_helpers.h>
struct {
__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__uint(max_entries, 32);
} forward_map_general SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(struct bpf_devmap_val));
__uint(max_entries, 32);
} forward_map_native SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, long);
__uint(max_entries, 1);
} rxcnt SEC(".maps");
/* map to store egress interfaces mac addresses, set the
* max_entries to 1 and extend it in user sapce prog.
*/
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, __be64);
__uint(max_entries, 1);
} mac_map SEC(".maps");
static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map)
{
long *value;
u32 key = 0;
/* count packet in global counter */
value = bpf_map_lookup_elem(&rxcnt, &key);
if (value)
*value += 1;
return bpf_redirect_map(forward_map, key,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
}
SEC("xdp_redirect_general")
int xdp_redirect_map_general(struct xdp_md *ctx)
{
return xdp_redirect_map(ctx, &forward_map_general);
}
SEC("xdp_redirect_native")
int xdp_redirect_map_native(struct xdp_md *ctx)
{
return xdp_redirect_map(ctx, &forward_map_native);
}
SEC("xdp_devmap/map_prog")
int xdp_devmap_prog(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
u32 key = ctx->egress_ifindex;
struct ethhdr *eth = data;
__be64 *mac;
u64 nh_off;
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_DROP;
mac = bpf_map_lookup_elem(&mac_map, &key);
if (mac)
__builtin_memcpy(eth->h_source, mac, ETH_ALEN);
return XDP_PASS;
}
char _license[] SEC("license") = "GPL";
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/if_link.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <net/if.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/resource.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include "bpf_util.h"
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#define MAX_IFACE_NUM 32
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int ifaces[MAX_IFACE_NUM] = {};
static int rxcnt_map_fd;
static void int_exit(int sig)
{
__u32 prog_id = 0;
int i;
for (i = 0; ifaces[i] > 0; i++) {
if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
printf("bpf_get_link_xdp_id failed\n");
exit(1);
}
if (prog_id)
bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
}
exit(0);
}
static void poll_stats(int interval)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
__u64 values[nr_cpus], prev[nr_cpus];
memset(prev, 0, sizeof(prev));
while (1) {
__u64 sum = 0;
__u32 key = 0;
int i;
sleep(interval);
assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
for (i = 0; i < nr_cpus; i++)
sum += (values[i] - prev[i]);
if (sum)
printf("Forwarding %10llu pkt/s\n", sum / interval);
memcpy(prev, values, sizeof(values));
}
}
static int get_mac_addr(unsigned int ifindex, void *mac_addr)
{
char ifname[IF_NAMESIZE];
struct ifreq ifr;
int fd, ret = -1;
fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0)
return ret;
if (!if_indextoname(ifindex, ifname))
goto err_out;
strcpy(ifr.ifr_name, ifname);
if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
goto err_out;
memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
ret = 0;
err_out:
close(fd);
return ret;
}
static int update_mac_map(struct bpf_object *obj)
{
int i, ret = -1, mac_map_fd;
unsigned char mac_addr[6];
unsigned int ifindex;
mac_map_fd = bpf_object__find_map_fd_by_name(obj, "mac_map");
if (mac_map_fd < 0) {
printf("find mac map fd failed\n");
return ret;
}
for (i = 0; ifaces[i] > 0; i++) {
ifindex = ifaces[i];
ret = get_mac_addr(ifindex, mac_addr);
if (ret < 0) {
printf("get interface %d mac failed\n", ifindex);
return ret;
}
ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0);
if (ret) {
perror("bpf_update_elem mac_map_fd");
return ret;
}
}
return 0;
}
static void usage(const char *prog)
{
fprintf(stderr,
"usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
"OPTS:\n"
" -S use skb-mode\n"
" -N enforce native mode\n"
" -F force loading prog\n"
" -X load xdp program on egress\n",
prog);
}
int main(int argc, char **argv)
{
int i, ret, opt, forward_map_fd, max_ifindex = 0;
struct bpf_program *ingress_prog, *egress_prog;
int ingress_prog_fd, egress_prog_fd = 0;
struct bpf_devmap_val devmap_val;
bool attach_egress_prog = false;
char ifname[IF_NAMESIZE];
struct bpf_map *mac_map;
struct bpf_object *obj;
unsigned int ifindex;
char filename[256];
while ((opt = getopt(argc, argv, "SNFX")) != -1) {
switch (opt) {
case 'S':
xdp_flags |= XDP_FLAGS_SKB_MODE;
break;
case 'N':
/* default, set below */
break;
case 'F':
xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'X':
attach_egress_prog = true;
break;
default:
usage(basename(argv[0]));
return 1;
}
}
if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
xdp_flags |= XDP_FLAGS_DRV_MODE;
} else if (attach_egress_prog) {
printf("Load xdp program on egress with SKB mode not supported yet\n");
return 1;
}
if (optind == argc) {
printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
return 1;
}
printf("Get interfaces");
for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
ifaces[i] = if_nametoindex(argv[optind + i]);
if (!ifaces[i])
ifaces[i] = strtoul(argv[optind + i], NULL, 0);
if (!if_indextoname(ifaces[i], ifname)) {
perror("Invalid interface name or i");
return 1;
}
/* Find the largest index number */
if (ifaces[i] > max_ifindex)
max_ifindex = ifaces[i];
printf(" %d", ifaces[i]);
}
printf("\n");
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open(filename);
if (libbpf_get_error(obj)) {
printf("ERROR: opening BPF object file failed\n");
obj = NULL;
goto err_out;
}
/* Reset the map size to max ifindex + 1 */
if (attach_egress_prog) {
mac_map = bpf_object__find_map_by_name(obj, "mac_map");
ret = bpf_map__resize(mac_map, max_ifindex + 1);
if (ret < 0) {
printf("ERROR: reset mac map size failed\n");
goto err_out;
}
}
/* load BPF program */
if (bpf_object__load(obj)) {
printf("ERROR: loading BPF object file failed\n");
goto err_out;
}
if (xdp_flags & XDP_FLAGS_SKB_MODE) {
ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general");
forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_general");
} else {
ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native");
forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_native");
}
if (!ingress_prog || forward_map_fd < 0) {
printf("finding ingress_prog/forward_map in obj file failed\n");
goto err_out;
}
ingress_prog_fd = bpf_program__fd(ingress_prog);
if (ingress_prog_fd < 0) {
printf("find ingress_prog fd failed\n");
goto err_out;
}
rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
if (rxcnt_map_fd < 0) {
printf("bpf_object__find_map_fd_by_name failed\n");
goto err_out;
}
if (attach_egress_prog) {
/* Update mac_map with all egress interfaces' mac addr */
if (update_mac_map(obj) < 0) {
printf("Error: update mac map failed");
goto err_out;
}
/* Find egress prog fd */
egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
if (!egress_prog) {
printf("finding egress_prog in obj file failed\n");
goto err_out;
}
egress_prog_fd = bpf_program__fd(egress_prog);
if (egress_prog_fd < 0) {
printf("find egress_prog fd failed\n");
goto err_out;
}
}
/* Remove attached program when program is interrupted or killed */
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
/* Init forward multicast groups */
for (i = 0; ifaces[i] > 0; i++) {
ifindex = ifaces[i];
/* bind prog_fd to each interface */
ret = bpf_set_link_xdp_fd(ifindex, ingress_prog_fd, xdp_flags);
if (ret) {
printf("Set xdp fd failed on %d\n", ifindex);
goto err_out;
}
/* Add all the interfaces to forward group and attach
* egress devmap programe if exist
*/
devmap_val.ifindex = ifindex;
devmap_val.bpf_prog.fd = egress_prog_fd;
ret = bpf_map_update_elem(forward_map_fd, &ifindex, &devmap_val, 0);
if (ret) {
perror("bpf_map_update_elem forward_map");
goto err_out;
}
}
poll_stats(2);
return 0;
err_out:
return 1;
}
......@@ -2555,8 +2555,12 @@ union bpf_attr {
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to **XDP_TX**, as chosen
* by the caller. Any higher bits in the *flags* argument must be
* unset.
* by the caller. The higher bits of *flags* can be set to
* BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
*
* With BPF_F_BROADCAST the packet will be broadcasted to all the
* interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
* interface will be excluded when do broadcasting.
*
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
......@@ -5122,6 +5126,12 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
/* Flags for bpf_redirect_map helper */
enum {
BPF_F_BROADCAST = (1ULL << 3),
BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
};
#define __bpf_md_ptr(type, name) \
union { \
type name; \
......
......@@ -54,6 +54,7 @@ TEST_FILES = xsk_prereqs.sh \
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
test_xdp_redirect.sh \
test_xdp_redirect_multi.sh \
test_xdp_meta.sh \
test_xdp_veth.sh \
test_offload.py \
......@@ -84,7 +85,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
xdpxceiver
xdpxceiver xdp_redirect_multi
TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
......
// SPDX-License-Identifier: GPL-2.0
#define KBUILD_MODNAME "foo"
#include <string.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
/* One map use devmap, another one use devmap_hash for testing */
struct {
__uint(type, BPF_MAP_TYPE_DEVMAP);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__uint(max_entries, 1024);
} map_all SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(struct bpf_devmap_val));
__uint(max_entries, 128);
} map_egress SEC(".maps");
/* map to store egress interfaces mac addresses */
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32);
__type(value, __be64);
__uint(max_entries, 128);
} mac_map SEC(".maps");
SEC("xdp_redirect_map_multi")
int xdp_redirect_map_multi_prog(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
int if_index = ctx->ingress_ifindex;
struct ethhdr *eth = data;
__u16 h_proto;
__u64 nh_off;
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_DROP;
h_proto = eth->h_proto;
/* Using IPv4 for (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) testing */
if (h_proto == bpf_htons(ETH_P_IP))
return bpf_redirect_map(&map_all, 0,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
/* Using IPv6 for none flag testing */
else if (h_proto == bpf_htons(ETH_P_IPV6))
return bpf_redirect_map(&map_all, if_index, 0);
/* All others for BPF_F_BROADCAST testing */
else
return bpf_redirect_map(&map_all, 0, BPF_F_BROADCAST);
}
/* The following 2 progs are for 2nd devmap prog testing */
SEC("xdp_redirect_map_ingress")
int xdp_redirect_map_all_prog(struct xdp_md *ctx)
{
return bpf_redirect_map(&map_egress, 0,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
}
SEC("xdp_devmap/map_prog")
int xdp_devmap_prog(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
__u32 key = ctx->egress_ifindex;
struct ethhdr *eth = data;
__u64 nh_off;
__be64 *mac;
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_DROP;
mac = bpf_map_lookup_elem(&mac_map, &key);
if (mac)
__builtin_memcpy(eth->h_source, mac, ETH_ALEN);
return XDP_PASS;
}
char _license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Test topology:
# - - - - - - - - - - - - - - - - - - - - - - - - -
# | veth1 veth2 veth3 | ... init net
# - -| - - - - - - | - - - - - - | - -
# --------- --------- ---------
# | veth0 | | veth0 | | veth0 | ...
# --------- --------- ---------
# ns1 ns2 ns3
#
# Test modules:
# XDP modes: generic, native, native + egress_prog
#
# Test cases:
# ARP: Testing BPF_F_BROADCAST, the ingress interface also should receive
# the redirects.
# ns1 -> gw: ns1, ns2, ns3, should receive the arp request
# IPv4: Testing BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, the ingress
# interface should not receive the redirects.
# ns1 -> gw: ns1 should not receive, ns2, ns3 should receive redirects.
# IPv6: Testing none flag, all the pkts should be redirected back
# ping test: ns1 -> ns2 (block), echo requests will be redirect back
# egress_prog:
# all src mac should be egress interface's mac
# netns numbers
NUM=3
IFACES=""
DRV_MODE="xdpgeneric xdpdrv xdpegress"
PASS=0
FAIL=0
test_pass()
{
echo "Pass: $@"
PASS=$((PASS + 1))
}
test_fail()
{
echo "fail: $@"
FAIL=$((FAIL + 1))
}
clean_up()
{
for i in $(seq $NUM); do
ip link del veth$i 2> /dev/null
ip netns del ns$i 2> /dev/null
done
}
# Kselftest framework requirement - SKIP code is 4.
check_env()
{
ip link set dev lo xdpgeneric off &>/dev/null
if [ $? -ne 0 ];then
echo "selftests: [SKIP] Could not run test without the ip xdpgeneric support"
exit 4
fi
which tcpdump &>/dev/null
if [ $? -ne 0 ];then
echo "selftests: [SKIP] Could not run test without tcpdump"
exit 4
fi
}
setup_ns()
{
local mode=$1
IFACES=""
if [ "$mode" = "xdpegress" ]; then
mode="xdpdrv"
fi
for i in $(seq $NUM); do
ip netns add ns$i
ip link add veth$i type veth peer name veth0 netns ns$i
ip link set veth$i up
ip -n ns$i link set veth0 up
ip -n ns$i addr add 192.0.2.$i/24 dev veth0
ip -n ns$i addr add 2001:db8::$i/64 dev veth0
# Add a neigh entry for IPv4 ping test
ip -n ns$i neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0
ip -n ns$i link set veth0 $mode obj \
xdp_dummy.o sec xdp_dummy &> /dev/null || \
{ test_fail "Unable to load dummy xdp" && exit 1; }
IFACES="$IFACES veth$i"
veth_mac[$i]=$(ip link show veth$i | awk '/link\/ether/ {print $2}')
done
}
do_egress_tests()
{
local mode=$1
# mac test
ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-2_${mode}.log &
ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-3_${mode}.log &
sleep 0.5
ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
sleep 0.5
pkill -9 tcpdump
# mac check
grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" mac_ns1-2_${mode}.log && \
test_pass "$mode mac ns1-2" || test_fail "$mode mac ns1-2"
grep -q "${veth_mac[3]} > ff:ff:ff:ff:ff:ff" mac_ns1-3_${mode}.log && \
test_pass "$mode mac ns1-3" || test_fail "$mode mac ns1-3"
}
do_ping_tests()
{
local mode=$1
# ping6 test: echo request should be redirect back to itself, not others
ip netns exec ns1 ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02
ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ns1-1_${mode}.log &
ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ns1-2_${mode}.log &
ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ns1-3_${mode}.log &
sleep 0.5
# ARP test
ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
# IPv4 test
ip netns exec ns1 ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null
# IPv6 test
ip netns exec ns1 ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null
sleep 0.5
pkill -9 tcpdump
# All netns should receive the redirect arp requests
[ $(grep -c "who-has 192.0.2.254" ns1-1_${mode}.log) -gt 4 ] && \
test_pass "$mode arp(F_BROADCAST) ns1-1" || \
test_fail "$mode arp(F_BROADCAST) ns1-1"
[ $(grep -c "who-has 192.0.2.254" ns1-2_${mode}.log) -le 4 ] && \
test_pass "$mode arp(F_BROADCAST) ns1-2" || \
test_fail "$mode arp(F_BROADCAST) ns1-2"
[ $(grep -c "who-has 192.0.2.254" ns1-3_${mode}.log) -le 4 ] && \
test_pass "$mode arp(F_BROADCAST) ns1-3" || \
test_fail "$mode arp(F_BROADCAST) ns1-3"
# ns1 should not receive the redirect echo request, others should
[ $(grep -c "ICMP echo request" ns1-1_${mode}.log) -eq 4 ] && \
test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" || \
test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1"
[ $(grep -c "ICMP echo request" ns1-2_${mode}.log) -eq 4 ] && \
test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" || \
test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2"
[ $(grep -c "ICMP echo request" ns1-3_${mode}.log) -eq 4 ] && \
test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" || \
test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3"
# ns1 should receive the echo request, ns2 should not
[ $(grep -c "ICMP6, echo request" ns1-1_${mode}.log) -eq 4 ] && \
test_pass "$mode IPv6 (no flags) ns1-1" || \
test_fail "$mode IPv6 (no flags) ns1-1"
[ $(grep -c "ICMP6, echo request" ns1-2_${mode}.log) -eq 0 ] && \
test_pass "$mode IPv6 (no flags) ns1-2" || \
test_fail "$mode IPv6 (no flags) ns1-2"
}
do_tests()
{
local mode=$1
local drv_p
case ${mode} in
xdpdrv) drv_p="-N";;
xdpegress) drv_p="-X";;
xdpgeneric) drv_p="-S";;
esac
./xdp_redirect_multi $drv_p $IFACES &> xdp_redirect_${mode}.log &
xdp_pid=$!
sleep 1
if [ "$mode" = "xdpegress" ]; then
do_egress_tests $mode
else
do_ping_tests $mode
fi
kill $xdp_pid
}
trap clean_up 0 2 3 6 9
check_env
rm -f xdp_redirect_*.log ns*.log mac_ns*.log
for mode in ${DRV_MODE}; do
setup_ns $mode
do_tests $mode
clean_up
done
echo "Summary: PASS $PASS, FAIL $FAIL"
[ $FAIL -eq 0 ] && exit 0 || exit 1
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/if_link.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <net/if.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/resource.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include "bpf_util.h"
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#define MAX_IFACE_NUM 32
#define MAX_INDEX_NUM 1024
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int ifaces[MAX_IFACE_NUM] = {};
static void int_exit(int sig)
{
__u32 prog_id = 0;
int i;
for (i = 0; ifaces[i] > 0; i++) {
if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
printf("bpf_get_link_xdp_id failed\n");
exit(1);
}
if (prog_id)
bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
}
exit(0);
}
static int get_mac_addr(unsigned int ifindex, void *mac_addr)
{
char ifname[IF_NAMESIZE];
struct ifreq ifr;
int fd, ret = -1;
fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0)
return ret;
if (!if_indextoname(ifindex, ifname))
goto err_out;
strcpy(ifr.ifr_name, ifname);
if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
goto err_out;
memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
ret = 0;
err_out:
close(fd);
return ret;
}
static void usage(const char *prog)
{
fprintf(stderr,
"usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
"OPTS:\n"
" -S use skb-mode\n"
" -N enforce native mode\n"
" -F force loading prog\n"
" -X load xdp program on egress\n",
prog);
}
int main(int argc, char **argv)
{
int prog_fd, group_all, mac_map;
struct bpf_program *ingress_prog, *egress_prog;
struct bpf_prog_load_attr prog_load_attr = {
.prog_type = BPF_PROG_TYPE_UNSPEC,
};
int i, ret, opt, egress_prog_fd = 0;
struct bpf_devmap_val devmap_val;
bool attach_egress_prog = false;
unsigned char mac_addr[6];
char ifname[IF_NAMESIZE];
struct bpf_object *obj;
unsigned int ifindex;
char filename[256];
while ((opt = getopt(argc, argv, "SNFX")) != -1) {
switch (opt) {
case 'S':
xdp_flags |= XDP_FLAGS_SKB_MODE;
break;
case 'N':
/* default, set below */
break;
case 'F':
xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'X':
attach_egress_prog = true;
break;
default:
usage(basename(argv[0]));
return 1;
}
}
if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
xdp_flags |= XDP_FLAGS_DRV_MODE;
} else if (attach_egress_prog) {
printf("Load xdp program on egress with SKB mode not supported yet\n");
goto err_out;
}
if (optind == argc) {
printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
goto err_out;
}
printf("Get interfaces");
for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
ifaces[i] = if_nametoindex(argv[optind + i]);
if (!ifaces[i])
ifaces[i] = strtoul(argv[optind + i], NULL, 0);
if (!if_indextoname(ifaces[i], ifname)) {
perror("Invalid interface name or i");
goto err_out;
}
if (ifaces[i] > MAX_INDEX_NUM) {
printf("Interface index to large\n");
goto err_out;
}
printf(" %d", ifaces[i]);
}
printf("\n");
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
prog_load_attr.file = filename;
if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
goto err_out;
if (attach_egress_prog)
group_all = bpf_object__find_map_fd_by_name(obj, "map_egress");
else
group_all = bpf_object__find_map_fd_by_name(obj, "map_all");
mac_map = bpf_object__find_map_fd_by_name(obj, "mac_map");
if (group_all < 0 || mac_map < 0) {
printf("bpf_object__find_map_fd_by_name failed\n");
goto err_out;
}
if (attach_egress_prog) {
/* Find ingress/egress prog for 2nd xdp prog */
ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_all_prog");
egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
if (!ingress_prog || !egress_prog) {
printf("finding ingress/egress_prog in obj file failed\n");
goto err_out;
}
prog_fd = bpf_program__fd(ingress_prog);
egress_prog_fd = bpf_program__fd(egress_prog);
if (prog_fd < 0 || egress_prog_fd < 0) {
printf("find egress_prog fd failed\n");
goto err_out;
}
}
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
/* Init forward multicast groups and exclude group */
for (i = 0; ifaces[i] > 0; i++) {
ifindex = ifaces[i];
if (attach_egress_prog) {
ret = get_mac_addr(ifindex, mac_addr);
if (ret < 0) {
printf("get interface %d mac failed\n", ifindex);
goto err_out;
}
ret = bpf_map_update_elem(mac_map, &ifindex, mac_addr, 0);
if (ret) {
perror("bpf_update_elem mac_map failed\n");
goto err_out;
}
}
/* Add all the interfaces to group all */
devmap_val.ifindex = ifindex;
devmap_val.bpf_prog.fd = egress_prog_fd;
ret = bpf_map_update_elem(group_all, &ifindex, &devmap_val, 0);
if (ret) {
perror("bpf_map_update_elem");
goto err_out;
}
/* bind prog_fd to each interface */
ret = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
if (ret) {
printf("Set xdp fd failed on %d\n", ifindex);
goto err_out;
}
}
/* sleep some time for testing */
sleep(999);
return 0;
err_out:
return 1;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment