Commit eefe06e8 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-prog-testing-framework'

Alexei Starovoitov says:

====================
bpf: program testing framework

Development and testing of networking bpf programs is quite cumbersome.
Especially tricky are XDP programs that attach to real netdevices and
program development feels like working on the car engine while
the car is in motion.
Another problem is ongoing changes to upstream llvm core
that can introduce an optimization that verifier will not
recognize. llvm bpf backend tests have no ability to run the programs.
To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.
It achieves several goals:
- development of xdp and skb based bpf programs can be done
in a canned environment with unit tests
- program performance optimizations can be benchmarked outside of
networking core (without driver and skb costs)
- continuous testing of upstream changes is finally practical

Patches 4,5,6 add C based test cases of various complexity
to cover some sched_cls and xdp features. More tests will
be added in the future. The tests were run on centos7 only.

For now the framework supports only skb and xdp programs. In the future
it can be extended to socket_filter and tracing program types.

More details are in individual patches.

v1->v2:
- rename bpf_program_test_run->bpf_prog_test_run
- add missing #include <linux/bpf.h> since libbpf.h shouldn't depend
on prior includes
- reordered patches 3 and 4 to keep bisect clean
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 98cd1552 37821613
...@@ -169,6 +169,8 @@ struct bpf_verifier_ops { ...@@ -169,6 +169,8 @@ struct bpf_verifier_ops {
const struct bpf_insn *src, const struct bpf_insn *src,
struct bpf_insn *dst, struct bpf_insn *dst,
struct bpf_prog *prog); struct bpf_prog *prog);
int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr);
}; };
struct bpf_prog_type_list { struct bpf_prog_type_list {
...@@ -233,6 +235,11 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, ...@@ -233,6 +235,11 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr);
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr);
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active); DECLARE_PER_CPU(int, bpf_prog_active);
......
...@@ -81,6 +81,7 @@ enum bpf_cmd { ...@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET, BPF_OBJ_GET,
BPF_PROG_ATTACH, BPF_PROG_ATTACH,
BPF_PROG_DETACH, BPF_PROG_DETACH,
BPF_PROG_TEST_RUN,
}; };
enum bpf_map_type { enum bpf_map_type {
...@@ -189,6 +190,17 @@ union bpf_attr { ...@@ -189,6 +190,17 @@ union bpf_attr {
__u32 attach_type; __u32 attach_type;
__u32 attach_flags; __u32 attach_flags;
}; };
struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
__u32 prog_fd;
__u32 retval;
__u32 data_size_in;
__u32 data_size_out;
__aligned_u64 data_in;
__aligned_u64 data_out;
__u32 repeat;
__u32 duration;
} test;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
/* BPF helper function descriptions: /* BPF helper function descriptions:
......
...@@ -973,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr) ...@@ -973,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
} }
#endif /* CONFIG_CGROUP_BPF */ #endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_prog *prog;
int ret = -ENOTSUPP;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
prog = bpf_prog_get(attr->test.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (prog->aux->ops->test_run)
ret = prog->aux->ops->test_run(prog, attr, uattr);
bpf_prog_put(prog);
return ret;
}
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{ {
union bpf_attr attr = {}; union bpf_attr attr = {};
...@@ -1039,7 +1061,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz ...@@ -1039,7 +1061,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET: case BPF_OBJ_GET:
err = bpf_obj_get(&attr); err = bpf_obj_get(&attr);
break; break;
#ifdef CONFIG_CGROUP_BPF #ifdef CONFIG_CGROUP_BPF
case BPF_PROG_ATTACH: case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr); err = bpf_prog_attach(&attr);
...@@ -1048,7 +1069,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz ...@@ -1048,7 +1069,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_prog_detach(&attr); err = bpf_prog_detach(&attr);
break; break;
#endif #endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
default: default:
err = -EINVAL; err = -EINVAL;
break; break;
......
...@@ -12,7 +12,7 @@ obj-$(CONFIG_NET) += $(tmp-y) ...@@ -12,7 +12,7 @@ obj-$(CONFIG_NET) += $(tmp-y)
# LLC has to be linked before the files in net/802/ # LLC has to be linked before the files in net/802/
obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_LLC) += llc/
obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/
obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_XFRM) += xfrm/
......
obj-y := test_run.o
/* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/etherdevice.h>
#include <linux/filter.h>
#include <linux/sched/signal.h>
static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx)
{
u32 ret;
preempt_disable();
rcu_read_lock();
ret = BPF_PROG_RUN(prog, ctx);
rcu_read_unlock();
preempt_enable();
return ret;
}
static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
{
u64 time_start, time_spent = 0;
u32 ret = 0, i;
if (!repeat)
repeat = 1;
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
ret = bpf_test_run_one(prog, ctx);
if (need_resched()) {
if (signal_pending(current))
break;
time_spent += ktime_get_ns() - time_start;
cond_resched();
time_start = ktime_get_ns();
}
}
time_spent += ktime_get_ns() - time_start;
do_div(time_spent, repeat);
*time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
return ret;
}
static int bpf_test_finish(union bpf_attr __user *uattr, const void *data,
u32 size, u32 retval, u32 duration)
{
void __user *data_out = u64_to_user_ptr(uattr->test.data_out);
int err = -EFAULT;
if (data_out && copy_to_user(data_out, data, size))
goto out;
if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
goto out;
if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
goto out;
if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration)))
goto out;
err = 0;
out:
return err;
}
static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
void *data;
if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
return ERR_PTR(-EINVAL);
data = kzalloc(size + headroom + tailroom, GFP_USER);
if (!data)
return ERR_PTR(-ENOMEM);
if (copy_from_user(data + headroom, data_in, size)) {
kfree(data);
return ERR_PTR(-EFAULT);
}
return data;
}
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
bool is_l2 = false, is_direct_pkt_access = false;
u32 size = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
u32 retval, duration;
struct sk_buff *skb;
void *data;
int ret;
data = bpf_test_init(kattr, size, NET_SKB_PAD,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
if (IS_ERR(data))
return PTR_ERR(data);
switch (prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
is_l2 = true;
/* fall through */
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_XMIT:
is_direct_pkt_access = true;
break;
default:
break;
}
skb = build_skb(data, 0);
if (!skb) {
kfree(data);
return -ENOMEM;
}
skb_reserve(skb, NET_SKB_PAD);
__skb_put(skb, size);
skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev);
skb_reset_network_header(skb);
if (is_l2)
__skb_push(skb, ETH_HLEN);
if (is_direct_pkt_access)
bpf_compute_data_end(skb);
retval = bpf_test_run(prog, skb, repeat, &duration);
if (!is_l2)
__skb_push(skb, ETH_HLEN);
size = skb->len;
/* bpf program can never convert linear skb to non-linear */
if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
size = skb_headlen(skb);
ret = bpf_test_finish(uattr, skb->data, size, retval, duration);
kfree_skb(skb);
return ret;
}
int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
u32 size = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
struct xdp_buff xdp = {};
u32 retval, duration;
void *data;
int ret;
data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM, 0);
if (IS_ERR(data))
return PTR_ERR(data);
xdp.data_hard_start = data;
xdp.data = data + XDP_PACKET_HEADROOM;
xdp.data_end = xdp.data + size;
retval = bpf_test_run(prog, &xdp, repeat, &duration);
if (xdp.data != data + XDP_PACKET_HEADROOM)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(uattr, xdp.data, size, retval, duration);
kfree(data);
return ret;
}
...@@ -3309,24 +3309,28 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { ...@@ -3309,24 +3309,28 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
.is_valid_access = tc_cls_act_is_valid_access, .is_valid_access = tc_cls_act_is_valid_access,
.convert_ctx_access = tc_cls_act_convert_ctx_access, .convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue, .gen_prologue = tc_cls_act_prologue,
.test_run = bpf_prog_test_run_skb,
}; };
static const struct bpf_verifier_ops xdp_ops = { static const struct bpf_verifier_ops xdp_ops = {
.get_func_proto = xdp_func_proto, .get_func_proto = xdp_func_proto,
.is_valid_access = xdp_is_valid_access, .is_valid_access = xdp_is_valid_access,
.convert_ctx_access = xdp_convert_ctx_access, .convert_ctx_access = xdp_convert_ctx_access,
.test_run = bpf_prog_test_run_xdp,
}; };
static const struct bpf_verifier_ops cg_skb_ops = { static const struct bpf_verifier_ops cg_skb_ops = {
.get_func_proto = cg_skb_func_proto, .get_func_proto = cg_skb_func_proto,
.is_valid_access = sk_filter_is_valid_access, .is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access, .convert_ctx_access = bpf_convert_ctx_access,
.test_run = bpf_prog_test_run_skb,
}; };
static const struct bpf_verifier_ops lwt_inout_ops = { static const struct bpf_verifier_ops lwt_inout_ops = {
.get_func_proto = lwt_inout_func_proto, .get_func_proto = lwt_inout_func_proto,
.is_valid_access = lwt_is_valid_access, .is_valid_access = lwt_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access, .convert_ctx_access = bpf_convert_ctx_access,
.test_run = bpf_prog_test_run_skb,
}; };
static const struct bpf_verifier_ops lwt_xmit_ops = { static const struct bpf_verifier_ops lwt_xmit_ops = {
...@@ -3334,6 +3338,7 @@ static const struct bpf_verifier_ops lwt_xmit_ops = { ...@@ -3334,6 +3338,7 @@ static const struct bpf_verifier_ops lwt_xmit_ops = {
.is_valid_access = lwt_is_valid_access, .is_valid_access = lwt_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access, .convert_ctx_access = bpf_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue, .gen_prologue = tc_cls_act_prologue,
.test_run = bpf_prog_test_run_skb,
}; };
static const struct bpf_verifier_ops cg_sock_ops = { static const struct bpf_verifier_ops cg_sock_ops = {
......
...@@ -81,6 +81,7 @@ enum bpf_cmd { ...@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET, BPF_OBJ_GET,
BPF_PROG_ATTACH, BPF_PROG_ATTACH,
BPF_PROG_DETACH, BPF_PROG_DETACH,
BPF_PROG_TEST_RUN,
}; };
enum bpf_map_type { enum bpf_map_type {
...@@ -189,6 +190,17 @@ union bpf_attr { ...@@ -189,6 +190,17 @@ union bpf_attr {
__u32 attach_type; __u32 attach_type;
__u32 attach_flags; __u32 attach_flags;
}; };
struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
__u32 prog_fd;
__u32 retval;
__u32 data_size_in;
__u32 data_size_out;
__aligned_u64 data_in;
__aligned_u64 data_out;
__u32 repeat;
__u32 duration;
} test;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
/* BPF helper function descriptions: /* BPF helper function descriptions:
...@@ -459,6 +471,18 @@ union bpf_attr { ...@@ -459,6 +471,18 @@ union bpf_attr {
* Return: * Return:
* > 0 length of the string including the trailing NUL on success * > 0 length of the string including the trailing NUL on success
* < 0 error * < 0 error
*
* u64 bpf_bpf_get_socket_cookie(skb)
* Get the cookie for the socket stored inside sk_buff.
* @skb: pointer to skb
* Return: 8 Bytes non-decreasing number on success or 0 if the socket
* field is missing inside sk_buff
*
* u32 bpf_get_socket_uid(skb)
* Get the owner uid of the socket stored inside sk_buff.
* @skb: pointer to skb
* Return: uid of the socket owner on success or 0 if the socket pointer
* inside sk_buff is NULL
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
......
...@@ -209,3 +209,27 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type) ...@@ -209,3 +209,27 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
} }
int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
void *data_out, __u32 *size_out, __u32 *retval,
__u32 *duration)
{
union bpf_attr attr;
int ret;
bzero(&attr, sizeof(attr));
attr.test.prog_fd = prog_fd;
attr.test.data_in = ptr_to_u64(data);
attr.test.data_out = ptr_to_u64(data_out);
attr.test.data_size_in = size;
attr.test.repeat = repeat;
ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
if (size_out)
*size_out = attr.test.data_size_out;
if (retval)
*retval = attr.test.retval;
if (duration)
*duration = attr.test.duration;
return ret;
}
...@@ -47,6 +47,8 @@ int bpf_obj_get(const char *pathname); ...@@ -47,6 +47,8 @@ int bpf_obj_get(const char *pathname);
int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
unsigned int flags); unsigned int flags);
int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
void *data_out, __u32 *size_out, __u32 *retval,
__u32 *duration);
#endif #endif
...@@ -1618,8 +1618,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n) ...@@ -1618,8 +1618,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd; return fd;
} }
static void bpf_program__set_type(struct bpf_program *prog, void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type)
enum bpf_prog_type type)
{ {
prog->type = type; prog->type = type;
} }
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#include <sys/types.h> // for size_t #include <sys/types.h> // for size_t
#include <linux/bpf.h>
enum libbpf_errno { enum libbpf_errno {
__LIBBPF_ERRNO__START = 4000, __LIBBPF_ERRNO__START = 4000,
...@@ -185,6 +186,7 @@ int bpf_program__set_sched_cls(struct bpf_program *prog); ...@@ -185,6 +186,7 @@ int bpf_program__set_sched_cls(struct bpf_program *prog);
int bpf_program__set_sched_act(struct bpf_program *prog); int bpf_program__set_sched_act(struct bpf_program *prog);
int bpf_program__set_xdp(struct bpf_program *prog); int bpf_program__set_xdp(struct bpf_program *prog);
int bpf_program__set_perf_event(struct bpf_program *prog); int bpf_program__set_perf_event(struct bpf_program *prog);
void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
bool bpf_program__is_socket_filter(struct bpf_program *prog); bool bpf_program__is_socket_filter(struct bpf_program *prog);
bool bpf_program__is_tracepoint(struct bpf_program *prog); bool bpf_program__is_tracepoint(struct bpf_program *prog);
......
LIBDIR := ../../../lib LIBDIR := ../../../lib
BPFDIR := $(LIBDIR)/bpf BPFDIR := $(LIBDIR)/bpf
CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR) CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR) -I../../../include
LDLIBS += -lcap LDLIBS += -lcap -lelf
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o
TEST_PROGS := test_kmod.sh TEST_PROGS := test_kmod.sh
include ../lib.mk include ../lib.mk
BPFOBJ := $(OUTPUT)/bpf.o BPFOBJ := $(OUTPUT)/libbpf.a
$(TEST_GEN_PROGS): $(BPFOBJ) $(TEST_GEN_PROGS): $(BPFOBJ)
...@@ -21,3 +23,10 @@ force: ...@@ -21,3 +23,10 @@ force:
$(BPFOBJ): force $(BPFOBJ): force
$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
CLANG ?= clang
%.o: %.c
$(CLANG) -I../../../include/uapi -I../../../../samples/bpf/ \
-D__x86_64__ -Wno-compare-distinct-pointer-types \
-O2 -target bpf -c $< -o $@
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef _TEST_IPTNL_COMMON_H
#define _TEST_IPTNL_COMMON_H
#include <linux/types.h>
#define MAX_IPTNL_ENTRIES 256U
struct vip {
union {
__u32 v6[4];
__u32 v4;
} daddr;
__u16 dport;
__u16 family;
__u8 protocol;
};
struct iptnl_info {
union {
__u32 v6[4];
__u32 v4;
} saddr;
union {
__u32 v6[4];
__u32 v4;
} daddr;
__u16 family;
__u8 dmac[6];
};
#endif
/* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <linux/pkt_cls.h>
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include "bpf_helpers.h"
#include "test_iptunnel_common.h"
#define htons __builtin_bswap16
#define ntohs __builtin_bswap16
int _version SEC("version") = 1;
static inline __u32 rol32(__u32 word, unsigned int shift)
{
return (word << shift) | (word >> ((-shift) & 31));
}
/* copy paste of jhash from kernel sources to make sure llvm
* can compile it into valid sequence of bpf instructions
*/
#define __jhash_mix(a, b, c) \
{ \
a -= c; a ^= rol32(c, 4); c += b; \
b -= a; b ^= rol32(a, 6); a += c; \
c -= b; c ^= rol32(b, 8); b += a; \
a -= c; a ^= rol32(c, 16); c += b; \
b -= a; b ^= rol32(a, 19); a += c; \
c -= b; c ^= rol32(b, 4); b += a; \
}
#define __jhash_final(a, b, c) \
{ \
c ^= b; c -= rol32(b, 14); \
a ^= c; a -= rol32(c, 11); \
b ^= a; b -= rol32(a, 25); \
c ^= b; c -= rol32(b, 16); \
a ^= c; a -= rol32(c, 4); \
b ^= a; b -= rol32(a, 14); \
c ^= b; c -= rol32(b, 24); \
}
#define JHASH_INITVAL 0xdeadbeef
typedef unsigned int u32;
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
u32 a, b, c;
const unsigned char *k = key;
a = b = c = JHASH_INITVAL + length + initval;
while (length > 12) {
a += *(u32 *)(k);
b += *(u32 *)(k + 4);
c += *(u32 *)(k + 8);
__jhash_mix(a, b, c);
length -= 12;
k += 12;
}
switch (length) {
case 12: c += (u32)k[11]<<24;
case 11: c += (u32)k[10]<<16;
case 10: c += (u32)k[9]<<8;
case 9: c += k[8];
case 8: b += (u32)k[7]<<24;
case 7: b += (u32)k[6]<<16;
case 6: b += (u32)k[5]<<8;
case 5: b += k[4];
case 4: a += (u32)k[3]<<24;
case 3: a += (u32)k[2]<<16;
case 2: a += (u32)k[1]<<8;
case 1: a += k[0];
__jhash_final(a, b, c);
case 0: /* Nothing left to add */
break;
}
return c;
}
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
a += initval;
b += initval;
c += initval;
__jhash_final(a, b, c);
return c;
}
static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}
#define PCKT_FRAGMENTED 65343
#define IPV4_HDR_LEN_NO_OPT 20
#define IPV4_PLUS_ICMP_HDR 28
#define IPV6_PLUS_ICMP_HDR 48
#define RING_SIZE 2
#define MAX_VIPS 12
#define MAX_REALS 5
#define CTL_MAP_SIZE 16
#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
#define F_IPV6 (1 << 0)
#define F_HASH_NO_SRC_PORT (1 << 0)
#define F_ICMP (1 << 0)
#define F_SYN_SET (1 << 1)
struct packet_description {
union {
__be32 src;
__be32 srcv6[4];
};
union {
__be32 dst;
__be32 dstv6[4];
};
union {
__u32 ports;
__u16 port16[2];
};
__u8 proto;
__u8 flags;
};
struct ctl_value {
union {
__u64 value;
__u32 ifindex;
__u8 mac[6];
};
};
struct vip_meta {
__u32 flags;
__u32 vip_num;
};
struct real_definition {
union {
__be32 dst;
__be32 dstv6[4];
};
__u8 flags;
};
struct vip_stats {
__u64 bytes;
__u64 pkts;
};
struct eth_hdr {
unsigned char eth_dest[ETH_ALEN];
unsigned char eth_source[ETH_ALEN];
unsigned short eth_proto;
};
struct bpf_map_def SEC("maps") vip_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct vip),
.value_size = sizeof(struct vip_meta),
.max_entries = MAX_VIPS,
};
struct bpf_map_def SEC("maps") ch_rings = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = CH_RINGS_SIZE,
};
struct bpf_map_def SEC("maps") reals = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct real_definition),
.max_entries = MAX_REALS,
};
struct bpf_map_def SEC("maps") stats = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct vip_stats),
.max_entries = MAX_VIPS,
};
struct bpf_map_def SEC("maps") ctl_array = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct ctl_value),
.max_entries = CTL_MAP_SIZE,
};
static __always_inline __u32 get_packet_hash(struct packet_description *pckt,
bool ipv6)
{
if (ipv6)
return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS),
pckt->ports, CH_RINGS_SIZE);
else
return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE);
}
static __always_inline bool get_packet_dst(struct real_definition **real,
struct packet_description *pckt,
struct vip_meta *vip_info,
bool is_ipv6)
{
__u32 hash = get_packet_hash(pckt, is_ipv6) % RING_SIZE;
__u32 key = RING_SIZE * vip_info->vip_num + hash;
__u32 *real_pos;
real_pos = bpf_map_lookup_elem(&ch_rings, &key);
if (!real_pos)
return false;
key = *real_pos;
*real = bpf_map_lookup_elem(&reals, &key);
if (!(*real))
return false;
return true;
}
static __always_inline int parse_icmpv6(void *data, void *data_end, __u64 off,
struct packet_description *pckt)
{
struct icmp6hdr *icmp_hdr;
struct ipv6hdr *ip6h;
icmp_hdr = data + off;
if (icmp_hdr + 1 > data_end)
return TC_ACT_SHOT;
if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
return TC_ACT_OK;
off += sizeof(struct icmp6hdr);
ip6h = data + off;
if (ip6h + 1 > data_end)
return TC_ACT_SHOT;
pckt->proto = ip6h->nexthdr;
pckt->flags |= F_ICMP;
memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16);
return TC_ACT_UNSPEC;
}
static __always_inline int parse_icmp(void *data, void *data_end, __u64 off,
struct packet_description *pckt)
{
struct icmphdr *icmp_hdr;
struct iphdr *iph;
icmp_hdr = data + off;
if (icmp_hdr + 1 > data_end)
return TC_ACT_SHOT;
if (icmp_hdr->type != ICMP_DEST_UNREACH ||
icmp_hdr->code != ICMP_FRAG_NEEDED)
return TC_ACT_OK;
off += sizeof(struct icmphdr);
iph = data + off;
if (iph + 1 > data_end)
return TC_ACT_SHOT;
if (iph->ihl != 5)
return TC_ACT_SHOT;
pckt->proto = iph->protocol;
pckt->flags |= F_ICMP;
pckt->src = iph->daddr;
pckt->dst = iph->saddr;
return TC_ACT_UNSPEC;
}
static __always_inline bool parse_udp(void *data, __u64 off, void *data_end,
struct packet_description *pckt)
{
struct udphdr *udp;
udp = data + off;
if (udp + 1 > data_end)
return false;
if (!(pckt->flags & F_ICMP)) {
pckt->port16[0] = udp->source;
pckt->port16[1] = udp->dest;
} else {
pckt->port16[0] = udp->dest;
pckt->port16[1] = udp->source;
}
return true;
}
static __always_inline bool parse_tcp(void *data, __u64 off, void *data_end,
struct packet_description *pckt)
{
struct tcphdr *tcp;
tcp = data + off;
if (tcp + 1 > data_end)
return false;
if (tcp->syn)
pckt->flags |= F_SYN_SET;
if (!(pckt->flags & F_ICMP)) {
pckt->port16[0] = tcp->source;
pckt->port16[1] = tcp->dest;
} else {
pckt->port16[0] = tcp->dest;
pckt->port16[1] = tcp->source;
}
return true;
}
static __always_inline int process_packet(void *data, __u64 off, void *data_end,
bool is_ipv6, struct __sk_buff *skb)
{
void *pkt_start = (void *)(long)skb->data;
struct packet_description pckt = {};
struct eth_hdr *eth = pkt_start;
struct bpf_tunnel_key tkey = {};
struct vip_stats *data_stats;
struct real_definition *dst;
struct vip_meta *vip_info;
struct ctl_value *cval;
__u32 v4_intf_pos = 1;
__u32 v6_intf_pos = 2;
struct ipv6hdr *ip6h;
struct vip vip = {};
struct iphdr *iph;
int tun_flag = 0;
__u16 pkt_bytes;
__u64 iph_len;
__u32 ifindex;
__u8 protocol;
__u32 vip_num;
int action;
tkey.tunnel_ttl = 64;
if (is_ipv6) {
ip6h = data + off;
if (ip6h + 1 > data_end)
return TC_ACT_SHOT;
iph_len = sizeof(struct ipv6hdr);
protocol = ip6h->nexthdr;
pckt.proto = protocol;
pkt_bytes = ntohs(ip6h->payload_len);
off += iph_len;
if (protocol == IPPROTO_FRAGMENT) {
return TC_ACT_SHOT;
} else if (protocol == IPPROTO_ICMPV6) {
action = parse_icmpv6(data, data_end, off, &pckt);
if (action >= 0)
return action;
off += IPV6_PLUS_ICMP_HDR;
} else {
memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16);
memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
}
} else {
iph = data + off;
if (iph + 1 > data_end)
return TC_ACT_SHOT;
if (iph->ihl != 5)
return TC_ACT_SHOT;
protocol = iph->protocol;
pckt.proto = protocol;
pkt_bytes = ntohs(iph->tot_len);
off += IPV4_HDR_LEN_NO_OPT;
if (iph->frag_off & PCKT_FRAGMENTED)
return TC_ACT_SHOT;
if (protocol == IPPROTO_ICMP) {
action = parse_icmp(data, data_end, off, &pckt);
if (action >= 0)
return action;
off += IPV4_PLUS_ICMP_HDR;
} else {
pckt.src = iph->saddr;
pckt.dst = iph->daddr;
}
}
protocol = pckt.proto;
if (protocol == IPPROTO_TCP) {
if (!parse_tcp(data, off, data_end, &pckt))
return TC_ACT_SHOT;
} else if (protocol == IPPROTO_UDP) {
if (!parse_udp(data, off, data_end, &pckt))
return TC_ACT_SHOT;
} else {
return TC_ACT_SHOT;
}
if (is_ipv6)
memcpy(vip.daddr.v6, pckt.dstv6, 16);
else
vip.daddr.v4 = pckt.dst;
vip.dport = pckt.port16[1];
vip.protocol = pckt.proto;
vip_info = bpf_map_lookup_elem(&vip_map, &vip);
if (!vip_info) {
vip.dport = 0;
vip_info = bpf_map_lookup_elem(&vip_map, &vip);
if (!vip_info)
return TC_ACT_SHOT;
pckt.port16[1] = 0;
}
if (vip_info->flags & F_HASH_NO_SRC_PORT)
pckt.port16[0] = 0;
if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
return TC_ACT_SHOT;
if (dst->flags & F_IPV6) {
cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos);
if (!cval)
return TC_ACT_SHOT;
ifindex = cval->ifindex;
memcpy(tkey.remote_ipv6, dst->dstv6, 16);
tun_flag = BPF_F_TUNINFO_IPV6;
} else {
cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos);
if (!cval)
return TC_ACT_SHOT;
ifindex = cval->ifindex;
tkey.remote_ipv4 = dst->dst;
}
vip_num = vip_info->vip_num;
data_stats = bpf_map_lookup_elem(&stats, &vip_num);
if (!data_stats)
return TC_ACT_SHOT;
data_stats->pkts++;
data_stats->bytes += pkt_bytes;
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag);
*(u32 *)eth->eth_dest = tkey.remote_ipv4;
return bpf_redirect(ifindex, 0);
}
SEC("l4lb-demo")
int balancer_ingress(struct __sk_buff *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct eth_hdr *eth = data;
__u32 eth_proto;
__u32 nh_off;
nh_off = sizeof(struct eth_hdr);
if (data + nh_off > data_end)
return TC_ACT_SHOT;
eth_proto = eth->eth_proto;
if (eth_proto == htons(ETH_P_IP))
return process_packet(data, nh_off, data_end, false, ctx);
else if (eth_proto == htons(ETH_P_IPV6))
return process_packet(data, nh_off, data_end, true, ctx);
else
return TC_ACT_SHOT;
}
char _license[] SEC("license") = "GPL";
/* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stddef.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/pkt_cls.h>
#include "bpf_helpers.h"
#define _htons __builtin_bswap16
#define barrier() __asm__ __volatile__("": : :"memory")
int _version SEC("version") = 1;
SEC("test1")
int process(struct __sk_buff *skb)
{
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
struct ethhdr *eth = (struct ethhdr *)(data);
struct tcphdr *tcp = NULL;
__u8 proto = 255;
__u64 ihl_len;
if (eth + 1 > data_end)
return TC_ACT_SHOT;
if (eth->h_proto == _htons(ETH_P_IP)) {
struct iphdr *iph = (struct iphdr *)(eth + 1);
if (iph + 1 > data_end)
return TC_ACT_SHOT;
ihl_len = iph->ihl * 4;
proto = iph->protocol;
tcp = (struct tcphdr *)((void *)(iph) + ihl_len);
} else if (eth->h_proto == _htons(ETH_P_IPV6)) {
struct ipv6hdr *ip6h = (struct ipv6hdr *)(eth + 1);
if (ip6h + 1 > data_end)
return TC_ACT_SHOT;
ihl_len = sizeof(*ip6h);
proto = ip6h->nexthdr;
tcp = (struct tcphdr *)((void *)(ip6h) + ihl_len);
}
if (tcp) {
if (((void *)(tcp) + 20) > data_end || proto != 6)
return TC_ACT_SHOT;
barrier(); /* to force ordering of checks */
if (((void *)(tcp) + 18) > data_end)
return TC_ACT_SHOT;
if (tcp->urg_ptr == 123)
return TC_ACT_OK;
}
return TC_ACT_UNSPEC;
}
/* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <linux/types.h>
typedef __u16 __sum16;
#include <arpa/inet.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <sys/wait.h>
#include <sys/resource.h>
#include <linux/bpf.h>
#include <linux/err.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include "test_iptunnel_common.h"
#include "bpf_util.h"
#define _htons __builtin_bswap16
static int error_cnt, pass_cnt;
#define MAGIC_BYTES 123
/* ipv4 test vector */
static struct {
struct ethhdr eth;
struct iphdr iph;
struct tcphdr tcp;
} __packed pkt_v4 = {
.eth.h_proto = _htons(ETH_P_IP),
.iph.ihl = 5,
.iph.protocol = 6,
.iph.tot_len = _htons(MAGIC_BYTES),
.tcp.urg_ptr = 123,
};
/* ipv6 test vector */
static struct {
struct ethhdr eth;
struct ipv6hdr iph;
struct tcphdr tcp;
} __packed pkt_v6 = {
.eth.h_proto = _htons(ETH_P_IPV6),
.iph.nexthdr = 6,
.iph.payload_len = _htons(MAGIC_BYTES),
.tcp.urg_ptr = 123,
};
#define CHECK(condition, tag, format...) ({ \
int __ret = !!(condition); \
if (__ret) { \
error_cnt++; \
printf("%s:FAIL:%s ", __func__, tag); \
printf(format); \
} else { \
pass_cnt++; \
printf("%s:PASS:%s %d nsec\n", __func__, tag, duration);\
} \
})
static int bpf_prog_load(const char *file, enum bpf_prog_type type,
struct bpf_object **pobj, int *prog_fd)
{
struct bpf_program *prog;
struct bpf_object *obj;
int err;
obj = bpf_object__open(file);
if (IS_ERR(obj)) {
error_cnt++;
return -ENOENT;
}
prog = bpf_program__next(NULL, obj);
if (!prog) {
bpf_object__close(obj);
error_cnt++;
return -ENOENT;
}
bpf_program__set_type(prog, type);
err = bpf_object__load(obj);
if (err) {
bpf_object__close(obj);
error_cnt++;
return -EINVAL;
}
*pobj = obj;
*prog_fd = bpf_program__fd(prog);
return 0;
}
static int bpf_find_map(const char *test, struct bpf_object *obj,
const char *name)
{
struct bpf_map *map;
map = bpf_object__find_map_by_name(obj, name);
if (!map) {
printf("%s:FAIL:map '%s' not found\n", test, name);
error_cnt++;
return -1;
}
return bpf_map__fd(map);
}
static void test_pkt_access(void)
{
const char *file = "./test_pkt_access.o";
struct bpf_object *obj;
__u32 duration, retval;
int err, prog_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (err)
return;
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration);
CHECK(err || errno || retval, "ipv4",
"err %d errno %d retval %d duration %d\n",
err, errno, retval, duration);
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
NULL, NULL, &retval, &duration);
CHECK(err || errno || retval, "ipv6",
"err %d errno %d retval %d duration %d\n",
err, errno, retval, duration);
bpf_object__close(obj);
}
static void test_xdp(void)
{
struct vip key4 = {.protocol = 6, .family = AF_INET};
struct vip key6 = {.protocol = 6, .family = AF_INET6};
struct iptnl_info value4 = {.family = AF_INET};
struct iptnl_info value6 = {.family = AF_INET6};
const char *file = "./test_xdp.o";
struct bpf_object *obj;
char buf[128];
struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
__u32 duration, retval, size;
int err, prog_fd, map_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
if (err)
return;
map_fd = bpf_find_map(__func__, obj, "vip2tnl");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &key4, &value4, 0);
bpf_map_update_elem(map_fd, &key6, &value6, 0);
err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
CHECK(err || errno || retval != XDP_TX || size != 74 ||
iph->protocol != IPPROTO_IPIP, "ipv4",
"err %d errno %d retval %d size %d\n",
err, errno, retval, size);
err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
CHECK(err || errno || retval != XDP_TX || size != 114 ||
iph6->nexthdr != IPPROTO_IPV6, "ipv6",
"err %d errno %d retval %d size %d\n",
err, errno, retval, size);
out:
bpf_object__close(obj);
}
#define MAGIC_VAL 0x1234
#define NUM_ITER 100000
#define VIP_NUM 5
static void test_l4lb(void)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
const char *file = "./test_l4lb.o";
struct vip key = {.protocol = 6};
struct vip_meta {
__u32 flags;
__u32 vip_num;
} value = {.vip_num = VIP_NUM};
__u32 stats_key = VIP_NUM;
struct vip_stats {
__u64 bytes;
__u64 pkts;
} stats[nr_cpus];
struct real_definition {
union {
__be32 dst;
__be32 dstv6[4];
};
__u8 flags;
} real_def = {.dst = MAGIC_VAL};
__u32 ch_key = 11, real_num = 3;
__u32 duration, retval, size;
int err, i, prog_fd, map_fd;
__u64 bytes = 0, pkts = 0;
struct bpf_object *obj;
char buf[128];
u32 *magic = (u32 *)buf;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (err)
return;
map_fd = bpf_find_map(__func__, obj, "vip_map");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &key, &value, 0);
map_fd = bpf_find_map(__func__, obj, "ch_rings");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
map_fd = bpf_find_map(__func__, obj, "reals");
if (map_fd < 0)
goto out;
bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
*magic != MAGIC_VAL, "ipv4",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
*magic != MAGIC_VAL, "ipv6",
"err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic);
map_fd = bpf_find_map(__func__, obj, "stats");
if (map_fd < 0)
goto out;
bpf_map_lookup_elem(map_fd, &stats_key, stats);
for (i = 0; i < nr_cpus; i++) {
bytes += stats[i].bytes;
pkts += stats[i].pkts;
}
if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
error_cnt++;
printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
}
out:
bpf_object__close(obj);
}
int main(void)
{
struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
setrlimit(RLIMIT_MEMLOCK, &rinf);
test_pkt_access();
test_xdp();
test_l4lb();
printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
return 0;
}
/* Copyright (c) 2016,2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stddef.h>
#include <string.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/udp.h>
#include <linux/tcp.h>
#include <linux/pkt_cls.h>
#include <sys/socket.h>
#include "bpf_helpers.h"
#include "test_iptunnel_common.h"
#define htons __builtin_bswap16
#define ntohs __builtin_bswap16
int _version SEC("version") = 1;
struct bpf_map_def SEC("maps") rxcnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u64),
.max_entries = 256,
};
struct bpf_map_def SEC("maps") vip2tnl = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct vip),
.value_size = sizeof(struct iptnl_info),
.max_entries = MAX_IPTNL_ENTRIES,
};
static __always_inline void count_tx(__u32 protocol)
{
__u64 *rxcnt_count;
rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
if (rxcnt_count)
*rxcnt_count += 1;
}
static __always_inline int get_dport(void *trans_data, void *data_end,
__u8 protocol)
{
struct tcphdr *th;
struct udphdr *uh;
switch (protocol) {
case IPPROTO_TCP:
th = (struct tcphdr *)trans_data;
if (th + 1 > data_end)
return -1;
return th->dest;
case IPPROTO_UDP:
uh = (struct udphdr *)trans_data;
if (uh + 1 > data_end)
return -1;
return uh->dest;
default:
return 0;
}
}
static __always_inline void set_ethhdr(struct ethhdr *new_eth,
const struct ethhdr *old_eth,
const struct iptnl_info *tnl,
__be16 h_proto)
{
memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
new_eth->h_proto = h_proto;
}
static __always_inline int handle_ipv4(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
struct iptnl_info *tnl;
struct ethhdr *new_eth;
struct ethhdr *old_eth;
struct iphdr *iph = data + sizeof(struct ethhdr);
__u16 *next_iph;
__u16 payload_len;
struct vip vip = {};
int dport;
__u32 csum = 0;
int i;
if (iph + 1 > data_end)
return XDP_DROP;
dport = get_dport(iph + 1, data_end, iph->protocol);
if (dport == -1)
return XDP_DROP;
vip.protocol = iph->protocol;
vip.family = AF_INET;
vip.daddr.v4 = iph->daddr;
vip.dport = dport;
payload_len = ntohs(iph->tot_len);
tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
/* It only does v4-in-v4 */
if (!tnl || tnl->family != AF_INET)
return XDP_PASS;
if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
return XDP_DROP;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
new_eth = data;
iph = data + sizeof(*new_eth);
old_eth = data + sizeof(*iph);
if (new_eth + 1 > data_end ||
old_eth + 1 > data_end ||
iph + 1 > data_end)
return XDP_DROP;
set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
iph->version = 4;
iph->ihl = sizeof(*iph) >> 2;
iph->frag_off = 0;
iph->protocol = IPPROTO_IPIP;
iph->check = 0;
iph->tos = 0;
iph->tot_len = htons(payload_len + sizeof(*iph));
iph->daddr = tnl->daddr.v4;
iph->saddr = tnl->saddr.v4;
iph->ttl = 8;
next_iph = (__u16 *)iph;
#pragma clang loop unroll(full)
for (i = 0; i < sizeof(*iph) >> 1; i++)
csum += *next_iph++;
iph->check = ~((csum & 0xffff) + (csum >> 16));
count_tx(vip.protocol);
return XDP_TX;
}
static __always_inline int handle_ipv6(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
struct iptnl_info *tnl;
struct ethhdr *new_eth;
struct ethhdr *old_eth;
struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
__u16 payload_len;
struct vip vip = {};
int dport;
if (ip6h + 1 > data_end)
return XDP_DROP;
dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
if (dport == -1)
return XDP_DROP;
vip.protocol = ip6h->nexthdr;
vip.family = AF_INET6;
memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
vip.dport = dport;
payload_len = ip6h->payload_len;
tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
/* It only does v6-in-v6 */
if (!tnl || tnl->family != AF_INET6)
return XDP_PASS;
if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
return XDP_DROP;
data = (void *)(long)xdp->data;
data_end = (void *)(long)xdp->data_end;
new_eth = data;
ip6h = data + sizeof(*new_eth);
old_eth = data + sizeof(*ip6h);
if (new_eth + 1 > data_end || old_eth + 1 > data_end ||
ip6h + 1 > data_end)
return XDP_DROP;
set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
ip6h->version = 6;
ip6h->priority = 0;
memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
ip6h->nexthdr = IPPROTO_IPV6;
ip6h->hop_limit = 8;
memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
count_tx(vip.protocol);
return XDP_TX;
}
SEC("xdp_tx_iptunnel")
int _xdp_tx_iptunnel(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
struct ethhdr *eth = data;
__u16 h_proto;
if (eth + 1 > data_end)
return XDP_DROP;
h_proto = eth->h_proto;
if (h_proto == htons(ETH_P_IP))
return handle_ipv4(xdp);
else if (h_proto == htons(ETH_P_IPV6))
return handle_ipv6(xdp);
else
return XDP_DROP;
}
char _license[] SEC("license") = "GPL";
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment