Commit 5133a4a8 authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2019-03-26

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) introduce bpf_tcp_check_syncookie() helper for XDP and tc, from Lorenz.

2) allow bpf_skb_ecn_set_ce() in tc, from Peter.

3) numerous bpf tc tunneling improvements, from Willem.

4) and other miscellaneous improvements from Adrian, Alan, Daniel, Ivan, Stanislav.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents fa7e428c b4b6aa83
......@@ -205,6 +205,7 @@ enum bpf_return_type {
RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */
RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */
RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
};
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
......
......@@ -1478,13 +1478,27 @@ union bpf_attr {
* Grow or shrink the room for data in the packet associated to
* *skb* by *len_diff*, and according to the selected *mode*.
*
* There is a single supported mode at this time:
* There are two supported modes at this time:
*
* * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
* (room space is added or removed below the layer 2 header).
*
* * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
* (room space is added or removed below the layer 3 header).
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
* The following flags are supported at this time:
*
* * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
* Adjusting mss in this way is not allowed for datagrams.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
* Any new space is reserved to hold a tunnel header.
* Configure skb offsets and other fields accordingly.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
* * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
* Use with ENCAP_L3 flags to further specify the tunnel type.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
......@@ -2431,6 +2445,38 @@ union bpf_attr {
* Return
* A **struct bpf_sock** pointer on success, or **NULL** in
* case of failure.
*
* struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
* Description
* Look for TCP socket matching *tuple*, optionally in a child
* network namespace *netns*. The return value must be checked,
* and if non-**NULL**, released via **bpf_sk_release**\ ().
*
* This function is identical to bpf_sk_lookup_tcp, except that it
* also returns timewait or request sockets. Use bpf_sk_fullsock
* or bpf_tcp_socket to access the full structure.
*
* This helper is available only if the kernel was compiled with
* **CONFIG_NET** configuration option.
* Return
* Pointer to **struct bpf_sock**, or **NULL** in case of failure.
* For sockets with reuseport option, the **struct bpf_sock**
* result is from **reuse->socks**\ [] using the hash of the tuple.
*
* int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
* Description
* Check whether iph and th contain a valid SYN cookie ACK for
* the listening socket in sk.
*
* iph points to the start of the IPv4 or IPv6 header, while
* iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
*
* th points to the start of the TCP header, while th_len contains
* sizeof(struct tcphdr).
*
* Return
* 0 if iph and th are a valid SYN cookie ACK, or a negative error
* otherwise.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2531,7 +2577,9 @@ union bpf_attr {
FN(sk_fullsock), \
FN(tcp_sock), \
FN(skb_ecn_set_ce), \
FN(get_listener_sock),
FN(get_listener_sock), \
FN(skc_lookup_tcp), \
FN(tcp_check_syncookie),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2590,9 +2638,18 @@ enum bpf_func_id {
/* Current network namespace */
#define BPF_F_CURRENT_NETNS (-1L)
/* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4)
/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
BPF_ADJ_ROOM_MAC,
};
/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
......
......@@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id)
static bool is_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp;
func_id == BPF_FUNC_sk_lookup_udp ||
func_id == BPF_FUNC_skc_lookup_tcp;
}
static bool is_ptr_cast_function(enum bpf_func_id func_id)
......@@ -3147,19 +3148,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
if (is_acquire_function(func_id)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
return id;
/* For mark_ptr_or_null_reg() */
regs[BPF_REG_0].id = id;
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = id;
} else {
/* For mark_ptr_or_null_reg() */
regs[BPF_REG_0].id = ++env->id_gen;
}
} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
......@@ -3170,9 +3163,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
if (is_ptr_cast_function(func_id))
if (is_ptr_cast_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
return id;
/* For mark_ptr_or_null_reg() */
regs[BPF_REG_0].id = id;
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = id;
}
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
......
......@@ -2963,42 +2963,113 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
}
}
static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags)
{
u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
u16 mac_len = 0, inner_net = 0, inner_trans = 0;
unsigned int gso_type = SKB_GSO_DODGY;
int ret;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
/* udp gso_size delineates datagrams, only allow if fixed */
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
return -ENOTSUPP;
}
ret = skb_cow(skb, len_diff);
ret = skb_cow_head(skb, len_diff);
if (unlikely(ret < 0))
return ret;
if (encap) {
if (skb->protocol != htons(ETH_P_IP) &&
skb->protocol != htons(ETH_P_IPV6))
return -ENOTSUPP;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
return -EINVAL;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;
if (skb->encapsulation)
return -EALREADY;
mac_len = skb->network_header - skb->mac_header;
inner_net = skb->network_header;
inner_trans = skb->transport_header;
}
ret = bpf_skb_net_hdr_push(skb, off, len_diff);
if (unlikely(ret < 0))
return ret;
if (encap) {
/* inner mac == inner_net on l3 encap */
skb->inner_mac_header = inner_net;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
skb_set_inner_protocol(skb, skb->protocol);
skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
gso_type |= SKB_GSO_UDP_TUNNEL;
else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
gso_type |= SKB_GSO_GRE;
else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
gso_type |= SKB_GSO_IPXIP6;
else
gso_type |= SKB_GSO_IPXIP4;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
sizeof(struct ipv6hdr) :
sizeof(struct iphdr);
skb_set_transport_header(skb, mac_len + nh_len);
}
}
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Due to header grow, MSS needs to be downgraded. */
if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_type |= gso_type;
shinfo->gso_segs = 0;
}
return 0;
}
static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags)
{
u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
int ret;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
/* udp gso_size delineates datagrams, only allow if fixed */
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
return -ENOTSUPP;
}
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
......@@ -3012,7 +3083,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Due to header shrink, MSS can be upgraded. */
if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
skb_increase_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0;
......@@ -3027,49 +3100,50 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
SKB_MAX_ALLOC;
}
static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
bool trans_same = skb->transport_header == skb->network_header;
u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb);
u32 len_max = __bpf_skb_max_len(skb);
__be16 proto = skb->protocol;
bool shrink = len_diff < 0;
u32 off;
int ret;
if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
if (unlikely(proto != htons(ETH_P_IP) &&
proto != htons(ETH_P_IPV6)))
return -ENOTSUPP;
off = skb_mac_header_len(skb);
switch (mode) {
case BPF_ADJ_ROOM_NET:
off += bpf_skb_net_base_len(skb);
break;
case BPF_ADJ_ROOM_MAC:
break;
default:
return -ENOTSUPP;
}
len_cur = skb->len - skb_network_offset(skb);
if (skb_transport_header_was_set(skb) && !trans_same)
len_cur = skb_network_header_len(skb);
if ((shrink && (len_diff_abs >= len_cur ||
len_cur - len_diff_abs < len_min)) ||
(!shrink && (skb->len + len_diff_abs > len_max &&
!skb_is_gso(skb))))
return -ENOTSUPP;
ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
bpf_skb_net_grow(skb, len_diff_abs);
ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
bpf_skb_net_grow(skb, off, len_diff_abs, flags);
bpf_compute_data_pointers(skb);
return ret;
}
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
if (unlikely(flags))
return -EINVAL;
if (likely(mode == BPF_ADJ_ROOM_NET))
return bpf_skb_adjust_net(skb, len_diff);
return -ENOTSUPP;
}
static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
.func = bpf_skb_adjust_room,
.gpl_only = false,
......@@ -5156,13 +5230,13 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
return sk;
}
/* bpf_sk_lookup performs the core lookup for different types of sockets,
/* bpf_skc_lookup performs the core lookup for different types of sockets,
* taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
* Returns the socket as an 'unsigned long' to simplify the casting in the
* callers to satisfy BPF_CALL declarations.
*/
static unsigned long
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
u64 flags)
{
......@@ -5192,14 +5266,26 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
put_net(net);
}
out:
return sk;
}
static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
u64 flags)
{
struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
ifindex, proto, netns_id, flags);
if (sk)
sk = sk_to_full_sk(sk);
out:
return (unsigned long) sk;
return sk;
}
static unsigned long
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
u8 proto, u64 netns_id, u64 flags)
{
struct net *caller_net;
......@@ -5213,14 +5299,47 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
ifindex = 0;
}
return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex,
proto, netns_id, flags);
return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
netns_id, flags);
}
static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
u8 proto, u64 netns_id, u64 flags)
{
struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
flags);
if (sk)
sk = sk_to_full_sk(sk);
return sk;
}
BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
netns_id, flags);
}
static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
.func = bpf_skc_lookup_tcp,
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
netns_id, flags);
}
static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
......@@ -5238,7 +5357,8 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
netns_id, flags);
}
static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
......@@ -5273,8 +5393,9 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
struct net *caller_net = dev_net(ctx->rxq->dev);
int ifindex = ctx->rxq->dev->ifindex;
return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
IPPROTO_UDP, netns_id, flags);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_UDP, netns_id,
flags);
}
static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
......@@ -5289,14 +5410,38 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
struct net *caller_net = dev_net(ctx->rxq->dev);
int ifindex = ctx->rxq->dev->ifindex;
return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
flags);
}
static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
.func = bpf_xdp_skc_lookup_tcp,
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
struct net *caller_net = dev_net(ctx->rxq->dev);
int ifindex = ctx->rxq->dev->ifindex;
return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex,
IPPROTO_TCP, netns_id, flags);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
flags);
}
static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
......@@ -5311,13 +5456,33 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0,
return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
sock_net(ctx->sk), 0,
IPPROTO_TCP, netns_id, flags);
}
static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
.func = bpf_sock_addr_skc_lookup_tcp,
.gpl_only = false,
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
sock_net(ctx->sk), 0, IPPROTO_TCP,
netns_id, flags);
}
static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
.func = bpf_sock_addr_sk_lookup_tcp,
.gpl_only = false,
......@@ -5332,8 +5497,9 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0,
IPPROTO_UDP, netns_id, flags);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
sock_net(ctx->sk), 0, IPPROTO_UDP,
netns_id, flags);
}
static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
......@@ -5461,6 +5627,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
u32 cookie;
int ret;
if (unlikely(th_len < sizeof(*th)))
return -EINVAL;
/* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
return -EINVAL;
if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
return -EINVAL;
if (!th->ack || th->rst || th->syn)
return -ENOENT;
if (tcp_synq_no_recent_overflow(sk))
return -ENOENT;
cookie = ntohl(th->ack_seq) - 1;
switch (sk->sk_family) {
case AF_INET:
if (unlikely(iph_len < sizeof(struct iphdr)))
return -EINVAL;
ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
break;
#if IS_BUILTIN(CONFIG_IPV6)
case AF_INET6:
if (unlikely(iph_len < sizeof(struct ipv6hdr)))
return -EINVAL;
ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
break;
#endif /* CONFIG_IPV6 */
default:
return -EPROTONOSUPPORT;
}
if (ret > 0)
return 0;
return -ENOENT;
#else
return -ENOTSUPP;
#endif
}
static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
.func = bpf_tcp_check_syncookie,
.gpl_only = true,
.pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCK_COMMON,
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_PTR_TO_MEM,
.arg5_type = ARG_CONST_SIZE,
};
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
......@@ -5586,6 +5820,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_addr_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
default:
return bpf_base_func_proto(func_id);
......@@ -5719,6 +5955,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_sock_proto;
case BPF_FUNC_get_listener_sock:
return &bpf_get_listener_sock_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_check_syncookie:
return &bpf_tcp_check_syncookie_proto;
case BPF_FUNC_skb_ecn_set_ce:
return &bpf_skb_ecn_set_ce_proto;
#endif
default:
return bpf_base_func_proto(func_id);
......@@ -5754,6 +5996,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_sk_lookup_tcp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_xdp_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_check_syncookie:
return &bpf_tcp_check_syncookie_proto;
#endif
default:
return bpf_base_func_proto(func_id);
......@@ -5846,6 +6092,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_skc_lookup_tcp_proto;
#endif
default:
return bpf_base_func_proto(func_id);
......
......@@ -44,5 +44,6 @@ xdp_redirect_cpu
xdp_redirect_map
xdp_router_ipv4
xdp_rxq_info
xdp_sample_pkts
xdp_tx_iptunnel
xdpsock
......@@ -1478,13 +1478,27 @@ union bpf_attr {
* Grow or shrink the room for data in the packet associated to
* *skb* by *len_diff*, and according to the selected *mode*.
*
* There is a single supported mode at this time:
* There are two supported modes at this time:
*
* * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
* (room space is added or removed below the layer 2 header).
*
* * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
* (room space is added or removed below the layer 3 header).
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
* The following flags are supported at this time:
*
* * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
* Adjusting mss in this way is not allowed for datagrams.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
* Any new space is reserved to hold a tunnel header.
* Configure skb offsets and other fields accordingly.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
* * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
* Use with ENCAP_L3 flags to further specify the tunnel type.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
......@@ -2431,6 +2445,38 @@ union bpf_attr {
* Return
* A **struct bpf_sock** pointer on success, or **NULL** in
* case of failure.
*
* struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
* Description
* Look for TCP socket matching *tuple*, optionally in a child
* network namespace *netns*. The return value must be checked,
* and if non-**NULL**, released via **bpf_sk_release**\ ().
*
* This function is identical to bpf_sk_lookup_tcp, except that it
* also returns timewait or request sockets. Use bpf_sk_fullsock
* or bpf_tcp_socket to access the full structure.
*
* This helper is available only if the kernel was compiled with
* **CONFIG_NET** configuration option.
* Return
* Pointer to **struct bpf_sock**, or **NULL** in case of failure.
* For sockets with reuseport option, the **struct bpf_sock**
* result is from **reuse->socks**\ [] using the hash of the tuple.
*
* int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
* Description
* Check whether iph and th contain a valid SYN cookie ACK for
* the listening socket in sk.
*
* iph points to the start of the IPv4 or IPv6 header, while
* iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
*
* th points to the start of the TCP header, while th_len contains
* sizeof(struct tcphdr).
*
* Return
* 0 if iph and th are a valid SYN cookie ACK, or a negative error
* otherwise.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2531,7 +2577,9 @@ union bpf_attr {
FN(sk_fullsock), \
FN(tcp_sock), \
FN(skb_ecn_set_ce), \
FN(get_listener_sock),
FN(get_listener_sock), \
FN(skc_lookup_tcp), \
FN(tcp_check_syncookie),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2590,9 +2638,18 @@ enum bpf_func_id {
/* Current network namespace */
#define BPF_F_CURRENT_NETNS (-1L)
/* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4)
/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
BPF_ADJ_ROOM_MAC,
};
/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
......
......@@ -30,4 +30,5 @@ test_netcnt
test_section_names
test_tcpnotify_user
test_libbpf
test_tcp_check_syncookie_user
alu32
......@@ -51,7 +51,10 @@ TEST_PROGS := test_kmod.sh \
test_skb_cgroup_id.sh \
test_flow_dissector.sh \
test_xdp_vlan.sh \
test_lwt_ip_encap.sh
test_lwt_ip_encap.sh \
test_tcp_check_syncookie.sh \
test_tc_tunnel.sh \
test_tc_edt.sh
TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \
......@@ -60,7 +63,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
# Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user
include ../lib.mk
......@@ -69,7 +72,7 @@ TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
all: $(TEST_CUSTOM_PROGS)
$(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c
$(CC) -o $@ -static $< -Wl,--build-id
$(CC) -o $@ $< -Wl,--build-id
BPFOBJ := $(OUTPUT)/libbpf.a
......
......@@ -159,6 +159,11 @@ static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
int size, unsigned long long netns_id,
unsigned long long flags) =
(void *) BPF_FUNC_sk_lookup_tcp;
static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx,
struct bpf_sock_tuple *tuple,
int size, unsigned long long netns_id,
unsigned long long flags) =
(void *) BPF_FUNC_skc_lookup_tcp;
static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
struct bpf_sock_tuple *tuple,
int size, unsigned long long netns_id,
......@@ -184,6 +189,9 @@ static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) =
(void *) BPF_FUNC_get_listener_sock;
static int (*bpf_skb_ecn_set_ce)(void *ctx) =
(void *) BPF_FUNC_skb_ecn_set_ce;
static int (*bpf_tcp_check_syncookie)(struct bpf_sock *sk,
void *ip, int ip_len, void *tcp, int tcp_len) =
(void *) BPF_FUNC_tcp_check_syncookie;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
......@@ -274,6 +282,9 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
#elif defined(__TARGET_ARCH_s930x)
#define bpf_target_s930x
#define bpf_target_defined
#elif defined(__TARGET_ARCH_arm)
#define bpf_target_arm
#define bpf_target_defined
#elif defined(__TARGET_ARCH_arm64)
#define bpf_target_arm64
#define bpf_target_defined
......@@ -296,6 +307,8 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
#define bpf_target_x86
#elif defined(__s390x__)
#define bpf_target_s930x
#elif defined(__arm__)
#define bpf_target_arm
#elif defined(__aarch64__)
#define bpf_target_arm64
#elif defined(__mips__)
......@@ -333,6 +346,19 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
#define PT_REGS_SP(x) ((x)->gprs[15])
#define PT_REGS_IP(x) ((x)->psw.addr)
#elif defined(bpf_target_arm)
#define PT_REGS_PARM1(x) ((x)->uregs[0])
#define PT_REGS_PARM2(x) ((x)->uregs[1])
#define PT_REGS_PARM3(x) ((x)->uregs[2])
#define PT_REGS_PARM4(x) ((x)->uregs[3])
#define PT_REGS_PARM5(x) ((x)->uregs[4])
#define PT_REGS_RET(x) ((x)->uregs[14])
#define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */
#define PT_REGS_RC(x) ((x)->uregs[0])
#define PT_REGS_SP(x) ((x)->uregs[13])
#define PT_REGS_IP(x) ((x)->uregs[12])
#elif defined(bpf_target_arm64)
#define PT_REGS_PARM1(x) ((x)->regs[0])
......
......@@ -23,3 +23,5 @@ CONFIG_LWTUNNEL=y
CONFIG_BPF_STREAM_PARSER=y
CONFIG_XDP_SOCKETS=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
static __u64 read_perf_max_sample_freq(void)
{
__u64 sample_freq = 5000; /* fallback to 5000 on error */
FILE *f;
f = fopen("/proc/sys/kernel/perf_event_max_sample_rate", "r");
if (f == NULL)
return sample_freq;
fscanf(f, "%llu", &sample_freq);
fclose(f);
return sample_freq;
}
void test_stacktrace_build_id_nmi(void)
{
int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
const char *file = "./test_stacktrace_build_id.o";
int err, pmu_fd, prog_fd;
struct perf_event_attr attr = {
.sample_freq = 5000,
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
......@@ -20,6 +32,8 @@ void test_stacktrace_build_id_nmi(void)
int build_id_matches = 0;
int retry = 1;
attr.sample_freq = read_perf_max_sample_freq();
retry:
err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
......
// SPDX-License-Identifier: GPL-2.0
#include <stdint.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/pkt_cls.h>
#include <linux/tcp.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
/* the maximum delay we are willing to add (drop packets beyond that) */
#define TIME_HORIZON_NS (2000 * 1000 * 1000)
#define NS_PER_SEC 1000000000
#define ECN_HORIZON_NS 5000000
#define THROTTLE_RATE_BPS (5 * 1000 * 1000)
/* flow_key => last_tstamp timestamp used */
struct bpf_map_def SEC("maps") flow_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(uint32_t),
.value_size = sizeof(uint64_t),
.max_entries = 1,
};
static inline int throttle_flow(struct __sk_buff *skb)
{
int key = 0;
uint64_t *last_tstamp = bpf_map_lookup_elem(&flow_map, &key);
uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC /
THROTTLE_RATE_BPS;
uint64_t now = bpf_ktime_get_ns();
uint64_t tstamp, next_tstamp = 0;
if (last_tstamp)
next_tstamp = *last_tstamp + delay_ns;
tstamp = skb->tstamp;
if (tstamp < now)
tstamp = now;
/* should we throttle? */
if (next_tstamp <= tstamp) {
if (bpf_map_update_elem(&flow_map, &key, &tstamp, BPF_ANY))
return TC_ACT_SHOT;
return TC_ACT_OK;
}
/* do not queue past the time horizon */
if (next_tstamp - now >= TIME_HORIZON_NS)
return TC_ACT_SHOT;
/* set ecn bit, if needed */
if (next_tstamp - now >= ECN_HORIZON_NS)
bpf_skb_ecn_set_ce(skb);
if (bpf_map_update_elem(&flow_map, &key, &next_tstamp, BPF_EXIST))
return TC_ACT_SHOT;
skb->tstamp = next_tstamp;
return TC_ACT_OK;
}
static inline int handle_tcp(struct __sk_buff *skb, struct tcphdr *tcp)
{
void *data_end = (void *)(long)skb->data_end;
/* drop malformed packets */
if ((void *)(tcp + 1) > data_end)
return TC_ACT_SHOT;
if (tcp->dest == bpf_htons(9000))
return throttle_flow(skb);
return TC_ACT_OK;
}
static inline int handle_ipv4(struct __sk_buff *skb)
{
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
struct iphdr *iph;
uint32_t ihl;
/* drop malformed packets */
if (data + sizeof(struct ethhdr) > data_end)
return TC_ACT_SHOT;
iph = (struct iphdr *)(data + sizeof(struct ethhdr));
if ((void *)(iph + 1) > data_end)
return TC_ACT_SHOT;
ihl = iph->ihl * 4;
if (((void *)iph) + ihl > data_end)
return TC_ACT_SHOT;
if (iph->protocol == IPPROTO_TCP)
return handle_tcp(skb, (struct tcphdr *)(((void *)iph) + ihl));
return TC_ACT_OK;
}
SEC("cls_test") int tc_prog(struct __sk_buff *skb)
{
if (skb->protocol == bpf_htons(ETH_P_IP))
return handle_ipv4(skb);
return TC_ACT_OK;
}
char __license[] SEC("license") = "GPL";
// SPDX-License-Identifier: GPL-2.0
/* In-place tunneling */
#include <stdbool.h>
#include <string.h>
#include <linux/stddef.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/pkt_cls.h>
#include <linux/types.h>
#include "bpf_endian.h"
#include "bpf_helpers.h"
static const int cfg_port = 8000;
struct grev4hdr {
struct iphdr ip;
__be16 flags;
__be16 protocol;
} __attribute__((packed));
struct grev6hdr {
struct ipv6hdr ip;
__be16 flags;
__be16 protocol;
} __attribute__((packed));
static __always_inline void set_ipv4_csum(struct iphdr *iph)
{
__u16 *iph16 = (__u16 *)iph;
__u32 csum;
int i;
iph->check = 0;
#pragma clang loop unroll(full)
for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++)
csum += *iph16++;
iph->check = ~((csum & 0xffff) + (csum >> 16));
}
static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
{
struct grev4hdr h_outer;
struct iphdr iph_inner;
struct tcphdr tcph;
__u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0)
return TC_ACT_OK;
/* filter only packets we want */
if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP)
return TC_ACT_OK;
if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
&tcph, sizeof(tcph)) < 0)
return TC_ACT_OK;
if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK;
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
if (with_gre) {
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
olen = sizeof(h_outer);
} else {
olen = sizeof(h_outer.ip);
}
/* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
return TC_ACT_SHOT;
/* prepare new outer network header */
h_outer.ip = iph_inner;
h_outer.ip.tot_len = bpf_htons(olen +
bpf_htons(h_outer.ip.tot_len));
if (with_gre) {
h_outer.ip.protocol = IPPROTO_GRE;
h_outer.protocol = bpf_htons(ETH_P_IP);
h_outer.flags = 0;
} else {
h_outer.ip.protocol = IPPROTO_IPIP;
}
set_ipv4_csum((void *)&h_outer.ip);
/* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
BPF_F_INVALIDATE_HASH) < 0)
return TC_ACT_SHOT;
return TC_ACT_OK;
}
static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
{
struct ipv6hdr iph_inner;
struct grev6hdr h_outer;
struct tcphdr tcph;
__u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0)
return TC_ACT_OK;
/* filter only packets we want */
if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
&tcph, sizeof(tcph)) < 0)
return TC_ACT_OK;
if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK;
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
if (with_gre) {
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
olen = sizeof(h_outer);
} else {
olen = sizeof(h_outer.ip);
}
/* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
return TC_ACT_SHOT;
/* prepare new outer network header */
h_outer.ip = iph_inner;
h_outer.ip.payload_len = bpf_htons(olen +
bpf_ntohs(h_outer.ip.payload_len));
if (with_gre) {
h_outer.ip.nexthdr = IPPROTO_GRE;
h_outer.protocol = bpf_htons(ETH_P_IPV6);
h_outer.flags = 0;
} else {
h_outer.ip.nexthdr = IPPROTO_IPV6;
}
/* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
BPF_F_INVALIDATE_HASH) < 0)
return TC_ACT_SHOT;
return TC_ACT_OK;
}
SEC("encap_ipip")
int __encap_ipip(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, false);
else
return TC_ACT_OK;
}
SEC("encap_gre")
int __encap_gre(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, true);
else
return TC_ACT_OK;
}
SEC("encap_ip6tnl")
int __encap_ip6tnl(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, false);
else
return TC_ACT_OK;
}
SEC("encap_ip6gre")
int __encap_ip6gre(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, true);
else
return TC_ACT_OK;
}
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
{
char buf[sizeof(struct grev6hdr)];
int olen;
switch (proto) {
case IPPROTO_IPIP:
case IPPROTO_IPV6:
olen = len;
break;
case IPPROTO_GRE:
olen = len + 4 /* gre hdr */;
break;
default:
return TC_ACT_OK;
}
if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC,
BPF_F_ADJ_ROOM_FIXED_GSO))
return TC_ACT_SHOT;
return TC_ACT_OK;
}
static int decap_ipv4(struct __sk_buff *skb)
{
struct iphdr iph_outer;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
sizeof(iph_outer)) < 0)
return TC_ACT_OK;
if (iph_outer.ihl != 5)
return TC_ACT_OK;
return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
iph_outer.protocol);
}
static int decap_ipv6(struct __sk_buff *skb)
{
struct ipv6hdr iph_outer;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
sizeof(iph_outer)) < 0)
return TC_ACT_OK;
return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
iph_outer.nexthdr);
}
SEC("decap")
int decap_f(struct __sk_buff *skb)
{
switch (skb->protocol) {
case __bpf_constant_htons(ETH_P_IP):
return decap_ipv4(skb);
case __bpf_constant_htons(ETH_P_IPV6):
return decap_ipv6(skb);
default:
/* does not match, ignore */
return TC_ACT_OK;
}
}
char __license[] SEC("license") = "GPL";
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2018 Facebook
// Copyright (c) 2019 Cloudflare
#include <string.h>
#include <linux/bpf.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <sys/socket.h>
#include <linux/tcp.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
struct bpf_map_def SEC("maps") results = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u64),
.max_entries = 1,
};
static __always_inline void check_syncookie(void *ctx, void *data,
void *data_end)
{
struct bpf_sock_tuple tup;
struct bpf_sock *sk;
struct ethhdr *ethh;
struct iphdr *ipv4h;
struct ipv6hdr *ipv6h;
struct tcphdr *tcph;
int ret;
__u32 key = 0;
__u64 value = 1;
ethh = data;
if (ethh + 1 > data_end)
return;
switch (bpf_ntohs(ethh->h_proto)) {
case ETH_P_IP:
ipv4h = data + sizeof(struct ethhdr);
if (ipv4h + 1 > data_end)
return;
if (ipv4h->ihl != 5)
return;
tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);
if (tcph + 1 > data_end)
return;
tup.ipv4.saddr = ipv4h->saddr;
tup.ipv4.daddr = ipv4h->daddr;
tup.ipv4.sport = tcph->source;
tup.ipv4.dport = tcph->dest;
sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv4),
BPF_F_CURRENT_NETNS, 0);
if (!sk)
return;
if (sk->state != BPF_TCP_LISTEN)
goto release;
ret = bpf_tcp_check_syncookie(sk, ipv4h, sizeof(*ipv4h),
tcph, sizeof(*tcph));
break;
case ETH_P_IPV6:
ipv6h = data + sizeof(struct ethhdr);
if (ipv6h + 1 > data_end)
return;
if (ipv6h->nexthdr != IPPROTO_TCP)
return;
tcph = data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
if (tcph + 1 > data_end)
return;
memcpy(tup.ipv6.saddr, &ipv6h->saddr, sizeof(tup.ipv6.saddr));
memcpy(tup.ipv6.daddr, &ipv6h->daddr, sizeof(tup.ipv6.daddr));
tup.ipv6.sport = tcph->source;
tup.ipv6.dport = tcph->dest;
sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv6),
BPF_F_CURRENT_NETNS, 0);
if (!sk)
return;
if (sk->state != BPF_TCP_LISTEN)
goto release;
ret = bpf_tcp_check_syncookie(sk, ipv6h, sizeof(*ipv6h),
tcph, sizeof(*tcph));
break;
default:
return;
}
if (ret == 0)
bpf_map_update_elem(&results, &key, &value, 0);
release:
bpf_sk_release(sk);
}
SEC("clsact/check_syncookie")
int check_syncookie_clsact(struct __sk_buff *skb)
{
check_syncookie(skb, (void *)(long)skb->data,
(void *)(long)skb->data_end);
return TC_ACT_OK;
}
SEC("xdp/check_syncookie")
int check_syncookie_xdp(struct xdp_md *ctx)
{
check_syncookie(ctx, (void *)(long)ctx->data,
(void *)(long)ctx->data_end);
return XDP_PASS;
}
char _license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test installs a TC bpf program that throttles a TCP flow
# with dst port = 9000 down to 5MBps. Then it measures actual
# throughput of the flow.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that nc, dd, and timeout are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP_SRC="172.16.1.100"
readonly IP_DST="172.16.2.100"
cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_DST}
}
trap cleanup EXIT
set -e # exit on error
ip netns add "${NS_SRC}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_dst
ip link set veth_src netns ${NS_SRC}
ip link set veth_dst netns ${NS_DST}
ip -netns ${NS_SRC} addr add ${IP_SRC}/24 dev veth_src
ip -netns ${NS_DST} addr add ${IP_DST}/24 dev veth_dst
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_SRC} route add ${IP_DST}/32 dev veth_src
ip -netns ${NS_DST} route add ${IP_SRC}/32 dev veth_dst
# set up TC on TX
ip netns exec ${NS_SRC} tc qdisc add dev veth_src root fq
ip netns exec ${NS_SRC} tc qdisc add dev veth_src clsact
ip netns exec ${NS_SRC} tc filter add dev veth_src egress \
bpf da obj test_tc_edt.o sec cls_test
# start the listener
ip netns exec ${NS_DST} bash -c \
"nc -4 -l -s ${IP_DST} -p 9000 >/dev/null &"
declare -i NC_PID=$!
sleep 1
declare -ir TIMEOUT=20
declare -ir EXPECTED_BPS=5000000
# run the load, capture RX bytes on DST
declare -ir RX_BYTES_START=$( ip netns exec ${NS_DST} \
cat /sys/class/net/veth_dst/statistics/rx_bytes )
set +e
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero \
bs=1000 count=1000000 > /dev/tcp/${IP_DST}/9000 2>/dev/null"
set -e
declare -ir RX_BYTES_END=$( ip netns exec ${NS_DST} \
cat /sys/class/net/veth_dst/statistics/rx_bytes )
declare -ir ACTUAL_BPS=$(( ($RX_BYTES_END - $RX_BYTES_START) / $TIMEOUT ))
echo $TIMEOUT $ACTUAL_BPS $EXPECTED_BPS | \
awk '{printf "elapsed: %d sec; bps difference: %.2f%%\n",
$1, ($2-$3)*100.0/$3}'
# Pass the test if the actual bps is within 1% of the expected bps.
# The difference is usually about 0.1% on a 20-sec test, and ==> zero
# the longer the test runs.
declare -ir RES=$( echo $ACTUAL_BPS $EXPECTED_BPS | \
awk 'function abs(x){return ((x < 0.0) ? -x : x)}
{if (abs(($1-$2)*100.0/$2) > 1.0) { print "1" }
else { print "0"} }' )
if [ "${RES}" == "0" ] ; then
echo "PASS"
else
echo "FAIL"
exit 1
fi
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# In-place tunneling
# must match the port that the bpf program filters on
readonly port=8000
readonly ns_prefix="ns-$$-"
readonly ns1="${ns_prefix}1"
readonly ns2="${ns_prefix}2"
readonly ns1_v4=192.168.1.1
readonly ns2_v4=192.168.1.2
readonly ns1_v6=fd::1
readonly ns2_v6=fd::2
readonly infile="$(mktemp)"
readonly outfile="$(mktemp)"
setup() {
ip netns add "${ns1}"
ip netns add "${ns2}"
ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \
peer name veth2 mtu 1500 netns "${ns2}"
ip netns exec "${ns1}" ethtool -K veth1 tso off
ip -netns "${ns1}" link set veth1 up
ip -netns "${ns2}" link set veth2 up
ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1
ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2
ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad
ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad
# clamp route to reserve room for tunnel headers
ip -netns "${ns1}" -4 route flush table main
ip -netns "${ns1}" -6 route flush table main
ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1
ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1
sleep 1
dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
}
cleanup() {
ip netns del "${ns2}"
ip netns del "${ns1}"
if [[ -f "${outfile}" ]]; then
rm "${outfile}"
fi
if [[ -f "${infile}" ]]; then
rm "${infile}"
fi
}
server_listen() {
ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" &
server_pid=$!
sleep 0.2
}
client_connect() {
ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}"
echo $?
}
verify_data() {
wait "${server_pid}"
# sha1sum returns two fields [sha1] [filepath]
# convert to bash array and access first elem
insum=($(sha1sum ${infile}))
outsum=($(sha1sum ${outfile}))
if [[ "${insum[0]}" != "${outsum[0]}" ]]; then
echo "data mismatch"
exit 1
fi
}
set -e
# no arguments: automated test, run all
if [[ "$#" -eq "0" ]]; then
echo "ipip"
$0 ipv4 ipip 100
echo "ip6ip6"
$0 ipv6 ip6tnl 100
echo "ip gre"
$0 ipv4 gre 100
echo "ip6 gre"
$0 ipv6 ip6gre 100
echo "ip gre gso"
$0 ipv4 gre 2000
echo "ip6 gre gso"
$0 ipv6 ip6gre 2000
echo "OK. All tests passed"
exit 0
fi
if [[ "$#" -ne "3" ]]; then
echo "Usage: $0"
echo " or: $0 <ipv4|ipv6> <tuntype> <data_len>"
exit 1
fi
case "$1" in
"ipv4")
readonly addr1="${ns1_v4}"
readonly addr2="${ns2_v4}"
readonly netcat_opt=-4
;;
"ipv6")
readonly addr1="${ns1_v6}"
readonly addr2="${ns2_v6}"
readonly netcat_opt=-6
;;
*)
echo "unknown arg: $1"
exit 1
;;
esac
readonly tuntype=$2
readonly datalen=$3
echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}"
trap cleanup EXIT
setup
# basic communication works
echo "test basic connectivity"
server_listen
client_connect
verify_data
# clientside, insert bpf program to encap all TCP to port ${port}
# client can no longer connect
ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
ip netns exec "${ns1}" tc filter add dev veth1 egress \
bpf direct-action object-file ./test_tc_tunnel.o \
section "encap_${tuntype}"
echo "test bpf encap without decap (expect failure)"
server_listen
! client_connect
# serverside, insert decap module
# server is still running
# client can connect again
ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \
remote "${addr1}" local "${addr2}"
# Because packets are decapped by the tunnel they arrive on testtun0 from
# the IP stack perspective. Ensure reverse path filtering is disabled
# otherwise we drop the TCP SYN as arriving on testtun0 instead of the
# expected veth2 (veth2 is where 192.168.1.2 is configured).
ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
# rp needs to be disabled for both all and testtun0 as the rp value is
# selected as the max of the "all" and device-specific values.
ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0
ip netns exec "${ns2}" ip link set dev testtun0 up
echo "test bpf encap with tunnel device decap"
client_connect
verify_data
# serverside, use BPF for decap
ip netns exec "${ns2}" ip link del dev testtun0
ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
ip netns exec "${ns2}" tc filter add dev veth2 ingress \
bpf direct-action object-file ./test_tc_tunnel.o section decap
server_listen
echo "test bpf encap with bpf decap"
client_connect
verify_data
echo OK
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2018 Facebook
# Copyright (c) 2019 Cloudflare
set -eu
wait_for_ip()
{
local _i
printf "Wait for IP %s to become available " "$1"
for _i in $(seq ${MAX_PING_TRIES}); do
printf "."
if ns1_exec ping -c 1 -W 1 "$1" >/dev/null 2>&1; then
echo " OK"
return
fi
sleep 1
done
echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
exit 1
}
get_prog_id()
{
awk '/ id / {sub(/.* id /, "", $0); print($1)}'
}
ns1_exec()
{
ip netns exec ns1 "$@"
}
setup()
{
ip netns add ns1
ns1_exec ip link set lo up
ns1_exec sysctl -w net.ipv4.tcp_syncookies=2
wait_for_ip 127.0.0.1
wait_for_ip ::1
}
cleanup()
{
ip netns del ns1 2>/dev/null || :
}
main()
{
trap cleanup EXIT 2 3 6 15
setup
printf "Testing clsact..."
ns1_exec tc qdisc add dev "${TEST_IF}" clsact
ns1_exec tc filter add dev "${TEST_IF}" ingress \
bpf obj "${BPF_PROG_OBJ}" sec "${CLSACT_SECTION}" da
BPF_PROG_ID=$(ns1_exec tc filter show dev "${TEST_IF}" ingress | \
get_prog_id)
ns1_exec "${PROG}" "${BPF_PROG_ID}"
ns1_exec tc qdisc del dev "${TEST_IF}" clsact
printf "Testing XDP..."
ns1_exec ip link set "${TEST_IF}" xdp \
object "${BPF_PROG_OBJ}" section "${XDP_SECTION}"
BPF_PROG_ID=$(ns1_exec ip link show "${TEST_IF}" | get_prog_id)
ns1_exec "${PROG}" "${BPF_PROG_ID}"
}
DIR=$(dirname $0)
TEST_IF=lo
MAX_PING_TRIES=5
BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.o"
CLSACT_SECTION="clsact/check_syncookie"
XDP_SECTION="xdp/check_syncookie"
BPF_PROG_ID=0
PROG="${DIR}/test_tcp_check_syncookie_user"
main
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2018 Facebook
// Copyright (c) 2019 Cloudflare
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include "bpf_rlimit.h"
#include "cgroup_helpers.h"
static int start_server(const struct sockaddr *addr, socklen_t len)
{
int fd;
fd = socket(addr->sa_family, SOCK_STREAM, 0);
if (fd == -1) {
log_err("Failed to create server socket");
goto out;
}
if (bind(fd, addr, len) == -1) {
log_err("Failed to bind server socket");
goto close_out;
}
if (listen(fd, 128) == -1) {
log_err("Failed to listen on server socket");
goto close_out;
}
goto out;
close_out:
close(fd);
fd = -1;
out:
return fd;
}
static int connect_to_server(int server_fd)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
int fd = -1;
if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
log_err("Failed to get server addr");
goto out;
}
fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (fd == -1) {
log_err("Failed to create client socket");
goto out;
}
if (connect(fd, (const struct sockaddr *)&addr, len) == -1) {
log_err("Fail to connect to server");
goto close_out;
}
goto out;
close_out:
close(fd);
fd = -1;
out:
return fd;
}
static int get_map_fd_by_prog_id(int prog_id)
{
struct bpf_prog_info info = {};
__u32 info_len = sizeof(info);
__u32 map_ids[1];
int prog_fd = -1;
int map_fd = -1;
prog_fd = bpf_prog_get_fd_by_id(prog_id);
if (prog_fd < 0) {
log_err("Failed to get fd by prog id %d", prog_id);
goto err;
}
info.nr_map_ids = 1;
info.map_ids = (__u64)(unsigned long)map_ids;
if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) {
log_err("Failed to get info by prog fd %d", prog_fd);
goto err;
}
if (!info.nr_map_ids) {
log_err("No maps found for prog fd %d", prog_fd);
goto err;
}
map_fd = bpf_map_get_fd_by_id(map_ids[0]);
if (map_fd < 0)
log_err("Failed to get fd by map id %d", map_ids[0]);
err:
if (prog_fd >= 0)
close(prog_fd);
return map_fd;
}
static int run_test(int server_fd, int results_fd)
{
int client = -1, srv_client = -1;
int ret = 0;
__u32 key = 0;
__u64 value = 0;
if (bpf_map_update_elem(results_fd, &key, &value, 0) < 0) {
log_err("Can't clear results");
goto err;
}
client = connect_to_server(server_fd);
if (client == -1)
goto err;
srv_client = accept(server_fd, NULL, 0);
if (srv_client == -1) {
log_err("Can't accept connection");
goto err;
}
if (bpf_map_lookup_elem(results_fd, &key, &value) < 0) {
log_err("Can't lookup result");
goto err;
}
if (value != 1) {
log_err("Didn't match syncookie: %llu", value);
goto err;
}
goto out;
err:
ret = 1;
out:
close(client);
close(srv_client);
return ret;
}
int main(int argc, char **argv)
{
struct sockaddr_in addr4;
struct sockaddr_in6 addr6;
int server = -1;
int server_v6 = -1;
int results = -1;
int err = 0;
if (argc < 2) {
fprintf(stderr, "Usage: %s prog_id\n", argv[0]);
exit(1);
}
results = get_map_fd_by_prog_id(atoi(argv[1]));
if (results < 0) {
log_err("Can't get map");
goto err;
}
memset(&addr4, 0, sizeof(addr4));
addr4.sin_family = AF_INET;
addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
addr4.sin_port = 0;
memset(&addr6, 0, sizeof(addr6));
addr6.sin6_family = AF_INET6;
addr6.sin6_addr = in6addr_loopback;
addr6.sin6_port = 0;
server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
if (server == -1)
goto err;
server_v6 = start_server((const struct sockaddr *)&addr6,
sizeof(addr6));
if (server_v6 == -1)
goto err;
if (run_test(server, results))
goto err;
if (run_test(server_v6, results))
goto err;
printf("ok\n");
goto out;
err:
err = 1;
out:
close(server);
close(server_v6);
close(results);
return err;
}
......@@ -198,7 +198,7 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
}
/* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
#define BPF_SK_LOOKUP \
#define BPF_SK_LOOKUP(func) \
/* struct bpf_sock_tuple tuple = {} */ \
BPF_MOV64_IMM(BPF_REG_2, 0), \
BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), \
......@@ -207,13 +207,13 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32), \
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40), \
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48), \
/* sk = sk_lookup_tcp(ctx, &tuple, sizeof tuple, 0, 0) */ \
/* sk = func(ctx, &tuple, sizeof tuple, 0, 0) */ \
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48), \
BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)), \
BPF_MOV64_IMM(BPF_REG_4, 0), \
BPF_MOV64_IMM(BPF_REG_5, 0), \
BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp)
BPF_EMIT_CALL(BPF_FUNC_ ## func)
/* BPF_DIRECT_PKT_R2 contains 7 instructions, it initializes default return
* value into 0 and does necessary preparation for direct packet access
......
......@@ -7,11 +7,19 @@
#define BUF_SIZE 256
static __attribute__((noinline))
void urandom_read(int fd, int count)
{
char buf[BUF_SIZE];
int i;
for (i = 0; i < count; ++i)
read(fd, buf, BUF_SIZE);
}
int main(int argc, char *argv[])
{
int fd = open("/dev/urandom", O_RDONLY);
int i;
char buf[BUF_SIZE];
int count = 4;
if (fd < 0)
......@@ -20,8 +28,7 @@ int main(int argc, char *argv[])
if (argc == 2)
count = atoi(argv[1]);
for (i = 0; i < count; ++i)
read(fd, buf, BUF_SIZE);
urandom_read(fd, count);
close(fd);
return 0;
......
{
"reference tracking: leak potential reference",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.errstr = "Unreleased reference",
.result = REJECT,
},
{
"reference tracking: leak potential reference to sock_common",
.insns = {
BPF_SK_LOOKUP(skc_lookup_tcp),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
BPF_EXIT_INSN(),
},
......@@ -12,7 +23,7 @@
{
"reference tracking: leak potential reference on stack",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
......@@ -26,7 +37,7 @@
{
"reference tracking: leak potential reference on stack 2",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
......@@ -41,7 +52,18 @@
{
"reference tracking: zero potential reference",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.errstr = "Unreleased reference",
.result = REJECT,
},
{
"reference tracking: zero potential reference to sock_common",
.insns = {
BPF_SK_LOOKUP(skc_lookup_tcp),
BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
BPF_EXIT_INSN(),
},
......@@ -52,7 +74,7 @@
{
"reference tracking: copy and zero potential references",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_MOV64_IMM(BPF_REG_7, 0), /* leak reference */
......@@ -65,7 +87,7 @@
{
"reference tracking: release reference without check",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
/* reference in r0 may be NULL */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_MOV64_IMM(BPF_REG_2, 0),
......@@ -76,10 +98,36 @@
.errstr = "type=sock_or_null expected=sock",
.result = REJECT,
},
{
"reference tracking: release reference to sock_common without check",
.insns = {
BPF_SK_LOOKUP(skc_lookup_tcp),
/* reference in r0 may be NULL */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_MOV64_IMM(BPF_REG_2, 0),
BPF_EMIT_CALL(BPF_FUNC_sk_release),
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.errstr = "type=sock_common_or_null expected=sock",
.result = REJECT,
},
{
"reference tracking: release reference",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_EMIT_CALL(BPF_FUNC_sk_release),
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = ACCEPT,
},
{
"reference tracking: release reference to sock_common",
.insns = {
BPF_SK_LOOKUP(skc_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_EMIT_CALL(BPF_FUNC_sk_release),
......@@ -91,7 +139,7 @@
{
"reference tracking: release reference 2",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
......@@ -104,7 +152,7 @@
{
"reference tracking: release reference twice",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
......@@ -120,7 +168,7 @@
{
"reference tracking: release reference twice inside branch",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* goto end */
......@@ -147,7 +195,7 @@
BPF_EXIT_INSN(),
BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
offsetof(struct __sk_buff, mark)),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), /* mark == 0? */
/* Leak reference in R0 */
BPF_EXIT_INSN(),
......@@ -175,7 +223,7 @@
BPF_EXIT_INSN(),
BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
offsetof(struct __sk_buff, mark)),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), /* mark == 0? */
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
......@@ -193,7 +241,7 @@
{
"reference tracking in call: free reference in subprog",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 0),
......@@ -211,7 +259,7 @@
{
"reference tracking in call: free reference in subprog and outside",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
......@@ -241,7 +289,7 @@
/* subprog 1 */
BPF_MOV64_REG(BPF_REG_6, BPF_REG_4),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
/* spill unchecked sk_ptr into stack of caller */
BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
......@@ -262,7 +310,7 @@
BPF_EXIT_INSN(),
/* subprog 1 */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_EXIT_INSN(), /* return sk */
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
......@@ -291,7 +339,7 @@
BPF_EXIT_INSN(),
/* subprog 2 */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
......@@ -324,7 +372,7 @@
BPF_EXIT_INSN(),
/* subprog 2 */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
......@@ -334,7 +382,7 @@
"reference tracking: allow LD_ABS",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_EMIT_CALL(BPF_FUNC_sk_release),
......@@ -350,7 +398,7 @@
"reference tracking: forbid LD_ABS while holding reference",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_LD_ABS(BPF_B, 0),
BPF_LD_ABS(BPF_H, 0),
BPF_LD_ABS(BPF_W, 0),
......@@ -367,7 +415,7 @@
"reference tracking: allow LD_IND",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_EMIT_CALL(BPF_FUNC_sk_release),
......@@ -384,7 +432,7 @@
"reference tracking: forbid LD_IND while holding reference",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
BPF_MOV64_IMM(BPF_REG_7, 1),
BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
......@@ -402,7 +450,7 @@
"reference tracking: check reference or tail call",
.insns = {
BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
/* if (sk) bpf_sk_release() */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 7),
......@@ -424,7 +472,7 @@
"reference tracking: release reference then tail call",
.insns = {
BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
/* if (sk) bpf_sk_release() */
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
......@@ -446,7 +494,7 @@
.insns = {
BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
/* Look up socket and store in REG_6 */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
/* bpf_tail_call() */
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_MOV64_IMM(BPF_REG_3, 2),
......@@ -470,7 +518,7 @@
.insns = {
BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
/* Look up socket and store in REG_6 */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
/* if (!sk) goto end */
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
......@@ -492,7 +540,7 @@
{
"reference tracking: mangle and release sock_or_null",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
......@@ -506,7 +554,7 @@
{
"reference tracking: mangle and release sock",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5),
......@@ -520,7 +568,7 @@
{
"reference tracking: access member",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_0, 4),
......@@ -534,7 +582,7 @@
{
"reference tracking: write to member",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
......@@ -553,7 +601,7 @@
{
"reference tracking: invalid 64-bit access of member",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
......@@ -568,7 +616,7 @@
{
"reference tracking: access after release",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
BPF_EMIT_CALL(BPF_FUNC_sk_release),
......@@ -608,7 +656,7 @@
{
"reference tracking: use ptr from bpf_tcp_sock() after release",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......@@ -631,7 +679,7 @@
{
"reference tracking: use ptr from bpf_sk_fullsock() after release",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......@@ -654,7 +702,7 @@
{
"reference tracking: use ptr from bpf_sk_fullsock(tp) after release",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......@@ -681,7 +729,7 @@
{
"reference tracking: use sk after bpf_sk_release(tp)",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......@@ -703,7 +751,7 @@
{
"reference tracking: use ptr from bpf_get_listener_sock() after bpf_sk_release(sk)",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......@@ -725,7 +773,7 @@
{
"reference tracking: bpf_sk_release(listen_sk)",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......@@ -750,7 +798,7 @@
/* !bpf_sk_fullsock(sk) is checked but !bpf_tcp_sock(sk) is not checked */
"reference tracking: tp->snd_cwnd after bpf_sk_fullsock(sk) and bpf_tcp_sock(sk)",
.insns = {
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
......
......@@ -242,7 +242,7 @@
.insns = {
BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
/* struct bpf_sock *sock = bpf_sock_lookup(...); */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
/* u64 foo; */
/* void *target = &foo; */
......@@ -276,7 +276,7 @@
.insns = {
BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
/* struct bpf_sock *sock = bpf_sock_lookup(...); */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
/* u64 foo; */
/* void *target = &foo; */
......@@ -307,7 +307,7 @@
.insns = {
BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
/* struct bpf_sock *sock = bpf_sock_lookup(...); */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
/* u64 foo; */
/* void *target = &foo; */
......@@ -339,7 +339,7 @@
.insns = {
BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
/* struct bpf_sock *sock = bpf_sock_lookup(...); */
BPF_SK_LOOKUP,
BPF_SK_LOOKUP(sk_lookup_tcp),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
/* u64 foo; */
/* void *target = &foo; */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment