Commit b92d44b5 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'expand-cg_skb-helpers'

Andrey Ignatov says:

====================
v2->v3:
- better documentation for bpf_sk_cgroup_id in uapi (Yonghong Song)
- save/restore errno in network helpers (Yonghong Song)
- cleanup leftover after switching selftest to skeleton (Yonghong Song)
- switch from map to skel->bss in selftest (Yonghong Song)

v1->v2:
- switch selftests to skeleton.

This patch set allows a bunch of existing sk lookup and skb cgroup id
helpers, and adds two new bpf_sk_{,ancestor_}cgroup_id helpers to be used
in cgroup skb programs.

It fills the gap to cover a use-case to apply intra-host cgroup-bpf network
policy based on a source cgroup a packet comes from.

For example, there can be multiple containers A, B, C running on a host.
Every such container runs in its own cgroup that can have multiple
sub-cgroups. But all these containers can share some IP addresses.

At the same time container A wants to have a policy for a server S running
in it so that only clients from this same container can connect to S, but
not from other containers (such as B, C). Source IP address can't be used
to decide whether to allow or deny a packet, but it looks reasonable to
filter by cgroup id.

The patch set allows to implement the following policy:
* when an ingress packet comes to container's cgroup, lookup peer (client)
  socket this packet comes from;
* having peer socket, get its cgroup id;
* compare peer cgroup id with self cgroup id and allow packet only if they
  match, i.e. it comes from same cgroup;
* the "sub-cgroup" part of the story can be addressed by getting not direct
  cgroup id of the peer socket, but ancestor cgroup id on specified level,
  similar to existing "ancestor" flavors of cgroup id helpers.

A newly introduced selftest implements such a policy in its basic form to
provide a better idea on the use-case.

Patch 1 allows existing sk lookup helpers in cgroup skb.
Patch 2 allows skb_ancestor_cgroup_id in cgrou skb.
Patch 3 introduces two new helpers to get cgroup id of socket.
Patch 4 extends network helpers to use them in the next patch.
Patch 5 adds selftest / example of use-case.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 5b0004d9 68e916bc
...@@ -3121,6 +3121,38 @@ union bpf_attr { ...@@ -3121,6 +3121,38 @@ union bpf_attr {
* 0 on success, or a negative error in case of failure: * 0 on success, or a negative error in case of failure:
* *
* **-EOVERFLOW** if an overflow happened: The same object will be tried again. * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3250,7 +3282,9 @@ union bpf_attr { ...@@ -3250,7 +3282,9 @@ union bpf_attr {
FN(sk_assign), \ FN(sk_assign), \
FN(ktime_get_boot_ns), \ FN(ktime_get_boot_ns), \
FN(seq_printf), \ FN(seq_printf), \
FN(seq_write), FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
......
...@@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { ...@@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
}; };
#ifdef CONFIG_SOCK_CGROUP_DATA #ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
struct cgroup *cgrp;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return cgroup_id(cgrp);
}
BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{ {
struct sock *sk = skb_to_full_sk(skb); struct sock *sk = skb_to_full_sk(skb);
struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk)) if (!sk || !sk_fullsock(sk))
return 0; return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return __bpf_sk_cgroup_id(sk);
return cgroup_id(cgrp);
} }
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
...@@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { ...@@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX, .arg1_type = ARG_PTR_TO_CTX,
}; };
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
ancestor_level) int ancestor_level)
{ {
struct sock *sk = skb_to_full_sk(skb);
struct cgroup *ancestor; struct cgroup *ancestor;
struct cgroup *cgrp; struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk))
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
ancestor = cgroup_ancestor(cgrp, ancestor_level); ancestor = cgroup_ancestor(cgrp, ancestor_level);
if (!ancestor) if (!ancestor)
...@@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ...@@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
return cgroup_id(ancestor); return cgroup_id(ancestor);
} }
BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
ancestor_level)
{
struct sock *sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return 0;
return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.func = bpf_skb_ancestor_cgroup_id, .func = bpf_skb_ancestor_cgroup_id,
.gpl_only = false, .gpl_only = false,
...@@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { ...@@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX, .arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
return __bpf_sk_cgroup_id(sk);
}
static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
.func = bpf_sk_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
};
BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}
static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
.func = bpf_sk_ancestor_cgroup_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_SOCKET,
.arg2_type = ARG_ANYTHING,
};
#endif #endif
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
...@@ -6157,8 +6195,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -6157,8 +6195,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_SOCK_CGROUP_DATA #ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id: case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto; return &bpf_skb_cgroup_id_proto;
case BPF_FUNC_skb_ancestor_cgroup_id:
return &bpf_skb_ancestor_cgroup_id_proto;
case BPF_FUNC_sk_cgroup_id:
return &bpf_sk_cgroup_id_proto;
case BPF_FUNC_sk_ancestor_cgroup_id:
return &bpf_sk_ancestor_cgroup_id_proto;
#endif #endif
#ifdef CONFIG_INET #ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sk_lookup_tcp_proto;
case BPF_FUNC_sk_lookup_udp:
return &bpf_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_skc_lookup_tcp:
return &bpf_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_sock: case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto; return &bpf_tcp_sock_proto;
case BPF_FUNC_get_listener_sock: case BPF_FUNC_get_listener_sock:
......
...@@ -3121,6 +3121,38 @@ union bpf_attr { ...@@ -3121,6 +3121,38 @@ union bpf_attr {
* 0 on success, or a negative error in case of failure: * 0 on success, or a negative error in case of failure:
* *
* **-EOVERFLOW** if an overflow happened: The same object will be tried again. * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3250,7 +3282,9 @@ union bpf_attr { ...@@ -3250,7 +3282,9 @@ union bpf_attr {
FN(sk_assign), \ FN(sk_assign), \
FN(ktime_get_boot_ns), \ FN(ktime_get_boot_ns), \
FN(seq_printf), \ FN(seq_printf), \
FN(seq_write), FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
......
...@@ -4,10 +4,14 @@ ...@@ -4,10 +4,14 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <unistd.h> #include <unistd.h>
#include <sys/epoll.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/in.h> #include <linux/in.h>
#include <linux/in6.h> #include <linux/in6.h>
#include "bpf_util.h"
#include "network_helpers.h" #include "network_helpers.h"
#define clean_errno() (errno == 0 ? "None" : strerror(errno)) #define clean_errno() (errno == 0 ? "None" : strerror(errno))
...@@ -77,9 +81,7 @@ static const size_t timeo_optlen = sizeof(timeo_sec); ...@@ -77,9 +81,7 @@ static const size_t timeo_optlen = sizeof(timeo_sec);
int connect_to_fd(int family, int type, int server_fd) int connect_to_fd(int family, int type, int server_fd)
{ {
struct sockaddr_storage addr; int fd, save_errno;
socklen_t len = sizeof(addr);
int fd;
fd = socket(family, type, 0); fd = socket(family, type, 0);
if (fd < 0) { if (fd < 0) {
...@@ -87,24 +89,70 @@ int connect_to_fd(int family, int type, int server_fd) ...@@ -87,24 +89,70 @@ int connect_to_fd(int family, int type, int server_fd)
return -1; return -1;
} }
if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, timeo_optlen)) { if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) {
save_errno = errno;
close(fd);
errno = save_errno;
return -1;
}
return fd;
}
int connect_fd_to_fd(int client_fd, int server_fd)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
int save_errno;
if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
timeo_optlen)) {
log_err("Failed to set SO_RCVTIMEO"); log_err("Failed to set SO_RCVTIMEO");
goto out; return -1;
} }
if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
log_err("Failed to get server addr"); log_err("Failed to get server addr");
goto out; return -1;
} }
if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) {
log_err("Fail to connect to server with family %d", family); if (errno != EINPROGRESS) {
goto out; save_errno = errno;
log_err("Failed to connect to server");
errno = save_errno;
}
return -1;
} }
return fd; return 0;
}
out: int connect_wait(int fd)
close(fd); {
struct epoll_event ev = {}, events[2];
int timeout_ms = 1000;
int efd, nfd;
efd = epoll_create1(EPOLL_CLOEXEC);
if (efd < 0) {
log_err("Failed to open epoll fd");
return -1;
}
ev.events = EPOLLRDHUP | EPOLLOUT;
ev.data.fd = fd;
if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
log_err("Failed to register fd=%d on epoll fd=%d", fd, efd);
close(efd);
return -1; return -1;
}
nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms);
if (nfd < 0)
log_err("Failed to wait for I/O event on epoll fd=%d", efd);
close(efd);
return nfd;
} }
...@@ -35,5 +35,7 @@ extern struct ipv6_packet pkt_v6; ...@@ -35,5 +35,7 @@ extern struct ipv6_packet pkt_v6;
int start_server(int family, int type); int start_server(int family, int type);
int connect_to_fd(int family, int type, int server_fd); int connect_to_fd(int family, int type, int server_fd);
int connect_fd_to_fd(int client_fd, int server_fd);
int connect_wait(int client_fd);
#endif #endif
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <test_progs.h>
#include "network_helpers.h"
#include "cgroup_skb_sk_lookup_kern.skel.h"
static void run_lookup_test(__u16 *g_serv_port, int out_sk)
{
int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err;
struct sockaddr_in6 addr = {};
socklen_t addr_len = sizeof(addr);
__u32 duration = 0;
serv_sk = start_server(AF_INET6, SOCK_STREAM);
if (CHECK(serv_sk < 0, "start_server", "failed to start server\n"))
return;
err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len);
if (CHECK(err, "getsockname", "errno %d\n", errno))
goto cleanup;
*g_serv_port = addr.sin6_port;
/* Client outside of test cgroup should fail to connect by timeout. */
err = connect_fd_to_fd(out_sk, serv_sk);
if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd",
"unexpected result err %d errno %d\n", err, errno))
goto cleanup;
err = connect_wait(out_sk);
if (CHECK(err, "connect_wait", "unexpected result %d\n", err))
goto cleanup;
/* Client inside test cgroup should connect just fine. */
in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk);
if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno))
goto cleanup;
serv_in_sk = accept(serv_sk, NULL, NULL);
if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno))
goto cleanup;
cleanup:
close(serv_in_sk);
close(in_sk);
close(serv_sk);
}
static void run_cgroup_bpf_test(const char *cg_path, int out_sk)
{
struct cgroup_skb_sk_lookup_kern *skel;
struct bpf_link *link;
__u32 duration = 0;
int cgfd = -1;
skel = cgroup_skb_sk_lookup_kern__open_and_load();
if (CHECK(!skel, "skel_open_load", "open_load failed\n"))
return;
cgfd = test__join_cgroup(cg_path);
if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n"))
goto cleanup;
link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd);
if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link)))
goto cleanup;
run_lookup_test(&skel->bss->g_serv_port, out_sk);
bpf_link__destroy(link);
cleanup:
close(cgfd);
cgroup_skb_sk_lookup_kern__destroy(skel);
}
void test_cgroup_skb_sk_lookup(void)
{
const char *cg_path = "/foo";
int out_sk;
/* Create a socket before joining testing cgroup so that its cgroup id
* differs from that of testing cgroup. Moving selftests process to
* testing cgroup won't change cgroup id of an already created socket.
*/
out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0);
if (CHECK_FAIL(out_sk < 0))
return;
run_cgroup_bpf_test(cg_path, out_sk);
close(out_sk);
}
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <linux/bpf.h>
#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <sys/types.h>
#include <sys/socket.h>
int _version SEC("version") = 1;
char _license[] SEC("license") = "GPL";
__u16 g_serv_port = 0;
static inline void set_ip(__u32 *dst, const struct in6_addr *src)
{
dst[0] = src->in6_u.u6_addr32[0];
dst[1] = src->in6_u.u6_addr32[1];
dst[2] = src->in6_u.u6_addr32[2];
dst[3] = src->in6_u.u6_addr32[3];
}
static inline void set_tuple(struct bpf_sock_tuple *tuple,
const struct ipv6hdr *ip6h,
const struct tcphdr *tcph)
{
set_ip(tuple->ipv6.saddr, &ip6h->daddr);
set_ip(tuple->ipv6.daddr, &ip6h->saddr);
tuple->ipv6.sport = tcph->dest;
tuple->ipv6.dport = tcph->source;
}
static inline int is_allowed_peer_cg(struct __sk_buff *skb,
const struct ipv6hdr *ip6h,
const struct tcphdr *tcph)
{
__u64 cgid, acgid, peer_cgid, peer_acgid;
struct bpf_sock_tuple tuple;
size_t tuple_len = sizeof(tuple.ipv6);
struct bpf_sock *peer_sk;
set_tuple(&tuple, ip6h, tcph);
peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len,
BPF_F_CURRENT_NETNS, 0);
if (!peer_sk)
return 0;
cgid = bpf_skb_cgroup_id(skb);
peer_cgid = bpf_sk_cgroup_id(peer_sk);
acgid = bpf_skb_ancestor_cgroup_id(skb, 2);
peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2);
bpf_sk_release(peer_sk);
return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid;
}
SEC("cgroup_skb/ingress")
int ingress_lookup(struct __sk_buff *skb)
{
__u32 serv_port_key = 0;
struct ipv6hdr ip6h;
struct tcphdr tcph;
if (skb->protocol != bpf_htons(ETH_P_IPV6))
return 1;
/* For SYN packets coming to listening socket skb->remote_port will be
* zero, so IPv6/TCP headers are loaded to identify remote peer
* instead.
*/
if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h)))
return 1;
if (ip6h.nexthdr != IPPROTO_TCP)
return 1;
if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph)))
return 1;
if (!g_serv_port)
return 0;
if (tcph.dest != g_serv_port)
return 1;
return is_allowed_peer_cg(skb, &ip6h, &tcph);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment