Commit cf51abcd authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'Link-based-attach-to-netns'

Jakub Sitnicki says:

====================
One of the pieces of feedback from recent review of BPF hooks for socket
lookup [0] was that new program types should use bpf_link-based
attachment.

This series introduces new bpf_link type for attaching to network
namespace. All link operations are supported. Errors returned from ops
follow cgroup example. Patch 4 description goes into error semantics.

The major change in v2 is a switch away from RCU to mutex-only
synchronization. Andrii pointed out that it is not needed, and it makes
sense to keep locking straightforward.

Also, there were a couple of bugs in update_prog and fill_info initial
implementation, one picked up by kbuild. Those are now fixed. Tests have
been extended to cover them. Full changelog below.

Series is organized as so:

Patches 1-3 prepare a space in struct net to keep state for attached BPF
programs, and massage the code in flow_dissector to make it attach type
agnostic, to finally move it under kernel/bpf/.

Patch 4, the most important one, introduces new bpf_link link type for
attaching to network namespace.

Patch 5 unifies the update error (ENOLINK) between BPF cgroup and netns.

Patches 6-8 make libbpf and bpftool aware of the new link type.

Patches 9-12 Add and extend tests to check that link low- and high-level
API for operating on links to netns works as intended.

Thanks to Alexei, Andrii, Lorenz, Marek, and Stanislav for feedback.

-jkbs

[0] https://lore.kernel.org/bpf/20200511185218.1422406-1-jakub@cloudflare.com/

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Cc: Marek Majkowski <marek@cloudflare.com>
Cc: Stanislav Fomichev <sdf@google.com>

v1 -> v2:

- Switch to mutex-only synchronization. Don't rely on RCU grace period
  guarantee when accessing struct net from link release / update /
  fill_info, and when accessing bpf_link from pernet pre_exit
  callback. (Andrii)
- Drop patch 1, no longer needed with mutex-only synchronization.
- Don't leak uninitialized variable contents from fill_info callback
  when link is in defunct state. (kbuild)
- Make fill_info treat the link as defunct (i.e. no attached netns) when
  struct net refcount is 0, but link has not been yet auto-detached.
- Add missing BPF_LINK_TYPE define in bpf_types.h for new link type.
- Fix link update_prog callback to update the prog that will run, and
  not just the link itself.
- Return EEXIST on prog attach when link already exists, and on link
  create when prog is already attached directly. (Andrii)
- Return EINVAL on prog detach when link is attached. (Andrii)
- Fold __netns_bpf_link_attach into its only caller. (Stanislav)
- Get rid of a wrapper around container_of() (Andrii)
- Use rcu_dereference_protected instead of rcu_access_pointer on
  update-side. (Stanislav)
- Make return-on-success from netns_bpf_link_create less
  confusing. (Andrii)
- Adapt bpf_link for cgroup to return ENOLINK when updating a defunct
  link. (Andrii, Alexei)
- Order new exported symbols in libbpf.map alphabetically (Andrii)
- Keep libbpf's "failed to attach link" warning message clear as to what
  we failed to attach to (cgroup vs netns). (Andrii)
- Extract helpers for printing link attach type. (bpftool, Andrii)
- Switch flow_dissector tests to BPF skeleton and extend them to
  exercise link-based flow dissector attachment. (Andrii)
- Harden flow dissector attachment tests with prog query checks after
  prog attach/detach, or link create/update/close.
- Extend flow dissector tests to cover fill_info for defunct links.
- Rebase onto recent bpf-next
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents febeb6df 06716e04
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BPF_NETNS_H
#define _BPF_NETNS_H
#include <linux/mutex.h>
#include <uapi/linux/bpf.h>
enum netns_bpf_attach_type {
NETNS_BPF_INVALID = -1,
NETNS_BPF_FLOW_DISSECTOR = 0,
MAX_NETNS_BPF_ATTACH_TYPE
};
static inline enum netns_bpf_attach_type
to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
{
switch (attach_type) {
case BPF_FLOW_DISSECTOR:
return NETNS_BPF_FLOW_DISSECTOR;
default:
return NETNS_BPF_INVALID;
}
}
/* Protects updates to netns_bpf */
extern struct mutex netns_bpf_mutex;
union bpf_attr;
struct bpf_prog;
#ifdef CONFIG_NET
int netns_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
int netns_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog);
int netns_bpf_prog_detach(const union bpf_attr *attr);
int netns_bpf_link_create(const union bpf_attr *attr,
struct bpf_prog *prog);
#else
static inline int netns_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return -EOPNOTSUPP;
}
static inline int netns_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
{
return -EOPNOTSUPP;
}
static inline int netns_bpf_prog_detach(const union bpf_attr *attr)
{
return -EOPNOTSUPP;
}
static inline int netns_bpf_link_create(const union bpf_attr *attr,
struct bpf_prog *prog)
{
return -EOPNOTSUPP;
}
#endif
#endif /* _BPF_NETNS_H */
......@@ -126,3 +126,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
#endif
BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
#ifdef CONFIG_NET
BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
#endif
......@@ -1283,32 +1283,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count);
#ifdef CONFIG_NET
int skb_flow_dissector_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog);
int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
#else
static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return -EOPNOTSUPP;
}
static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
{
return -EOPNOTSUPP;
}
static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
{
return -EOPNOTSUPP;
}
#endif
struct bpf_flow_dissector;
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
__be16 proto, int nhoff, int hlen, unsigned int flags);
......
......@@ -8,6 +8,8 @@
#include <linux/string.h>
#include <uapi/linux/if_ether.h>
struct bpf_prog;
struct net;
struct sk_buff;
/**
......@@ -369,4 +371,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
memset(key_basic, 0, sizeof(*key_basic));
}
#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog);
#endif /* CONFIG_BPF_SYSCALL */
#endif
......@@ -33,6 +33,7 @@
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/bpf.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
......@@ -162,7 +163,8 @@ struct net {
#endif
struct net_generic __rcu *gen;
struct bpf_prog __rcu *flow_dissector_prog;
/* Used to store attached BPF programs */
struct netns_bpf bpf;
/* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* BPF programs attached to network namespace
*/
#ifndef __NETNS_BPF_H__
#define __NETNS_BPF_H__
#include <linux/bpf-netns.h>
struct bpf_prog;
struct netns_bpf {
struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE];
struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE];
};
#endif /* __NETNS_BPF_H__ */
......@@ -237,6 +237,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
BPF_LINK_TYPE_NETNS = 5,
MAX_BPF_LINK_TYPE,
};
......@@ -3839,6 +3840,10 @@ struct bpf_link_info {
__u64 cgroup_id;
__u32 attach_type;
} cgroup;
struct {
__u32 netns_ino;
__u32 attach_type;
} netns;
};
} __attribute__((aligned(8)));
......
......@@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
......
......@@ -595,7 +595,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
mutex_lock(&cgroup_mutex);
/* link might have been auto-released by dying cgroup, so fail */
if (!cg_link->cgroup) {
ret = -EINVAL;
ret = -ENOLINK;
goto out_unlock;
}
if (old_prog && link->prog != old_prog) {
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/filter.h>
#include <net/net_namespace.h>
/*
* Functions to manage BPF programs attached to netns
*/
struct bpf_netns_link {
struct bpf_link link;
enum bpf_attach_type type;
enum netns_bpf_attach_type netns_type;
/* We don't hold a ref to net in order to auto-detach the link
* when netns is going away. Instead we rely on pernet
* pre_exit callback to clear this pointer. Must be accessed
* with netns_bpf_mutex held.
*/
struct net *net;
};
/* Protects updates to netns_bpf */
DEFINE_MUTEX(netns_bpf_mutex);
/* Must be called with netns_bpf_mutex held. */
static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
net_link->net = NULL;
}
static void bpf_netns_link_release(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
enum netns_bpf_attach_type type = net_link->netns_type;
struct net *net;
/* Link auto-detached by dying netns. */
if (!net_link->net)
return;
mutex_lock(&netns_bpf_mutex);
/* Recheck after potential sleep. We can race with cleanup_net
* here, but if we see a non-NULL struct net pointer pre_exit
* has not happened yet and will block on netns_bpf_mutex.
*/
net = net_link->net;
if (!net)
goto out_unlock;
net->bpf.links[type] = NULL;
RCU_INIT_POINTER(net->bpf.progs[type], NULL);
out_unlock:
mutex_unlock(&netns_bpf_mutex);
}
static void bpf_netns_link_dealloc(struct bpf_link *link)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
kfree(net_link);
}
static int bpf_netns_link_update_prog(struct bpf_link *link,
struct bpf_prog *new_prog,
struct bpf_prog *old_prog)
{
struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
enum netns_bpf_attach_type type = net_link->netns_type;
struct net *net;
int ret = 0;
if (old_prog && old_prog != link->prog)
return -EPERM;
if (new_prog->type != link->prog->type)
return -EINVAL;
mutex_lock(&netns_bpf_mutex);
net = net_link->net;
if (!net || !check_net(net)) {
/* Link auto-detached or netns dying */
ret = -ENOLINK;
goto out_unlock;
}
old_prog = xchg(&link->prog, new_prog);
rcu_assign_pointer(net->bpf.progs[type], new_prog);
bpf_prog_put(old_prog);
out_unlock:
mutex_unlock(&netns_bpf_mutex);
return ret;
}
static int bpf_netns_link_fill_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
const struct bpf_netns_link *net_link =
container_of(link, struct bpf_netns_link, link);
unsigned int inum = 0;
struct net *net;
mutex_lock(&netns_bpf_mutex);
net = net_link->net;
if (net && check_net(net))
inum = net->ns.inum;
mutex_unlock(&netns_bpf_mutex);
info->netns.netns_ino = inum;
info->netns.attach_type = net_link->type;
return 0;
}
static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_link_info info = {};
bpf_netns_link_fill_info(link, &info);
seq_printf(seq,
"netns_ino:\t%u\n"
"attach_type:\t%u\n",
info.netns.netns_ino,
info.netns.attach_type);
}
static const struct bpf_link_ops bpf_netns_link_ops = {
.release = bpf_netns_link_release,
.dealloc = bpf_netns_link_dealloc,
.update_prog = bpf_netns_link_update_prog,
.fill_link_info = bpf_netns_link_fill_info,
.show_fdinfo = bpf_netns_link_show_fdinfo,
};
int netns_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
u32 prog_id, prog_cnt = 0, flags = 0;
enum netns_bpf_attach_type type;
struct bpf_prog *attached;
struct net *net;
if (attr->query.query_flags)
return -EINVAL;
type = to_netns_bpf_attach_type(attr->query.attach_type);
if (type < 0)
return -EINVAL;
net = get_net_ns_by_fd(attr->query.target_fd);
if (IS_ERR(net))
return PTR_ERR(net);
rcu_read_lock();
attached = rcu_dereference(net->bpf.progs[type]);
if (attached) {
prog_cnt = 1;
prog_id = attached->aux->id;
}
rcu_read_unlock();
put_net(net);
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
return -EFAULT;
if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
return 0;
if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
return -EFAULT;
return 0;
}
int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
enum netns_bpf_attach_type type;
struct net *net;
int ret;
type = to_netns_bpf_attach_type(attr->attach_type);
if (type < 0)
return -EINVAL;
net = current->nsproxy->net_ns;
mutex_lock(&netns_bpf_mutex);
/* Attaching prog directly is not compatible with links */
if (net->bpf.links[type]) {
ret = -EEXIST;
goto out_unlock;
}
switch (type) {
case NETNS_BPF_FLOW_DISSECTOR:
ret = flow_dissector_bpf_prog_attach(net, prog);
break;
default:
ret = -EINVAL;
break;
}
out_unlock:
mutex_unlock(&netns_bpf_mutex);
return ret;
}
/* Must be called with netns_bpf_mutex held. */
static int __netns_bpf_prog_detach(struct net *net,
enum netns_bpf_attach_type type)
{
struct bpf_prog *attached;
/* Progs attached via links cannot be detached */
if (net->bpf.links[type])
return -EINVAL;
attached = rcu_dereference_protected(net->bpf.progs[type],
lockdep_is_held(&netns_bpf_mutex));
if (!attached)
return -ENOENT;
RCU_INIT_POINTER(net->bpf.progs[type], NULL);
bpf_prog_put(attached);
return 0;
}
int netns_bpf_prog_detach(const union bpf_attr *attr)
{
enum netns_bpf_attach_type type;
int ret;
type = to_netns_bpf_attach_type(attr->attach_type);
if (type < 0)
return -EINVAL;
mutex_lock(&netns_bpf_mutex);
ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type);
mutex_unlock(&netns_bpf_mutex);
return ret;
}
static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
enum netns_bpf_attach_type type)
{
struct bpf_prog *prog;
int err;
mutex_lock(&netns_bpf_mutex);
/* Allow attaching only one prog or link for now */
if (net->bpf.links[type]) {
err = -E2BIG;
goto out_unlock;
}
/* Links are not compatible with attaching prog directly */
prog = rcu_dereference_protected(net->bpf.progs[type],
lockdep_is_held(&netns_bpf_mutex));
if (prog) {
err = -EEXIST;
goto out_unlock;
}
switch (type) {
case NETNS_BPF_FLOW_DISSECTOR:
err = flow_dissector_bpf_prog_attach(net, link->prog);
break;
default:
err = -EINVAL;
break;
}
if (err)
goto out_unlock;
net->bpf.links[type] = link;
out_unlock:
mutex_unlock(&netns_bpf_mutex);
return err;
}
int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
{
enum netns_bpf_attach_type netns_type;
struct bpf_link_primer link_primer;
struct bpf_netns_link *net_link;
enum bpf_attach_type type;
struct net *net;
int err;
if (attr->link_create.flags)
return -EINVAL;
type = attr->link_create.attach_type;
netns_type = to_netns_bpf_attach_type(type);
if (netns_type < 0)
return -EINVAL;
net = get_net_ns_by_fd(attr->link_create.target_fd);
if (IS_ERR(net))
return PTR_ERR(net);
net_link = kzalloc(sizeof(*net_link), GFP_USER);
if (!net_link) {
err = -ENOMEM;
goto out_put_net;
}
bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
&bpf_netns_link_ops, prog);
net_link->net = net;
net_link->type = type;
net_link->netns_type = netns_type;
err = bpf_link_prime(&net_link->link, &link_primer);
if (err) {
kfree(net_link);
goto out_put_net;
}
err = netns_bpf_link_attach(net, &net_link->link, netns_type);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_net;
}
put_net(net);
return bpf_link_settle(&link_primer);
out_put_net:
put_net(net);
return err;
}
static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
{
enum netns_bpf_attach_type type;
struct bpf_link *link;
mutex_lock(&netns_bpf_mutex);
for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
link = net->bpf.links[type];
if (link)
bpf_netns_link_auto_detach(link);
else
__netns_bpf_prog_detach(net, type);
}
mutex_unlock(&netns_bpf_mutex);
}
static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
.pre_exit = netns_bpf_pernet_pre_exit,
};
static int __init netns_bpf_init(void)
{
return register_pernet_subsys(&netns_bpf_pernet_ops);
}
subsys_initcall(netns_bpf_init);
......@@ -27,6 +27,7 @@
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/bpf-netns.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
......@@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ret = lirc_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
ret = netns_bpf_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
......@@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_FLOW_DISSECTOR:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
return skb_flow_dissector_bpf_prog_detach(attr);
return netns_bpf_prog_detach(attr);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
......@@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
return skb_flow_dissector_prog_query(attr, uattr);
return netns_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
......@@ -3886,6 +3887,9 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_TRACING:
ret = tracing_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_link_create(attr, prog);
break;
default:
ret = -EINVAL;
}
......
......@@ -31,8 +31,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
static DEFINE_MUTEX(flow_dissector_mutex);
#include <linux/bpf-netns.h>
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
......@@ -70,54 +69,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
}
EXPORT_SYMBOL(skb_flow_dissector_init);
int skb_flow_dissector_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
u32 prog_id, prog_cnt = 0, flags = 0;
struct bpf_prog *attached;
struct net *net;
if (attr->query.query_flags)
return -EINVAL;
net = get_net_ns_by_fd(attr->query.target_fd);
if (IS_ERR(net))
return PTR_ERR(net);
rcu_read_lock();
attached = rcu_dereference(net->flow_dissector_prog);
if (attached) {
prog_cnt = 1;
prog_id = attached->aux->id;
}
rcu_read_unlock();
put_net(net);
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
return -EFAULT;
if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
return 0;
if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
return -EFAULT;
return 0;
}
int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog)
{
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog *attached;
struct net *net;
int ret = 0;
net = current->nsproxy->net_ns;
mutex_lock(&flow_dissector_mutex);
if (net == &init_net) {
/* BPF flow dissector in the root namespace overrides
......@@ -130,70 +86,29 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
for_each_net(ns) {
if (ns == &init_net)
continue;
if (rcu_access_pointer(ns->flow_dissector_prog)) {
ret = -EEXIST;
goto out;
}
if (rcu_access_pointer(ns->bpf.progs[type]))
return -EEXIST;
}
} else {
/* Make sure root flow dissector is not attached
* when attaching to the non-root namespace.
*/
if (rcu_access_pointer(init_net.flow_dissector_prog)) {
ret = -EEXIST;
goto out;
}
if (rcu_access_pointer(init_net.bpf.progs[type]))
return -EEXIST;
}
attached = rcu_dereference_protected(net->flow_dissector_prog,
lockdep_is_held(&flow_dissector_mutex));
if (attached == prog) {
attached = rcu_dereference_protected(net->bpf.progs[type],
lockdep_is_held(&netns_bpf_mutex));
if (attached == prog)
/* The same program cannot be attached twice */
ret = -EINVAL;
goto out;
}
rcu_assign_pointer(net->flow_dissector_prog, prog);
return -EINVAL;
rcu_assign_pointer(net->bpf.progs[type], prog);
if (attached)
bpf_prog_put(attached);
out:
mutex_unlock(&flow_dissector_mutex);
return ret;
}
static int flow_dissector_bpf_prog_detach(struct net *net)
{
struct bpf_prog *attached;
mutex_lock(&flow_dissector_mutex);
attached = rcu_dereference_protected(net->flow_dissector_prog,
lockdep_is_held(&flow_dissector_mutex));
if (!attached) {
mutex_unlock(&flow_dissector_mutex);
return -ENOENT;
}
RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
bpf_prog_put(attached);
mutex_unlock(&flow_dissector_mutex);
return 0;
}
int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
{
return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns);
}
static void __net_exit flow_dissector_pernet_pre_exit(struct net *net)
{
/* We're not racing with attach/detach because there are no
* references to netns left when pre_exit gets called.
*/
if (rcu_access_pointer(net->flow_dissector_prog))
flow_dissector_bpf_prog_detach(net);
}
static struct pernet_operations flow_dissector_pernet_ops __net_initdata = {
.pre_exit = flow_dissector_pernet_pre_exit,
};
#endif /* CONFIG_BPF_SYSCALL */
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
......@@ -1044,11 +959,13 @@ bool __skb_flow_dissect(const struct net *net,
WARN_ON_ONCE(!net);
if (net) {
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
rcu_read_lock();
attached = rcu_dereference(init_net.flow_dissector_prog);
attached = rcu_dereference(init_net.bpf.progs[type]);
if (!attached)
attached = rcu_dereference(net->flow_dissector_prog);
attached = rcu_dereference(net->bpf.progs[type]);
if (attached) {
struct bpf_flow_keys flow_keys;
......@@ -1869,7 +1786,6 @@ static int __init init_default_flow_dissectors(void)
skb_flow_dissector_init(&flow_keys_basic_dissector,
flow_keys_basic_dissector_keys,
ARRAY_SIZE(flow_keys_basic_dissector_keys));
return register_pernet_subsys(&flow_dissector_pernet_ops);
return 0;
}
core_initcall(init_default_flow_dissectors);
......@@ -17,6 +17,7 @@ static const char * const link_type_name[] = {
[BPF_LINK_TYPE_TRACING] = "tracing",
[BPF_LINK_TYPE_CGROUP] = "cgroup",
[BPF_LINK_TYPE_ITER] = "iter",
[BPF_LINK_TYPE_NETNS] = "netns",
};
static int link_parse_fd(int *argc, char ***argv)
......@@ -62,6 +63,15 @@ show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr)
jsonw_uint_field(json_wtr, "prog_id", info->prog_id);
}
static void show_link_attach_type_json(__u32 attach_type, json_writer_t *wtr)
{
if (attach_type < ARRAY_SIZE(attach_type_name))
jsonw_string_field(wtr, "attach_type",
attach_type_name[attach_type]);
else
jsonw_uint_field(wtr, "attach_type", attach_type);
}
static int get_prog_info(int prog_id, struct bpf_prog_info *info)
{
__u32 len = sizeof(*info);
......@@ -105,22 +115,18 @@ static int show_link_close_json(int fd, struct bpf_link_info *info)
jsonw_uint_field(json_wtr, "prog_type",
prog_info.type);
if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name))
jsonw_string_field(json_wtr, "attach_type",
attach_type_name[info->tracing.attach_type]);
else
jsonw_uint_field(json_wtr, "attach_type",
info->tracing.attach_type);
show_link_attach_type_json(info->tracing.attach_type,
json_wtr);
break;
case BPF_LINK_TYPE_CGROUP:
jsonw_lluint_field(json_wtr, "cgroup_id",
info->cgroup.cgroup_id);
if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name))
jsonw_string_field(json_wtr, "attach_type",
attach_type_name[info->cgroup.attach_type]);
else
jsonw_uint_field(json_wtr, "attach_type",
info->cgroup.attach_type);
show_link_attach_type_json(info->cgroup.attach_type, json_wtr);
break;
case BPF_LINK_TYPE_NETNS:
jsonw_uint_field(json_wtr, "netns_ino",
info->netns.netns_ino);
show_link_attach_type_json(info->netns.attach_type, json_wtr);
break;
default:
break;
......@@ -153,6 +159,14 @@ static void show_link_header_plain(struct bpf_link_info *info)
printf("prog %u ", info->prog_id);
}
static void show_link_attach_type_plain(__u32 attach_type)
{
if (attach_type < ARRAY_SIZE(attach_type_name))
printf("attach_type %s ", attach_type_name[attach_type]);
else
printf("attach_type %u ", attach_type);
}
static int show_link_close_plain(int fd, struct bpf_link_info *info)
{
struct bpf_prog_info prog_info;
......@@ -176,19 +190,15 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info)
else
printf("\n\tprog_type %u ", prog_info.type);
if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name))
printf("attach_type %s ",
attach_type_name[info->tracing.attach_type]);
else
printf("attach_type %u ", info->tracing.attach_type);
show_link_attach_type_plain(info->tracing.attach_type);
break;
case BPF_LINK_TYPE_CGROUP:
printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id);
if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name))
printf("attach_type %s ",
attach_type_name[info->cgroup.attach_type]);
else
printf("attach_type %u ", info->cgroup.attach_type);
show_link_attach_type_plain(info->cgroup.attach_type);
break;
case BPF_LINK_TYPE_NETNS:
printf("\n\tnetns_ino %u ", info->netns.netns_ino);
show_link_attach_type_plain(info->netns.attach_type);
break;
default:
break;
......
......@@ -237,6 +237,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
BPF_LINK_TYPE_NETNS = 5,
MAX_BPF_LINK_TYPE,
};
......@@ -3839,6 +3840,10 @@ struct bpf_link_info {
__u64 cgroup_id;
__u32 attach_type;
} cgroup;
struct {
__u32 netns_ino;
__u32 attach_type;
} netns;
};
} __attribute__((aligned(8)));
......
......@@ -7896,8 +7896,9 @@ static struct bpf_link *attach_iter(const struct bpf_sec_def *sec,
return bpf_program__attach_iter(prog, NULL);
}
struct bpf_link *
bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
static struct bpf_link *
bpf_program__attach_fd(struct bpf_program *prog, int target_fd,
const char *target_name)
{
enum bpf_attach_type attach_type;
char errmsg[STRERR_BUFSIZE];
......@@ -7917,12 +7918,12 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
link->detach = &bpf_link__detach_fd;
attach_type = bpf_program__get_expected_attach_type(prog);
link_fd = bpf_link_create(prog_fd, cgroup_fd, attach_type, NULL);
link_fd = bpf_link_create(prog_fd, target_fd, attach_type, NULL);
if (link_fd < 0) {
link_fd = -errno;
free(link);
pr_warn("program '%s': failed to attach to cgroup: %s\n",
bpf_program__title(prog, false),
pr_warn("program '%s': failed to attach to %s: %s\n",
bpf_program__title(prog, false), target_name,
libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
return ERR_PTR(link_fd);
}
......@@ -7930,6 +7931,18 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
return link;
}
struct bpf_link *
bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd)
{
return bpf_program__attach_fd(prog, cgroup_fd, "cgroup");
}
struct bpf_link *
bpf_program__attach_netns(struct bpf_program *prog, int netns_fd)
{
return bpf_program__attach_fd(prog, netns_fd, "netns");
}
struct bpf_link *
bpf_program__attach_iter(struct bpf_program *prog,
const struct bpf_iter_attach_opts *opts)
......
......@@ -253,6 +253,8 @@ LIBBPF_API struct bpf_link *
bpf_program__attach_lsm(struct bpf_program *prog);
LIBBPF_API struct bpf_link *
bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd);
LIBBPF_API struct bpf_link *
bpf_program__attach_netns(struct bpf_program *prog, int netns_fd);
struct bpf_map;
......
......@@ -262,6 +262,7 @@ LIBBPF_0.0.9 {
bpf_link_get_fd_by_id;
bpf_link_get_next_id;
bpf_program__attach_iter;
bpf_program__attach_netns;
perf_buffer__consume;
ring_buffer__add;
ring_buffer__consume;
......
......@@ -6,6 +6,8 @@
#include <linux/if_tun.h>
#include <sys/uio.h>
#include "bpf_flow.skel.h"
#ifndef IP_MF
#define IP_MF 0x2000
#endif
......@@ -101,6 +103,7 @@ struct test {
#define VLAN_HLEN 4
static __u32 duration;
struct test tests[] = {
{
.name = "ipv4",
......@@ -444,17 +447,130 @@ static int ifup(const char *ifname)
return 0;
}
static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array)
{
int i, err, map_fd, prog_fd;
struct bpf_program *prog;
char prog_name[32];
map_fd = bpf_map__fd(prog_array);
if (map_fd < 0)
return -1;
for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
snprintf(prog_name, sizeof(prog_name), "flow_dissector/%i", i);
prog = bpf_object__find_program_by_title(obj, prog_name);
if (!prog)
return -1;
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0)
return -1;
err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
if (err)
return -1;
}
return 0;
}
static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
{
int i, err, keys_fd;
keys_fd = bpf_map__fd(keys);
if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd))
return;
for (i = 0; i < ARRAY_SIZE(tests); i++) {
/* Keep in sync with 'flags' from eth_get_headlen. */
__u32 eth_get_headlen_flags =
BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
struct bpf_prog_test_run_attr tattr = {};
struct bpf_flow_keys flow_keys = {};
__u32 key = (__u32)(tests[i].keys.sport) << 16 |
tests[i].keys.dport;
/* For skb-less case we can't pass input flags; run
* only the tests that have a matching set of flags.
*/
if (tests[i].flags != eth_get_headlen_flags)
continue;
err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err);
CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err);
CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
err = bpf_map_delete_elem(keys_fd, &key);
CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err);
}
}
static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd)
{
int err, prog_fd;
prog_fd = bpf_program__fd(skel->progs._dissect);
if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd))
return;
err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno))
return;
run_tests_skb_less(tap_fd, skel->maps.last_dissection);
err = bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR);
CHECK(err, "bpf_prog_detach", "err %d errno %d\n", err, errno);
}
static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
{
struct bpf_link *link;
int err, net_fd;
net_fd = open("/proc/self/ns/net", O_RDONLY);
if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno))
return;
link = bpf_program__attach_netns(skel->progs._dissect, net_fd);
if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link)))
goto out_close;
run_tests_skb_less(tap_fd, skel->maps.last_dissection);
err = bpf_link__destroy(link);
CHECK(err, "bpf_link__destroy", "err %d\n", err);
out_close:
close(net_fd);
}
void test_flow_dissector(void)
{
int i, err, prog_fd, keys_fd = -1, tap_fd;
struct bpf_object *obj;
__u32 duration = 0;
struct bpf_flow *skel;
err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
"jmp_table", "last_dissection", &prog_fd, &keys_fd);
if (CHECK_FAIL(err))
skel = bpf_flow__open_and_load();
if (CHECK(!skel, "skel", "failed to open/load skeleton\n"))
return;
prog_fd = bpf_program__fd(skel->progs._dissect);
if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd))
goto out_destroy_skel;
keys_fd = bpf_map__fd(skel->maps.last_dissection);
if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd))
goto out_destroy_skel;
err = init_prog_array(skel->obj, skel->maps.jmp_table);
if (CHECK(err, "init_prog_array", "err %d\n", err))
goto out_destroy_skel;
for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_flow_keys flow_keys;
struct bpf_prog_test_run_attr tattr = {
......@@ -487,43 +603,17 @@ void test_flow_dissector(void)
* via BPF map in this case.
*/
err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno);
tap_fd = create_tap("tap0");
CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno);
err = ifup("tap0");
CHECK(err, "ifup", "err %d errno %d\n", err, errno);
for (i = 0; i < ARRAY_SIZE(tests); i++) {
/* Keep in sync with 'flags' from eth_get_headlen. */
__u32 eth_get_headlen_flags =
BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
struct bpf_prog_test_run_attr tattr = {};
struct bpf_flow_keys flow_keys = {};
__u32 key = (__u32)(tests[i].keys.sport) << 16 |
tests[i].keys.dport;
/* For skb-less case we can't pass input flags; run
* only the tests that have a matching set of flags.
*/
if (tests[i].flags != eth_get_headlen_flags)
continue;
err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err);
CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err);
CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
err = bpf_map_delete_elem(keys_fd, &key);
CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err);
}
/* Test direct prog attachment */
test_skb_less_prog_attach(skel, tap_fd);
/* Test indirect prog attachment via link */
test_skb_less_link_create(skel, tap_fd);
bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR);
bpf_object__close(obj);
close(tap_fd);
out_destroy_skel:
bpf_flow__destroy(skel);
}
......@@ -20,20 +20,20 @@
#include <bpf/bpf_endian.h>
int _version SEC("version") = 1;
#define PROG(F) SEC(#F) int bpf_func_##F
#define PROG(F) PROG_(F, _##F)
#define PROG_(NUM, NAME) SEC("flow_dissector/"#NUM) int bpf_func##NAME
/* These are the identifiers of the BPF programs that will be used in tail
* calls. Name is limited to 16 characters, with the terminating character and
* bpf_func_ above, we have only 6 to work with, anything after will be cropped.
*/
enum {
IP,
IPV6,
IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */
IPV6FR, /* Fragmentation IPv6 Extension Header */
MPLS,
VLAN,
};
#define IP 0
#define IPV6 1
#define IPV6OP 2 /* Destination/Hop-by-Hop Options IPv6 Ext. Header */
#define IPV6FR 3 /* Fragmentation IPv6 Extension Header */
#define MPLS 4
#define VLAN 5
#define MAX_PROG 6
#define IP_MF 0x2000
#define IP_OFFSET 0x1FFF
......@@ -59,7 +59,7 @@ struct frag_hdr {
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 8);
__uint(max_entries, MAX_PROG);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment