Merge branch 'ebpf-tail-call'

Alexei Starovoitov says: ==================== bpf: introduce bpf_tail_call() helper introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. Use cases: - simplify complex programs - dispatch into other programs (for example: index in jump table can be syscall number or network protocol) - build dynamic chains of programs The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. patch 1 - support bpf_tail_call() in interpreter patch 2 - support in x64 JIT We've discussed what's neccessary to support it in arm64/s390 JITs and it looks fine. patch 3 - sample example for tracing patch 4 - sample example for networking More details in every patch. This set went through several iterations of reviews/fixes and older attempts can be seen: https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/log/?h=tail_call_v[123456] - tail_call_v1 does it without touching JITs but introduces overhead for all programs that don't use this helper function. - tail_call_v2 still has some overhead and x64 JIT does full stack unwind (prologue skipping optimization wasn't there) - tail_call_v3 reuses 'call' instruction encoding and has interpreter overhead for every normal call - tail_call_v4 fixes above architectural shortcomings and v5,v6 fix few more bugs This last tail_call_v6 approach seems to be the best. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'ebpf-tail-call'
Alexei Starovoitov says: ==================== bpf: introduce bpf_tail_call() helper introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. Use cases: - simplify complex programs - dispatch into other programs (for example: index in jump table can be syscall number or network protocol) - build dynamic chains of programs The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. patch 1 - support bpf_tail_call() in interpreter patch 2 - support in x64 JIT We've discussed what's neccessary to support it in arm64/s390 JITs and it looks fine. patch 3 - sample example for tracing patch 4 - sample example for networking More details in every patch. This set went through several iterations of reviews/fixes and older attempts can be seen: https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/log/?h=tail_call_v[123456] - tail_call_v1 does it without touching JITs but introduces overhead for all programs that don't use this helper function. - tail_call_v2 still has some overhead and x64 JIT does full stack unwind (prologue skipping optimization wasn't there) - tail_call_v3 reuses 'call' instruction encoding and has interpreter overhead for every normal call - tail_call_v4 fixes above architectural shortcomings and v5,v6 fix few more bugs This last tail_call_v6 approach seems to be the best. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
3f55b7ed · David S. Miller · e7582bab · 530b2c86 · 3f55b7ed · 3f55b7ed
Commit 3f55b7ed authored May 21, 2015 by David S. Miller
17 changed files
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
+#include <linux/bpf.h>

 int bpf_jit_enable __read_mostly;

@@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 	return ptr + len;
 }

-#define EMIT(bytes, len)	do { prog = emit_code(prog, bytes, len); } while (0)
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)

 #define EMIT1(b1)		EMIT(b1, 1)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
@@ -186,31 +188,31 @@ struct jit_context {
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64

-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
-		  int oldproglen, struct jit_context *ctx)
+#define STACKSIZE \
+	(MAX_BPF_STACK + \
+	 32 /* space for rbx, r13, r14, r15 */ + \
+	 8 /* space for skb_copy_bits() buffer */)
+
+#define PROLOGUE_SIZE 51
+
+/* emit x64 prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog)
 {
-	struct bpf_insn *insn = bpf_prog->insnsi;
-	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_exit = false;
-	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
-	int i;
-	int proglen = 0;
-	u8 *prog = temp;
-	int stacksize = MAX_BPF_STACK +
-		32 /* space for rbx, r13, r14, r15 */ +
-		8 /* space for skb_copy_bits() buffer */;
+	u8 *prog = *pprog;
+	int cnt = 0;

 	EMIT1(0x55); /* push rbp */
 	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */

-	/* sub rsp, stacksize */
-	EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+	/* sub rsp, STACKSIZE */
+	EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);

 	/* all classic BPF filters use R6(rbx) save it */

 	/* mov qword ptr [rbp-X],rbx */
-	EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+	EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);

 	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
 	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	 */

 	/* mov qword ptr [rbp-X],r13 */
-	EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+	EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
 	/* mov qword ptr [rbp-X],r14 */
-	EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+	EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
 	/* mov qword ptr [rbp-X],r15 */
-	EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+	EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);

 	/* clear A and X registers */
 	EMIT2(0x31, 0xc0); /* xor eax, eax */
 	EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */

+	/* clear tail_cnt: mov qword ptr [rbp-X], rax */
+	EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->prog[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int label1, label2, label3;
+	int cnt = 0;
+
+	/* rdi - pointer to ctx
+	 * rsi - pointer to bpf_array
+	 * rdx - index in bpf_array
+	 */
+
+	/* if (index >= array->map.max_entries)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	      offsetof(struct bpf_array, map.max_entries));
+	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
+#define OFFSET1 44 /* number of bytes to jump */
+	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
+	label1 = cnt;
+
+	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *   goto out;
+	 */
+	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
+	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
+#define OFFSET2 33
+	EMIT2(X86_JA, OFFSET2);                   /* ja out */
+	label2 = cnt;
+	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
+	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
+
+	/* prog = array->prog[index]; */
+	EMIT4(0x48, 0x8D, 0x44, 0xD6);            /* lea rax, [rsi + rdx * 8 + 0x50] */
+	EMIT1(offsetof(struct bpf_array, prog));
+	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
+
+	/* if (prog == NULL)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x83, 0xF8, 0x00);            /* cmp rax, 0 */
+#define OFFSET3 10
+	EMIT2(X86_JE, OFFSET3);                   /* je out */
+	label3 = cnt;
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	EMIT4(0x48, 0x8B, 0x40,                   /* mov rax, qword ptr [rax + 32] */
+	      offsetof(struct bpf_prog, bpf_func));
+	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
+
+	/* now we're ready to jump into next BPF program
+	 * rdi == ctx (1st arg)
+	 * rax == prog->bpf_func + prologue_size
+	 */
+	EMIT2(0xFF, 0xE0);                        /* jmp rax */
+
+	/* out: */
+	BUILD_BUG_ON(cnt - label1 != OFFSET1);
+	BUILD_BUG_ON(cnt - label2 != OFFSET2);
+	BUILD_BUG_ON(cnt - label3 != OFFSET3);
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog);
+
 	if (seen_ld_abs) {
 		/* r9d : skb->len - skb->data_len (headlen)
 		 * r10 : skb->data
@@ -739,6 +837,10 @@ xadd:			if (is_imm8(insn->off))
 			}
 			break;

+		case BPF_JMP | BPF_CALL | BPF_X:
+			emit_bpf_tail_call(&prog);
+			break;
+
 			/* cond jump */
 		case BPF_JMP | BPF_JEQ | BPF_X:
 		case BPF_JMP | BPF_JNE | BPF_X:
@@ -891,13 +993,13 @@ xadd:			if (is_imm8(insn->off))
 			/* update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp-X] */
-			EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+			EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
 			/* mov r13, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+			EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
 			/* mov r14, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+			EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
 			/* mov r15, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+			EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);

 			EMIT1(0xC9); /* leave */
 			EMIT1(0xC3); /* ret */

--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -126,6 +126,27 @@ struct bpf_prog_aux {
 	struct work_struct work;
 };

+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	/* 'ownership' of prog_array is claimed by the first program that
+	 * is going to use this map or by the first program which FD is stored
+	 * in the map to make sure that all callers and callees have the same
+	 * prog_type and JITed flag
+	 */
+	enum bpf_prog_type owner_prog_type;
+	bool owner_jited;
+	union {
+		char value[0] __aligned(8);
+		struct bpf_prog *prog[0] __aligned(8);
+	};
+};
+#define MAX_TAIL_CALL_CNT 32
+
+u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+void bpf_prog_array_map_clear(struct bpf_map *map);
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
@@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;

 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_tail_call_proto;

 #endif /* _LINUX_BPF_H */
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)

 int sk_filter(struct sock *sk, struct sk_buff *skb);

-void bpf_prog_select_runtime(struct bpf_prog *fp);
+int bpf_prog_select_runtime(struct bpf_prog *fp);
 void bpf_prog_free(struct bpf_prog *fp);

 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);

--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
 };

 enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_l4_csum_replace,
+
+	/**
+	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
+	 * @ctx: context pointer passed to next program
+	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+	 * @index: index inside array that selects specific program to run
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_tail_call,
 	__BPF_FUNC_MAX_ID,
 };


--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-
-struct bpf_array {
-	struct bpf_map map;
-	u32 elem_size;
-	char value[0] __aligned(8);
-};
+#include <linux/filter.h>

 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
 	return 0;
 }
 late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	/* only bpf_prog file descriptors can be stored in prog_array map */
+	if (attr->value_size != sizeof(u32))
+		return ERR_PTR(-EINVAL);
+	return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* make sure it's empty */
+	for (i = 0; i < array->map.max_entries; i++)
+		BUG_ON(array->prog[i] != NULL);
+	kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+				      void *value, u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog, *old_prog;
+	u32 index = *(u32 *)key, ufd;
+
+	if (map_flags != BPF_ANY)
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	ufd = *(u32 *)value;
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	old_prog = xchg(array->prog + index, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *old_prog;
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	old_prog = xchg(array->prog + index, NULL);
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = prog_array_map_lookup_elem,
+	.map_update_elem = prog_array_map_update_elem,
+	.map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+	.ops = &prog_array_ops,
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+	bpf_register_map_type(&prog_array_type);
+	return 0;
+}
+late_initcall(register_prog_array_map);
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }

+const struct bpf_func_proto bpf_tail_call_proto = {
+	.func = NULL,
+	.gpl_only = false,
+	.ret_type = RET_VOID,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_CONST_MAP_PTR,
+	.arg3_type = ARG_ANYTHING,
+};
+
 /**
 *	__bpf_prog_run - run eBPF program on a given context
 *	@ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
+	u32 tail_call_cnt = 0;
 	void *ptr;
 	int off;

@@ -431,6 +442,30 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 						       BPF_R4, BPF_R5);
 		CONT;

+	JMP_TAIL_CALL: {
+		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+		struct bpf_prog *prog;
+		u64 index = BPF_R3;
+
+		if (unlikely(index >= array->map.max_entries))
+			goto out;
+
+		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+			goto out;
+
+		tail_call_cnt++;
+
+		prog = READ_ONCE(array->prog[index]);
+		if (unlikely(!prog))
+			goto out;
+
+		ARG1 = BPF_R1;
+		insn = prog->insnsi;
+		goto select_insn;
+out:
+		CONT;
+	}
 	/* JMP */
 	JMP_JA:
 		insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }

+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
+{
+	if (array->owner_prog_type) {
+		if (array->owner_prog_type != fp->type)
+			return false;
+		if (array->owner_jited != fp->jited)
+			return false;
+	} else {
+		array->owner_prog_type = fp->type;
+		array->owner_jited = fp->jited;
+	}
+	return true;
+}
+
+static int check_tail_call(const struct bpf_prog *fp)
+{
+	struct bpf_prog_aux *aux = fp->aux;
+	int i;
+
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		struct bpf_array *array;
+		struct bpf_map *map;
+
+		map = aux->used_maps[i];
+		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+			continue;
+		array = container_of(map, struct bpf_array, map);
+		if (!bpf_prog_array_compatible(array, fp))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
 *	bpf_prog_select_runtime - select execution runtime for BPF program
 *	@fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 * try to JIT internal BPF program, if JIT is not available select interpreter
 * BPF program will be executed via BPF_PROG_RUN() macro
 */
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;

@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
 	bpf_int_jit_compile(fp);
 	/* Lock whole bpf_prog as read-only */
 	bpf_prog_lock_ro(fp);
+
+	return check_tail_call(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);


--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_map *map = filp->private_data;

+	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+		/* prog_array stores refcnt-ed bpf_prog pointers
+		 * release them all when user space closes prog_array_fd
+		 */
+		bpf_prog_array_map_clear(map);
+
 	bpf_map_put(map);
 	return 0;
 }
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);

+			if (insn->imm == BPF_FUNC_tail_call) {
+				/* mark bpf_tail_call as different opcode
+				 * to avoid conditional branch in
+				 * interpeter for every normal call
+				 * and to prevent accidental JITing by
+				 * JIT compiler that doesn't support
+				 * bpf_tail_call yet
+				 */
+				insn->imm = 0;
+				insn->code |= BPF_X;
+				continue;
+			}
+
 			fn = prog->aux->ops->get_func_proto(insn->imm);
 			/* all functions that have prototype and verifier allowed
 			 * programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);

 	/* eBPF program is ready to be JITed */
-	bpf_prog_select_runtime(prog);
+	err = bpf_prog_select_runtime(prog);
+	if (err < 0)
+		goto free_used_maps;

 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
 	if (err < 0)

--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
+
+	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+	    func_id != BPF_FUNC_tail_call)
+		/* prog_array map type needs extra care:
+		 * only allow to pass it into bpf_tail_call() for now.
+		 * bpf_map_delete_elem() can be allowed in the future,
+		 * while bpf_map_update_elem() must only be done via syscall
+		 */
+		return -EINVAL;
+
+	if (func_id == BPF_FUNC_tail_call &&
+	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+		/* don't allow any other map type to be passed into
+		 * bpf_tail_call()
+		 */
+		return -EINVAL;
+
 	return 0;
 }


--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;

 	case BPF_FUNC_trace_printk:
 		/*

--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 	default:
 		return NULL;
 	}

--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,29 +6,35 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += sockex3
 hostprogs-y += tracex1
 hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex4
+hostprogs-y += tracex5

 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+sockex3-objs := bpf_load.o libbpf.o sockex3_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
+tracex5-objs := bpf_load.o libbpf.o tracex5_user.o

 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += sockex3_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
+always += tracex5_kern.o
 always += tcbpf1_kern.o

 HOSTCFLAGS += -I$(objtree)/usr/include
@@ -36,10 +42,12 @@ HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_sockex3 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
+HOSTLOADLIBES_tracex5 += -lelf

 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc

--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -21,6 +21,10 @@ static unsigned long long (*bpf_ktime_get_ns)(void) =
 	(void *) BPF_FUNC_ktime_get_ns;
 static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *) BPF_FUNC_trace_printk;
+static void (*bpf_tail_call)(void *ctx, void *map, int index) =
+	(void *) BPF_FUNC_tail_call;
+static unsigned long long (*bpf_get_smp_processor_id)(void) =
+	(void *) BPF_FUNC_get_smp_processor_id;

 /* llvm builtin functions that eBPF C program may use to
 * emit BPF_LD_ABS and BPF_LD_IND instructions

--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -16,6 +16,7 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <poll.h>
+#include <ctype.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
@@ -29,6 +30,19 @@ int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
 int event_fd[MAX_PROGS];
 int prog_cnt;
+int prog_array_fd = -1;
+
+static int populate_prog_array(const char *event, int prog_fd)
+{
+	int ind = atoi(event), err;
+
+	err = bpf_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
+	if (err < 0) {
+		printf("failed to store prog_fd in prog_array\n");
+		return -1;
+	}
+	return 0;
+}

 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
@@ -54,12 +68,40 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		return -1;
 	}

+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
+	if (fd < 0) {
+		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
+		return -1;
+	}
+
+	prog_fd[prog_cnt++] = fd;
+
+	if (is_socket) {
+		event += 6;
+		if (*event != '/')
+			return 0;
+		event++;
+		if (!isdigit(*event)) {
+			printf("invalid prog number\n");
+			return -1;
+		}
+		return populate_prog_array(event, fd);
+	}
+
 	if (is_kprobe || is_kretprobe) {
 		if (is_kprobe)
 			event += 7;
 		else
 			event += 10;

+		if (*event == 0) {
+			printf("event name cannot be empty\n");
+			return -1;
+		}
+
+		if (isdigit(*event))
+			return populate_prog_array(event, fd);
+
 		snprintf(buf, sizeof(buf),
 			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
 			 is_kprobe ? 'p' : 'r', event, event);
@@ -71,18 +113,6 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		}
 	}

-	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
-
-	if (fd < 0) {
-		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
-		return -1;
-	}
-
-	prog_fd[prog_cnt++] = fd;
-
-	if (is_socket)
-		return 0;
-
 	strcpy(buf, DEBUGFS);
 	strcat(buf, "events/kprobes/");
 	strcat(buf, event);
@@ -130,6 +160,9 @@ static int load_maps(struct bpf_map_def *maps, int len)
 					   maps[i].max_entries);
 		if (map_fd[i] < 0)
 			return 1;
+
+		if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
+			prog_array_fd = map_fd[i];
 	}
 	return 0;
 }

--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#include <uapi/linux/mpls.h>
+#define IP_MF		0x2000
+#define IP_OFFSET	0x1FFF
+
+#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
+
+struct bpf_map_def SEC("maps") jmp_table = {
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u32),
+	.max_entries = 8,
+};
+
+#define PARSE_VLAN 1
+#define PARSE_MPLS 2
+#define PARSE_IP 3
+#define PARSE_IPV6 4
+
+/* protocol dispatch routine.
+ * It tail-calls next BPF program depending on eth proto
+ * Note, we could have used:
+ * bpf_tail_call(skb, &jmp_table, proto);
+ * but it would need large prog_array
+ */
+static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
+{
+	switch (proto) {
+	case ETH_P_8021Q:
+	case ETH_P_8021AD:
+		bpf_tail_call(skb, &jmp_table, PARSE_VLAN);
+		break;
+	case ETH_P_MPLS_UC:
+	case ETH_P_MPLS_MC:
+		bpf_tail_call(skb, &jmp_table, PARSE_MPLS);
+		break;
+	case ETH_P_IP:
+		bpf_tail_call(skb, &jmp_table, PARSE_IP);
+		break;
+	case ETH_P_IPV6:
+		bpf_tail_call(skb, &jmp_table, PARSE_IPV6);
+		break;
+	}
+}
+
+struct vlan_hdr {
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_keys {
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	__u32 ip_proto;
+};
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+	return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+		& (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
+{
+	__u64 w0 = load_word(ctx, off);
+	__u64 w1 = load_word(ctx, off + 4);
+	__u64 w2 = load_word(ctx, off + 8);
+	__u64 w3 = load_word(ctx, off + 12);
+
+	return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+struct globals {
+	struct flow_keys flow;
+	__u32 nhoff;
+};
+
+struct bpf_map_def SEC("maps") percpu_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct globals),
+	.max_entries = 32,
+};
+
+/* user poor man's per_cpu until native support is ready */
+static struct globals *this_cpu_globals(void)
+{
+	u32 key = bpf_get_smp_processor_id();
+
+	return bpf_map_lookup_elem(&percpu_map, &key);
+}
+
+/* some simple stats for user space consumption */
+struct pair {
+	__u64 packets;
+	__u64 bytes;
+};
+
+struct bpf_map_def SEC("maps") hash_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(struct flow_keys),
+	.value_size = sizeof(struct pair),
+	.max_entries = 1024,
+};
+
+static void update_stats(struct __sk_buff *skb, struct globals *g)
+{
+	struct flow_keys key = g->flow;
+	struct pair *value;
+
+	value = bpf_map_lookup_elem(&hash_map, &key);
+	if (value) {
+		__sync_fetch_and_add(&value->packets, 1);
+		__sync_fetch_and_add(&value->bytes, skb->len);
+	} else {
+		struct pair val = {1, skb->len};
+
+		bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+	}
+}
+
+static __always_inline void parse_ip_proto(struct __sk_buff *skb,
+					   struct globals *g, __u32 ip_proto)
+{
+	__u32 nhoff = g->nhoff;
+	int poff;
+
+	switch (ip_proto) {
+	case IPPROTO_GRE: {
+		struct gre_hdr {
+			__be16 flags;
+			__be16 proto;
+		};
+
+		__u32 gre_flags = load_half(skb,
+					    nhoff + offsetof(struct gre_hdr, flags));
+		__u32 gre_proto = load_half(skb,
+					    nhoff + offsetof(struct gre_hdr, proto));
+
+		if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+			break;
+
+		nhoff += 4;
+		if (gre_flags & GRE_CSUM)
+			nhoff += 4;
+		if (gre_flags & GRE_KEY)
+			nhoff += 4;
+		if (gre_flags & GRE_SEQ)
+			nhoff += 4;
+
+		g->nhoff = nhoff;
+		parse_eth_proto(skb, gre_proto);
+		break;
+	}
+	case IPPROTO_IPIP:
+		parse_eth_proto(skb, ETH_P_IP);
+		break;
+	case IPPROTO_IPV6:
+		parse_eth_proto(skb, ETH_P_IPV6);
+		break;
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		g->flow.ports = load_word(skb, nhoff);
+	case IPPROTO_ICMP:
+		g->flow.ip_proto = ip_proto;
+		update_stats(skb, g);
+		break;
+	default:
+		break;
+	}
+}
+
+PROG(PARSE_IP)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, verlen, ip_proto;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	if (unlikely(ip_is_fragment(skb, nhoff)))
+		return 0;
+
+	ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+	if (ip_proto != IPPROTO_GRE) {
+		g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+		g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+	}
+
+	verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+	nhoff += (verlen & 0xF) << 2;
+
+	g->nhoff = nhoff;
+	parse_ip_proto(skb, g, ip_proto);
+	return 0;
+}
+
+PROG(PARSE_IPV6)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, ip_proto;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	ip_proto = load_byte(skb,
+			     nhoff + offsetof(struct ipv6hdr, nexthdr));
+	g->flow.src = ipv6_addr_hash(skb,
+				     nhoff + offsetof(struct ipv6hdr, saddr));
+	g->flow.dst = ipv6_addr_hash(skb,
+				     nhoff + offsetof(struct ipv6hdr, daddr));
+	nhoff += sizeof(struct ipv6hdr);
+
+	g->nhoff = nhoff;
+	parse_ip_proto(skb, g, ip_proto);
+	return 0;
+}
+
+PROG(PARSE_VLAN)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, proto;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+						h_vlan_encapsulated_proto));
+	nhoff += sizeof(struct vlan_hdr);
+	g->nhoff = nhoff;
+
+	parse_eth_proto(skb, proto);
+
+	return 0;
+}
+
+PROG(PARSE_MPLS)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, label;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	label = load_word(skb, nhoff);
+	nhoff += sizeof(struct mpls_label);
+	g->nhoff = nhoff;
+
+	if (label & MPLS_LS_S_MASK) {
+		__u8 verlen = load_byte(skb, nhoff);
+		if ((verlen & 0xF0) == 4)
+			parse_eth_proto(skb, ETH_P_IP);
+		else
+			parse_eth_proto(skb, ETH_P_IPV6);
+	} else {
+		parse_eth_proto(skb, ETH_P_MPLS_UC);
+	}
+
+	return 0;
+}
+
+SEC("socket/0")
+int main_prog(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff = ETH_HLEN;
+	__u32 proto = load_half(skb, 12);
+
+	if (!g)
+		return 0;
+
+	g->nhoff = nhoff;
+	parse_eth_proto(skb, proto);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+struct flow_keys {
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	__u32 ip_proto;
+};
+
+struct pair {
+	__u64 packets;
+	__u64 bytes;
+};
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+	FILE *f;
+	int i, sock;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	sock = open_raw_sock("lo");
+
+	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
+			  sizeof(__u32)) == 0);
+
+	if (argc > 1)
+		f = popen("ping -c5 localhost", "r");
+	else
+		f = popen("netperf -l 4 localhost", "r");
+	(void) f;
+
+	for (i = 0; i < 5; i++) {
+		struct flow_keys key = {}, next_key;
+		struct pair value;
+
+		sleep(1);
+		printf("IP     src.port -> dst.port               bytes      packets\n");
+		while (bpf_get_next_key(map_fd[2], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[2], &next_key, &value);
+			printf("%s.%05d -> %s.%05d %12lld %12lld\n",
+			       inet_ntoa((struct in_addr){htonl(next_key.src)}),
+			       next_key.port16[0],
+			       inet_ntoa((struct in_addr){htonl(next_key.dst)}),
+			       next_key.port16[1],
+			       value.bytes, value.packets);
+			key = next_key;
+		}
+	}
+	return 0;
+}
--- a/samples/bpf/tracex5_kern.c
+++ b/samples/bpf/tracex5_kern.c
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/seccomp.h>
+#include "bpf_helpers.h"
+
+#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
+
+struct bpf_map_def SEC("maps") progs = {
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u32),
+	.max_entries = 1024,
+};
+
+SEC("kprobe/seccomp_phase1")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	struct seccomp_data sd = {};
+
+	bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di);
+
+	/* dispatch into next BPF program depending on syscall number */
+	bpf_tail_call(ctx, &progs, sd.nr);
+
+	/* fall through -> unknown syscall */
+	if (sd.nr >= __NR_getuid && sd.nr <= __NR_getsid) {
+		char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n";
+		bpf_trace_printk(fmt, sizeof(fmt), sd.nr);
+	}
+	return 0;
+}
+
+/* we jump here when syscall number == __NR_write */
+PROG(__NR_write)(struct pt_regs *ctx)
+{
+	struct seccomp_data sd = {};
+
+	bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di);
+	if (sd.args[2] == 512) {
+		char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
+		bpf_trace_printk(fmt, sizeof(fmt),
+				 sd.args[0], sd.args[1], sd.args[2]);
+	}
+	return 0;
+}
+
+PROG(__NR_read)(struct pt_regs *ctx)
+{
+	struct seccomp_data sd = {};
+
+	bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di);
+	if (sd.args[2] > 128 && sd.args[2] <= 1024) {
+		char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
+		bpf_trace_printk(fmt, sizeof(fmt),
+				 sd.args[0], sd.args[1], sd.args[2]);
+	}
+	return 0;
+}
+
+PROG(__NR_mmap)(struct pt_regs *ctx)
+{
+	char fmt[] = "mmap\n";
+	bpf_trace_printk(fmt, sizeof(fmt));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
--- a/samples/bpf/tracex5_user.c
+++ b/samples/bpf/tracex5_user.c
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+/* install fake seccomp program to enable seccomp code path inside the kernel,
+ * so that our kprobe attached to seccomp_phase1() can be triggered
+ */
+static void install_accept_all_seccomp(void)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+		.filter = filter,
+	};
+	if (prctl(PR_SET_SECCOMP, 2, &prog))
+		perror("prctl");
+}
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	install_accept_all_seccomp();
+
+	f = popen("dd if=/dev/zero of=/dev/null count=5", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}