Merge branch 'bpf-nfp-jmp-memcpy-improvements'

Jiong Wang says: ==================== Currently, compiler will lower memcpy function call in XDP/eBPF C program into a sequence of eBPF load/store pairs for some scenarios. Compiler is thinking this "inline" optimiation is beneficial as it could avoid function call and also increase code locality. However, Netronome NPU is not an tranditional load/store architecture that doing a sequence of individual load/store actions are not efficient. This patch set tries to identify the load/store sequences composed of load/store pairs that comes from memcpy lowering, then accelerates them through NPU's Command Push Pull (CPP) instruction. This patch set registered an new optimization pass before doing the actual JIT work, it traverse through eBPF IR, once found candidate sequence then record the memory copy source, destination and length information in the first load instruction starting the sequence and marks all remaining instructions in the sequence into skipable status. Later, when JITing the first load instructoin, optimal instructions will be generated using those record information. For this safety of this transformation: - jump into the middle of the sequence will cancel the optimization. - overlapped memory access will cancel the optimization. - the load destination register still contains the same value as before the transformation. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

Merge branch 'bpf-nfp-jmp-memcpy-improvements'
Jiong Wang says: ==================== Currently, compiler will lower memcpy function call in XDP/eBPF C program into a sequence of eBPF load/store pairs for some scenarios. Compiler is thinking this "inline" optimiation is beneficial as it could avoid function call and also increase code locality. However, Netronome NPU is not an tranditional load/store architecture that doing a sequence of individual load/store actions are not efficient. This patch set tries to identify the load/store sequences composed of load/store pairs that comes from memcpy lowering, then accelerates them through NPU's Command Push Pull (CPP) instruction. This patch set registered an new optimization pass before doing the actual JIT work, it traverse through eBPF IR, once found candidate sequence then record the memory copy source, destination and length information in the first load instruction starting the sequence and marks all remaining instructions in the sequence into skipable status. Later, when JITing the first load instructoin, optimal instructions will be generated using those record information. For this safety of this transformation: - jump into the middle of the sequence will cancel the optimization. - overlapped memory access will cancel the optimization. - the load destination register still contains the same value as before the transformation. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
44851665 · Daniel Borkmann · 554b36bf · 6bc7103c · 44851665 · 44851665
Commit 44851665 authored Dec 01, 2017 by Daniel Borkmann
8 changed files
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
 *
 * This software is dual licensed under the GNU General License Version 2,
 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -89,23 +89,37 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
 #define nfp_meta_next(meta)	list_next_entry(meta, l)
 #define nfp_meta_prev(meta)	list_prev_entry(meta, l)

+#define FLAG_INSN_IS_JUMP_DST	BIT(0)
+
 /**
 * struct nfp_insn_meta - BPF instruction wrapper
 * @insn: BPF instruction
 * @ptr: pointer type for memory operations
+ * @ldst_gather_len: memcpy length gathered from load/store sequence
+ * @paired_st: the paired store insn at the head of the sequence
 * @ptr_not_const: pointer is not always constant
+ * @jmp_dst: destination info for jump instructions
 * @off: index of first generated machine instruction (in nfp_prog.prog)
 * @n: eBPF instruction number
+ * @flags: eBPF instruction extra optimization flags
 * @skip: skip this instruction (optimized out)
 * @double_cb: callback for second part of the instruction
 * @l: link on nfp_prog->insns list
 */
 struct nfp_insn_meta {
 	struct bpf_insn insn;
-	struct bpf_reg_state ptr;
-	bool ptr_not_const;
+	union {
+		struct {
+			struct bpf_reg_state ptr;
+			struct bpf_insn *paired_st;
+			s16 ldst_gather_len;
+			bool ptr_not_const;
+		};
+		struct nfp_insn_meta *jmp_dst;
+	};
 	unsigned int off;
 	unsigned short n;
+	unsigned short flags;
 	bool skip;
 	instr_cb_t double_cb;

@@ -134,6 +148,16 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
 	return BPF_MODE(meta->insn.code);
 }

+static inline bool is_mbpf_load(const struct nfp_insn_meta *meta)
+{
+	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM);
+}
+
+static inline bool is_mbpf_store(const struct nfp_insn_meta *meta)
+{
+	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM);
+}
+
 /**
 * struct nfp_prog - nfp BPF program
 * @prog: machine code
@@ -142,6 +166,7 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
 * @verifier_meta: temporary storage for verifier's insn meta
 * @type: BPF program type
 * @start_off: address of the first instruction in the memory
+ * @last_bpf_off: address of the last instruction translated from BPF
 * @tgt_out: jump target for normal exit
 * @tgt_abort: jump target for abort (e.g. access outside of packet buffer)
 * @tgt_done: jump target to get the next packet
@@ -160,6 +185,7 @@ struct nfp_prog {
 	enum bpf_prog_type type;

 	unsigned int start_off;
+	unsigned int last_bpf_off;
 	unsigned int tgt_out;
 	unsigned int tgt_abort;
 	unsigned int tgt_done;
@@ -189,4 +215,7 @@ int nfp_bpf_translate(struct nfp_app *app, struct nfp_net *nn,
 		      struct bpf_prog *prog);
 int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn,
 		    struct bpf_prog *prog);
+struct nfp_insn_meta *
+nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		  unsigned int insn_idx, unsigned int n_insns);
 #endif
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
 *
 * This software is dual licensed under the GNU General License Version 2,
 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -55,11 +55,10 @@ static int
 nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 		 unsigned int cnt)
 {
+	struct nfp_insn_meta *meta;
 	unsigned int i;

 	for (i = 0; i < cnt; i++) {
-		struct nfp_insn_meta *meta;
-
 		meta = kzalloc(sizeof(*meta), GFP_KERNEL);
 		if (!meta)
 			return -ENOMEM;
@@ -70,6 +69,24 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 		list_add_tail(&meta->l, &nfp_prog->insns);
 	}

+	/* Another pass to record jump information. */
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		u64 code = meta->insn.code;
+
+		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
+		    BPF_OP(code) != BPF_CALL) {
+			struct nfp_insn_meta *dst_meta;
+			unsigned short dst_indx;
+
+			dst_indx = meta->n + 1 + meta->insn.off;
+			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
+						     cnt);
+
+			meta->jmp_dst = dst_meta;
+			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
+		}
+	}
+
 	return 0;
 }


--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
 *
 * This software is dual licensed under the GNU General License Version 2,
 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -40,7 +40,7 @@

 #include "main.h"

-static struct nfp_insn_meta *
+struct nfp_insn_meta *
 nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 		  unsigned int insn_idx, unsigned int n_insns)
 {
@@ -180,10 +180,10 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	if (meta->insn.code == (BPF_JMP | BPF_EXIT))
 		return nfp_bpf_check_exit(nfp_prog, env);

-	if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM))
+	if (is_mbpf_load(meta))
 		return nfp_bpf_check_ptr(nfp_prog, meta, env,
 					 meta->insn.src_reg);
-	if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM))
+	if (is_mbpf_store(meta))
 		return nfp_bpf_check_ptr(nfp_prog, meta, env,
 					 meta->insn.dst_reg);


--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -41,6 +41,7 @@

 const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
 	[CMD_TGT_WRITE8_SWAP] =		{ 0x02, 0x42 },
+	[CMD_TGT_WRITE32_SWAP] =	{ 0x02, 0x5f },
 	[CMD_TGT_READ8] =		{ 0x01, 0x43 },
 	[CMD_TGT_READ32] =		{ 0x00, 0x5c },
 	[CMD_TGT_READ32_LE] =		{ 0x01, 0x5c },
@@ -120,7 +121,8 @@ int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
 	reg->dst = nfp_swreg_to_unreg(dst, true);

 	/* Decode source operands */
-	if (swreg_type(lreg) == swreg_type(rreg))
+	if (swreg_type(lreg) == swreg_type(rreg) &&
+	    swreg_type(lreg) != NN_REG_NONE)
 		return -EFAULT;

 	if (swreg_type(lreg) == NN_REG_GPR_B ||
@@ -200,7 +202,8 @@ int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
 	reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);

 	/* Decode source operands */
-	if (swreg_type(lreg) == swreg_type(rreg))
+	if (swreg_type(lreg) == swreg_type(rreg) &&
+	    swreg_type(lreg) != NN_REG_NONE)
 		return -EFAULT;

 	if (swreg_type(lreg) == NN_REG_GPR_B ||

--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
 *
 * This software is dual licensed under the GNU General License Version 2,
 * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -209,6 +209,7 @@ enum alu_dst_ab {
 #define OP_CMD_CNT		0x0000e000000ULL
 #define OP_CMD_SIG		0x000f0000000ULL
 #define OP_CMD_TGT_CMD		0x07f00000000ULL
+#define OP_CMD_INDIR		0x20000000000ULL
 #define OP_CMD_MODE	       0x1c0000000000ULL

 struct cmd_tgt_act {
@@ -219,6 +220,7 @@ struct cmd_tgt_act {
 enum cmd_tgt_map {
 	CMD_TGT_READ8,
 	CMD_TGT_WRITE8_SWAP,
+	CMD_TGT_WRITE32_SWAP,
 	CMD_TGT_READ32,
 	CMD_TGT_READ32_LE,
 	CMD_TGT_READ32_SWAP,
@@ -240,6 +242,9 @@ enum cmd_ctx_swap {
 	CMD_CTX_NO_SWAP = 3,
 };

+#define CMD_OVE_LEN	BIT(7)
+#define CMD_OV_LEN	GENMASK(12, 8)
+
 #define OP_LCSR_BASE		0x0fc00000000ULL
 #define OP_LCSR_A_SRC		0x000000003ffULL
 #define OP_LCSR_B_SRC		0x000000ffc00ULL

--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -548,6 +548,8 @@ struct nfp_net_dp {
 * @max_r_vecs:		Number of allocated interrupt vectors for RX/TX
 * @max_tx_rings:       Maximum number of TX rings supported by the Firmware
 * @max_rx_rings:       Maximum number of RX rings supported by the Firmware
+ * @stride_rx:		Queue controller RX queue spacing
+ * @stride_tx:		Queue controller TX queue spacing
 * @r_vecs:             Pre-allocated array of ring vectors
 * @irq_entries:        Pre-allocated array of MSI-X entries
 * @lsc_handler:        Handler for Link State Change interrupt

--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
@@ -372,8 +372,7 @@ nfp_cpp_area_alloc(struct nfp_cpp *cpp, u32 dest,
 * that it can be accessed directly.
 *
 * NOTE: @address and @size must be 32-bit aligned values.
- *
- * NOTE: The area must also be 'released' when the structure is freed.
+ * The area must also be 'released' when the structure is freed.
 *
 * Return: NFP CPP Area handle, or NULL
 */
@@ -536,8 +535,7 @@ void nfp_cpp_area_release_free(struct nfp_cpp_area *area)
 * Read data from indicated CPP region.
 *
 * NOTE: @offset and @length must be 32-bit aligned values.
- *
- * NOTE: Area must have been locked down with an 'acquire'.
+ * Area must have been locked down with an 'acquire'.
 *
 * Return: length of io, or -ERRNO
 */
@@ -558,8 +556,7 @@ int nfp_cpp_area_read(struct nfp_cpp_area *area,
 * Write data to indicated CPP region.
 *
 * NOTE: @offset and @length must be 32-bit aligned values.
- *
- * NOTE: Area must have been locked down with an 'acquire'.
+ * Area must have been locked down with an 'acquire'.
 *
 * Return: length of io, or -ERRNO
 */