Commit 0a14842f authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: filter: Just In Time compiler for x86-64

In order to speedup packet filtering, here is an implementation of a
JIT compiler for x86_64

It is disabled by default, and must be enabled by the admin.

echo 1 >/proc/sys/net/core/bpf_jit_enable

It uses module_alloc() and module_free() to get memory in the 2GB text
kernel range since we call helpers functions from the generated code.

EAX : BPF A accumulator
EBX : BPF X accumulator
RDI : pointer to skb   (first argument given to JIT function)
RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
r9d : skb->len - skb->data_len (headlen)
r8  : skb->data

To get a trace of generated code, use :

echo 2 >/proc/sys/net/core/bpf_jit_enable

Example of generated code :

# tcpdump -p -n -s 0 -i eth1 host 192.168.20.0/24

flen=18 proglen=147 pass=3 image=ffffffffa00b5000
JIT code: ffffffffa00b5000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 60
JIT code: ffffffffa00b5010: 44 2b 4f 64 4c 8b 87 b8 00 00 00 be 0c 00 00 00
JIT code: ffffffffa00b5020: e8 24 7b f7 e0 3d 00 08 00 00 75 28 be 1a 00 00
JIT code: ffffffffa00b5030: 00 e8 fe 7a f7 e0 24 00 3d 00 14 a8 c0 74 49 be
JIT code: ffffffffa00b5040: 1e 00 00 00 e8 eb 7a f7 e0 24 00 3d 00 14 a8 c0
JIT code: ffffffffa00b5050: 74 36 eb 3b 3d 06 08 00 00 74 07 3d 35 80 00 00
JIT code: ffffffffa00b5060: 75 2d be 1c 00 00 00 e8 c8 7a f7 e0 24 00 3d 00
JIT code: ffffffffa00b5070: 14 a8 c0 74 13 be 26 00 00 00 e8 b5 7a f7 e0 24
JIT code: ffffffffa00b5080: 00 3d 00 14 a8 c0 75 07 b8 ff ff 00 00 eb 02 31
JIT code: ffffffffa00b5090: c0 c9 c3

BPF program is 144 bytes long, so native program is almost same size ;)

(000) ldh      [12]
(001) jeq      #0x800           jt 2    jf 8
(002) ld       [26]
(003) and      #0xffffff00
(004) jeq      #0xc0a81400      jt 16   jf 5
(005) ld       [30]
(006) and      #0xffffff00
(007) jeq      #0xc0a81400      jt 16   jf 17
(008) jeq      #0x806           jt 10   jf 9
(009) jeq      #0x8035          jt 10   jf 17
(010) ld       [28]
(011) and      #0xffffff00
(012) jeq      #0xc0a81400      jt 16   jf 13
(013) ld       [38]
(014) and      #0xffffff00
(015) jeq      #0xc0a81400      jt 16   jf 17
(016) ret      #65535
(017) ret      #0
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b678027c
......@@ -32,6 +32,17 @@ Table : Subdirectories in /proc/sys/net
1. /proc/sys/net/core - Network core options
-------------------------------------------------------
bpf_jit_enable
--------------
This enables Berkeley Packet Filter Just in Time compiler.
Currently supported on x86_64 architecture, bpf_jit provides a framework
to speed packet filtering, the one used by tcpdump/libpcap for example.
Values :
0 - disable the JIT (default value)
1 - enable the JIT
2 - enable the JIT and ask the compiler to emit traces on kernel log.
rmem_default
------------
......
......@@ -4372,6 +4372,7 @@ S: Maintained
F: net/ipv4/
F: net/ipv6/
F: include/net/ip*
F: arch/x86/net/*
NETWORKING [LABELED] (NetLabel, CIPSO, Labeled IPsec, SECMARK)
M: Paul Moore <paul.moore@hp.com>
......
......@@ -15,3 +15,4 @@ obj-y += vdso/
obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
obj-y += net/
......@@ -72,6 +72,7 @@ config X86
select IRQ_FORCED_THREADING
select USE_GENERIC_SMP_HELPERS if SMP
select ARCH_NO_SYSDEV_OPS
select HAVE_BPF_JIT if X86_64
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
......
#
# Arch-specific network modules
#
obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
/* bpf_jit.S : BPF JIT helper functions
*
* Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/linkage.h>
#include <asm/dwarf2.h>
/*
* Calling convention :
* rdi : skb pointer
* esi : offset of byte(s) to fetch in skb (can be scratched)
* r8 : copy of skb->data
* r9d : hlen = skb->len - skb->data_len
*/
#define SKBDATA %r8
sk_load_word_ind:
.globl sk_load_word_ind
add %ebx,%esi /* offset += X */
# test %esi,%esi /* if (offset < 0) goto bpf_error; */
js bpf_error
sk_load_word:
.globl sk_load_word
mov %r9d,%eax # hlen
sub %esi,%eax # hlen - offset
cmp $3,%eax
jle bpf_slow_path_word
mov (SKBDATA,%rsi),%eax
bswap %eax /* ntohl() */
ret
sk_load_half_ind:
.globl sk_load_half_ind
add %ebx,%esi /* offset += X */
js bpf_error
sk_load_half:
.globl sk_load_half
mov %r9d,%eax
sub %esi,%eax # hlen - offset
cmp $1,%eax
jle bpf_slow_path_half
movzwl (SKBDATA,%rsi),%eax
rol $8,%ax # ntohs()
ret
sk_load_byte_ind:
.globl sk_load_byte_ind
add %ebx,%esi /* offset += X */
js bpf_error
sk_load_byte:
.globl sk_load_byte
cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
jle bpf_slow_path_byte
movzbl (SKBDATA,%rsi),%eax
ret
/**
* sk_load_byte_msh - BPF_S_LDX_B_MSH helper
*
* Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
* Must preserve A accumulator (%eax)
* Inputs : %esi is the offset value, already known positive
*/
ENTRY(sk_load_byte_msh)
CFI_STARTPROC
cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
jle bpf_slow_path_byte_msh
movzbl (SKBDATA,%rsi),%ebx
and $15,%bl
shl $2,%bl
ret
CFI_ENDPROC
ENDPROC(sk_load_byte_msh)
bpf_error:
# force a return 0 from jit handler
xor %eax,%eax
mov -8(%rbp),%rbx
leaveq
ret
/* rsi contains offset and can be scratched */
#define bpf_slow_path_common(LEN) \
push %rdi; /* save skb */ \
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
mov $LEN,%ecx; /* len */ \
lea -12(%rbp),%rdx; \
call skb_copy_bits; \
test %eax,%eax; \
pop SKBDATA; \
pop %r9; \
pop %rdi
bpf_slow_path_word:
bpf_slow_path_common(4)
js bpf_error
mov -12(%rbp),%eax
bswap %eax
ret
bpf_slow_path_half:
bpf_slow_path_common(2)
js bpf_error
mov -12(%rbp),%ax
rol $8,%ax
movzwl %ax,%eax
ret
bpf_slow_path_byte:
bpf_slow_path_common(1)
js bpf_error
movzbl -12(%rbp),%eax
ret
bpf_slow_path_byte_msh:
xchg %eax,%ebx /* dont lose A , X is about to be scratched */
bpf_slow_path_common(1)
js bpf_error
movzbl -12(%rbp),%eax
and $15,%al
shl $2,%al
xchg %eax,%ebx
ret
This diff is collapsed.
......@@ -135,6 +135,8 @@ struct sk_filter
{
atomic_t refcnt;
unsigned int len; /* Number of filter blocks */
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct sock_filter *filter);
struct rcu_head rcu;
struct sock_filter insns[0];
};
......@@ -153,6 +155,80 @@ extern unsigned int sk_run_filter(const struct sk_buff *skb,
extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
extern int sk_detach_filter(struct sock *sk);
extern int sk_chk_filter(struct sock_filter *filter, int flen);
#ifdef CONFIG_BPF_JIT
extern void bpf_jit_compile(struct sk_filter *fp);
extern void bpf_jit_free(struct sk_filter *fp);
#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
#else
static inline void bpf_jit_compile(struct sk_filter *fp)
{
}
static inline void bpf_jit_free(struct sk_filter *fp)
{
}
#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
#endif
enum {
BPF_S_RET_K = 1,
BPF_S_RET_A,
BPF_S_ALU_ADD_K,
BPF_S_ALU_ADD_X,
BPF_S_ALU_SUB_K,
BPF_S_ALU_SUB_X,
BPF_S_ALU_MUL_K,
BPF_S_ALU_MUL_X,
BPF_S_ALU_DIV_X,
BPF_S_ALU_AND_K,
BPF_S_ALU_AND_X,
BPF_S_ALU_OR_K,
BPF_S_ALU_OR_X,
BPF_S_ALU_LSH_K,
BPF_S_ALU_LSH_X,
BPF_S_ALU_RSH_K,
BPF_S_ALU_RSH_X,
BPF_S_ALU_NEG,
BPF_S_LD_W_ABS,
BPF_S_LD_H_ABS,
BPF_S_LD_B_ABS,
BPF_S_LD_W_LEN,
BPF_S_LD_W_IND,
BPF_S_LD_H_IND,
BPF_S_LD_B_IND,
BPF_S_LD_IMM,
BPF_S_LDX_W_LEN,
BPF_S_LDX_B_MSH,
BPF_S_LDX_IMM,
BPF_S_MISC_TAX,
BPF_S_MISC_TXA,
BPF_S_ALU_DIV_K,
BPF_S_LD_MEM,
BPF_S_LDX_MEM,
BPF_S_ST,
BPF_S_STX,
BPF_S_JMP_JA,
BPF_S_JMP_JEQ_K,
BPF_S_JMP_JEQ_X,
BPF_S_JMP_JGE_K,
BPF_S_JMP_JGE_X,
BPF_S_JMP_JGT_K,
BPF_S_JMP_JGT_X,
BPF_S_JMP_JSET_K,
BPF_S_JMP_JSET_X,
/* Ancillary data */
BPF_S_ANC_PROTOCOL,
BPF_S_ANC_PKTTYPE,
BPF_S_ANC_IFINDEX,
BPF_S_ANC_NLATTR,
BPF_S_ANC_NLATTR_NEST,
BPF_S_ANC_MARK,
BPF_S_ANC_QUEUE,
BPF_S_ANC_HATYPE,
BPF_S_ANC_RXHASH,
BPF_S_ANC_CPU,
};
#endif /* __KERNEL__ */
#endif /* __LINUX_FILTER_H__ */
......@@ -2514,6 +2514,7 @@ extern struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
extern int netdev_max_backlog;
extern int netdev_tstamp_prequeue;
extern int weight_p;
extern int bpf_jit_enable;
extern int netdev_set_master(struct net_device *dev, struct net_device *master);
extern int netdev_set_bond_master(struct net_device *dev,
struct net_device *master);
......
......@@ -391,8 +391,8 @@ struct sk_buff {
__u32 rxhash;
__u16 queue_mapping;
kmemcheck_bitfield_begin(flags2);
__u16 queue_mapping:16;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
......
......@@ -232,6 +232,19 @@ config XPS
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y
config HAVE_BPF_JIT
bool
config BPF_JIT
bool "enable BPF Just In Time compiler"
depends on HAVE_BPF_JIT
---help---
Berkeley Packet Filter filtering capabilities are normally handled
by an interpreter. This option allows kernel to generate a native
code when filter is loaded in memory. This should speedup
packet sniffing (libpcap/tcpdump). Note : Admin should enable
this feature changing /proc/sys/net/core/bpf_jit_enable
menu "Network testing"
config NET_PKTGEN
......
......@@ -39,65 +39,6 @@
#include <linux/filter.h>
#include <linux/reciprocal_div.h>
enum {
BPF_S_RET_K = 1,
BPF_S_RET_A,
BPF_S_ALU_ADD_K,
BPF_S_ALU_ADD_X,
BPF_S_ALU_SUB_K,
BPF_S_ALU_SUB_X,
BPF_S_ALU_MUL_K,
BPF_S_ALU_MUL_X,
BPF_S_ALU_DIV_X,
BPF_S_ALU_AND_K,
BPF_S_ALU_AND_X,
BPF_S_ALU_OR_K,
BPF_S_ALU_OR_X,
BPF_S_ALU_LSH_K,
BPF_S_ALU_LSH_X,
BPF_S_ALU_RSH_K,
BPF_S_ALU_RSH_X,
BPF_S_ALU_NEG,
BPF_S_LD_W_ABS,
BPF_S_LD_H_ABS,
BPF_S_LD_B_ABS,
BPF_S_LD_W_LEN,
BPF_S_LD_W_IND,
BPF_S_LD_H_IND,
BPF_S_LD_B_IND,
BPF_S_LD_IMM,
BPF_S_LDX_W_LEN,
BPF_S_LDX_B_MSH,
BPF_S_LDX_IMM,
BPF_S_MISC_TAX,
BPF_S_MISC_TXA,
BPF_S_ALU_DIV_K,
BPF_S_LD_MEM,
BPF_S_LDX_MEM,
BPF_S_ST,
BPF_S_STX,
BPF_S_JMP_JA,
BPF_S_JMP_JEQ_K,
BPF_S_JMP_JEQ_X,
BPF_S_JMP_JGE_K,
BPF_S_JMP_JGE_X,
BPF_S_JMP_JGT_K,
BPF_S_JMP_JGT_X,
BPF_S_JMP_JSET_K,
BPF_S_JMP_JSET_X,
/* Ancillary data */
BPF_S_ANC_PROTOCOL,
BPF_S_ANC_PKTTYPE,
BPF_S_ANC_IFINDEX,
BPF_S_ANC_NLATTR,
BPF_S_ANC_NLATTR_NEST,
BPF_S_ANC_MARK,
BPF_S_ANC_QUEUE,
BPF_S_ANC_HATYPE,
BPF_S_ANC_RXHASH,
BPF_S_ANC_CPU,
};
/* No hurry in this branch */
static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size)
{
......@@ -145,7 +86,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
unsigned int pkt_len = sk_run_filter(skb, filter->insns);
unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
......@@ -638,6 +579,7 @@ void sk_filter_release_rcu(struct rcu_head *rcu)
{
struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
bpf_jit_free(fp);
kfree(fp);
}
EXPORT_SYMBOL(sk_filter_release_rcu);
......@@ -672,6 +614,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
fp->bpf_func = sk_run_filter;
err = sk_chk_filter(fp->insns, fp->len);
if (err) {
......@@ -679,6 +622,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
return err;
}
bpf_jit_compile(fp);
old_fp = rcu_dereference_protected(sk->sk_filter,
sock_owned_by_user(sk));
rcu_assign_pointer(sk->sk_filter, fp);
......
......@@ -122,6 +122,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
#ifdef CONFIG_BPF_JIT
{
.procname = "bpf_jit_enable",
.data = &bpf_jit_enable,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#endif
{
.procname = "netdev_tstamp_prequeue",
.data = &netdev_tstamp_prequeue,
......
......@@ -538,7 +538,7 @@ static inline unsigned int run_filter(const struct sk_buff *skb,
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
res = sk_run_filter(skb, filter->insns);
res = SK_RUN_FILTER(filter, skb);
rcu_read_unlock();
return res;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment