• Alexei Starovoitov's avatar
    net: filter: x86: internal BPF JIT · 62258278
    Alexei Starovoitov authored
    Maps all internal BPF instructions into x86_64 instructions.
    This patch replaces original BPF x64 JIT with internal BPF x64 JIT.
    sysctl net.core.bpf_jit_enable is reused as on/off switch.
    
    Performance:
    
    1. old BPF JIT and internal BPF JIT generate equivalent x86_64 code.
      No performance difference is observed for filters that were JIT-able before
    
    Example assembler code for BPF filter "tcpdump port 22"
    
    original BPF -> old JIT:            original BPF -> internal BPF -> new JIT:
       0:   push   %rbp                      0:     push   %rbp
       1:   mov    %rsp,%rbp                 1:     mov    %rsp,%rbp
       4:   sub    $0x60,%rsp                4:     sub    $0x228,%rsp
       8:   mov    %rbx,-0x8(%rbp)           b:     mov    %rbx,-0x228(%rbp) // prologue
                                            12:     mov    %r13,-0x220(%rbp)
                                            19:     mov    %r14,-0x218(%rbp)
                                            20:     mov    %r15,-0x210(%rbp)
                                            27:     xor    %eax,%eax         // clear A
       c:   xor    %ebx,%ebx                29:     xor    %r13,%r13         // clear X
       e:   mov    0x68(%rdi),%r9d          2c:     mov    0x68(%rdi),%r9d
      12:   sub    0x6c(%rdi),%r9d          30:     sub    0x6c(%rdi),%r9d
      16:   mov    0xd8(%rdi),%r8           34:     mov    0xd8(%rdi),%r10
                                            3b:     mov    %rdi,%rbx
      1d:   mov    $0xc,%esi                3e:     mov    $0xc,%esi
      22:   callq  0xffffffffe1021e15       43:     callq  0xffffffffe102bd75
      27:   cmp    $0x86dd,%eax             48:     cmp    $0x86dd,%rax
      2c:   jne    0x0000000000000069       4f:     jne    0x000000000000009a
      2e:   mov    $0x14,%esi               51:     mov    $0x14,%esi
      33:   callq  0xffffffffe1021e31       56:     callq  0xffffffffe102bd91
      38:   cmp    $0x84,%eax               5b:     cmp    $0x84,%rax
      3d:   je     0x0000000000000049       62:     je     0x0000000000000074
      3f:   cmp    $0x6,%eax                64:     cmp    $0x6,%rax
      42:   je     0x0000000000000049       68:     je     0x0000000000000074
      44:   cmp    $0x11,%eax               6a:     cmp    $0x11,%rax
      47:   jne    0x00000000000000c6       6e:     jne    0x0000000000000117
      49:   mov    $0x36,%esi               74:     mov    $0x36,%esi
      4e:   callq  0xffffffffe1021e15       79:     callq  0xffffffffe102bd75
      53:   cmp    $0x16,%eax               7e:     cmp    $0x16,%rax
      56:   je     0x00000000000000bf       82:     je     0x0000000000000110
      58:   mov    $0x38,%esi               88:     mov    $0x38,%esi
      5d:   callq  0xffffffffe1021e15       8d:     callq  0xffffffffe102bd75
      62:   cmp    $0x16,%eax               92:     cmp    $0x16,%rax
      65:   je     0x00000000000000bf       96:     je     0x0000000000000110
      67:   jmp    0x00000000000000c6       98:     jmp    0x0000000000000117
      69:   cmp    $0x800,%eax              9a:     cmp    $0x800,%rax
      6e:   jne    0x00000000000000c6       a1:     jne    0x0000000000000117
      70:   mov    $0x17,%esi               a3:     mov    $0x17,%esi
      75:   callq  0xffffffffe1021e31       a8:     callq  0xffffffffe102bd91
      7a:   cmp    $0x84,%eax               ad:     cmp    $0x84,%rax
      7f:   je     0x000000000000008b       b4:     je     0x00000000000000c2
      81:   cmp    $0x6,%eax                b6:     cmp    $0x6,%rax
      84:   je     0x000000000000008b       ba:     je     0x00000000000000c2
      86:   cmp    $0x11,%eax               bc:     cmp    $0x11,%rax
      89:   jne    0x00000000000000c6       c0:     jne    0x0000000000000117
      8b:   mov    $0x14,%esi               c2:     mov    $0x14,%esi
      90:   callq  0xffffffffe1021e15       c7:     callq  0xffffffffe102bd75
      95:   test   $0x1fff,%ax              cc:     test   $0x1fff,%rax
      99:   jne    0x00000000000000c6       d3:     jne    0x0000000000000117
                                            d5:     mov    %rax,%r14
      9b:   mov    $0xe,%esi                d8:     mov    $0xe,%esi
      a0:   callq  0xffffffffe1021e44       dd:     callq  0xffffffffe102bd91 // MSH
                                            e2:     and    $0xf,%eax
                                            e5:     shl    $0x2,%eax
                                            e8:     mov    %rax,%r13
                                            eb:     mov    %r14,%rax
                                            ee:     mov    %r13,%rsi
      a5:   lea    0xe(%rbx),%esi           f1:     add    $0xe,%esi
      a8:   callq  0xffffffffe1021e0d       f4:     callq  0xffffffffe102bd6d
      ad:   cmp    $0x16,%eax               f9:     cmp    $0x16,%rax
      b0:   je     0x00000000000000bf       fd:     je     0x0000000000000110
                                            ff:     mov    %r13,%rsi
      b2:   lea    0x10(%rbx),%esi         102:     add    $0x10,%esi
      b5:   callq  0xffffffffe1021e0d      105:     callq  0xffffffffe102bd6d
      ba:   cmp    $0x16,%eax              10a:     cmp    $0x16,%rax
      bd:   jne    0x00000000000000c6      10e:     jne    0x0000000000000117
      bf:   mov    $0xffff,%eax            110:     mov    $0xffff,%eax
      c4:   jmp    0x00000000000000c8      115:     jmp    0x000000000000011c
      c6:   xor    %eax,%eax               117:     mov    $0x0,%eax
      c8:   mov    -0x8(%rbp),%rbx         11c:     mov    -0x228(%rbp),%rbx // epilogue
      cc:   leaveq                         123:     mov    -0x220(%rbp),%r13
      cd:   retq                           12a:     mov    -0x218(%rbp),%r14
                                           131:     mov    -0x210(%rbp),%r15
                                           138:     leaveq
                                           139:     retq
    
    On fully cached SKBs both JITed functions take 12 nsec to execute.
    BPF interpreter executes the program in 30 nsec.
    
    The difference in generated assembler is due to the following:
    
    Old BPF imlements LDX_MSH instruction via sk_load_byte_msh() helper function
    inside bpf_jit.S.
    New JIT removes the helper and does it explicitly, so ldx_msh cost
    is the same for both JITs, but generated code looks longer.
    
    New JIT has 4 registers to save, so prologue/epilogue are larger,
    but the cost is within noise on x64.
    
    Old JIT checks whether first insn clears A and if not emits 'xor %eax,%eax'.
    New JIT clears %rax unconditionally.
    
    2. old BPF JIT doesn't support ANC_NLATTR, ANC_PAY_OFFSET, ANC_RANDOM
      extensions. New JIT supports all BPF extensions.
      Performance of such filters improves 2-4 times depending on a filter.
      The longer the filter the higher performance gain.
      Synthetic benchmarks with many ancillary loads see 20x speedup
      which seems to be the maximum gain from JIT
    
    Notes:
    
    . net.core.bpf_jit_enable=2 + tools/net/bpf_jit_disasm is still functional
      and can be used to see generated assembler
    
    . there are two jit_compile() functions and code flow for classic filters is:
      sk_attach_filter() - load classic BPF
      bpf_jit_compile() - try to JIT from classic BPF
      sk_convert_filter() - convert classic to internal
      bpf_int_jit_compile() - JIT from internal BPF
    
      seccomp and tracing filters will just call bpf_int_jit_compile()
    Signed-off-by: default avatarAlexei Starovoitov <ast@plumgrid.com>
    Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
    62258278
bpf_jit.S 3.41 KB