Commit e994cc24 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull seccomp updates from Kees Cook:
 "The major change here is finally gaining seccomp constant-action
  bitmaps, which internally reduces the seccomp overhead for many
  real-world syscall filters to O(1), as discussed at Plumbers this
  year.

   - Improve seccomp performance via constant-action bitmaps (YiFei Zhu
     & Kees Cook)

   - Fix bogus __user annotations (Jann Horn)

   - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)"

* tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
  selftests/seccomp: Update kernel config
  seccomp: Remove bogus __user annotations
  seccomp/cache: Report cache data through /proc/pid/seccomp_cache
  xtensa: Enable seccomp architecture tracking
  sh: Enable seccomp architecture tracking
  s390: Enable seccomp architecture tracking
  riscv: Enable seccomp architecture tracking
  powerpc: Enable seccomp architecture tracking
  parisc: Enable seccomp architecture tracking
  csky: Enable seccomp architecture tracking
  arm: Enable seccomp architecture tracking
  arm64: Enable seccomp architecture tracking
  selftests/seccomp: Compare bitmap vs filter overhead
  x86: Enable seccomp architecture tracking
  seccomp/cache: Add "emulator" to check if filter is constant allow
  seccomp/cache: Lookup syscall allowlist bitmap for fast path
parents ba1d41a5 2c07343a
......@@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER
- secure_computing return value is checked and a return value of -1
results in the system call being skipped immediately.
- seccomp syscall wired up
- if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE,
SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If
COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too.
config SECCOMP
prompt "Enable seccomp to safely execute untrusted bytecode"
......@@ -514,6 +517,20 @@ config SECCOMP_FILTER
See Documentation/userspace-api/seccomp_filter.rst for details.
config SECCOMP_CACHE_DEBUG
bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
depends on PROC_FS
help
This enables the /proc/pid/seccomp_cache interface to monitor
seccomp cache data. The file format is subject to change. Reading
the file requires CAP_SYS_ADMIN.
This option is for debugging only. Enabling presents the risk that
an adversary may be able to infer the seccomp filter logic.
If unsure, say N.
config HAVE_ARCH_STACKLEAK
bool
help
......
......@@ -4,7 +4,6 @@ generic-y += extable.h
generic-y += flat.h
generic-y += local64.h
generic-y += parport.h
generic-y += seccomp.h
generated-y += mach-types.h
generated-y += unistd-nr.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "arm"
#endif /* _ASM_SECCOMP_H */
......@@ -19,4 +19,13 @@
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "aarch64"
#ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM
# define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "arm"
#endif
#endif /* _ASM_SECCOMP_H */
......@@ -4,6 +4,5 @@ generic-y += gpio.h
generic-y += kvm_para.h
generic-y += local64.h
generic-y += qrwlock.h
generic-y += seccomp.h
generic-y += user.h
generic-y += vmlinux.lds.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "csky"
#endif /* _ASM_SECCOMP_H */
......@@ -5,5 +5,4 @@ generated-y += syscall_table_c32.h
generic-y += kvm_para.h
generic-y += local64.h
generic-y += mcs_spinlock.h
generic-y += seccomp.h
generic-y += user.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#ifdef CONFIG_64BIT
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "parisc64"
# ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "parisc"
# endif
#else /* !CONFIG_64BIT */
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "parisc"
#endif
#endif /* _ASM_SECCOMP_H */
......@@ -8,4 +8,27 @@
#include <asm-generic/seccomp.h>
#ifdef __LITTLE_ENDIAN__
#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
#define __SECCOMP_ARCH_LE_NAME "le"
#else
#define __SECCOMP_ARCH_LE 0
#define __SECCOMP_ARCH_LE_NAME
#endif
#ifdef CONFIG_PPC64
# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE)
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME
# ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME
# endif
#else /* !CONFIG_PPC64 */
# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME
#endif
#endif /* _ASM_POWERPC_SECCOMP_H */
......@@ -7,4 +7,14 @@
#include <asm-generic/seccomp.h>
#ifdef CONFIG_64BIT
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "riscv64"
#else /* !CONFIG_64BIT */
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "riscv32"
#endif
#endif /* _ASM_SECCOMP_H */
......@@ -16,4 +16,13 @@
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "s390x"
#ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "s390"
#endif
#endif /* _ASM_S390_SECCOMP_H */
......@@ -8,4 +8,14 @@
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
#else
#define __SECCOMP_ARCH_LE 0
#endif
#define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE)
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "sh"
#endif /* __ASM_SECCOMP_H */
......@@ -16,6 +16,26 @@
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
#endif
#ifdef CONFIG_X86_64
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
# ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "ia32"
# endif
/*
* x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
* caching them and they are treated as out of range syscalls, which will
* always pass through the BPF filter.
*/
#else /* !CONFIG_X86_64 */
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "ia32"
#endif
#include <asm-generic/seccomp.h>
#endif /* _ASM_X86_SECCOMP_H */
......@@ -7,5 +7,4 @@ generic-y += mcs_spinlock.h
generic-y += param.h
generic-y += qrwlock.h
generic-y += qspinlock.h
generic-y += seccomp.h
generic-y += user.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "xtensa"
#endif /* _ASM_SECCOMP_H */
......@@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
......@@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
......
......@@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
return -EINVAL;
}
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
struct seq_file;
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
#endif
#endif /* _LINUX_SECCOMP_H */
This diff is collapsed.
CONFIG_PID_NS=y
CONFIG_SECCOMP=y
CONFIG_SECCOMP_FILTER=y
CONFIG_USER_NS=y
......@@ -4,12 +4,16 @@
*/
#define _GNU_SOURCE
#include <assert.h>
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/param.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/types.h>
......@@ -70,18 +74,74 @@ unsigned long long calibrate(void)
return samples * seconds;
}
bool approx(int i_one, int i_two)
{
double one = i_one, one_bump = one * 0.01;
double two = i_two, two_bump = two * 0.01;
one_bump = one + MAX(one_bump, 2.0);
two_bump = two + MAX(two_bump, 2.0);
/* Equal to, or within 1% or 2 digits */
if (one == two ||
(one > two && one <= two_bump) ||
(two > one && two <= one_bump))
return true;
return false;
}
bool le(int i_one, int i_two)
{
if (i_one <= i_two)
return true;
return false;
}
long compare(const char *name_one, const char *name_eval, const char *name_two,
unsigned long long one, bool (*eval)(int, int), unsigned long long two)
{
bool good;
printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
(long long)one, name_eval, (long long)two);
if (one > INT_MAX) {
printf("Miscalculation! Measurement went negative: %lld\n", (long long)one);
return 1;
}
if (two > INT_MAX) {
printf("Miscalculation! Measurement went negative: %lld\n", (long long)two);
return 1;
}
good = eval(one, two);
printf("%s\n", good ? "✔️" : "❌");
return good ? 0 : 1;
}
int main(int argc, char *argv[])
{
struct sock_filter bitmap_filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog bitmap_prog = {
.len = (unsigned short)ARRAY_SIZE(bitmap_filter),
.filter = bitmap_filter,
};
struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)ARRAY_SIZE(filter),
.filter = filter,
};
long ret;
unsigned long long samples;
unsigned long long native, filter1, filter2;
long ret, bits;
unsigned long long samples, calc;
unsigned long long native, filter1, filter2, bitmap1, bitmap2;
unsigned long long entry, per_filter1, per_filter2;
printf("Current BPF sysctl settings:\n");
system("sysctl net.core.bpf_jit_enable");
......@@ -101,35 +161,82 @@ int main(int argc, char *argv[])
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
assert(ret == 0);
/* One filter */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
/* One filter resulting in a bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
/* Second filter resulting in a bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
if (filter1 == native)
printf("No overhead measured!? Try running again with more samples.\n");
bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
/* Two filters */
/* Third filter, can no longer be converted to bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
assert(ret == 0);
filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
/* Calculations */
printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
filter1 - native);
filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
filter2 - native);
/* Fourth filter, can not be converted to bitmap because of filter 3 */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
printf("Estimated seccomp per-filter overhead: %llu ns\n",
filter2 - filter1);
filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
/* Estimations */
#define ESTIMATE(fmt, var, what) do { \
var = (what); \
printf("Estimated " fmt ": %llu ns\n", var); \
if (var > INT_MAX) \
goto more_samples; \
} while (0)
ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc,
bitmap1 - native);
ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc,
bitmap2 - native);
ESTIMATE("total seccomp overhead for 3 full filters", calc,
filter1 - native);
ESTIMATE("total seccomp overhead for 4 full filters", calc,
filter2 - native);
ESTIMATE("seccomp entry overhead", entry,
bitmap1 - native - (bitmap2 - bitmap1));
ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1,
filter2 - filter1);
ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2,
(filter2 - native - entry) / 4);
printf("Expectations:\n");
ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1);
bits = compare("native", "≤", "1 filter", native, le, filter1);
if (bits)
goto more_samples;
ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)",
per_filter1, approx, per_filter2);
bits = compare("1 bitmapped", "≈", "2 bitmapped",
bitmap1 - native, approx, bitmap2 - native);
if (bits) {
printf("Skipping constant action bitmap expectations: they appear unsupported.\n");
goto out;
}
printf("Estimated seccomp entry overhead: %llu ns\n",
filter1 - native - (filter2 - filter1));
ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native);
ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native);
ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total",
entry + (per_filter1 * 4) + native, approx, filter2);
if (ret == 0)
goto out;
more_samples:
printf("Saw unexpected benchmark result. Try running again with more samples?\n");
out:
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment