summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-16 20:30:10 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-16 20:30:10 +0100
commite994cc240a3b75744c33ca9b8d74f71f0fcd8852 (patch)
tree10809f00d4cbb97bff138301b21edfacf8b129af
parentMerge tag 'pstore-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parentselftests/seccomp: Update kernel config (diff)
downloadlinux-e994cc240a3b75744c33ca9b8d74f71f0fcd8852.tar.xz
linux-e994cc240a3b75744c33ca9b8d74f71f0fcd8852.zip
Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
Pull seccomp updates from Kees Cook: "The major change here is finally gaining seccomp constant-action bitmaps, which internally reduces the seccomp overhead for many real-world syscall filters to O(1), as discussed at Plumbers this year. - Improve seccomp performance via constant-action bitmaps (YiFei Zhu & Kees Cook) - Fix bogus __user annotations (Jann Horn) - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)" * tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux: selftests/seccomp: Update kernel config seccomp: Remove bogus __user annotations seccomp/cache: Report cache data through /proc/pid/seccomp_cache xtensa: Enable seccomp architecture tracking sh: Enable seccomp architecture tracking s390: Enable seccomp architecture tracking riscv: Enable seccomp architecture tracking powerpc: Enable seccomp architecture tracking parisc: Enable seccomp architecture tracking csky: Enable seccomp architecture tracking arm: Enable seccomp architecture tracking arm64: Enable seccomp architecture tracking selftests/seccomp: Compare bitmap vs filter overhead x86: Enable seccomp architecture tracking seccomp/cache: Add "emulator" to check if filter is constant allow seccomp/cache: Lookup syscall allowlist bitmap for fast path
-rw-r--r--arch/Kconfig17
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/seccomp.h11
-rw-r--r--arch/arm64/include/asm/seccomp.h9
-rw-r--r--arch/csky/include/asm/Kbuild1
-rw-r--r--arch/csky/include/asm/seccomp.h11
-rw-r--r--arch/parisc/include/asm/Kbuild1
-rw-r--r--arch/parisc/include/asm/seccomp.h22
-rw-r--r--arch/powerpc/include/asm/seccomp.h23
-rw-r--r--arch/riscv/include/asm/seccomp.h10
-rw-r--r--arch/s390/include/asm/seccomp.h9
-rw-r--r--arch/sh/include/asm/seccomp.h10
-rw-r--r--arch/x86/include/asm/seccomp.h20
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/asm/seccomp.h11
-rw-r--r--fs/proc/base.c6
-rw-r--r--include/linux/seccomp.h7
-rw-r--r--kernel/seccomp.c296
-rw-r--r--tools/testing/selftests/seccomp/config1
-rw-r--r--tools/testing/selftests/seccomp/seccomp_benchmark.c151
-rw-r--r--tools/testing/selftests/seccomp/settings2
21 files changed, 590 insertions, 30 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 96992b01d806..d4bdc19ed3ad 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER
- secure_computing return value is checked and a return value of -1
results in the system call being skipped immediately.
- seccomp syscall wired up
+ - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE,
+ SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If
+ COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too.
config SECCOMP
prompt "Enable seccomp to safely execute untrusted bytecode"
@@ -514,6 +517,20 @@ config SECCOMP_FILTER
See Documentation/userspace-api/seccomp_filter.rst for details.
+config SECCOMP_CACHE_DEBUG
+ bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+ depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
+ depends on PROC_FS
+ help
+ This enables the /proc/pid/seccomp_cache interface to monitor
+ seccomp cache data. The file format is subject to change. Reading
+ the file requires CAP_SYS_ADMIN.
+
+ This option is for debugging only. Enabling presents the risk that
+ an adversary may be able to infer the seccomp filter logic.
+
+ If unsure, say N.
+
config HAVE_ARCH_STACKLEAK
bool
help
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 383635b68763..4a0848aef207 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -4,7 +4,6 @@ generic-y += extable.h
generic-y += flat.h
generic-y += local64.h
generic-y += parport.h
-generic-y += seccomp.h
generated-y += mach-types.h
generated-y += unistd-nr.h
diff --git a/arch/arm/include/asm/seccomp.h b/arch/arm/include/asm/seccomp.h
new file mode 100644
index 000000000000..e9ad0f37d2ba
--- /dev/null
+++ b/arch/arm/include/asm/seccomp.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include <asm-generic/seccomp.h>
+
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "arm"
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/arch/arm64/include/asm/seccomp.h b/arch/arm64/include/asm/seccomp.h
index c36387170936..30256233788b 100644
--- a/arch/arm64/include/asm/seccomp.h
+++ b/arch/arm64/include/asm/seccomp.h
@@ -19,4 +19,13 @@
#include <asm-generic/seccomp.h>
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "aarch64"
+#ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM
+# define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "arm"
+#endif
+
#endif /* _ASM_SECCOMP_H */
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 64876e59e2ef..93372255984d 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -4,6 +4,5 @@ generic-y += gpio.h
generic-y += kvm_para.h
generic-y += local64.h
generic-y += qrwlock.h
-generic-y += seccomp.h
generic-y += user.h
generic-y += vmlinux.lds.h
diff --git a/arch/csky/include/asm/seccomp.h b/arch/csky/include/asm/seccomp.h
new file mode 100644
index 000000000000..d33e758126fb
--- /dev/null
+++ b/arch/csky/include/asm/seccomp.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include <asm-generic/seccomp.h>
+
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "csky"
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index e3ee5c0bfe80..f16c4db80116 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -5,5 +5,4 @@ generated-y += syscall_table_c32.h
generic-y += kvm_para.h
generic-y += local64.h
generic-y += mcs_spinlock.h
-generic-y += seccomp.h
generic-y += user.h
diff --git a/arch/parisc/include/asm/seccomp.h b/arch/parisc/include/asm/seccomp.h
new file mode 100644
index 000000000000..b058b2220322
--- /dev/null
+++ b/arch/parisc/include/asm/seccomp.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include <asm-generic/seccomp.h>
+
+#ifdef CONFIG_64BIT
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "parisc64"
+# ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC
+# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "parisc"
+# endif
+#else /* !CONFIG_64BIT */
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "parisc"
+#endif
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 51209f6071c5..ac2033f134f0 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -8,4 +8,27 @@
#include <asm-generic/seccomp.h>
+#ifdef __LITTLE_ENDIAN__
+#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
+#define __SECCOMP_ARCH_LE_NAME "le"
+#else
+#define __SECCOMP_ARCH_LE 0
+#define __SECCOMP_ARCH_LE_NAME
+#endif
+
+#ifdef CONFIG_PPC64
+# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE)
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME
+# ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
+# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME
+# endif
+#else /* !CONFIG_PPC64 */
+# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME
+#endif
+
#endif /* _ASM_POWERPC_SECCOMP_H */
diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
index bf7744ee3b3d..c7ee6a3507be 100644
--- a/arch/riscv/include/asm/seccomp.h
+++ b/arch/riscv/include/asm/seccomp.h
@@ -7,4 +7,14 @@
#include <asm-generic/seccomp.h>
+#ifdef CONFIG_64BIT
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "riscv64"
+#else /* !CONFIG_64BIT */
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "riscv32"
+#endif
+
#endif /* _ASM_SECCOMP_H */
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 795bbe0d7ca6..71d46f0ba97b 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -16,4 +16,13 @@
#include <asm-generic/seccomp.h>
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "s390x"
+#ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390
+# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "s390"
+#endif
+
#endif /* _ASM_S390_SECCOMP_H */
diff --git a/arch/sh/include/asm/seccomp.h b/arch/sh/include/asm/seccomp.h
index 54111e4d32b8..d4578395fd66 100644
--- a/arch/sh/include/asm/seccomp.h
+++ b/arch/sh/include/asm/seccomp.h
@@ -8,4 +8,14 @@
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
+#else
+#define __SECCOMP_ARCH_LE 0
+#endif
+
+#define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE)
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "sh"
+
#endif /* __ASM_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 2bd1338de236..fef16e398161 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -16,6 +16,26 @@
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
#endif
+#ifdef CONFIG_X86_64
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
+# ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
+# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "ia32"
+# endif
+/*
+ * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
+ * caching them and they are treated as out of range syscalls, which will
+ * always pass through the BPF filter.
+ */
+#else /* !CONFIG_X86_64 */
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "ia32"
+#endif
+
#include <asm-generic/seccomp.h>
#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index c59c42a1221a..9718e9593564 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -7,5 +7,4 @@ generic-y += mcs_spinlock.h
generic-y += param.h
generic-y += qrwlock.h
generic-y += qspinlock.h
-generic-y += seccomp.h
generic-y += user.h
diff --git a/arch/xtensa/include/asm/seccomp.h b/arch/xtensa/include/asm/seccomp.h
new file mode 100644
index 000000000000..f1cb6b0a9e1f
--- /dev/null
+++ b/arch/xtensa/include/asm/seccomp.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include <asm-generic/seccomp.h>
+
+#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA
+#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+#define SECCOMP_ARCH_NATIVE_NAME "xtensa"
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e71040b68991..b3422cda2a91 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 47763f3999f7..0c564e5d40ff 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
return -EINVAL;
}
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+struct seq_file;
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task);
+#endif
#endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15f47fc11d13..952dc1c90229 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,38 @@ struct notification {
struct list_head notifications;
};
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct action_cache {
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -159,6 +191,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
@@ -180,6 +213,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
@@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+ size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+ /* A native-only architecture doesn't need to check sd->arch. */
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+#else
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
+ if (seccomp_cache_check_allow(f, sd))
+ return SECCOMP_RET_ALLOW;
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
@@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* We are effectively holding the siglock by not having any sighand. */
+ WARN_ON(tsk->sighand != NULL);
+
/* Detach task from its filter tree. */
tsk->seccomp.filter = NULL;
__seccomp_filter_release(orig);
@@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -609,6 +700,148 @@ out:
return filter;
}
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * seccomp_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number and arch
+ * number are considered constant.
+ */
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+ struct seccomp_data *sd)
+{
+ unsigned int reg_value = 0;
+ unsigned int pc;
+ bool op_res;
+
+ if (WARN_ON_ONCE(!fprog))
+ return false;
+
+ for (pc = 0; pc < fprog->len; pc++) {
+ struct sock_filter *insn = &fprog->filter[pc];
+ u16 code = insn->code;
+ u32 k = insn->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ switch (k) {
+ case offsetof(struct seccomp_data, nr):
+ reg_value = sd->nr;
+ break;
+ case offsetof(struct seccomp_data, arch):
+ reg_value = sd->arch;
+ break;
+ default:
+ /* can't optimize (non-constant value load) */
+ return false;
+ }
+ break;
+ case BPF_RET | BPF_K:
+ /* reached return with constant values only, check allow */
+ return k == SECCOMP_RET_ALLOW;
+ case BPF_JMP | BPF_JA:
+ pc += insn->k;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ op_res = reg_value == k;
+ break;
+ case BPF_JGE:
+ op_res = reg_value >= k;
+ break;
+ case BPF_JGT:
+ op_res = reg_value > k;
+ break;
+ case BPF_JSET:
+ op_res = !!(reg_value & k);
+ break;
+ default:
+ /* can't optimize (unknown jump) */
+ return false;
+ }
+
+ pc += op_res ? insn->jt : insn->jf;
+ break;
+ case BPF_ALU | BPF_AND | BPF_K:
+ reg_value &= k;
+ break;
+ default:
+ /* can't optimize (unknown insn) */
+ return false;
+ }
+ }
+
+ /* ran off the end of the filter?! */
+ WARN_ON(1);
+ return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+ void *bitmap, const void *bitmap_prev,
+ size_t bitmap_size, int arch)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct seccomp_data sd;
+ int nr;
+
+ if (bitmap_prev) {
+ /* The new filter must be as restrictive as the last. */
+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
+ } else {
+ /* Before any filters, all syscalls are always allowed. */
+ bitmap_fill(bitmap, bitmap_size);
+ }
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ /* No bitmap change: not a cacheable action. */
+ if (!test_bit(nr, bitmap))
+ continue;
+
+ sd.nr = nr;
+ sd.arch = arch;
+
+ /* No bitmap change: continue to always allow. */
+ if (seccomp_is_const_allow(fprog, &sd))
+ continue;
+
+ /*
+ * Not a cacheable action: always run filters.
+ * atomic clear_bit() not needed, filter not visible yet.
+ */
+ __clear_bit(nr, bitmap);
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct action_cache *cache = &sfilter->cache;
+ const struct action_cache *cache_prev =
+ sfilter->prev ? &sfilter->prev->cache : NULL;
+
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
+ cache_prev ? cache_prev->allow_native : NULL,
+ SECCOMP_ARCH_NATIVE_NR,
+ SECCOMP_ARCH_NATIVE);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
+ cache_prev ? cache_prev->allow_compat : NULL,
+ SECCOMP_ARCH_COMPAT_NR,
+ SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);
@@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
@@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
-static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
@@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+ const void *bitmap, size_t bitmap_size)
+{
+ int nr;
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ bool cached = test_bit(nr, bitmap);
+ char *status = cached ? "ALLOW" : "FILTER";
+
+ seq_printf(m, "%s %d %s\n", name, nr, status);
+ }
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct seccomp_filter *f;
+ unsigned long flags;
+
+ /*
+ * We don't want some sandboxed process to know what their seccomp
+ * filters consist of.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!lock_task_sighand(task, &flags))
+ return -ESRCH;
+
+ f = READ_ONCE(task->seccomp.filter);
+ if (!f) {
+ unlock_task_sighand(task, &flags);
+ return 0;
+ }
+
+ /* prevent filter from being freed while we are printing it */
+ __get_seccomp_filter(f);
+ unlock_task_sighand(task, &flags);
+
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+ f->cache.allow_native,
+ SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+ f->cache.allow_compat,
+ SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ __put_seccomp_filter(f);
+ return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
diff --git a/tools/testing/selftests/seccomp/config b/tools/testing/selftests/seccomp/config
index 64c19d8eba79..ad431a5178fb 100644
--- a/tools/testing/selftests/seccomp/config
+++ b/tools/testing/selftests/seccomp/config
@@ -1,3 +1,4 @@
+CONFIG_PID_NS=y
CONFIG_SECCOMP=y
CONFIG_SECCOMP_FILTER=y
CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c
index 91f5a89cadac..fcc806585266 100644
--- a/tools/testing/selftests/seccomp/seccomp_benchmark.c
+++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c
@@ -4,12 +4,16 @@
*/
#define _GNU_SOURCE
#include <assert.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
+#include <sys/param.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/types.h>
@@ -70,18 +74,74 @@ unsigned long long calibrate(void)
return samples * seconds;
}
+bool approx(int i_one, int i_two)
+{
+ double one = i_one, one_bump = one * 0.01;
+ double two = i_two, two_bump = two * 0.01;
+
+ one_bump = one + MAX(one_bump, 2.0);
+ two_bump = two + MAX(two_bump, 2.0);
+
+ /* Equal to, or within 1% or 2 digits */
+ if (one == two ||
+ (one > two && one <= two_bump) ||
+ (two > one && two <= one_bump))
+ return true;
+ return false;
+}
+
+bool le(int i_one, int i_two)
+{
+ if (i_one <= i_two)
+ return true;
+ return false;
+}
+
+long compare(const char *name_one, const char *name_eval, const char *name_two,
+ unsigned long long one, bool (*eval)(int, int), unsigned long long two)
+{
+ bool good;
+
+ printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
+ (long long)one, name_eval, (long long)two);
+ if (one > INT_MAX) {
+ printf("Miscalculation! Measurement went negative: %lld\n", (long long)one);
+ return 1;
+ }
+ if (two > INT_MAX) {
+ printf("Miscalculation! Measurement went negative: %lld\n", (long long)two);
+ return 1;
+ }
+
+ good = eval(one, two);
+ printf("%s\n", good ? "✔️" : "❌");
+
+ return good ? 0 : 1;
+}
+
int main(int argc, char *argv[])
{
+ struct sock_filter bitmap_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog bitmap_prog = {
+ .len = (unsigned short)ARRAY_SIZE(bitmap_filter),
+ .filter = bitmap_filter,
+ };
struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)ARRAY_SIZE(filter),
.filter = filter,
};
- long ret;
- unsigned long long samples;
- unsigned long long native, filter1, filter2;
+
+ long ret, bits;
+ unsigned long long samples, calc;
+ unsigned long long native, filter1, filter2, bitmap1, bitmap2;
+ unsigned long long entry, per_filter1, per_filter2;
printf("Current BPF sysctl settings:\n");
system("sysctl net.core.bpf_jit_enable");
@@ -101,35 +161,82 @@ int main(int argc, char *argv[])
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
assert(ret == 0);
- /* One filter */
- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ /* One filter resulting in a bitmap */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
- filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
+ bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
+
+ /* Second filter resulting in a bitmap */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
+ assert(ret == 0);
- if (filter1 == native)
- printf("No overhead measured!? Try running again with more samples.\n");
+ bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
- /* Two filters */
+ /* Third filter, can no longer be converted to bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
assert(ret == 0);
- filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
-
- /* Calculations */
- printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
- filter1 - native);
+ filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
- printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
- filter2 - native);
+ /* Fourth filter, can not be converted to bitmap because of filter 3 */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
+ assert(ret == 0);
- printf("Estimated seccomp per-filter overhead: %llu ns\n",
- filter2 - filter1);
+ filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
+
+ /* Estimations */
+#define ESTIMATE(fmt, var, what) do { \
+ var = (what); \
+ printf("Estimated " fmt ": %llu ns\n", var); \
+ if (var > INT_MAX) \
+ goto more_samples; \
+ } while (0)
+
+ ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc,
+ bitmap1 - native);
+ ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc,
+ bitmap2 - native);
+ ESTIMATE("total seccomp overhead for 3 full filters", calc,
+ filter1 - native);
+ ESTIMATE("total seccomp overhead for 4 full filters", calc,
+ filter2 - native);
+ ESTIMATE("seccomp entry overhead", entry,
+ bitmap1 - native - (bitmap2 - bitmap1));
+ ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1,
+ filter2 - filter1);
+ ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2,
+ (filter2 - native - entry) / 4);
+
+ printf("Expectations:\n");
+ ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1);
+ bits = compare("native", "≤", "1 filter", native, le, filter1);
+ if (bits)
+ goto more_samples;
+
+ ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)",
+ per_filter1, approx, per_filter2);
+
+ bits = compare("1 bitmapped", "≈", "2 bitmapped",
+ bitmap1 - native, approx, bitmap2 - native);
+ if (bits) {
+ printf("Skipping constant action bitmap expectations: they appear unsupported.\n");
+ goto out;
+ }
- printf("Estimated seccomp entry overhead: %llu ns\n",
- filter1 - native - (filter2 - filter1));
+ ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native);
+ ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native);
+ ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total",
+ entry + (per_filter1 * 4) + native, approx, filter2);
+ if (ret == 0)
+ goto out;
+more_samples:
+ printf("Saw unexpected benchmark result. Try running again with more samples?\n");
+out:
return 0;
}
diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings
index ba4d85f74cd6..6091b45d226b 100644
--- a/tools/testing/selftests/seccomp/settings
+++ b/tools/testing/selftests/seccomp/settings
@@ -1 +1 @@
-timeout=90
+timeout=120