summaryrefslogtreecommitdiffstats
path: root/kernel/seccomp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-16 20:30:10 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-16 20:30:10 +0100
commite994cc240a3b75744c33ca9b8d74f71f0fcd8852 (patch)
tree10809f00d4cbb97bff138301b21edfacf8b129af /kernel/seccomp.c
parentMerge tag 'pstore-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parentselftests/seccomp: Update kernel config (diff)
downloadlinux-e994cc240a3b75744c33ca9b8d74f71f0fcd8852.tar.xz
linux-e994cc240a3b75744c33ca9b8d74f71f0fcd8852.zip
Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
Pull seccomp updates from Kees Cook: "The major change here is finally gaining seccomp constant-action bitmaps, which internally reduces the seccomp overhead for many real-world syscall filters to O(1), as discussed at Plumbers this year. - Improve seccomp performance via constant-action bitmaps (YiFei Zhu & Kees Cook) - Fix bogus __user annotations (Jann Horn) - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)" * tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux: selftests/seccomp: Update kernel config seccomp: Remove bogus __user annotations seccomp/cache: Report cache data through /proc/pid/seccomp_cache xtensa: Enable seccomp architecture tracking sh: Enable seccomp architecture tracking s390: Enable seccomp architecture tracking riscv: Enable seccomp architecture tracking powerpc: Enable seccomp architecture tracking parisc: Enable seccomp architecture tracking csky: Enable seccomp architecture tracking arm: Enable seccomp architecture tracking arm64: Enable seccomp architecture tracking selftests/seccomp: Compare bitmap vs filter overhead x86: Enable seccomp architecture tracking seccomp/cache: Add "emulator" to check if filter is constant allow seccomp/cache: Lookup syscall allowlist bitmap for fast path
Diffstat (limited to 'kernel/seccomp.c')
-rw-r--r--kernel/seccomp.c296
1 files changed, 293 insertions, 3 deletions
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15f47fc11d13..952dc1c90229 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,38 @@ struct notification {
struct list_head notifications;
};
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct action_cache {
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -159,6 +191,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
@@ -180,6 +213,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
@@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+ size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+ /* A native-only architecture doesn't need to check sd->arch. */
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+#else
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
+ if (seccomp_cache_check_allow(f, sd))
+ return SECCOMP_RET_ALLOW;
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
@@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* We are effectively holding the siglock by not having any sighand. */
+ WARN_ON(tsk->sighand != NULL);
+
/* Detach task from its filter tree. */
tsk->seccomp.filter = NULL;
__seccomp_filter_release(orig);
@@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -609,6 +700,148 @@ out:
return filter;
}
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * seccomp_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number and arch
+ * number are considered constant.
+ */
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+ struct seccomp_data *sd)
+{
+ unsigned int reg_value = 0;
+ unsigned int pc;
+ bool op_res;
+
+ if (WARN_ON_ONCE(!fprog))
+ return false;
+
+ for (pc = 0; pc < fprog->len; pc++) {
+ struct sock_filter *insn = &fprog->filter[pc];
+ u16 code = insn->code;
+ u32 k = insn->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ switch (k) {
+ case offsetof(struct seccomp_data, nr):
+ reg_value = sd->nr;
+ break;
+ case offsetof(struct seccomp_data, arch):
+ reg_value = sd->arch;
+ break;
+ default:
+ /* can't optimize (non-constant value load) */
+ return false;
+ }
+ break;
+ case BPF_RET | BPF_K:
+ /* reached return with constant values only, check allow */
+ return k == SECCOMP_RET_ALLOW;
+ case BPF_JMP | BPF_JA:
+ pc += insn->k;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ op_res = reg_value == k;
+ break;
+ case BPF_JGE:
+ op_res = reg_value >= k;
+ break;
+ case BPF_JGT:
+ op_res = reg_value > k;
+ break;
+ case BPF_JSET:
+ op_res = !!(reg_value & k);
+ break;
+ default:
+ /* can't optimize (unknown jump) */
+ return false;
+ }
+
+ pc += op_res ? insn->jt : insn->jf;
+ break;
+ case BPF_ALU | BPF_AND | BPF_K:
+ reg_value &= k;
+ break;
+ default:
+ /* can't optimize (unknown insn) */
+ return false;
+ }
+ }
+
+ /* ran off the end of the filter?! */
+ WARN_ON(1);
+ return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+ void *bitmap, const void *bitmap_prev,
+ size_t bitmap_size, int arch)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct seccomp_data sd;
+ int nr;
+
+ if (bitmap_prev) {
+ /* The new filter must be as restrictive as the last. */
+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
+ } else {
+ /* Before any filters, all syscalls are always allowed. */
+ bitmap_fill(bitmap, bitmap_size);
+ }
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ /* No bitmap change: not a cacheable action. */
+ if (!test_bit(nr, bitmap))
+ continue;
+
+ sd.nr = nr;
+ sd.arch = arch;
+
+ /* No bitmap change: continue to always allow. */
+ if (seccomp_is_const_allow(fprog, &sd))
+ continue;
+
+ /*
+ * Not a cacheable action: always run filters.
+ * atomic clear_bit() not needed, filter not visible yet.
+ */
+ __clear_bit(nr, bitmap);
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct action_cache *cache = &sfilter->cache;
+ const struct action_cache *cache_prev =
+ sfilter->prev ? &sfilter->prev->cache : NULL;
+
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
+ cache_prev ? cache_prev->allow_native : NULL,
+ SECCOMP_ARCH_NATIVE_NR,
+ SECCOMP_ARCH_NATIVE);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
+ cache_prev ? cache_prev->allow_compat : NULL,
+ SECCOMP_ARCH_COMPAT_NR,
+ SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);
@@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
@@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
-static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
@@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+ const void *bitmap, size_t bitmap_size)
+{
+ int nr;
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ bool cached = test_bit(nr, bitmap);
+ char *status = cached ? "ALLOW" : "FILTER";
+
+ seq_printf(m, "%s %d %s\n", name, nr, status);
+ }
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct seccomp_filter *f;
+ unsigned long flags;
+
+ /*
+ * We don't want some sandboxed process to know what their seccomp
+ * filters consist of.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!lock_task_sighand(task, &flags))
+ return -ESRCH;
+
+ f = READ_ONCE(task->seccomp.filter);
+ if (!f) {
+ unlock_task_sighand(task, &flags);
+ return 0;
+ }
+
+ /* prevent filter from being freed while we are printing it */
+ __get_seccomp_filter(f);
+ unlock_task_sighand(task, &flags);
+
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+ f->cache.allow_native,
+ SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+ f->cache.allow_compat,
+ SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ __put_seccomp_filter(f);
+ return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */