diff options
author | Luca Boccassi <bluca@debian.org> | 2022-04-20 11:27:58 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-20 11:27:58 +0200 |
commit | 7d4054464318d15ecd35c93fb477011aec63391e (patch) | |
tree | cf0dd1e0d568fc56492e3e19f302a1501cb29bb6 /src | |
parent | compression: add separate pre-processor definitions (diff) | |
parent | manager: prohibit clone3() in seccomp filters (diff) | |
download | systemd-7d4054464318d15ecd35c93fb477011aec63391e.tar.xz systemd-7d4054464318d15ecd35c93fb477011aec63391e.zip |
Merge pull request #23126 from keszybz/clone3-prohibit
Prohibit clone3() when RestrictNamespaces is used
Diffstat (limited to 'src')
-rw-r--r-- | src/shared/seccomp-util.c | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index c465bf58dd..49044a45ae 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -718,6 +718,9 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { .value = "capget\0" /* Able to query arbitrary processes */ "clone\0" + /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't + * implement seccomp, so we don't need to list it at all. C.f. + * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */ "clone3\0" "execveat\0" "fork\0" @@ -1227,6 +1230,21 @@ int seccomp_restrict_namespaces(unsigned long retain) { if (r < 0) return r; + /* We cannot filter on individual flags to clone3(), and we need to disable the + * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other + * users shall fall back to clone(), as if on an older kernel. + * + * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330, + * https://github.com/moby/moby/issues/42680. */ + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(clone3), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch)); + if ((retain & NAMESPACE_FLAGS_ALL) == 0) /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall * altogether. */ |