summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLuke T. Shumaker <lukeshu@parabola.nu>2024-08-22 01:29:10 +0200
committerLuke T. Shumaker <lukeshu@parabola.nu>2024-09-07 18:18:35 +0200
commitdc3223919f663b7c8b8d8d1d6072b4487df7709b (patch)
treeb4192fbe82e73926a6e8bbde1d3e0e1ce272dfad /src
parentnspawn: register_machine() and allocate_scope() bools to flags (diff)
downloadsystemd-dc3223919f663b7c8b8d8d1d6072b4487df7709b.tar.xz
systemd-dc3223919f663b7c8b8d8d1d6072b4487df7709b.zip
nspawn: enable FUSE in containers
Linux kernel v4.18 (2018-08-12) added user-namespace support to FUSE, and bumped the FUSE version to 7.27 (see: da315f6e0398 (Merge tag 'fuse-update-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse, Linus Torvalds, 2018-06-07). This means that on such kernels it is safe to enable FUSE in nspawn containers. In outer_child(), before calling copy_devnodes(), check the FUSE version to decide whether enable (>=7.27) or disable (<7.27) FUSE in the container. We look at the FUSE version instead of the kernel version in order to enable FUSE support on older-versioned kernels that may have the mentioned patchset backported ([as requested by @poettering][1]). However, I am not sure that this is safe; user-namespace support is not a documented part of the FUSE protocol, which is what FUSE_KERNEL_VERSION/FUSE_KERNEL_MINOR_VERSION are meant to capture. While the same patchset - added FUSE_ABORT_ERROR (which is all that the 7.27 version bump is documented as including), - bumped FUSE_KERNEL_MINOR_VERSION from 26 to 27, and - added user-namespace support these 3 things are not inseparable; it is conceivable to me that a backport could include the first 2 of those things and exclude the 3rd; perhaps it would be safer to check the kernel version. Do note that our get_fuse_version() function uses the fsopen() family of syscalls, which were not added until Linux kernel v5.2 (2019-07-07); so if nothing has been backported, then the minimum kernel version for FUSE-in-nspawn is actually v5.2, not v4.18. Pass whether or not to enable FUSE to copy_devnodes(); have copy_devnodes() copy in /dev/fuse if enabled. Pass whether or not to enable FUSE back over fd_outer_socket to run_container() so that it can pass that to append_machine_properties() (via either register_machine() or allocate_scope()); have append_machine_properties() append "DeviceAllow=/dev/fuse rw" if enabled. For testing, simply check that /dev/fuse can be opened for reading and writing, but that actually reading from it fails with EPERM. The test assumes that if FUSE is supported (/dev/fuse exists), then the testsuite is running on a kernel with FUSE >= 7.27; I am unsure how to go about writing a test that validates that the version check disables FUSE on old kernels. [1]: https://github.com/systemd/systemd/issues/17607#issuecomment-745418835 Closes #17607
Diffstat (limited to 'src')
-rw-r--r--src/nspawn/nspawn-register.c9
-rw-r--r--src/nspawn/nspawn-register.h4
-rw-r--r--src/nspawn/nspawn.c100
3 files changed, 109 insertions, 4 deletions
diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c
index 855172c09c..52f7384468 100644
--- a/src/nspawn/nspawn-register.c
+++ b/src/nspawn/nspawn-register.c
@@ -15,6 +15,7 @@
static int append_machine_properties(
sd_bus_message *m,
+ bool enable_fuse,
CustomMount *mounts,
unsigned n_mounts,
int kill_signal,
@@ -40,6 +41,12 @@ static int append_machine_properties(
"char-pts", "rw");
if (r < 0)
return bus_log_create_error(r);
+ if (enable_fuse) {
+ r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
+ "/dev/fuse", "rw");
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
for (j = 0; j < n_mounts; j++) {
CustomMount *cm = mounts + j;
@@ -200,6 +207,7 @@ int register_machine(
r = append_machine_properties(
m,
+ FLAGS_SET(flags, REGISTER_MACHINE_ENABLE_FUSE),
mounts,
n_mounts,
kill_signal,
@@ -320,6 +328,7 @@ int allocate_scope(
r = append_machine_properties(
m,
+ FLAGS_SET(flags, ALLOCATE_SCOPE_ENABLE_FUSE),
mounts,
n_mounts,
kill_signal,
diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h
index 0effb40aa0..5e187e33bb 100644
--- a/src/nspawn/nspawn-register.h
+++ b/src/nspawn/nspawn-register.h
@@ -9,7 +9,8 @@
#include "nspawn-settings.h"
typedef enum RegisterMachineFlags {
- REGISTER_MACHINE_KEEP_UNIT = 1 << 0,
+ REGISTER_MACHINE_KEEP_UNIT = 1 << 0,
+ REGISTER_MACHINE_ENABLE_FUSE = 1 << 1,
} RegisterMachineFlags;
int register_machine(
@@ -31,6 +32,7 @@ int unregister_machine(sd_bus *bus, const char *machine_name);
typedef enum AllocateScopeFlags {
ALLOCATE_SCOPE_ALLOW_PIDFD = 1 << 0,
+ ALLOCATE_SCOPE_ENABLE_FUSE = 1 << 1,
} AllocateScopeFlags;
int allocate_scope(
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 8a26333364..f8bcf26b58 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2,6 +2,7 @@
#include <errno.h>
#include <getopt.h>
+#include <linux/fuse.h>
#include <linux/loop.h>
#if HAVE_SELINUX
#include <selinux/selinux.h>
@@ -2147,7 +2148,85 @@ static int setup_boot_id(void) {
return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
-static int copy_devnodes(const char *dest) {
+static int get_fuse_version(uint32_t *ret_major, uint32_t *ret_minor) {
+ /* Must be called with mount privileges, either via arg_privileged or by being uid=0 in new
+ * CLONE_NEWUSER/CLONE_NEWNS namespaces. This is true when called from outer_child(). */
+ ssize_t n;
+ _cleanup_close_ int fuse_fd = -EBADF, mnt_fd = -EBADF;
+ _cleanup_free_ char *opts = NULL;
+ union {
+ char unstructured[FUSE_MIN_READ_BUFFER];
+ struct {
+ struct fuse_in_header header;
+ /* Don't use <linux/fuse.h>:`struct fuse_init_in` because a newer fuse.h might give
+ * us a bigger struct than what an older kernel actually gives us, and that would
+ * break our .header.len check. */
+ struct {
+ uint32_t major;
+ uint32_t minor;
+ } body;
+ } structured;
+ } request;
+
+ assert(ret_major);
+ assert(ret_minor);
+
+ /* Get a FUSE handle. */
+ fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR);
+ if (fuse_fd < 0)
+ return log_debug_errno(errno, "Failed to open /dev/fuse: %m");
+ if (asprintf(&opts, "fd=%i,rootmode=40000,user_id=0,group_id=0", fuse_fd) < 0)
+ return log_oom_debug();
+ mnt_fd = make_fsmount(LOG_DEBUG, "nspawn-fuse", "fuse.nspawn", 0, opts, -EBADF);
+ if (mnt_fd < 0)
+ return mnt_fd;
+
+ /* Read a request from the FUSE handle. */
+ n = read(fuse_fd, &request.unstructured, sizeof request);
+ if (n < 0)
+ return log_debug_errno(errno, "Failed to read /dev/fuse: %m");
+ if ((size_t) n < sizeof request.structured.header ||
+ (size_t) n < request.structured.header.len)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read /dev/fuse: Short read");
+
+ /* Assume that the request is a FUSE_INIT request, and return the version information from it. */
+ if (request.structured.header.opcode != FUSE_INIT)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial request from /dev/fuse should have opcode=%i (FUSE_INIT), but has opcode=%"PRIu32,
+ FUSE_INIT, request.structured.header.opcode);
+ if (request.structured.header.len < sizeof request.structured)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial FUSE_INIT request from /dev/fuse is too short");
+ *ret_major = request.structured.body.major;
+ *ret_minor = request.structured.body.minor;
+ return 0;
+}
+
+static bool should_enable_fuse(void) {
+ uint32_t fuse_major, fuse_minor;
+ int r;
+
+ r = get_fuse_version(&fuse_major, &fuse_minor);
+ if (r < 0) {
+ if (ERRNO_IS_NEG_DEVICE_ABSENT(r))
+ log_debug_errno(r, "Disabling FUSE: FUSE appears to be disabled on the host: %m");
+ else if (r == -ENOSYS)
+ log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m");
+ else
+ log_warning_errno(r, "Disabling FUSE: Failed to determine FUSE version: %m");
+ return false;
+ }
+
+ /* FUSE is only userns-safe in FUSE version 7.27 and later.
+ * https://github.com/torvalds/linux/commit/da315f6e03988a7127680bbc26e1028991b899b8 */
+ if (fuse_major < 7 || (fuse_major == 7 && fuse_minor < 27)) {
+ log_debug("Disabling FUSE: FUSE version %" PRIu32 ".%" PRIu32 " is too old to support user namespaces",
+ fuse_major, fuse_minor);
+ return false;
+ }
+
+ return true;
+}
+
+static int copy_devnodes(const char *dest, bool enable_fuse) {
_cleanup_strv_free_ char **devnodes = NULL;
int r = 0;
@@ -2159,6 +2238,7 @@ static int copy_devnodes(const char *dest) {
"random",
"urandom",
"tty",
+ STRV_IFNOTNULL(enable_fuse ? "fuse" : NULL),
"net/tun");
if (!devnodes)
return log_oom();
@@ -3807,7 +3887,7 @@ static int outer_child(
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
_cleanup_strv_free_ char **os_release_pairs = NULL;
_cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
- bool idmap = false;
+ bool idmap = false, enable_fuse;
const char *p;
pid_t pid;
ssize_t l;
@@ -4090,7 +4170,12 @@ static int outer_child(
if (r < 0)
return r;
- r = copy_devnodes(directory);
+ enable_fuse = should_enable_fuse();
+ l = send(fd_outer_socket, &enable_fuse, sizeof enable_fuse, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send whether to enable FUSE: %m");
+
+ r = copy_devnodes(directory, enable_fuse);
if (r < 0)
return r;
@@ -5048,6 +5133,7 @@ static int run_container(
ssize_t l;
sigset_t mask_chld;
_cleanup_close_ int child_netns_fd = -EBADF;
+ bool enable_fuse;
assert_se(sigemptyset(&mask_chld) == 0);
assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
@@ -5234,6 +5320,12 @@ static int run_container(
l, l == 0 ? " The child is most likely dead." : "");
}
+ l = recv(fd_outer_socket_pair[0], &enable_fuse, sizeof enable_fuse, 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to read whether to enable FUSE: %m");
+ if (l != sizeof enable_fuse)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading whether to enable FUSE.");
+
/* Wait for the outer child. */
r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
if (r < 0)
@@ -5386,6 +5478,7 @@ static int run_container(
if (arg_register) {
RegisterMachineFlags flags = 0;
SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit);
+ SET_FLAG(flags, REGISTER_MACHINE_ENABLE_FUSE, enable_fuse);
r = register_machine(
bus,
arg_machine,
@@ -5406,6 +5499,7 @@ static int run_container(
} else if (!arg_keep_unit) {
AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
+ SET_FLAG(flags, ALLOCATE_SCOPE_ENABLE_FUSE, enable_fuse);
r = allocate_scope(
bus,
arg_machine,