summaryrefslogtreecommitdiffstats
path: root/src/nspawn/nspawn-mount.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2022-11-28 12:36:47 +0100
committerChristian Brauner (Microsoft) <brauner@kernel.org>2022-12-05 18:34:25 +0100
commitb71a0192c040f585397cfc6fc2ca025bf839733d (patch)
tree0c880be80f885c92d71ee05383b9656ad626c550 /src/nspawn/nspawn-mount.c
parentnspawn: support pivot_root() (diff)
downloadsystemd-b71a0192c040f585397cfc6fc2ca025bf839733d.tar.xz
systemd-b71a0192c040f585397cfc6fc2ca025bf839733d.zip
nspawn: mount temporary visible procfs and sysfs instance
In order to mount procfs and sysfs in an unprivileged container the kernel requires that a fully visible instance is already present in the target mount namespace. Mount one here so the inner child can mount its own instances. Later we umount the temporary instances created here before we actually exec the payload. Since the rootfs is shared the umount will propagate into the container. Note, the inner child wouldn't be able to unmount the instances on its own since it doesn't own the originating mount namespace. IOW, the outer child needs to do this. So far nspawn didn't run into this issue because it used MS_MOVE which meant that the shadow mount tree pinned a procfs and sysfs instance which the kernel would find. The shadow mount tree is gone with proper pivot_root() semantics. Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Diffstat (limited to 'src/nspawn/nspawn-mount.c')
-rw-r--r--src/nspawn/nspawn-mount.c65
1 files changed, 63 insertions, 2 deletions
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index a54f1464ba..0e8aaa1e3c 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -13,6 +13,7 @@
#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
+#include "namespace-util.h"
#include "nspawn-mount.h"
#include "parse-util.h"
#include "path-util.h"
@@ -510,6 +511,9 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
}
+#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV)
+#define SYS_DEFAULT_MOUNT_FLAGS (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV)
+
int mount_all(const char *dest,
MountSettingsMask mount_settings,
uid_t uid_shift,
@@ -538,7 +542,7 @@ int mount_all(const char *dest,
static const MountPoint mount_table[] = {
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS,
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
@@ -576,7 +580,7 @@ int mount_all(const char *dest,
MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
{ "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */
@@ -1336,3 +1340,60 @@ done:
return r;
}
+
+#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc"
+#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys"
+
+int pin_fully_visible_fs(void) {
+ int r;
+
+ (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755);
+ (void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755);
+
+ r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL);
+ if (r < 0)
+ return r;
+
+ r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int do_wipe_fully_visible_fs(void) {
+ if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0)
+ return log_error_errno(errno, "Failed to unmount temporary proc: %m");
+
+ if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0)
+ return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m");
+
+ if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0)
+ return log_error_errno(errno, "Failed to unmount temporary sys: %m");
+
+ if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0)
+ return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m");
+
+ return 0;
+}
+
+int wipe_fully_visible_fs(int mntns_fd) {
+ _cleanup_close_ int orig_mntns_fd = -EBADF;
+ int r, rr;
+
+ r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to pin originating mount namespace: %m");
+
+ r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enter mount namespace: %m");
+
+ rr = do_wipe_fully_visible_fs();
+
+ r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enter original mount namespace: %m");
+
+ return rr;
+}