diff options
author | Lennart Poettering <lennart@poettering.net> | 2023-12-12 11:00:19 +0100 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2024-04-06 16:08:24 +0200 |
commit | 0af7e294343b29d769c1bae6a8d390236560ec1a (patch) | |
tree | 610cd943568c1bc7e0fe3de6f448e75feb9d4de5 | |
parent | core: implement RootImage= via mountfsd in unprivileged environments (diff) | |
download | systemd-0af7e294343b29d769c1bae6a8d390236560ec1a.tar.xz systemd-0af7e294343b29d769c1bae6a8d390236560ec1a.zip |
nspawn: make nspawn work without privileges
-rw-r--r-- | man/systemd-nspawn.xml | 24 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 87 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.h | 3 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.c | 70 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.h | 1 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 571 |
6 files changed, 517 insertions, 239 deletions
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 4b9c252784..c7359f298a 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -119,6 +119,28 @@ </refsect1> <refsect1> + <title>Unprivileged Operation</title> + + <para><command>systemd-nspawn</command> may be invoked with or without privileges. The full functionality + is currently only available when invoked with privileges. When invoked without privileges, various + limitations apply, including, but not limited to:</para> + + <itemizedlist> + <listitem><para>Only disk image based containers are supported (i.e. <option>--image=</option>). + Directory based ones (i.e. <option>--directory=</option>) are not supported.</para></listitem> + + <listitem><para>Machine registration via <option>--machine=</option> is not supported.</para></listitem> + + <listitem><para>Only <option>--private-network</option> and <option>--network-veth</option> networking modes are supported.</para></listitem> + </itemizedlist> + + <para>When running in unprivileged mode, some needed functionality is provided via + <citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry> + and + <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></para> + </refsect1> + + <refsect1> <title>Options</title> <para>If option <option>--boot</option> is specified, the arguments @@ -1910,6 +1932,8 @@ After=sys-subsystem-net-devices-ens1.device</programlisting> <member><citerefentry><refentrytitle>systemd.slice</refentrytitle><manvolnum>5</manvolnum></citerefentry></member> <member><citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member> <member><citerefentry><refentrytitle>importctl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member> + <member><citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member> + <member><citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member> <member><citerefentry project='url'><refentrytitle url='https://btrfs.readthedocs.io/en/latest/btrfs.html'>btrfs</refentrytitle><manvolnum>8</manvolnum></citerefentry></member> </simplelist></para> </refsect1> diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index c4a784fd64..4f28b4a225 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -13,6 +13,7 @@ #include "mountpoint-util.h" #include "nspawn-cgroup.h" #include "nspawn-mount.h" +#include "nsresource.h" #include "path-util.h" #include "rm-rf.h" #include "string-util.h" @@ -46,38 +47,6 @@ static int chown_cgroup_path(const char *path, uid_t uid_shift) { return 0; } -int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { - _cleanup_free_ char *path = NULL, *fs = NULL; - int r; - - r = cg_pid_get_path(NULL, pid, &path); - if (r < 0) - return log_error_errno(r, "Failed to get container cgroup path: %m"); - - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs); - if (r < 0) - return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); - - r = chown_cgroup_path(fs, uid_shift); - if (r < 0) - return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs); - - if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) { - _cleanup_free_ char *lfs = NULL; - /* Always propagate access rights from unified to legacy controller */ - - r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs); - if (r < 0) - return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); - - r = chown_cgroup_path(lfs, uid_shift); - if (r < 0) - return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs); - } - - return 0; -} - int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) { _cleanup_free_ char *cgroup = NULL; char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1]; @@ -142,7 +111,14 @@ finish: return r; } -int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) { +int create_subcgroup( + pid_t pid, + bool keep_unit, + CGroupUnified unified_requested, + uid_t uid_shift, + int userns_fd, + bool privileged) { + _cleanup_free_ char *cgroup = NULL, *payload = NULL; CGroupMask supported; char *e; @@ -185,13 +161,54 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) if (!payload) return log_oom(); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); + if (privileged) + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); + else + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, payload); if (r < 0) return log_error_errno(r, "Failed to create %s subcgroup: %m", payload); + if (privileged) { + _cleanup_free_ char *fs = NULL; + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, payload, NULL, &fs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + r = chown_cgroup_path(fs, uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs); + + } else if (userns_fd >= 0) { + _cleanup_close_ int cgroup_fd = -EBADF; + + cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, payload); + if (cgroup_fd < 0) + return log_error_errno(cgroup_fd, "Failed to open cgroup %s: %m", payload); + + r = cg_fd_attach(cgroup_fd, pid); + if (r < 0) + return log_error_errno(r, "Failed to add process " PID_FMT " to cgroup %s: %m", pid, payload); + + r = nsresource_add_cgroup(userns_fd, cgroup_fd); + if (r < 0) + return log_error_errno(r, "Failed to add cgroup %s to userns: %m", payload); + } + + if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) { + _cleanup_free_ char *lfs = NULL; + /* Always propagate access rights from unified to legacy controller */ + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, payload, NULL, &lfs); + if (r < 0) + return log_error_errno(r, "Failed to get file system path for container cgroup: %m"); + + r = chown_cgroup_path(lfs, uid_shift); + if (r < 0) + return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs); + } + if (keep_unit) { _cleanup_free_ char *supervisor = NULL; - supervisor = path_join(cgroup, "supervisor"); if (!supervisor) return log_oom(); diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h index 3f5ba622d8..7e2cd53ddc 100644 --- a/src/nspawn/nspawn-cgroup.h +++ b/src/nspawn/nspawn-cgroup.h @@ -6,9 +6,8 @@ #include "cgroup-util.h" -int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); -int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested); +int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, bool privileged); int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index e94ffd799e..c2bd4f6c30 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -444,22 +444,38 @@ int tmpfs_patch_options( } int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { - const char *full, *top; - int r; + _cleanup_free_ char *top = NULL, *full = NULL;; unsigned long extra_flags = 0; + int r; - top = prefix_roota(dest, "/sys"); - r = path_is_fs_type(top, SYSFS_MAGIC); + top = path_join(dest, "/sys"); + if (!top) + return log_oom(); + + r = path_is_mount_point(top); if (r < 0) - return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top); - /* /sys might already be mounted as sysfs by the outer child in the - * !netns case. In this case, it's all good. Don't touch it because we - * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555. - */ - if (r > 0) - return 0; + return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top); + if (r == 0) { + /* If this is not a mount point yet, then mount a tmpfs there */ + r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS); + if (r < 0) + return r; + } else { + r = path_is_fs_type(top, SYSFS_MAGIC); + if (r < 0) + return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top); + + /* /sys/ might already be mounted as sysfs by the outer child in the !netns case. In this case, it's + * all good. Don't touch it because we don't have the right to do so, see + * https://github.com/systemd/systemd/issues/1555. + */ + if (r > 0) + return 0; + } - full = prefix_roota(top, "/full"); + full = path_join(top, "/full"); + if (!full) + return log_oom(); (void) mkdir(full, 0755); @@ -501,10 +517,11 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) { if (rmdir(full) < 0) return log_error_errno(errno, "Failed to remove %s: %m", full); - /* Create mountpoint for cgroups. Otherwise we are not allowed since we - * remount /sys read-only. - */ - const char *x = prefix_roota(top, "/fs/cgroup"); + /* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */ + _cleanup_free_ char *x = path_join(top, "/fs/cgroup"); + if (!x) + return log_oom(); + (void) mkdir_p(x, 0755); return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL, @@ -541,7 +558,7 @@ int mount_all(const char *dest, } MountPoint; static const MountPoint mount_table[] = { - /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */ + /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing when we are privileged) */ { "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */ @@ -575,15 +592,15 @@ int mount_all(const char *dest, { "mqueue", "/dev/mqueue", "mqueue", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_IN_USERNS|MOUNT_MKDIR }, - /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */ + /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */ { "tmpfs", "/tmp", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR }, { "tmpfs", "/sys", "tmpfs", "mode=0555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR }, + MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED }, { "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS, - MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */ + MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* skipped if above was mounted */ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, - MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */ + MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* skipped if above was mounted */ { "tmpfs", "/dev", "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL|MOUNT_MKDIR }, { "tmpfs", "/dev/shm", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME, @@ -604,11 +621,11 @@ int mount_all(const char *dest, MOUNT_FATAL|MOUNT_IN_USERNS }, #if HAVE_SELINUX { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, - MOUNT_MKDIR }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */ + MOUNT_MKDIR|MOUNT_PRIVILEGED }, /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, - 0 }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */ + MOUNT_PRIVILEGED }, /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */ { NULL, "/sys/fs/selinux", NULL, NULL, MS_PRIVATE, - 0 }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */ + MOUNT_PRIVILEGED }, /* Turn off propagation (we only want that for the mount propagation tunnel dir) */ #endif }; @@ -617,6 +634,7 @@ int mount_all(const char *dest, bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO); bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS); bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP); + bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED); int r; for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) { @@ -624,6 +642,10 @@ int mount_all(const char *dest, bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL); const char *o; + /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */ + if (!privileged && FLAGS_SET(mount_table[k].mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS)) + continue; + if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS)) continue; diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index bf5e47dce4..9112f24b94 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -20,6 +20,7 @@ typedef enum MountSettingsMask { MOUNT_TOUCH = 1 << 9, /* if set, touch file to mount over first */ MOUNT_PREFIX_ROOT = 1 << 10,/* if set, prefix the source path with the container's root directory */ MOUNT_FOLLOW_SYMLINKS = 1 << 11,/* if set, we'll follow symlinks for the mount target */ + MOUNT_PRIVILEGED = 1 << 12,/* if set, we'll only mount this in in the outer child if we are running in privileged mode */ } MountSettingsMask; typedef enum CustomMountType { diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 3d0a9a73e3..7edb45dd17 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -84,6 +84,7 @@ #include "nspawn-stub-pid1.h" #include "nspawn-util.h" #include "nspawn.h" +#include "nsresource.h" #include "nulstr-util.h" #include "os-util.h" #include "pager.h" @@ -237,6 +238,7 @@ static char *arg_settings_filename = NULL; static Architecture arg_architecture = _ARCHITECTURE_INVALID; static ImagePolicy *arg_image_policy = NULL; static char *arg_background = NULL; +static bool arg_privileged = false; STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_template, freep); @@ -518,6 +520,12 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) { static int detect_unified_cgroup_hierarchy_from_image(const char *directory) { int r; + if (!arg_privileged) { + /* We only support the unified mode when running unprivileged */ + arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; + return 0; + } + /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd * in the image actually supports. */ r = cg_all_unified(); @@ -619,7 +627,6 @@ static int parse_mount_settings_env(void) { e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE"); if (streq_ptr(e, "network")) arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS; - else if (e) { r = parse_boolean(e); if (r < 0) @@ -1653,6 +1660,21 @@ static int parse_argv(int argc, char *argv[]) { static int verify_arguments(void) { int r; + SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged); + + if (!arg_privileged) { + /* machined is not accessible to unpriv clients */ + if (arg_register) { + log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients."); + arg_register = false; + } + + if (!arg_private_network) { + log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing."); + arg_private_network = true; + } + } + if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { /* If we are running the stub init in the container, we don't need to look at what the init * in the container supports, because we are not using it. Let's immediately pick the right @@ -2692,6 +2714,9 @@ static int reset_audit_loginuid(void) { if ((arg_clone_ns_flags & CLONE_NEWPID) == 0) return 0; + if (!arg_privileged) + return 0; + r = read_one_line_file("/proc/self/loginuid", &p); if (r == -ENOENT) return 0; @@ -2721,6 +2746,11 @@ static int mount_tunnel_dig(const char *root) { const char *p, *q; int r; + if (!arg_privileged) { + log_debug("Not digging mount tunnel, because running unprivileged."); + return 0; + } + (void) mkdir_p("/run/systemd/nspawn/", 0755); (void) mkdir_p("/run/systemd/nspawn/propagate", 0600); p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); @@ -2749,6 +2779,11 @@ static int mount_tunnel_dig(const char *root) { static int mount_tunnel_open(void) { int r; + if (!arg_privileged) { + log_debug("Not opening up mount tunnel, because running unprivileged."); + return 0; + } + r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL); if (r < 0) return r; @@ -3270,20 +3305,32 @@ static int inner_child( return r; if (!arg_network_namespace_path && arg_private_network) { - r = unshare(CLONE_NEWNET); + _cleanup_close_ int netns_fd = -EBADF; + + if (arg_privileged) { + if (unshare(CLONE_NEWNET) < 0) + return log_error_errno(errno, "Failed to unshare network namespace: %m"); + } + + netns_fd = namespace_open_by_type(NAMESPACE_NET); + if (netns_fd < 0) + return log_error_errno(netns_fd, "Failed to open newly allocate network namespace: %m"); + + r = send_one_fd(fd_inner_socket, netns_fd, 0); if (r < 0) - return log_error_errno(errno, "Failed to unshare network namespace: %m"); + return log_error_errno(r, "Failed to send network namespace to supervisor: %m"); /* Tell the parent that it can setup network interfaces. */ (void) barrier_place(barrier); /* #3 */ } - r = mount_sysfs(NULL, arg_mount_settings); - if (r < 0) - return r; + if (arg_privileged) { + r = mount_sysfs(NULL, arg_mount_settings); + if (r < 0) + return r; + } - /* Wait until we are cgroup-ified, so that we - * can mount the right cgroup path writable */ + /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */ if (!barrier_place_and_sync(barrier)) /* #4 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early"); @@ -3584,11 +3631,11 @@ static int inner_child( return log_error_errno(errno, "execv(%s) failed: %m", exec_target); } -static int setup_notify_child(void) { +static int setup_notify_child(const void *directory) { _cleanup_close_ int fd = -EBADF; - static const union sockaddr_union sa = { + _cleanup_free_ char *j = NULL; + union sockaddr_union sa = { .un.sun_family = AF_UNIX, - .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH, }; int r; @@ -3596,7 +3643,17 @@ static int setup_notify_child(void) { if (fd < 0) return log_error_errno(errno, "Failed to allocate notification socket: %m"); - (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755); + if (directory) { + j = path_join(directory, NSPAWN_NOTIFY_SOCKET_PATH); + if (!j) + return log_oom(); + } + + r = sockaddr_un_set_path(&sa.un, j ?: NSPAWN_NOTIFY_SOCKET_PATH); + if (r < 0) + return log_error_errno(r, "Failed to set AF_UNIX path to %s: %m", j ?: NSPAWN_NOTIFY_SOCKET_PATH); + + (void) mkdir_parents(sa.un.sun_path, 0755); (void) sockaddr_un_unlink(&sa.un); WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */ @@ -3605,7 +3662,7 @@ static int setup_notify_child(void) { return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m"); } - r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0); + r = userns_lchown(sa.un.sun_path, 0, 0); if (r < 0) return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m"); @@ -3621,6 +3678,11 @@ static int setup_unix_export_dir_outside(char **ret) { assert(ret); + if (!arg_privileged) { + log_debug("Not digging socket tunnel, because running unprivileged."); + return 0; + } + _cleanup_free_ char *p = NULL; p = path_join("/run/systemd/nspawn/unix-export", arg_machine); if (!p) @@ -3672,6 +3734,10 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix int r; assert(directory); + + if (!arg_privileged) + return 0; + assert(unix_export_path); r = make_run_host(directory); @@ -3714,10 +3780,16 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix static DissectImageFlags determine_dissect_image_flags(void) { return + DISSECT_IMAGE_GENERIC_ROOT | + DISSECT_IMAGE_REQUIRE_ROOT | + DISSECT_IMAGE_RELAX_VAR_CHECK | DISSECT_IMAGE_USR_NO_ROOT | DISSECT_IMAGE_DISCARD_ON_LOOP | + DISSECT_IMAGE_ADD_PARTITION_DEVICES | + DISSECT_IMAGE_PIN_PARTITION_DEVICES | (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) | - DISSECT_IMAGE_ALLOW_USERSPACE_VERITY; + DISSECT_IMAGE_ALLOW_USERSPACE_VERITY | + (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0); } static int outer_child( @@ -4093,47 +4165,59 @@ static int outer_child( return r; } - /* Mark everything as shared so our mounts get propagated down. This is required to make new bind - * mounts available in systemd services inside the container that create a new mount namespace. See - * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this - * will inherit the shared propagation mode. - * - * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root - * directory mount to root later on. - * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251 - */ - r = mount_switch_root(directory, MS_SHARED); - if (r < 0) - return log_error_errno(r, "Failed to move root directory: %m"); + /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll + * now switch into the target directory, and then do the final setup from there. If a user namespace + * is then allocated for the container, the root mount and everything else will be out of reach for + * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and + * procfs then anymore, since that only works if there's an unobstructed instance currently + * visible. Hence there we do it the other way round: we first allocate a new set set of namespaces + * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */ - /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a - * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into - * the container. */ - r = mount_tunnel_open(); - if (r < 0) - return r; + if (arg_privileged) { + /* Mark everything as shared so our mounts get propagated down. This is required to make new + * bind mounts available in systemd services inside the container that create a new mount + * namespace. See https://github.com/systemd/systemd/issues/3860 Further submounts (such as + * /dev/) done after this will inherit the shared propagation mode. + * + * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root + * directory mount to root later on. + * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251 + */ + r = mount_switch_root(directory, MS_SHARED); + if (r < 0) + return log_error_errno(r, "Failed to move root directory: %m"); - if (arg_userns_mode != USER_NAMESPACE_NO) { - /* In order to mount procfs and sysfs in an unprivileged container the kernel - * requires that a fully visible instance is already present in the target mount - * namespace. Mount one here so the inner child can mount its own instances. Later - * we umount the temporary instances created here before we actually exec the - * payload. Since the rootfs is shared the umount will propagate into the container. - * Note, the inner child wouldn't be able to unmount the instances on its own since - * it doesn't own the originating mount namespace. IOW, the outer child needs to do - * this. */ - r = pin_fully_visible_fs(); + /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a + * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into + * the container. */ + r = mount_tunnel_open(); if (r < 0) return r; - } - fd = setup_notify_child(); + if (arg_userns_mode != USER_NAMESPACE_NO) { + /* In order to mount procfs and sysfs in an unprivileged container the kernel + * requires that a fully visible instance is already present in the target mount + * namespace. Mount one here so the inner child can mount its own instances. Later + * we umount the temporary instances created here before we actually exec the + * payload. Since the rootfs is shared the umount will propagate into the container. + * Note, the inner child wouldn't be able to unmount the instances on its own since + * it doesn't own the originating mount namespace. IOW, the outer child needs to do + * this. */ + r = pin_fully_visible_fs(); + if (r < 0) + return r; + } + + fd = setup_notify_child(NULL); + } else + fd = setup_notify_child(directory); if (fd < 0) return fd; pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | - (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) | + ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0)); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); if (pid == 0) { @@ -4152,6 +4236,26 @@ static int outer_child( return log_error_errno(r, "Failed to join network namespace: %m"); } + if (!arg_privileged) { + /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them + * inside the inner namespaces, but before we switch root. Hence do so here. */ + _cleanup_free_ char *j = path_join(directory, "/proc"); + if (!j) + return log_oom(); + + r = mount_follow_verbose(LOG_ERR, "proc", j, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) + return r; + + r = mount_sysfs(directory, arg_mount_settings); + if (r < 0) + return r; + + r = mount_switch_root(directory, MS_SHARED); + if (r < 0) + return log_error_errno(r, "Failed to move root directory: %m"); + } + r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs); if (r < 0) _exit(EXIT_FAILURE); @@ -4433,6 +4537,9 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) { int r; + if (fd < 0) + return 0; + r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid); if (r < 0) return log_error_errno(r, "Failed to allocate notify event source: %m"); @@ -4812,26 +4919,28 @@ static int load_settings(void) { return 0; /* We first look in the admin's directories in /etc and /run */ - FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { - _cleanup_free_ char *j = NULL; + if (arg_privileged) { + FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") { + _cleanup_free_ char *j = NULL; - j = path_join(i, arg_settings_filename); - if (!j) - return log_oom(); + j = path_join(i, arg_settings_filename); + if (!j) + return log_oom(); - f = fopen(j, "re"); - if (f) { - p = TAKE_PTR(j); + f = fopen(j, "re"); + if (f) { + p = TAKE_PTR(j); - /* By default, we trust configuration from /etc and /run */ - if (arg_settings_trusted < 0) - arg_settings_trusted = true; + /* By default, we trust configuration from /etc and /run */ + if (arg_settings_trusted < 0) + arg_settings_trusted = true; - break; - } + break; + } - if (errno != ENOENT) - return log_error_errno(errno, "Failed to open %s: %m", j); + if (errno != ENOENT) + return log_error_errno(errno, "Failed to open %s: %m", j); + } } if (!f) { @@ -4891,10 +5000,14 @@ static int load_oci_bundle(void) { static int run_container( DissectedImage *dissected_image, + int userns_fd, FDSet *fds, - char veth_name[IFNAMSIZ], bool *veth_created, + char veth_name[IFNAMSIZ], + bool *veth_created, struct ExposeArgs *expose_args, - int *master, pid_t *pid, int *ret) { + int *master, + pid_t *pid, + int *ret) { static const struct sigaction sa = { .sa_handler = nop_signal_handler, @@ -4979,11 +5092,44 @@ static int run_container( "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path); } - *pid = raw_clone(SIGCHLD|CLONE_NEWNS); - if (*pid < 0) - return log_error_errno(errno, "clone() failed%s: %m", - errno == EINVAL ? - ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : ""); + if (arg_privileged) { + assert(userns_fd < 0); + + /* If we have no user namespace then we'll clone and create a new mount namepsace right-away. */ + + *pid = raw_clone(SIGCHLD|CLONE_NEWNS); + if (*pid < 0) + return log_error_errno(errno, "clone() failed%s: %m", + errno == EINVAL ? + ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : ""); + } else { + assert(userns_fd >= 0); + + /* If we have a user namespace then we'll clone() first, and then join the user namespace, + * and then open the mount namespace, so that it is owned by the user namespace */ + + *pid = raw_clone(SIGCHLD); + if (*pid < 0) + return log_error_errno(errno, "clone() failed: %m"); + + if (*pid == 0) { + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + log_error_errno(errno, "Failed to join allocate user namespace: %m"); + _exit(EXIT_FAILURE); + } + + r = reset_uid_gid(); + if (r < 0) { + log_error_errno(r, "Failed to reset UID/GID to root: %m"); + _exit(EXIT_FAILURE); + } + + if (unshare(CLONE_NEWNS) < 0) { + log_error_errno(errno, "Failed to unshare file system namespace: %m"); + _exit(EXIT_FAILURE); + } + } + } if (*pid == 0) { /* The outer child only has a file system namespace. */ @@ -5120,19 +5266,13 @@ static int run_container( /* Wait until the child has unshared its network namespace. */ if (!barrier_place_and_sync(&barrier)) /* #3 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early"); - } - if (child_netns_fd < 0) { - /* Make sure we have an open file descriptor to the child's network - * namespace so it stays alive even if the child exits. */ - r = namespace_open(*pid, - /* ret_pidns_fd = */ NULL, - /* ret_mntns_fd = */ NULL, - &child_netns_fd, - /* ret_userns_fd = */ NULL, - /* ret_root_fd = */ NULL); - if (r < 0) - return log_error_errno(r, "Failed to open child network namespace: %m"); + /* Make sure we have an open file descriptor to the child's network namespace so it + * stays alive even if the child exits. */ + assert(child_netns_fd < 0); + child_netns_fd = receive_one_fd(fd_inner_socket_pair[0], 0); + if (child_netns_fd < 0) + return log_error_errno(r, "Failed to receive child network namespace: %m"); } r = move_network_interfaces(child_netns_fd, arg_network_interfaces); @@ -5140,12 +5280,29 @@ static int run_container( return r; if (arg_network_veth) { - r = setup_veth(arg_machine, *pid, veth_name, - arg_network_bridge || arg_network_zone, &arg_network_provided_mac); - if (r < 0) - return r; - else if (r > 0) - ifi = r; + if (arg_privileged) { + r = setup_veth(arg_machine, *pid, veth_name, + arg_network_bridge || arg_network_zone, &arg_network_provided_mac); + if (r < 0) + return r; + else if (r > 0) + ifi = r; + } else { + _cleanup_free_ char *host_ifname = NULL; + + r = nsresource_add_netif(userns_fd, child_netns_fd, /* namespace_ifname= */ NULL, &host_ifname, /* ret_namespace_ifname= */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to add network interface to container: %m"); + + ifi = if_nametoindex(host_ifname); + if (ifi == 0) + return log_error_errno(errno, "Failed to resolve interface '%s': %m", host_ifname); + + if (strlen(host_ifname) >= IFNAMSIZ) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Host interface name too long?"); + + strcpy(veth_name, host_ifname); + } if (arg_network_bridge) { /* Add the interface to a bridge */ @@ -5184,9 +5341,12 @@ static int run_container( } if (arg_register || !arg_keep_unit) { - r = sd_bus_default_system(&bus); + if (arg_privileged) + r = sd_bus_default_system(&bus); + else + r = sd_bus_default_user(&bus); if (r < 0) - return log_error_errno(r, "Failed to open system bus: %m"); + return log_error_errno(r, "Failed to open bus: %m"); r = sd_bus_set_close_on_exit(bus, false); if (r < 0) @@ -5247,7 +5407,13 @@ static int run_container( } else if (arg_slice || arg_property) log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect."); - r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy); + r = create_subcgroup( + *pid, + arg_keep_unit, + arg_unified_cgroup_hierarchy, + arg_uid_shift, + userns_fd, + arg_privileged); if (r < 0) return r; @@ -5255,14 +5421,8 @@ static int run_container( if (r < 0) return r; - r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); - if (r < 0) - return r; - - /* Notify the child that the parent is ready with all - * its setup (including cgroup-ification), and that - * the child can now hand over control to the code to - * run inside the container. */ + /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and + * that the child can now hand over control to the code to run inside the container. */ (void) barrier_place(&barrier); /* #4 */ /* Block SIGCHLD here, before notifying child. @@ -5428,7 +5588,7 @@ static int run_container( fd_kmsg_fifo = safe_close(fd_kmsg_fifo); - if (arg_private_network) { + if (arg_private_network && arg_privileged) { r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces); if (r < 0) return r; @@ -5569,6 +5729,10 @@ static int cant_be_in_netns(void) { if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r)) return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev."); + if (ERRNO_IS_NEG_PRIVILEGE(r)) { + log_debug_errno(r, "Can't connect to udev control socket, assuming we are in same netns."); + return 0; + } if (r < 0) return log_error_errno(r, "Failed to connect socket to udev control socket: %m"); @@ -5587,7 +5751,7 @@ static int cant_be_in_netns(void) { static int run(int argc, char *argv[]) { bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false; - _cleanup_close_ int master = -EBADF; + _cleanup_close_ int master = -EBADF, userns_fd = -EBADF; _cleanup_fdset_free_ FDSet *fds = NULL; int r, n_fd_passed, ret = EXIT_SUCCESS; char veth_name[IFNAMSIZ] = ""; @@ -5602,17 +5766,12 @@ static int run(int argc, char *argv[]) { log_parse_environment(); log_open(); + arg_privileged = getuid() == 0; + r = parse_argv(argc, argv); if (r <= 0) goto finish; - if (geteuid() != 0) { - r = log_warning_errno(SYNTHETIC_ERRNO(EPERM), - argc >= 2 ? "Need to be root." : - "Need to be root (and some arguments are usually required).\nHint: try --help"); - goto finish; - } - r = cant_be_in_netns(); if (r < 0) goto finish; @@ -5643,7 +5802,7 @@ static int run(int argc, char *argv[]) { if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0) arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE); - r = cg_unified(); + r = cg_unified(); /* initialize cache early */ if (r < 0) { log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m"); goto finish; @@ -5664,6 +5823,16 @@ static int run(int argc, char *argv[]) { /* Reapply environment settings. */ (void) detect_unified_cgroup_hierarchy_from_environment(); + if (!arg_privileged) { + r = cg_all_unified(); + if (r < 0) { + log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m"); + goto finish; + } + if (r == 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode."); + } + /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if * the result is closed. Note that the container payload child will reset signal mask+handler anyway, * so just turning this off here means we only turn it off in nspawn itself, not any children. */ @@ -5683,9 +5852,21 @@ static int run(int argc, char *argv[]) { * the child. Functions like copy_devnodes() change the umask temporarily. */ umask(0022); + if (arg_console_mode < 0) + arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ? + CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY; + + if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ + arg_quiet = true; + if (arg_directory) { assert(!arg_image); + if (!arg_privileged) { + r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges."); + goto finish; + } + /* Safety precaution: let's not allow running images from the live host OS image, as long as * /var from the host will propagate into container dynamically (because bad things happen if * two systems write to the same /var). Let's allow it for the special cases where /var is @@ -5722,7 +5903,11 @@ static int run(int argc, char *argv[]) { /* We take an exclusive lock on this image, since it's our private, ephemeral copy * only owned by us and no one else. */ - r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + np, + LOCK_EX|LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r < 0) { log_error_errno(r, "Failed to lock %s: %m", np); goto finish; @@ -5754,7 +5939,11 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; - r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + arg_directory, + (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r == -EBUSY) { log_error_errno(r, "Directory tree %s is currently busy.", arg_directory); goto finish; @@ -5846,15 +6035,12 @@ static int run(int argc, char *argv[]) { } else { DissectImageFlags dissect_image_flags = - DISSECT_IMAGE_GENERIC_ROOT | - DISSECT_IMAGE_REQUIRE_ROOT | - DISSECT_IMAGE_RELAX_VAR_CHECK | - DISSECT_IMAGE_USR_NO_ROOT | - DISSECT_IMAGE_ADD_PARTITION_DEVICES | - DISSECT_IMAGE_PIN_PARTITION_DEVICES; + determine_dissect_image_flags(); + assert(arg_image); assert(!arg_template); + r = chase_and_update(&arg_image, 0); if (r < 0) goto finish; @@ -5869,7 +6055,11 @@ static int run(int argc, char *argv[]) { } /* Always take an exclusive lock on our own ephemeral copy. */ - r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + np, + LOCK_EX|LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r < 0) { log_error_errno(r, "Failed to create image lock: %m"); goto finish; @@ -5894,7 +6084,11 @@ static int run(int argc, char *argv[]) { free_and_replace(arg_image, np); remove_image = true; } else { - r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock); + r = image_path_lock( + arg_image, + (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, + arg_privileged ? &tree_global_lock : NULL, + &tree_local_lock); if (r == -EBUSY) { log_error_errno(r, "Disk image %s is currently busy.", arg_image); goto finish; @@ -5929,56 +6123,80 @@ static int run(int argc, char *argv[]) { goto finish; } - r = loop_device_make_by_path( - arg_image, - arg_read_only ? O_RDONLY : O_RDWR, - /* sector_size= */ UINT32_MAX, - FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, - LOCK_SH, - &loop); - if (r < 0) { - log_error_errno(r, "Failed to set up loopback block device: %m"); - goto finish; - } + if (arg_privileged) { + r = loop_device_make_by_path( + arg_image, + arg_read_only ? O_RDONLY : O_RDWR, + /* sector_size= */ UINT32_MAX, + FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN, + LOCK_SH, + &loop); + if (r < 0) { + log_error_errno(r, "Failed to set up loopback block device: %m"); + goto finish; + } - r = dissect_loop_device_and_warn( - loop, - &arg_verity_settings, - /* mount_options=*/ NULL, - arg_image_policy ?: &image_policy_container, - dissect_image_flags, - &dissected_image); - if (r == -ENOPKG) { - /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */ - log_notice("Note that the disk image needs to\n" - " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n" - " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n" - " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n" - " d) or contain a file system without a partition table\n" - "in order to be bootable with systemd-nspawn."); - goto finish; - } - if (r < 0) - goto finish; + r = dissect_loop_device_and_warn( + loop, + &arg_verity_settings, + /* mount_options=*/ NULL, + arg_image_policy ?: &image_policy_container, + dissect_image_flags, + &dissected_image); + if (r == -ENOPKG) { + /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */ + log_notice("Note that the disk image needs to\n" + " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n" + " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n" + " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n" + " d) or contain a file system without a partition table\n" + "in order to be bootable with systemd-nspawn."); + goto finish; + } + if (r < 0) + goto finish; - r = dissected_image_load_verity_sig_partition( - dissected_image, - loop->fd, - &arg_verity_settings); - if (r < 0) - goto finish; + r = dissected_image_load_verity_sig_partition( + dissected_image, + loop->fd, + &arg_verity_settings); + if (r < 0) + goto finish; - if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig) - log_notice("Note: image %s contains verity information, but no root hash specified and no embedded " - "root hash signature found! Proceeding without integrity checking.", arg_image); + if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig) + log_notice("Note: image %s contains verity information, but no root hash specified and no embedded " + "root hash signature found! Proceeding without integrity checking.", arg_image); - r = dissected_image_decrypt_interactively( - dissected_image, - NULL, - &arg_verity_settings, - 0); - if (r < 0) - goto finish; + r = dissected_image_decrypt_interactively( + dissected_image, + NULL, + &arg_verity_settings, + dissect_image_flags); + if (r < 0) + goto finish; + } else { + _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine); + if (!userns_name) { + r = log_oom(); + goto finish; + } + + /* if we are unprivileged, let's allocate a 64K userns first */ + userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000)); + if (userns_fd < 0) { + r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m"); + goto finish; + } + + r = mountfsd_mount_image( + arg_image, + userns_fd, + arg_image_policy, + dissect_image_flags, + &dissected_image); + if (r < 0) + goto finish; + } /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */ if (remove_image && unlink(arg_image) >= 0) @@ -5992,13 +6210,6 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; - if (arg_console_mode < 0) - arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ? - CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY; - - if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ - arg_quiet = true; - if (!arg_quiet) { const char *t = arg_image ?: arg_directory; _cleanup_free_ char *u = NULL; @@ -6029,11 +6240,13 @@ static int run(int argc, char *argv[]) { expose_args.fw_ctx = fw_ctx; } for (;;) { - r = run_container(dissected_image, - fds, - veth_name, &veth_created, - &expose_args, &master, - &pid, &ret); + r = run_container( + dissected_image, + userns_fd, + fds, + veth_name, &veth_created, + &expose_args, &master, + &pid, &ret); if (r <= 0) break; } @@ -6075,7 +6288,7 @@ finish: log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir); } - if (arg_machine) { + if (arg_machine && arg_privileged) { const char *p; p = strjoina("/run/systemd/nspawn/propagate/", arg_machine); @@ -6089,9 +6302,11 @@ finish: expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4); expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6); - if (veth_created) - (void) remove_veth_links(veth_name, arg_network_veth_extra); - (void) remove_bridge(arg_network_zone); + if (arg_privileged) { + if (veth_created) + (void) remove_veth_links(veth_name, arg_network_veth_extra); + (void) remove_bridge(arg_network_zone); + } custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); expose_port_free_all(arg_expose_ports); |