nspawn: make nspawn work without privileges

author: Lennart Poettering <lennart@poettering.net> 2023-12-12 11:00:19 +0100
committer: Lennart Poettering <lennart@poettering.net> 2024-04-06 16:08:24 +0200
commit: 0af7e294343b29d769c1bae6a8d390236560ec1a (patch)
tree: 610cd943568c1bc7e0fe3de6f448e75feb9d4de5
parent: core: implement RootImage= via mountfsd in unprivileged environments (diff)
download: systemd-0af7e294343b29d769c1bae6a8d390236560ec1a.tar.xz
systemd-0af7e294343b29d769c1bae6a8d390236560ec1a.zip
6 files changed, 517 insertions, 239 deletions
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index 4b9c252784..c7359f298a 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -119,6 +119,28 @@
   </refsect1>
 
   <refsect1>
+    <title>Unprivileged Operation</title>
+
+    <para><command>systemd-nspawn</command> may be invoked with or without privileges. The full functionality
+    is currently only available when invoked with privileges. When invoked without privileges, various
+    limitations apply, including, but not limited to:</para>
+
+    <itemizedlist>
+      <listitem><para>Only disk image based containers are supported (i.e. <option>--image=</option>).
+      Directory based ones (i.e. <option>--directory=</option>) are not supported.</para></listitem>
+
+      <listitem><para>Machine registration via <option>--machine=</option> is not supported.</para></listitem>
+
+      <listitem><para>Only <option>--private-network</option> and <option>--network-veth</option> networking modes are supported.</para></listitem>
+    </itemizedlist>
+
+    <para>When running in unprivileged mode, some needed functionality is provided via
+    <citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
+    and
+    <citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></para>
+  </refsect1>
+
+  <refsect1>
     <title>Options</title>
 
     <para>If option <option>--boot</option> is specified, the arguments
@@ -1910,6 +1932,8 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
       <member><citerefentry><refentrytitle>systemd.slice</refentrytitle><manvolnum>5</manvolnum></citerefentry></member>
       <member><citerefentry><refentrytitle>machinectl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member>
       <member><citerefentry><refentrytitle>importctl</refentrytitle><manvolnum>1</manvolnum></citerefentry></member>
+      <member><citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
+      <member><citerefentry><refentrytitle>systemd-nsresourced.service</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
       <member><citerefentry project='url'><refentrytitle url='https://btrfs.readthedocs.io/en/latest/btrfs.html'>btrfs</refentrytitle><manvolnum>8</manvolnum></citerefentry></member>
     </simplelist></para>
   </refsect1>
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index c4a784fd64..4f28b4a225 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -13,6 +13,7 @@
 #include "mountpoint-util.h"
 #include "nspawn-cgroup.h"
 #include "nspawn-mount.h"
+#include "nsresource.h"
 #include "path-util.h"
 #include "rm-rf.h"
 #include "string-util.h"
@@ -46,38 +47,6 @@ static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         return 0;
 }
 
-int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
-        _cleanup_free_ char *path = NULL, *fs = NULL;
-        int r;
-
-        r = cg_pid_get_path(NULL, pid, &path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get container cgroup path: %m");
-
-        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
-        if (r < 0)
-                return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-        r = chown_cgroup_path(fs, uid_shift);
-        if (r < 0)
-                return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
-
-        if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) {
-                _cleanup_free_ char *lfs = NULL;
-                /* Always propagate access rights from unified to legacy controller */
-
-                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
-
-                r = chown_cgroup_path(lfs, uid_shift);
-                if (r < 0)
-                        return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs);
-        }
-
-        return 0;
-}
-
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
         _cleanup_free_ char *cgroup = NULL;
         char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
@@ -142,7 +111,14 @@ finish:
         return r;
 }
 
-int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
+int create_subcgroup(
+                pid_t pid,
+                bool keep_unit,
+                CGroupUnified unified_requested,
+                uid_t uid_shift,
+                int userns_fd,
+                bool privileged) {
+
         _cleanup_free_ char *cgroup = NULL, *payload = NULL;
         CGroupMask supported;
         char *e;
@@ -185,13 +161,54 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested)
         if (!payload)
                 return log_oom();
 
-        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
+        if (privileged)
+                r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
+        else
+                r = cg_create(SYSTEMD_CGROUP_CONTROLLER, payload);
         if (r < 0)
                 return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
 
+        if (privileged) {
+                _cleanup_free_ char *fs = NULL;
+                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, payload, NULL, &fs);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+                r = chown_cgroup_path(fs, uid_shift);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+
+        } else if (userns_fd >= 0) {
+                _cleanup_close_ int cgroup_fd = -EBADF;
+
+                cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, payload);
+                if (cgroup_fd < 0)
+                        return log_error_errno(cgroup_fd, "Failed to open cgroup %s: %m", payload);
+
+                r = cg_fd_attach(cgroup_fd, pid);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add process " PID_FMT " to cgroup %s: %m", pid, payload);
+
+                r = nsresource_add_cgroup(userns_fd, cgroup_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to add cgroup %s to userns: %m", payload);
+        }
+
+        if (unified_requested == CGROUP_UNIFIED_SYSTEMD || (unified_requested == CGROUP_UNIFIED_NONE && cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)) {
+                _cleanup_free_ char *lfs = NULL;
+                /* Always propagate access rights from unified to legacy controller */
+
+                r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, payload, NULL, &lfs);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+
+                r = chown_cgroup_path(lfs, uid_shift);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs);
+        }
+
         if (keep_unit) {
                 _cleanup_free_ char *supervisor = NULL;
-
                 supervisor = path_join(cgroup, "supervisor");
                 if (!supervisor)
                         return log_oom();
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 3f5ba622d8..7e2cd53ddc 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -6,9 +6,8 @@
 
 #include "cgroup-util.h"
 
-int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, bool privileged);
 
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index e94ffd799e..c2bd4f6c30 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -444,22 +444,38 @@ int tmpfs_patch_options(
 }
 
 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
-        const char *full, *top;
-        int r;
+        _cleanup_free_ char *top = NULL, *full = NULL;;
         unsigned long extra_flags = 0;
+        int r;
 
-        top = prefix_roota(dest, "/sys");
-        r = path_is_fs_type(top, SYSFS_MAGIC);
+        top = path_join(dest, "/sys");
+        if (!top)
+                return log_oom();
+
+        r = path_is_mount_point(top);
         if (r < 0)
-                return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
-        /* /sys might already be mounted as sysfs by the outer child in the
-         * !netns case. In this case, it's all good. Don't touch it because we
-         * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
-         */
-        if (r > 0)
-                return 0;
+                return log_error_errno(r, "Failed to determine if '%s' is a mountpoint: %m", top);
+        if (r == 0) {
+                /* If this is not a mount point yet, then mount a tmpfs there */
+                r = mount_nofollow_verbose(LOG_ERR, "tmpfs", top, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0555" TMPFS_LIMITS_SYS);
+                if (r < 0)
+                        return r;
+        } else {
+                r = path_is_fs_type(top, SYSFS_MAGIC);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
+
+                /* /sys/ might already be mounted as sysfs by the outer child in the !netns case. In this case, it's
+                 * all good. Don't touch it because we don't have the right to do so, see
+                 * https://github.com/systemd/systemd/issues/1555.
+                 */
+                if (r > 0)
+                        return 0;
+        }
 
-        full = prefix_roota(top, "/full");
+        full = path_join(top, "/full");
+        if (!full)
+                return log_oom();
 
         (void) mkdir(full, 0755);
 
@@ -501,10 +517,11 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
         if (rmdir(full) < 0)
                 return log_error_errno(errno, "Failed to remove %s: %m", full);
 
-        /* Create mountpoint for cgroups. Otherwise we are not allowed since we
-         * remount /sys read-only.
-         */
-        const char *x = prefix_roota(top, "/fs/cgroup");
+        /* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */
+        _cleanup_free_ char *x = path_join(top, "/fs/cgroup");
+        if (!x)
+                return log_oom();
+
         (void) mkdir_p(x, 0755);
 
         return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL,
@@ -541,7 +558,7 @@ int mount_all(const char *dest,
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
+                /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing when we are privileged) */
                 { "proc",            "/proc",           "proc",  NULL,        PROC_DEFAULT_MOUNT_FLAGS,
                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
 
@@ -575,15 +592,15 @@ int mount_all(const char *dest,
                 { "mqueue",                 "/dev/mqueue",                  "mqueue", NULL,                            MS_NOSUID|MS_NOEXEC|MS_NODEV,
                   MOUNT_IN_USERNS|MOUNT_MKDIR },
 
-                /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
+                /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */
                 { "tmpfs",                  "/tmp",                         "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
                 { "tmpfs",                  "/sys",                         "tmpfs", "mode=0555" TMPFS_LIMITS_SYS,     MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED },
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             SYS_DEFAULT_MOUNT_FLAGS,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR },    /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED },    /* skipped if above was mounted */
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_MKDIR },                          /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED },                          /* skipped if above was mounted */
                 { "tmpfs",                  "/dev",                         "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_MKDIR },
                 { "tmpfs",                  "/dev/shm",                     "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
@@ -604,11 +621,11 @@ int mount_all(const char *dest,
                   MOUNT_FATAL|MOUNT_IN_USERNS },
 #if HAVE_SELINUX
                 { "/sys/fs/selinux",        "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND,
-                  MOUNT_MKDIR },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
+                  MOUNT_MKDIR|MOUNT_PRIVILEGED },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
-                  0 },            /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
+                  MOUNT_PRIVILEGED },              /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_PRIVATE,
-                  0 },            /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
+                  MOUNT_PRIVILEGED },              /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
 #endif
         };
 
@@ -617,6 +634,7 @@ int mount_all(const char *dest,
         bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
         bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
         bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
+        bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED);
         int r;
 
         for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) {
@@ -624,6 +642,10 @@ int mount_all(const char *dest,
                 bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL);
                 const char *o;
 
+                /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */
+                if (!privileged && FLAGS_SET(mount_table[k].mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
+                        continue;
+
                 if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
                         continue;
 
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index bf5e47dce4..9112f24b94 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -20,6 +20,7 @@ typedef enum MountSettingsMask {
         MOUNT_TOUCH              = 1 << 9, /* if set, touch file to mount over first */
         MOUNT_PREFIX_ROOT        = 1 << 10,/* if set, prefix the source path with the container's root directory */
         MOUNT_FOLLOW_SYMLINKS    = 1 << 11,/* if set, we'll follow symlinks for the mount target */
+        MOUNT_PRIVILEGED         = 1 << 12,/* if set, we'll only mount this in in the outer child if we are running in privileged mode */
 } MountSettingsMask;
 
 typedef enum CustomMountType {
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 3d0a9a73e3..7edb45dd17 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -84,6 +84,7 @@
 #include "nspawn-stub-pid1.h"
 #include "nspawn-util.h"
 #include "nspawn.h"
+#include "nsresource.h"
 #include "nulstr-util.h"
 #include "os-util.h"
 #include "pager.h"
@@ -237,6 +238,7 @@ static char *arg_settings_filename = NULL;
 static Architecture arg_architecture = _ARCHITECTURE_INVALID;
 static ImagePolicy *arg_image_policy = NULL;
 static char *arg_background = NULL;
+static bool arg_privileged = false;
 
 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
@@ -518,6 +520,12 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) {
 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
         int r;
 
+        if (!arg_privileged) {
+                /* We only support the unified mode when running unprivileged */
+                arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
+                return 0;
+        }
+
         /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
          * in the image actually supports. */
         r = cg_all_unified();
@@ -619,7 +627,6 @@ static int parse_mount_settings_env(void) {
         e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
         if (streq_ptr(e, "network"))
                 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
-
         else if (e) {
                 r = parse_boolean(e);
                 if (r < 0)
@@ -1653,6 +1660,21 @@ static int parse_argv(int argc, char *argv[]) {
 static int verify_arguments(void) {
         int r;
 
+        SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
+
+        if (!arg_privileged) {
+                /* machined is not accessible to unpriv clients */
+                if (arg_register) {
+                        log_notice("Automatically implying --register=no, since machined is not accessible to unprivileged clients.");
+                        arg_register = false;
+                }
+
+                if (!arg_private_network) {
+                        log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
+                        arg_private_network = true;
+                }
+        }
+
         if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
                 /* If we are running the stub init in the container, we don't need to look at what the init
                  * in the container supports, because we are not using it. Let's immediately pick the right
@@ -2692,6 +2714,9 @@ static int reset_audit_loginuid(void) {
         if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
                 return 0;
 
+        if (!arg_privileged)
+                return 0;
+
         r = read_one_line_file("/proc/self/loginuid", &p);
         if (r == -ENOENT)
                 return 0;
@@ -2721,6 +2746,11 @@ static int mount_tunnel_dig(const char *root) {
         const char *p, *q;
         int r;
 
+        if (!arg_privileged) {
+                log_debug("Not digging mount tunnel, because running unprivileged.");
+                return 0;
+        }
+
         (void) mkdir_p("/run/systemd/nspawn/", 0755);
         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
@@ -2749,6 +2779,11 @@ static int mount_tunnel_dig(const char *root) {
 static int mount_tunnel_open(void) {
         int r;
 
+        if (!arg_privileged) {
+                log_debug("Not opening up mount tunnel, because running unprivileged.");
+                return 0;
+        }
+
         r = mount_follow_verbose(LOG_ERR, NULL, NSPAWN_MOUNT_TUNNEL, NULL, MS_SLAVE, NULL);
         if (r < 0)
                 return r;
@@ -3270,20 +3305,32 @@ static int inner_child(
                 return r;
 
         if (!arg_network_namespace_path && arg_private_network) {
-                r = unshare(CLONE_NEWNET);
+                _cleanup_close_ int netns_fd = -EBADF;
+
+                if (arg_privileged) {
+                        if (unshare(CLONE_NEWNET) < 0)
+                                return log_error_errno(errno, "Failed to unshare network namespace: %m");
+                }
+
+                netns_fd = namespace_open_by_type(NAMESPACE_NET);
+                if (netns_fd < 0)
+                        return log_error_errno(netns_fd, "Failed to open newly allocate network namespace: %m");
+
+                r = send_one_fd(fd_inner_socket, netns_fd, 0);
                 if (r < 0)
-                        return log_error_errno(errno, "Failed to unshare network namespace: %m");
+                        return log_error_errno(r, "Failed to send network namespace to supervisor: %m");
 
                 /* Tell the parent that it can setup network interfaces. */
                 (void) barrier_place(barrier); /* #3 */
         }
 
-        r = mount_sysfs(NULL, arg_mount_settings);
-        if (r < 0)
-                return r;
+        if (arg_privileged) {
+                r = mount_sysfs(NULL, arg_mount_settings);
+                if (r < 0)
+                        return r;
+        }
 
-        /* Wait until we are cgroup-ified, so that we
-         * can mount the right cgroup path writable */
+        /* Wait until we are cgroup-ified, so that we can mount the right cgroup path writable */
         if (!barrier_place_and_sync(barrier)) /* #4 */
                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
                                        "Parent died too early");
@@ -3584,11 +3631,11 @@ static int inner_child(
         return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
 }
 
-static int setup_notify_child(void) {
+static int setup_notify_child(const void *directory) {
         _cleanup_close_ int fd = -EBADF;
-        static const union sockaddr_union sa = {
+        _cleanup_free_ char *j = NULL;
+        union sockaddr_union sa = {
                 .un.sun_family = AF_UNIX,
-                .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
         };
         int r;
 
@@ -3596,7 +3643,17 @@ static int setup_notify_child(void) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to allocate notification socket: %m");
 
-        (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
+        if (directory) {
+                j = path_join(directory, NSPAWN_NOTIFY_SOCKET_PATH);
+                if (!j)
+                        return log_oom();
+        }
+
+        r = sockaddr_un_set_path(&sa.un, j ?: NSPAWN_NOTIFY_SOCKET_PATH);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set AF_UNIX path to %s: %m", j ?: NSPAWN_NOTIFY_SOCKET_PATH);
+
+        (void) mkdir_parents(sa.un.sun_path, 0755);
         (void) sockaddr_un_unlink(&sa.un);
 
         WITH_UMASK(0577) { /* only set "w" bit, which is all that's necessary for connecting from the container */
@@ -3605,7 +3662,7 @@ static int setup_notify_child(void) {
                         return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
         }
 
-        r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
+        r = userns_lchown(sa.un.sun_path, 0, 0);
         if (r < 0)
                 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
 
@@ -3621,6 +3678,11 @@ static int setup_unix_export_dir_outside(char **ret) {
 
         assert(ret);
 
+        if (!arg_privileged) {
+                log_debug("Not digging socket tunnel, because running unprivileged.");
+                return 0;
+        }
+
         _cleanup_free_ char *p = NULL;
         p = path_join("/run/systemd/nspawn/unix-export", arg_machine);
         if (!p)
@@ -3672,6 +3734,10 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix
         int r;
 
         assert(directory);
+
+        if (!arg_privileged)
+                return 0;
+
         assert(unix_export_path);
 
         r = make_run_host(directory);
@@ -3714,10 +3780,16 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix
 
 static DissectImageFlags determine_dissect_image_flags(void) {
         return
+                DISSECT_IMAGE_GENERIC_ROOT |
+                DISSECT_IMAGE_REQUIRE_ROOT |
+                DISSECT_IMAGE_RELAX_VAR_CHECK |
                 DISSECT_IMAGE_USR_NO_ROOT |
                 DISSECT_IMAGE_DISCARD_ON_LOOP |
+                DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+                DISSECT_IMAGE_PIN_PARTITION_DEVICES |
                 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
-                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
+                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
+                (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
 }
 
 static int outer_child(
@@ -4093,47 +4165,59 @@ static int outer_child(
                         return r;
         }
 
-        /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
-         * mounts available in systemd services inside the container that create a new mount namespace.  See
-         * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
-         * will inherit the shared propagation mode.
-         *
-         * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
-         * directory mount to root later on.
-         * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
-         */
-        r = mount_switch_root(directory, MS_SHARED);
-        if (r < 0)
-                return log_error_errno(r, "Failed to move root directory: %m");
+        /* We have different codepaths here for privileged and non-privileged mode. In privileged mode we'll
+         * now switch into the target directory, and then do the final setup from there. If a user namespace
+         * is then allocated for the container, the root mount and everything else will be out of reach for
+         * it. For unprivileged containers we cannot do that however, since we couldn't mount a sysfs and
+         * procfs then anymore, since that only works if there's an unobstructed instance currently
+         * visible. Hence there we do it the other way round: we first allocate a new set set of namespaces
+         * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
 
-        /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
-         * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
-         * the container. */
-        r = mount_tunnel_open();
-        if (r < 0)
-                return r;
+        if (arg_privileged) {
+                /* Mark everything as shared so our mounts get propagated down. This is required to make new
+                 * bind mounts available in systemd services inside the container that create a new mount
+                 * namespace.  See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
+                 * /dev/) done after this will inherit the shared propagation mode.
+                 *
+                 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
+                 * directory mount to root later on.
+                 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
+                 */
+                r = mount_switch_root(directory, MS_SHARED);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to move root directory: %m");
 
-        if (arg_userns_mode != USER_NAMESPACE_NO) {
-                /* In order to mount procfs and sysfs in an unprivileged container the kernel
-                 * requires that a fully visible instance is already present in the target mount
-                 * namespace. Mount one here so the inner child can mount its own instances. Later
-                 * we umount the temporary instances created here before we actually exec the
-                 * payload. Since the rootfs is shared the umount will propagate into the container.
-                 * Note, the inner child wouldn't be able to unmount the instances on its own since
-                 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
-                 * this. */
-                r = pin_fully_visible_fs();
+                /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
+                 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
+                 * the container. */
+                r = mount_tunnel_open();
                 if (r < 0)
                         return r;
-        }
 
-        fd = setup_notify_child();
+                if (arg_userns_mode != USER_NAMESPACE_NO) {
+                        /* In order to mount procfs and sysfs in an unprivileged container the kernel
+                         * requires that a fully visible instance is already present in the target mount
+                         * namespace. Mount one here so the inner child can mount its own instances. Later
+                         * we umount the temporary instances created here before we actually exec the
+                         * payload. Since the rootfs is shared the umount will propagate into the container.
+                         * Note, the inner child wouldn't be able to unmount the instances on its own since
+                         * it doesn't own the originating mount namespace. IOW, the outer child needs to do
+                         * this. */
+                        r = pin_fully_visible_fs();
+                        if (r < 0)
+                                return r;
+                }
+
+                fd = setup_notify_child(NULL);
+        } else
+                fd = setup_notify_child(directory);
         if (fd < 0)
                 return fd;
 
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
-                        (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
+                        (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
+                        ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
         if (pid == 0) {
@@ -4152,6 +4236,26 @@ static int outer_child(
                                 return log_error_errno(r, "Failed to join network namespace: %m");
                 }
 
+                if (!arg_privileged) {
+                        /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
+                         * inside the inner namespaces, but before we switch root. Hence do so here. */
+                        _cleanup_free_ char *j = path_join(directory, "/proc");
+                        if (!j)
+                                return log_oom();
+
+                        r = mount_follow_verbose(LOG_ERR, "proc", j, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_sysfs(directory, arg_mount_settings);
+                        if (r < 0)
+                                return r;
+
+                        r = mount_switch_root(directory, MS_SHARED);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to move root directory: %m");
+                }
+
                 r = inner_child(barrier, fd_inner_socket, fds, os_release_pairs);
                 if (r < 0)
                         _exit(EXIT_FAILURE);
@@ -4433,6 +4537,9 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
         int r;
 
+        if (fd < 0)
+                return 0;
+
         r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
         if (r < 0)
                 return log_error_errno(r, "Failed to allocate notify event source: %m");
@@ -4812,26 +4919,28 @@ static int load_settings(void) {
                 return 0;
 
         /* We first look in the admin's directories in /etc and /run */
-        FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
-                _cleanup_free_ char *j = NULL;
+        if (arg_privileged) {
+                FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+                        _cleanup_free_ char *j = NULL;
 
-                j = path_join(i, arg_settings_filename);
-                if (!j)
-                        return log_oom();
+                        j = path_join(i, arg_settings_filename);
+                        if (!j)
+                                return log_oom();
 
-                f = fopen(j, "re");
-                if (f) {
-                        p = TAKE_PTR(j);
+                        f = fopen(j, "re");
+                        if (f) {
+                                p = TAKE_PTR(j);
 
-                        /* By default, we trust configuration from /etc and /run */
-                        if (arg_settings_trusted < 0)
-                                arg_settings_trusted = true;
+                                /* By default, we trust configuration from /etc and /run */
+                                if (arg_settings_trusted < 0)
+                                        arg_settings_trusted = true;
 
-                        break;
-                }
+                                break;
+                        }
 
-                if (errno != ENOENT)
-                        return log_error_errno(errno, "Failed to open %s: %m", j);
+                        if (errno != ENOENT)
+                                return log_error_errno(errno, "Failed to open %s: %m", j);
+                }
         }
 
         if (!f) {
@@ -4891,10 +5000,14 @@ static int load_oci_bundle(void) {
 
 static int run_container(
                DissectedImage *dissected_image,
+               int userns_fd,
                FDSet *fds,
-               char veth_name[IFNAMSIZ], bool *veth_created,
+               char veth_name[IFNAMSIZ],
+               bool *veth_created,
                struct ExposeArgs *expose_args,
-               int *master, pid_t *pid, int *ret) {
+               int *master,
+               pid_t *pid,
+               int *ret) {
 
         static const struct sigaction sa = {
                 .sa_handler = nop_signal_handler,
@@ -4979,11 +5092,44 @@ static int run_container(
                                                "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
-        *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
-        if (*pid < 0)
-                return log_error_errno(errno, "clone() failed%s: %m",
-                                       errno == EINVAL ?
-                                       ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+        if (arg_privileged) {
+                assert(userns_fd < 0);
+
+                /* If we have no user namespace then we'll clone and create a new mount namepsace right-away. */
+
+                *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
+                if (*pid < 0)
+                        return log_error_errno(errno, "clone() failed%s: %m",
+                                               errno == EINVAL ?
+                                               ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
+        } else {
+                assert(userns_fd >= 0);
+
+                /* If we have a user namespace then we'll clone() first, and then join the user namespace,
+                 * and then open the mount namespace, so that it is owned by the user namespace */
+
+                *pid = raw_clone(SIGCHLD);
+                if (*pid < 0)
+                        return log_error_errno(errno, "clone() failed: %m");
+
+                if (*pid == 0) {
+                        if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+                                log_error_errno(errno, "Failed to join allocate user namespace: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        r = reset_uid_gid();
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to reset UID/GID to root: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        if (unshare(CLONE_NEWNS) < 0) {
+                                log_error_errno(errno, "Failed to unshare file system namespace: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+                }
+        }
 
         if (*pid == 0) {
                 /* The outer child only has a file system namespace. */
@@ -5120,19 +5266,13 @@ static int run_container(
                         /* Wait until the child has unshared its network namespace. */
                         if (!barrier_place_and_sync(&barrier)) /* #3 */
                                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
-                }
 
-                if (child_netns_fd < 0) {
-                        /* Make sure we have an open file descriptor to the child's network
-                         * namespace so it stays alive even if the child exits. */
-                        r = namespace_open(*pid,
-                                           /* ret_pidns_fd = */ NULL,
-                                           /* ret_mntns_fd = */ NULL,
-                                           &child_netns_fd,
-                                           /* ret_userns_fd = */ NULL,
-                                           /* ret_root_fd = */ NULL);
-                        if (r < 0)
-                                return log_error_errno(r, "Failed to open child network namespace: %m");
+                        /* Make sure we have an open file descriptor to the child's network namespace so it
+                         * stays alive even if the child exits. */
+                        assert(child_netns_fd < 0);
+                        child_netns_fd = receive_one_fd(fd_inner_socket_pair[0], 0);
+                        if (child_netns_fd < 0)
+                                return log_error_errno(r, "Failed to receive child network namespace: %m");
                 }
 
                 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
@@ -5140,12 +5280,29 @@ static int run_container(
                         return r;
 
                 if (arg_network_veth) {
-                        r = setup_veth(arg_machine, *pid, veth_name,
-                                       arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
-                        if (r < 0)
-                                return r;
-                        else if (r > 0)
-                                ifi = r;
+                        if (arg_privileged) {
+                                r = setup_veth(arg_machine, *pid, veth_name,
+                                               arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
+                                if (r < 0)
+                                        return r;
+                                else if (r > 0)
+                                        ifi = r;
+                        } else {
+                                _cleanup_free_ char *host_ifname = NULL;
+
+                                r = nsresource_add_netif(userns_fd, child_netns_fd, /* namespace_ifname= */ NULL, &host_ifname, /* ret_namespace_ifname= */ NULL);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to add network interface to container: %m");
+
+                                ifi = if_nametoindex(host_ifname);
+                                if (ifi == 0)
+                                        return log_error_errno(errno, "Failed to resolve interface '%s': %m", host_ifname);
+
+                                if (strlen(host_ifname) >= IFNAMSIZ)
+                                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Host interface name too long?");
+
+                                strcpy(veth_name, host_ifname);
+                        }
 
                         if (arg_network_bridge) {
                                 /* Add the interface to a bridge */
@@ -5184,9 +5341,12 @@ static int run_container(
         }
 
         if (arg_register || !arg_keep_unit) {
-                r = sd_bus_default_system(&bus);
+                if (arg_privileged)
+                        r = sd_bus_default_system(&bus);
+                else
+                        r = sd_bus_default_user(&bus);
                 if (r < 0)
-                        return log_error_errno(r, "Failed to open system bus: %m");
+                        return log_error_errno(r, "Failed to open bus: %m");
 
                 r = sd_bus_set_close_on_exit(bus, false);
                 if (r < 0)
@@ -5247,7 +5407,13 @@ static int run_container(
         } else if (arg_slice || arg_property)
                 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
 
-        r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+        r = create_subcgroup(
+                        *pid,
+                        arg_keep_unit,
+                        arg_unified_cgroup_hierarchy,
+                        arg_uid_shift,
+                        userns_fd,
+                        arg_privileged);
         if (r < 0)
                 return r;
 
@@ -5255,14 +5421,8 @@ static int run_container(
         if (r < 0)
                 return r;
 
-        r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
-        if (r < 0)
-                return r;
-
-        /* Notify the child that the parent is ready with all
-         * its setup (including cgroup-ification), and that
-         * the child can now hand over control to the code to
-         * run inside the container. */
+        /* Notify the child that the parent is ready with all its setup (including cgroup-ification), and
+         * that the child can now hand over control to the code to run inside the container. */
         (void) barrier_place(&barrier); /* #4 */
 
         /* Block SIGCHLD here, before notifying child.
@@ -5428,7 +5588,7 @@ static int run_container(
 
         fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
 
-        if (arg_private_network) {
+        if (arg_private_network && arg_privileged) {
                 r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
                 if (r < 0)
                         return r;
@@ -5569,6 +5729,10 @@ static int cant_be_in_netns(void) {
         if (r == -ENOENT || ERRNO_IS_NEG_DISCONNECT(r))
                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
                                        "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
+        if (ERRNO_IS_NEG_PRIVILEGE(r)) {
+                log_debug_errno(r, "Can't connect to udev control socket, assuming we are in same netns.");
+                return 0;
+        }
         if (r < 0)
                 return log_error_errno(r, "Failed to connect socket to udev control socket: %m");
 
@@ -5587,7 +5751,7 @@ static int cant_be_in_netns(void) {
 
 static int run(int argc, char *argv[]) {
         bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
-        _cleanup_close_ int master = -EBADF;
+        _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int r, n_fd_passed, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
@@ -5602,17 +5766,12 @@ static int run(int argc, char *argv[]) {
         log_parse_environment();
         log_open();
 
+        arg_privileged = getuid() == 0;
+
         r = parse_argv(argc, argv);
         if (r <= 0)
                 goto finish;
 
-        if (geteuid() != 0) {
-                r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
-                                      argc >= 2 ? "Need to be root." :
-                                      "Need to be root (and some arguments are usually required).\nHint: try --help");
-                goto finish;
-        }
-
         r = cant_be_in_netns();
         if (r < 0)
                 goto finish;
@@ -5643,7 +5802,7 @@ static int run(int argc, char *argv[]) {
         if (!arg_private_network && arg_userns_mode != USER_NAMESPACE_NO && arg_uid_shift > 0)
                 arg_caps_retain &= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE);
 
-        r = cg_unified();
+        r = cg_unified(); /* initialize cache early */
         if (r < 0) {
                 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
                 goto finish;
@@ -5664,6 +5823,16 @@ static int run(int argc, char *argv[]) {
         /* Reapply environment settings. */
         (void) detect_unified_cgroup_hierarchy_from_environment();
 
+        if (!arg_privileged) {
+                r = cg_all_unified();
+                if (r < 0) {
+                        log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
+                        goto finish;
+                }
+                if (r == 0)
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
+        }
+
         /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
          * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
          * so just turning this off here means we only turn it off in nspawn itself, not any children. */
@@ -5683,9 +5852,21 @@ static int run(int argc, char *argv[]) {
         * the child. Functions like copy_devnodes() change the umask temporarily. */
         umask(0022);
 
+        if (arg_console_mode < 0)
+                arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
+                                   CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
+
+        if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
+                arg_quiet = true;
+
         if (arg_directory) {
                 assert(!arg_image);
 
+                if (!arg_privileged) {
+                        r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
+                        goto finish;
+                }
+
                 /* Safety precaution: let's not allow running images from the live host OS image, as long as
                  * /var from the host will propagate into container dynamically (because bad things happen if
                  * two systems write to the same /var). Let's allow it for the special cases where /var is
@@ -5722,7 +5903,11 @@ static int run(int argc, char *argv[]) {
 
                         /* We take an exclusive lock on this image, since it's our private, ephemeral copy
                          * only owned by us and no one else. */
-                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        np,
+                                        LOCK_EX|LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to lock %s: %m", np);
                                 goto finish;
@@ -5754,7 +5939,11 @@ static int run(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
 
-                        r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        arg_directory,
+                                        (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r == -EBUSY) {
                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
                                 goto finish;
@@ -5846,15 +6035,12 @@ static int run(int argc, char *argv[]) {
 
         } else {
                 DissectImageFlags dissect_image_flags =
-                        DISSECT_IMAGE_GENERIC_ROOT |
-                        DISSECT_IMAGE_REQUIRE_ROOT |
-                        DISSECT_IMAGE_RELAX_VAR_CHECK |
-                        DISSECT_IMAGE_USR_NO_ROOT |
-                        DISSECT_IMAGE_ADD_PARTITION_DEVICES |
-                        DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+                        determine_dissect_image_flags();
+
                 assert(arg_image);
                 assert(!arg_template);
 
+
                 r = chase_and_update(&arg_image, 0);
                 if (r < 0)
                         goto finish;
@@ -5869,7 +6055,11 @@ static int run(int argc, char *argv[]) {
                         }
 
                         /* Always take an exclusive lock on our own ephemeral copy. */
-                        r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        np,
+                                        LOCK_EX|LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r < 0) {
                                 log_error_errno(r, "Failed to create image lock: %m");
                                 goto finish;
@@ -5894,7 +6084,11 @@ static int run(int argc, char *argv[]) {
                         free_and_replace(arg_image, np);
                         remove_image = true;
                 } else {
-                        r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
+                        r = image_path_lock(
+                                        arg_image,
+                                        (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB,
+                                        arg_privileged ? &tree_global_lock : NULL,
+                                        &tree_local_lock);
                         if (r == -EBUSY) {
                                 log_error_errno(r, "Disk image %s is currently busy.", arg_image);
                                 goto finish;
@@ -5929,56 +6123,80 @@ static int run(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                r = loop_device_make_by_path(
-                                arg_image,
-                                arg_read_only ? O_RDONLY : O_RDWR,
-                                /* sector_size= */ UINT32_MAX,
-                                FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
-                                LOCK_SH,
-                                &loop);
-                if (r < 0) {
-                        log_error_errno(r, "Failed to set up loopback block device: %m");
-                        goto finish;
-                }
+                if (arg_privileged) {
+                        r = loop_device_make_by_path(
+                                        arg_image,
+                                        arg_read_only ? O_RDONLY : O_RDWR,
+                                        /* sector_size= */ UINT32_MAX,
+                                        FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+                                        LOCK_SH,
+                                        &loop);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to set up loopback block device: %m");
+                                goto finish;
+                        }
 
-                r = dissect_loop_device_and_warn(
-                                loop,
-                                &arg_verity_settings,
-                                /* mount_options=*/ NULL,
-                                arg_image_policy ?: &image_policy_container,
-                                dissect_image_flags,
-                                &dissected_image);
-                if (r == -ENOPKG) {
-                        /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
-                        log_notice("Note that the disk image needs to\n"
-                                   "    a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
-                                   "    b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
-                                   "    c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
-                                   "    d) or contain a file system without a partition table\n"
-                                   "in order to be bootable with systemd-nspawn.");
-                        goto finish;
-                }
-                if (r < 0)
-                        goto finish;
+                        r = dissect_loop_device_and_warn(
+                                        loop,
+                                        &arg_verity_settings,
+                                        /* mount_options=*/ NULL,
+                                        arg_image_policy ?: &image_policy_container,
+                                        dissect_image_flags,
+                                        &dissected_image);
+                        if (r == -ENOPKG) {
+                                /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
+                                log_notice("Note that the disk image needs to\n"
+                                           "    a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
+                                           "    b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
+                                           "    c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
+                                           "    d) or contain a file system without a partition table\n"
+                                           "in order to be bootable with systemd-nspawn.");
+                                goto finish;
+                        }
+                        if (r < 0)
+                                goto finish;
 
-                r = dissected_image_load_verity_sig_partition(
-                                dissected_image,
-                                loop->fd,
-                                &arg_verity_settings);
-                if (r < 0)
-                        goto finish;
+                        r = dissected_image_load_verity_sig_partition(
+                                        dissected_image,
+                                        loop->fd,
+                                        &arg_verity_settings);
+                        if (r < 0)
+                                goto finish;
 
-                if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
-                        log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
-                                   "root hash signature found! Proceeding without integrity checking.", arg_image);
+                        if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
+                                log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
+                                           "root hash signature found! Proceeding without integrity checking.", arg_image);
 
-                r = dissected_image_decrypt_interactively(
-                                dissected_image,
-                                NULL,
-                                &arg_verity_settings,
-                                0);
-                if (r < 0)
-                        goto finish;
+                        r = dissected_image_decrypt_interactively(
+                                        dissected_image,
+                                        NULL,
+                                        &arg_verity_settings,
+                                        dissect_image_flags);
+                        if (r < 0)
+                                goto finish;
+                } else {
+                        _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
+                        if (!userns_name) {
+                                r = log_oom();
+                                goto finish;
+                        }
+
+                        /* if we are unprivileged, let's allocate a 64K userns first */
+                        userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
+                        if (userns_fd < 0) {
+                                r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
+                                goto finish;
+                        }
+
+                        r = mountfsd_mount_image(
+                                        arg_image,
+                                        userns_fd,
+                                        arg_image_policy,
+                                        dissect_image_flags,
+                                        &dissected_image);
+                        if (r < 0)
+                                goto finish;
+                }
 
                 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
                 if (remove_image && unlink(arg_image) >= 0)
@@ -5992,13 +6210,6 @@ static int run(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
-        if (arg_console_mode < 0)
-                arg_console_mode = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO) ?
-                                   CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
-
-        if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
-                arg_quiet = true;
-
         if (!arg_quiet) {
                 const char *t = arg_image ?: arg_directory;
                 _cleanup_free_ char *u = NULL;
@@ -6029,11 +6240,13 @@ static int run(int argc, char *argv[]) {
                 expose_args.fw_ctx = fw_ctx;
         }
         for (;;) {
-                r = run_container(dissected_image,
-                                  fds,
-                                  veth_name, &veth_created,
-                                  &expose_args, &master,
-                                  &pid, &ret);
+                r = run_container(
+                                dissected_image,
+                                userns_fd,
+                                fds,
+                                veth_name, &veth_created,
+                                &expose_args, &master,
+                                &pid, &ret);
                 if (r <= 0)
                         break;
         }
@@ -6075,7 +6288,7 @@ finish:
                         log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
         }
 
-        if (arg_machine) {
+        if (arg_machine && arg_privileged) {
                 const char *p;
 
                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
@@ -6089,9 +6302,11 @@ finish:
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET,  &expose_args.address4);
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
 
-        if (veth_created)
-                (void) remove_veth_links(veth_name, arg_network_veth_extra);
-        (void) remove_bridge(arg_network_zone);
+        if (arg_privileged) {
+                if (veth_created)
+                        (void) remove_veth_links(veth_name, arg_network_veth_extra);
+                (void) remove_bridge(arg_network_zone);
+        }
 
         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
         expose_port_free_all(arg_expose_ports);
author	Lennart Poettering <lennart@poettering.net>	2023-12-12 11:00:19 +0100
committer	Lennart Poettering <lennart@poettering.net>	2024-04-06 16:08:24 +0200
commit	0af7e294343b29d769c1bae6a8d390236560ec1a (patch)
tree	610cd943568c1bc7e0fe3de6f448e75feb9d4de5
parent	core: implement RootImage= via mountfsd in unprivileged environments (diff)
download	systemd-0af7e294343b29d769c1bae6a8d390236560ec1a.tar.xz systemd-0af7e294343b29d769c1bae6a8d390236560ec1a.zip