Merge pull request #8898 from poettering/nspawn-mount-block

some nspawn cgroup and mount lock-down fixes
author: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> 2018-05-08 12:54:58 +0200
committer: GitHub <noreply@github.com> 2018-05-08 12:54:58 +0200
commit: 6b1ca2a948180877481ef4fbbcad5762bf9f0600 (patch)
tree: c0bd34896cf23b63aa250c6e35212bd112d647cd
parent: Merge pull request #8926 from keszybz/man-journal-remote (diff)
parent: update TODO (diff)
download: systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.tar.xz
systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.zip
10 files changed, 215 insertions, 135 deletions
diff --git a/TODO b/TODO
index c8f7596390..5a0117260e 100644
--- a/TODO
+++ b/TODO
@@ -24,6 +24,10 @@ Janitorial Clean-ups:
 
 Features:
 
+* nspawn: greater control over hostname, resolv.conf, timezone, rlim
+
+* nspawn: when operating in a scope, also create /payload subcrgoup
+
 * the error paths in usbffs_dispatch_ep() leak memory
 
 * cgroups: figure out if we can somehow communicate in a cleaner way whether a
@@ -52,9 +56,6 @@ Features:
 
 * add --vacuum-xyz options to coredumpctl, matching those journalctl already has.
 
-* list the exit codes from the BSD/glibc <sysexits.h> in our own
-  exit-codes.[ch] tables.
-
 * SuccessExitStatus= and friends should probably also accept symbolic exit
   codes names, i.e. error codes from the list maintained in exit-codes.[ch]
 
diff --git a/doc/CGROUP_DELEGATION.md b/doc/CGROUP_DELEGATION.md
index 412f0a5fa0..212283fd73 100644
--- a/doc/CGROUP_DELEGATION.md
+++ b/doc/CGROUP_DELEGATION.md
@@ -424,15 +424,16 @@ unified you (of course, I guess) need to provide only `/sys/fs/cgroup/` itself.
    cgroup tree of systemd itself is out of limits for you. It's fine to *read*
    from any attribute you like however. That's totally OK and welcome.
 
-4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a container
-   payload running systemd, then don't get the idea that you can bind mount
-   only a sub-tree of the host's cgroup tree into the container. Part of the
-   cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
+4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a
+   container payload running systemd, then don't get the idea that you can bind
+   mount only a sub-tree of the host's cgroup tree into the container. Part of
+   the cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
    process, and hence any path below `/sys/fs/cgroup/` needs to match what
    `/proc/$PID/cgroup` of the payload processes reports. What you can do safely
-   however, is mount the upper parts of the cgroup tree read-only or even
-   replace it with an intermediary `tmpfs`, as long as the path to the
-   delegated sub-tree remains accessible as-is.
+   however, is mount the upper parts of the cgroup tree read-only (or even
+   replace the middle bits with an intermediary `tmpfs` — but be careful not to
+   break the `statfs()` detection logic discussed above), as long as the path
+   to the delegated sub-tree remains accessible as-is.
 
 5. ⚡ Currently, the algorithm for mapping between slice/scope/service unit
    naming and their cgroup paths is not considered public API of systemd, and
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 4a7fea920e..3154cad58a 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -81,23 +81,26 @@ static const MountEntry apivfs_table[] = {
 
 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const MountEntry protect_kernel_tunables_table[] = {
-        { "/proc/sys",           READONLY,     false },
-        { "/proc/sysrq-trigger", READONLY,     true  },
-        { "/proc/latency_stats", READONLY,     true  },
-        { "/proc/mtrr",          READONLY,     true  },
-        { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
         { "/proc/acpi",          READONLY,     true  },
-        { "/proc/timer_stats",   READONLY,     true  },
+        { "/proc/apm",           READONLY,     true  }, /* Obsolete API, there's no point in permitting access to this, ever */
         { "/proc/asound",        READONLY,     true  },
         { "/proc/bus",           READONLY,     true  },
         { "/proc/fs",            READONLY,     true  },
         { "/proc/irq",           READONLY,     true  },
+        { "/proc/kallsyms",      INACCESSIBLE, true  },
+        { "/proc/kcore",         INACCESSIBLE, true  },
+        { "/proc/latency_stats", READONLY,     true  },
+        { "/proc/mtrr",          READONLY,     true  },
+        { "/proc/scsi",          READONLY,     true  },
+        { "/proc/sys",           READONLY,     false },
+        { "/proc/sysrq-trigger", READONLY,     true  },
+        { "/proc/timer_stats",   READONLY,     true  },
         { "/sys",                READONLY,     false },
-        { "/sys/kernel/debug",   READONLY,     true  },
-        { "/sys/kernel/tracing", READONLY,     true  },
         { "/sys/fs/bpf",         READONLY,     true  },
         { "/sys/fs/cgroup",      READWRITE,    false }, /* READONLY is set by ProtectControlGroups= option */
         { "/sys/fs/selinux",     READWRITE,    true  },
+        { "/sys/kernel/debug",   READONLY,     true  },
+        { "/sys/kernel/tracing", READONLY,     true  },
 };
 
 /* ProtectKernelModules= option */
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 682ea65080..761d737dc9 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -141,44 +141,53 @@ finish:
         return r;
 }
 
-int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
         _cleanup_free_ char *cgroup = NULL;
-        const char *child;
-        int r;
         CGroupMask supported;
+        const char *payload;
+        int r;
 
-        /* In the unified hierarchy inner nodes may only contain
-         * subgroups, but not processes. Hence, if we running in the
-         * unified hierarchy and the container does the same, and we
-         * did not create a scope unit for the container move us and
-         * the container into two separate subcgroups. */
-
-        if (unified_requested == CGROUP_UNIFIED_NONE)
-                return 0;
-
-        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine whether the systemd controller is unified: %m");
-        if (r == 0)
-                return 0;
+        assert(pid > 1);
+
+        /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
+         * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
+         * move us and the container into two separate subcgroups.
+         *
+         * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
+         * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
+         * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
+         * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
+         * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
+         *
+         * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
+         * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
+         * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
+         * do it. */
 
         r = cg_mask_supported(&supported);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine supported controllers: %m");
 
-        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        if (keep_unit)
+                r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        else
+                r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
         if (r < 0)
                 return log_error_errno(r, "Failed to get our control group: %m");
 
-        child = strjoina(cgroup, "/payload");
-        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+        payload = strjoina(cgroup, "/payload");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
         if (r < 0)
-                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
 
-        child = strjoina(cgroup, "/supervisor");
-        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
-        if (r < 0)
-                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+        if (keep_unit) {
+                const char *supervisor;
+
+                supervisor = strjoina(cgroup, "/supervisor");
+                r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
+        }
 
         /* Try to enable as many controllers as possible for the new payload. */
         (void) cg_enable_everywhere(supported, supported, cgroup);
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 3a8e98e122..7639b483ae 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -14,4 +14,4 @@
 
 int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 1cc4a2da6b..8a4634f53e 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -27,7 +27,7 @@
 #include "user-util.h"
 #include "util.h"
 
-CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
+CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
         CustomMount *c, *ret;
 
         assert(l);
@@ -48,8 +48,8 @@ CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
         return ret;
 }
 
-void custom_mount_free_all(CustomMount *l, unsigned n) {
-        unsigned i;
+void custom_mount_free_all(CustomMount *l, size_t n) {
+        size_t i;
 
         for (i = 0; i < n; i++) {
                 CustomMount *m = l + i;
@@ -110,8 +110,8 @@ static char *resolve_source_path(const char *dest, const char *source) {
         return strdup(source);
 }
 
-int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
-        unsigned i;
+int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
+        size_t i;
         int r;
 
         /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
@@ -133,8 +133,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
                         if (!s)
                                 return log_oom();
 
-                        free(m->source);
-                        m->source = s;
+                        free_and_replace(m->source, s);
                 } else {
                         /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
 
@@ -165,8 +164,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
                                 if (!s)
                                         return log_oom();
 
-                                free(*j);
-                                *j = s;
+                                free_and_replace(*j, s);
                         }
 
                         if (m->work_dir) {
@@ -176,8 +174,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
                                 if (!s)
                                         return log_oom();
 
-                                free(m->work_dir);
-                                m->work_dir = s;
+                                free_and_replace(m->work_dir, s);
                         } else {
                                 assert(m->source);
 
@@ -193,7 +190,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
         return 0;
 }
 
-int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
         _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
         const char *p = s;
         CustomMount *m;
@@ -239,7 +236,7 @@ int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only
         return 0;
 }
 
-int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
+int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
         _cleanup_free_ char *path = NULL, *opts = NULL;
         const char *p = s;
         CustomMount *m;
@@ -275,7 +272,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
         return 0;
 }
 
-int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
         _cleanup_free_ char *upper = NULL, *destination = NULL;
         _cleanup_strv_free_ char **lower = NULL;
         CustomMount *m;
@@ -511,6 +508,18 @@ int mount_all(const char *dest,
               uid_t uid_shift, uid_t uid_range,
               const char *selinux_apifs_context) {
 
+#define PROC_INACCESSIBLE(path)                                         \
+        { NULL, (path), NULL, NULL, MS_BIND,                            \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
+        { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
+
+#define PROC_READ_ONLY(path)                                            \
+        { (path), (path), NULL, NULL, MS_BIND,                          \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
+        { NULL,   (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
+          MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
+
         typedef struct MountPoint {
                 const char *what;
                 const char *where;
@@ -521,39 +530,72 @@ int mount_all(const char *dest,
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                /* inner child mounts */
-                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL|MOUNT_IN_USERNS },
-                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ... */
-                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
-                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
-                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                   MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },  /* Bind mount first ... */
-                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },  /* ... then, make it r/o */
-
-                /* outer child mounts */
-                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
-                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
-                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO },    /* skipped if above was mounted */
-                { "sysfs",               "/sys",                "sysfs", NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL },                          /* skipped if above was mounted */
-
-                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  MOUNT_FATAL },
-                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
-                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
+                /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
+                { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL|MOUNT_IN_USERNS },
+
+                { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,
+                  MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ... */
+
+                { "/proc/sys/net",   "/proc/sys/net",   NULL,    NULL,        MS_BIND,
+                  MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
+
+                { NULL,              "/proc/sys",       NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
+                  MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
+
+                /* Make these files inaccessible to container payloads: they potentially leak information about kernel
+                 * internals or the host's execution environment to the container */
+                PROC_INACCESSIBLE("/proc/kallsyms"),
+                PROC_INACCESSIBLE("/proc/kcore"),
+                PROC_INACCESSIBLE("/proc/keys"),
+                PROC_INACCESSIBLE("/proc/sysrq-trigger"),
+                PROC_INACCESSIBLE("/proc/timer_list"),
+
+                /* Make these directories read-only to container payloads: they show hardware information, and in some
+                 * cases contain tunables the container really shouldn't have access to. */
+                PROC_READ_ONLY("/proc/acpi"),
+                PROC_READ_ONLY("/proc/apm"),
+                PROC_READ_ONLY("/proc/asound"),
+                PROC_READ_ONLY("/proc/bus"),
+                PROC_READ_ONLY("/proc/fs"),
+                PROC_READ_ONLY("/proc/irq"),
+                PROC_READ_ONLY("/proc/scsi"),
+
+                /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
+                { "tmpfs",           "/tmp",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+                  MOUNT_FATAL },
+                { "tmpfs",           "/sys",            "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
+                { "sysfs",           "/sys",            "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO },    /* skipped if above was mounted */
+                { "sysfs",           "/sys",            "sysfs", NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,
+                  MOUNT_FATAL },                          /* skipped if above was mounted */
+                { "tmpfs",           "/dev",            "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,
+                  MOUNT_FATAL },
+                { "tmpfs",           "/dev/shm",        "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+                  MOUNT_FATAL },
+                { "tmpfs",           "/run",            "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+                  MOUNT_FATAL },
+
 #if HAVE_SELINUX
-                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,    NULL,        MS_BIND,                                                   0 },  /* Bind mount first */
-                { NULL,                  "/sys/fs/selinux",     NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 },  /* Then, make it r/o */
+                { "/sys/fs/selinux", "/sys/fs/selinux", NULL,    NULL,        MS_BIND,
+                  0 },  /* Bind mount first */
+                { NULL,              "/sys/fs/selinux", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
+                  0 },  /* Then, make it r/o */
 #endif
         };
 
-        unsigned k;
-        int r;
+        _cleanup_(unlink_and_freep) char *inaccessible = NULL;
         bool use_userns = (mount_settings & MOUNT_USE_USERNS);
         bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
         bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
         bool in_userns = (mount_settings & MOUNT_IN_USERNS);
+        size_t k;
+        int r;
 
         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
                 _cleanup_free_ char *where = NULL, *options = NULL;
-                const char *o;
+                const char *o, *what;
                 bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
 
                 if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
@@ -569,12 +611,32 @@ int mount_all(const char *dest,
                 if (r < 0)
                         return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
 
+                if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
+
+                        if (!inaccessible) {
+                                _cleanup_free_ char *np = NULL;
+
+                                r = tempfn_random_child(NULL, "inaccessible", &np);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
+
+                                r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
+
+                                inaccessible = TAKE_PTR(np);
+                        }
+
+                        what = inaccessible;
+                } else
+                        what = mount_table[k].what;
+
                 r = path_is_mount_point(where, NULL, 0);
                 if (r < 0 && r != -ENOENT)
                         return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
 
                 /* Skip this entry if it is not a remount. */
-                if (mount_table[k].what && r > 0)
+                if (what && r > 0)
                         continue;
 
                 r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
@@ -603,7 +665,7 @@ int mount_all(const char *dest,
                 }
 
                 r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
-                                  mount_table[k].what,
+                                  what,
                                   where,
                                   mount_table[k].type,
                                   mount_table[k].flags,
@@ -766,11 +828,11 @@ static int mount_overlay(const char *dest, CustomMount *m) {
 
 int mount_custom(
                 const char *dest,
-                CustomMount *mounts, unsigned n,
+                CustomMount *mounts, size_t n,
                 bool userns, uid_t uid_shift, uid_t uid_range,
                 const char *selinux_apifs_context) {
 
-        unsigned i;
+        size_t i;
         int r;
 
         assert(dest);
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index 6d3aca76a0..db7aadc28e 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -13,12 +13,13 @@
 #include "volatile-util.h"
 
 typedef enum MountSettingsMask {
-        MOUNT_FATAL              = 1 << 0, /* if set, a mount error is considered fatal */
-        MOUNT_USE_USERNS         = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
-        MOUNT_IN_USERNS          = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
-        MOUNT_APPLY_APIVFS_RO    = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
-        MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
-                                              Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+        MOUNT_FATAL              = 1U << 0, /* if set, a mount error is considered fatal */
+        MOUNT_USE_USERNS         = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
+        MOUNT_IN_USERNS          = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
+        MOUNT_APPLY_APIVFS_RO    = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
+        MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write.
+                                               Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+        MOUNT_INACCESSIBLE_REG   = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
 } MountSettingsMask;
 
 typedef enum CustomMountType {
@@ -40,13 +41,13 @@ typedef struct CustomMount {
         char *rm_rf_tmpdir;
 } CustomMount;
 
-CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t);
-void custom_mount_free_all(CustomMount *l, unsigned n);
-int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n);
+CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t);
+void custom_mount_free_all(CustomMount *l, size_t n);
+int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n);
 
-int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
-int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
-int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
+int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
+int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s);
+int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
 
 int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
@@ -54,7 +55,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
 
-int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
+int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 
 int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
 int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c
index 643e00400f..c0cdb7c0d7 100644
--- a/src/nspawn/nspawn-register.c
+++ b/src/nspawn/nspawn-register.c
@@ -11,6 +11,7 @@
 #include "bus-unit-util.h"
 #include "bus-util.h"
 #include "nspawn-register.h"
+#include "special.h"
 #include "stat-util.h"
 #include "strv.h"
 #include "util.h"
@@ -309,7 +310,7 @@ int allocate_scope(
                                   "PIDs", "au", 1, pid,
                                   "Description", "s", description,
                                   "Delegate", "b", 1,
-                                  "Slice", "s", isempty(slice) ? "machine.slice" : slice);
+                                  "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
         if (r < 0)
                 return bus_log_create_error(r);
 
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index bee7e9f530..731db87260 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -76,7 +76,7 @@ typedef struct Settings {
         int read_only;
         VolatileMode volatile_mode;
         CustomMount *custom_mounts;
-        unsigned n_custom_mounts;
+        size_t n_custom_mounts;
         int userns_chown;
 
         /* [Network] */
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 23bc9402a8..d7ceb4ed44 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -165,7 +165,7 @@ static uint64_t arg_caps_retain =
         (1ULL << CAP_SYS_RESOURCE) |
         (1ULL << CAP_SYS_TTY_CONFIG);
 static CustomMount *arg_custom_mounts = NULL;
-static unsigned arg_n_custom_mounts = 0;
+static size_t arg_n_custom_mounts = 0;
 static char **arg_setenv = NULL;
 static bool arg_quiet = false;
 static bool arg_register = true;
@@ -291,7 +291,7 @@ static void help(void) {
 }
 
 static int custom_mount_check_all(void) {
-        unsigned i;
+        size_t i;
 
         for (i = 0; i < arg_n_custom_mounts; i++) {
                 CustomMount *m = &arg_custom_mounts[i];
@@ -1470,31 +1470,35 @@ static int setup_resolv_conf(const char *dest) {
 }
 
 static int setup_boot_id(void) {
+        _cleanup_(unlink_and_freep) char *from = NULL;
+        _cleanup_free_ char *path = NULL;
         sd_id128_t rnd = SD_ID128_NULL;
-        const char *from, *to;
+        const char *to;
         int r;
 
         /* Generate a new randomized boot ID, so that each boot-up of
          * the container gets a new one */
 
-        from = "/run/proc-sys-kernel-random-boot-id";
-        to = "/proc/sys/kernel/random/boot_id";
+        r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate random boot ID path: %m");
 
         r = sd_id128_randomize(&rnd);
         if (r < 0)
                 return log_error_errno(r, "Failed to generate random boot id: %m");
 
-        r = id128_write(from, ID128_UUID, rnd, false);
+        r = id128_write(path, ID128_UUID, rnd, false);
         if (r < 0)
                 return log_error_errno(r, "Failed to write boot id: %m");
 
+        from = TAKE_PTR(path);
+        to = "/proc/sys/kernel/random/boot_id";
+
         r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
-        if (r >= 0)
-                r = mount_verbose(LOG_ERR, NULL, to, NULL,
-                                  MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
+        if (r < 0)
+                return r;
 
-        (void) unlink(from);
-        return r;
+        return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
 }
 
 static int copy_devnodes(const char *dest) {
@@ -1662,26 +1666,32 @@ static int setup_keyring(void) {
 }
 
 static int setup_kmsg(int kmsg_socket) {
-        const char *from, *to;
+        _cleanup_(unlink_and_freep) char *from = NULL;
+        _cleanup_free_ char *fifo = NULL;
+        _cleanup_close_ int fd = -1;
         _cleanup_umask_ mode_t u;
-        int fd, r;
+        const char *to;
+        int r;
 
         assert(kmsg_socket >= 0);
 
         u = umask(0000);
 
-        /* We create the kmsg FIFO as /run/kmsg, but immediately
-         * delete it after bind mounting it to /proc/kmsg. While FIFOs
-         * on the reading side behave very similar to /proc/kmsg,
-         * their writing side behaves differently from /dev/kmsg in
-         * that writing blocks when nothing is reading. In order to
-         * avoid any problems with containers deadlocking due to this
-         * we simply make /dev/kmsg unavailable to the container. */
-        from = "/run/kmsg";
-        to = "/proc/kmsg";
+        /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+         * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
+         * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
+         * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
 
-        if (mkfifo(from, 0600) < 0)
+        r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate kmsg path: %m");
+
+        if (mkfifo(fifo, 0600) < 0)
                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
+
+        from = TAKE_PTR(fifo);
+        to = "/proc/kmsg";
+
         r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
         if (r < 0)
                 return r;
@@ -1690,17 +1700,11 @@ static int setup_kmsg(int kmsg_socket) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to open fifo: %m");
 
-        /* Store away the fd in the socket, so that it stays open as
-         * long as we run the child */
+        /* Store away the fd in the socket, so that it stays open as long as we run the child */
         r = send_one_fd(kmsg_socket, fd, 0);
-        safe_close(fd);
-
         if (r < 0)
                 return log_error_errno(r, "Failed to send FIFO fd: %m");
 
-        /* And now make the FIFO unavailable as /run/kmsg... */
-        (void) unlink(from);
-
         return 0;
 }
 
@@ -2265,7 +2269,7 @@ static int inner_child(
 
         _cleanup_free_ char *home = NULL;
         char as_uuid[37];
-        unsigned n_env = 1;
+        size_t n_env = 1;
         const char *envp[] = {
                 "PATH=" DEFAULT_PATH_COMPAT,
                 NULL, /* container */
@@ -3639,11 +3643,9 @@ static int run(int master,
         if (r < 0)
                 return r;
 
-        if (arg_keep_unit) {
-                r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
-                if (r < 0)
-                        return r;
-        }
+        r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+        if (r < 0)
+                return r;
 
         r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
         if (r < 0)
author	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>	2018-05-08 12:54:58 +0200
committer	GitHub <noreply@github.com>	2018-05-08 12:54:58 +0200
commit	6b1ca2a948180877481ef4fbbcad5762bf9f0600 (patch)
tree	c0bd34896cf23b63aa250c6e35212bd112d647cd
parent	Merge pull request #8926 from keszybz/man-journal-remote (diff)
parent	update TODO (diff)
download	systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.tar.xz systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.zip