summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2018-05-08 12:54:58 +0200
committerGitHub <noreply@github.com>2018-05-08 12:54:58 +0200
commit6b1ca2a948180877481ef4fbbcad5762bf9f0600 (patch)
treec0bd34896cf23b63aa250c6e35212bd112d647cd
parentMerge pull request #8926 from keszybz/man-journal-remote (diff)
parentupdate TODO (diff)
downloadsystemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.tar.xz
systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.zip
Merge pull request #8898 from poettering/nspawn-mount-block
some nspawn cgroup and mount lock-down fixes
-rw-r--r--TODO7
-rw-r--r--doc/CGROUP_DELEGATION.md15
-rw-r--r--src/core/namespace.c19
-rw-r--r--src/nspawn/nspawn-cgroup.c59
-rw-r--r--src/nspawn/nspawn-cgroup.h2
-rw-r--r--src/nspawn/nspawn-mount.c142
-rw-r--r--src/nspawn/nspawn-mount.h27
-rw-r--r--src/nspawn/nspawn-register.c3
-rw-r--r--src/nspawn/nspawn-settings.h2
-rw-r--r--src/nspawn/nspawn.c74
10 files changed, 215 insertions, 135 deletions
diff --git a/TODO b/TODO
index c8f7596390..5a0117260e 100644
--- a/TODO
+++ b/TODO
@@ -24,6 +24,10 @@ Janitorial Clean-ups:
Features:
+* nspawn: greater control over hostname, resolv.conf, timezone, rlim
+
+* nspawn: when operating in a scope, also create /payload subcrgoup
+
* the error paths in usbffs_dispatch_ep() leak memory
* cgroups: figure out if we can somehow communicate in a cleaner way whether a
@@ -52,9 +56,6 @@ Features:
* add --vacuum-xyz options to coredumpctl, matching those journalctl already has.
-* list the exit codes from the BSD/glibc <sysexits.h> in our own
- exit-codes.[ch] tables.
-
* SuccessExitStatus= and friends should probably also accept symbolic exit
codes names, i.e. error codes from the list maintained in exit-codes.[ch]
diff --git a/doc/CGROUP_DELEGATION.md b/doc/CGROUP_DELEGATION.md
index 412f0a5fa0..212283fd73 100644
--- a/doc/CGROUP_DELEGATION.md
+++ b/doc/CGROUP_DELEGATION.md
@@ -424,15 +424,16 @@ unified you (of course, I guess) need to provide only `/sys/fs/cgroup/` itself.
cgroup tree of systemd itself is out of limits for you. It's fine to *read*
from any attribute you like however. That's totally OK and welcome.
-4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a container
- payload running systemd, then don't get the idea that you can bind mount
- only a sub-tree of the host's cgroup tree into the container. Part of the
- cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
+4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a
+ container payload running systemd, then don't get the idea that you can bind
+ mount only a sub-tree of the host's cgroup tree into the container. Part of
+ the cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
process, and hence any path below `/sys/fs/cgroup/` needs to match what
`/proc/$PID/cgroup` of the payload processes reports. What you can do safely
- however, is mount the upper parts of the cgroup tree read-only or even
- replace it with an intermediary `tmpfs`, as long as the path to the
- delegated sub-tree remains accessible as-is.
+ however, is mount the upper parts of the cgroup tree read-only (or even
+ replace the middle bits with an intermediary `tmpfs` — but be careful not to
+ break the `statfs()` detection logic discussed above), as long as the path
+ to the delegated sub-tree remains accessible as-is.
5. ⚡ Currently, the algorithm for mapping between slice/scope/service unit
naming and their cgroup paths is not considered public API of systemd, and
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 4a7fea920e..3154cad58a 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -81,23 +81,26 @@ static const MountEntry apivfs_table[] = {
/* ProtectKernelTunables= option and the related filesystem APIs */
static const MountEntry protect_kernel_tunables_table[] = {
- { "/proc/sys", READONLY, false },
- { "/proc/sysrq-trigger", READONLY, true },
- { "/proc/latency_stats", READONLY, true },
- { "/proc/mtrr", READONLY, true },
- { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
{ "/proc/acpi", READONLY, true },
- { "/proc/timer_stats", READONLY, true },
+ { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
{ "/proc/asound", READONLY, true },
{ "/proc/bus", READONLY, true },
{ "/proc/fs", READONLY, true },
{ "/proc/irq", READONLY, true },
+ { "/proc/kallsyms", INACCESSIBLE, true },
+ { "/proc/kcore", INACCESSIBLE, true },
+ { "/proc/latency_stats", READONLY, true },
+ { "/proc/mtrr", READONLY, true },
+ { "/proc/scsi", READONLY, true },
+ { "/proc/sys", READONLY, false },
+ { "/proc/sysrq-trigger", READONLY, true },
+ { "/proc/timer_stats", READONLY, true },
{ "/sys", READONLY, false },
- { "/sys/kernel/debug", READONLY, true },
- { "/sys/kernel/tracing", READONLY, true },
{ "/sys/fs/bpf", READONLY, true },
{ "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
{ "/sys/fs/selinux", READWRITE, true },
+ { "/sys/kernel/debug", READONLY, true },
+ { "/sys/kernel/tracing", READONLY, true },
};
/* ProtectKernelModules= option */
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 682ea65080..761d737dc9 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -141,44 +141,53 @@ finish:
return r;
}
-int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
_cleanup_free_ char *cgroup = NULL;
- const char *child;
- int r;
CGroupMask supported;
+ const char *payload;
+ int r;
- /* In the unified hierarchy inner nodes may only contain
- * subgroups, but not processes. Hence, if we running in the
- * unified hierarchy and the container does the same, and we
- * did not create a scope unit for the container move us and
- * the container into two separate subcgroups. */
-
- if (unified_requested == CGROUP_UNIFIED_NONE)
- return 0;
-
- r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
- if (r < 0)
- return log_error_errno(r, "Failed to determine whether the systemd controller is unified: %m");
- if (r == 0)
- return 0;
+ assert(pid > 1);
+
+ /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
+ * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
+ * move us and the container into two separate subcgroups.
+ *
+ * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
+ * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
+ * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
+ * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
+ * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
+ *
+ * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
+ * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
+ * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
+ * do it. */
r = cg_mask_supported(&supported);
if (r < 0)
return log_error_errno(r, "Failed to determine supported controllers: %m");
- r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ if (keep_unit)
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ else
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
if (r < 0)
return log_error_errno(r, "Failed to get our control group: %m");
- child = strjoina(cgroup, "/payload");
- r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+ payload = strjoina(cgroup, "/payload");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
if (r < 0)
- return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
- child = strjoina(cgroup, "/supervisor");
- r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
- if (r < 0)
- return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+ if (keep_unit) {
+ const char *supervisor;
+
+ supervisor = strjoina(cgroup, "/supervisor");
+ r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
+ }
/* Try to enable as many controllers as possible for the new payload. */
(void) cg_enable_everywhere(supported, supported, cgroup);
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 3a8e98e122..7639b483ae 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -14,4 +14,4 @@
int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index 1cc4a2da6b..8a4634f53e 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -27,7 +27,7 @@
#include "user-util.h"
#include "util.h"
-CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
+CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
CustomMount *c, *ret;
assert(l);
@@ -48,8 +48,8 @@ CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
return ret;
}
-void custom_mount_free_all(CustomMount *l, unsigned n) {
- unsigned i;
+void custom_mount_free_all(CustomMount *l, size_t n) {
+ size_t i;
for (i = 0; i < n; i++) {
CustomMount *m = l + i;
@@ -110,8 +110,8 @@ static char *resolve_source_path(const char *dest, const char *source) {
return strdup(source);
}
-int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
- unsigned i;
+int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
+ size_t i;
int r;
/* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
@@ -133,8 +133,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
if (!s)
return log_oom();
- free(m->source);
- m->source = s;
+ free_and_replace(m->source, s);
} else {
/* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
@@ -165,8 +164,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
if (!s)
return log_oom();
- free(*j);
- *j = s;
+ free_and_replace(*j, s);
}
if (m->work_dir) {
@@ -176,8 +174,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
if (!s)
return log_oom();
- free(m->work_dir);
- m->work_dir = s;
+ free_and_replace(m->work_dir, s);
} else {
assert(m->source);
@@ -193,7 +190,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
return 0;
}
-int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
_cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
const char *p = s;
CustomMount *m;
@@ -239,7 +236,7 @@ int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only
return 0;
}
-int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
+int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
_cleanup_free_ char *path = NULL, *opts = NULL;
const char *p = s;
CustomMount *m;
@@ -275,7 +272,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
return 0;
}
-int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
+int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
_cleanup_free_ char *upper = NULL, *destination = NULL;
_cleanup_strv_free_ char **lower = NULL;
CustomMount *m;
@@ -511,6 +508,18 @@ int mount_all(const char *dest,
uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
+#define PROC_INACCESSIBLE(path) \
+ { NULL, (path), NULL, NULL, MS_BIND, \
+ MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
+ { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
+ MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
+
+#define PROC_READ_ONLY(path) \
+ { (path), (path), NULL, NULL, MS_BIND, \
+ MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
+ { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
+ MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
+
typedef struct MountPoint {
const char *what;
const char *where;
@@ -521,39 +530,72 @@ int mount_all(const char *dest,
} MountPoint;
static const MountPoint mount_table[] = {
- /* inner child mounts */
- { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
- { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
- { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
- { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
- { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
- { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
-
- /* outer child mounts */
- { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
- { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
- { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
- { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
-
- { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
- { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
- { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
+ /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ MOUNT_FATAL|MOUNT_IN_USERNS },
+
+ { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
+ MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
+
+ { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
+ MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
+
+ { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
+ MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
+
+ /* Make these files inaccessible to container payloads: they potentially leak information about kernel
+ * internals or the host's execution environment to the container */
+ PROC_INACCESSIBLE("/proc/kallsyms"),
+ PROC_INACCESSIBLE("/proc/kcore"),
+ PROC_INACCESSIBLE("/proc/keys"),
+ PROC_INACCESSIBLE("/proc/sysrq-trigger"),
+ PROC_INACCESSIBLE("/proc/timer_list"),
+
+ /* Make these directories read-only to container payloads: they show hardware information, and in some
+ * cases contain tunables the container really shouldn't have access to. */
+ PROC_READ_ONLY("/proc/acpi"),
+ PROC_READ_ONLY("/proc/apm"),
+ PROC_READ_ONLY("/proc/asound"),
+ PROC_READ_ONLY("/proc/bus"),
+ PROC_READ_ONLY("/proc/fs"),
+ PROC_READ_ONLY("/proc/irq"),
+ PROC_READ_ONLY("/proc/scsi"),
+
+ /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
+ { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ MOUNT_FATAL },
+ { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
+ { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
+ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ MOUNT_FATAL }, /* skipped if above was mounted */
+ { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
+ MOUNT_FATAL },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ MOUNT_FATAL },
+ { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ MOUNT_FATAL },
+
#if HAVE_SELINUX
- { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
- { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
+ { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
+ 0 }, /* Bind mount first */
+ { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
+ 0 }, /* Then, make it r/o */
#endif
};
- unsigned k;
- int r;
+ _cleanup_(unlink_and_freep) char *inaccessible = NULL;
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
bool in_userns = (mount_settings & MOUNT_IN_USERNS);
+ size_t k;
+ int r;
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
_cleanup_free_ char *where = NULL, *options = NULL;
- const char *o;
+ const char *o, *what;
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
@@ -569,12 +611,32 @@ int mount_all(const char *dest,
if (r < 0)
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
+ if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
+
+ if (!inaccessible) {
+ _cleanup_free_ char *np = NULL;
+
+ r = tempfn_random_child(NULL, "inaccessible", &np);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
+
+ r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
+
+ inaccessible = TAKE_PTR(np);
+ }
+
+ what = inaccessible;
+ } else
+ what = mount_table[k].what;
+
r = path_is_mount_point(where, NULL, 0);
if (r < 0 && r != -ENOENT)
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
/* Skip this entry if it is not a remount. */
- if (mount_table[k].what && r > 0)
+ if (what && r > 0)
continue;
r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
@@ -603,7 +665,7 @@ int mount_all(const char *dest,
}
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
- mount_table[k].what,
+ what,
where,
mount_table[k].type,
mount_table[k].flags,
@@ -766,11 +828,11 @@ static int mount_overlay(const char *dest, CustomMount *m) {
int mount_custom(
const char *dest,
- CustomMount *mounts, unsigned n,
+ CustomMount *mounts, size_t n,
bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
- unsigned i;
+ size_t i;
int r;
assert(dest);
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index 6d3aca76a0..db7aadc28e 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -13,12 +13,13 @@
#include "volatile-util.h"
typedef enum MountSettingsMask {
- MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */
- MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
- MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
- MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
- MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
- Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+ MOUNT_FATAL = 1U << 0, /* if set, a mount error is considered fatal */
+ MOUNT_USE_USERNS = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
+ MOUNT_IN_USERNS = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
+ MOUNT_APPLY_APIVFS_RO = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
+ MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write.
+ Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+ MOUNT_INACCESSIBLE_REG = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
} MountSettingsMask;
typedef enum CustomMountType {
@@ -40,13 +41,13 @@ typedef struct CustomMount {
char *rm_rf_tmpdir;
} CustomMount;
-CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t);
-void custom_mount_free_all(CustomMount *l, unsigned n);
-int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n);
+CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t);
+void custom_mount_free_all(CustomMount *l, size_t n);
+int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n);
-int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
-int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
-int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
+int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
+int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s);
+int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
@@ -54,7 +55,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
-int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
+int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c
index 643e00400f..c0cdb7c0d7 100644
--- a/src/nspawn/nspawn-register.c
+++ b/src/nspawn/nspawn-register.c
@@ -11,6 +11,7 @@
#include "bus-unit-util.h"
#include "bus-util.h"
#include "nspawn-register.h"
+#include "special.h"
#include "stat-util.h"
#include "strv.h"
#include "util.h"
@@ -309,7 +310,7 @@ int allocate_scope(
"PIDs", "au", 1, pid,
"Description", "s", description,
"Delegate", "b", 1,
- "Slice", "s", isempty(slice) ? "machine.slice" : slice);
+ "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
if (r < 0)
return bus_log_create_error(r);
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index bee7e9f530..731db87260 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -76,7 +76,7 @@ typedef struct Settings {
int read_only;
VolatileMode volatile_mode;
CustomMount *custom_mounts;
- unsigned n_custom_mounts;
+ size_t n_custom_mounts;
int userns_chown;
/* [Network] */
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 23bc9402a8..d7ceb4ed44 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -165,7 +165,7 @@ static uint64_t arg_caps_retain =
(1ULL << CAP_SYS_RESOURCE) |
(1ULL << CAP_SYS_TTY_CONFIG);
static CustomMount *arg_custom_mounts = NULL;
-static unsigned arg_n_custom_mounts = 0;
+static size_t arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static bool arg_quiet = false;
static bool arg_register = true;
@@ -291,7 +291,7 @@ static void help(void) {
}
static int custom_mount_check_all(void) {
- unsigned i;
+ size_t i;
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
@@ -1470,31 +1470,35 @@ static int setup_resolv_conf(const char *dest) {
}
static int setup_boot_id(void) {
+ _cleanup_(unlink_and_freep) char *from = NULL;
+ _cleanup_free_ char *path = NULL;
sd_id128_t rnd = SD_ID128_NULL;
- const char *from, *to;
+ const char *to;
int r;
/* Generate a new randomized boot ID, so that each boot-up of
* the container gets a new one */
- from = "/run/proc-sys-kernel-random-boot-id";
- to = "/proc/sys/kernel/random/boot_id";
+ r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate random boot ID path: %m");
r = sd_id128_randomize(&rnd);
if (r < 0)
return log_error_errno(r, "Failed to generate random boot id: %m");
- r = id128_write(from, ID128_UUID, rnd, false);
+ r = id128_write(path, ID128_UUID, rnd, false);
if (r < 0)
return log_error_errno(r, "Failed to write boot id: %m");
+ from = TAKE_PTR(path);
+ to = "/proc/sys/kernel/random/boot_id";
+
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
- if (r >= 0)
- r = mount_verbose(LOG_ERR, NULL, to, NULL,
- MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
+ if (r < 0)
+ return r;
- (void) unlink(from);
- return r;
+ return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
static int copy_devnodes(const char *dest) {
@@ -1662,26 +1666,32 @@ static int setup_keyring(void) {
}
static int setup_kmsg(int kmsg_socket) {
- const char *from, *to;
+ _cleanup_(unlink_and_freep) char *from = NULL;
+ _cleanup_free_ char *fifo = NULL;
+ _cleanup_close_ int fd = -1;
_cleanup_umask_ mode_t u;
- int fd, r;
+ const char *to;
+ int r;
assert(kmsg_socket >= 0);
u = umask(0000);
- /* We create the kmsg FIFO as /run/kmsg, but immediately
- * delete it after bind mounting it to /proc/kmsg. While FIFOs
- * on the reading side behave very similar to /proc/kmsg,
- * their writing side behaves differently from /dev/kmsg in
- * that writing blocks when nothing is reading. In order to
- * avoid any problems with containers deadlocking due to this
- * we simply make /dev/kmsg unavailable to the container. */
- from = "/run/kmsg";
- to = "/proc/kmsg";
+ /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
+ * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
+ * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
+ * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
- if (mkfifo(from, 0600) < 0)
+ r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate kmsg path: %m");
+
+ if (mkfifo(fifo, 0600) < 0)
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
+
+ from = TAKE_PTR(fifo);
+ to = "/proc/kmsg";
+
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
if (r < 0)
return r;
@@ -1690,17 +1700,11 @@ static int setup_kmsg(int kmsg_socket) {
if (fd < 0)
return log_error_errno(errno, "Failed to open fifo: %m");
- /* Store away the fd in the socket, so that it stays open as
- * long as we run the child */
+ /* Store away the fd in the socket, so that it stays open as long as we run the child */
r = send_one_fd(kmsg_socket, fd, 0);
- safe_close(fd);
-
if (r < 0)
return log_error_errno(r, "Failed to send FIFO fd: %m");
- /* And now make the FIFO unavailable as /run/kmsg... */
- (void) unlink(from);
-
return 0;
}
@@ -2265,7 +2269,7 @@ static int inner_child(
_cleanup_free_ char *home = NULL;
char as_uuid[37];
- unsigned n_env = 1;
+ size_t n_env = 1;
const char *envp[] = {
"PATH=" DEFAULT_PATH_COMPAT,
NULL, /* container */
@@ -3639,11 +3643,9 @@ static int run(int master,
if (r < 0)
return r;
- if (arg_keep_unit) {
- r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
- if (r < 0)
- return r;
- }
+ r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
+ if (r < 0)
+ return r;
r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
if (r < 0)