diff options
author | Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> | 2018-05-08 12:54:58 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-05-08 12:54:58 +0200 |
commit | 6b1ca2a948180877481ef4fbbcad5762bf9f0600 (patch) | |
tree | c0bd34896cf23b63aa250c6e35212bd112d647cd | |
parent | Merge pull request #8926 from keszybz/man-journal-remote (diff) | |
parent | update TODO (diff) | |
download | systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.tar.xz systemd-6b1ca2a948180877481ef4fbbcad5762bf9f0600.zip |
Merge pull request #8898 from poettering/nspawn-mount-block
some nspawn cgroup and mount lock-down fixes
-rw-r--r-- | TODO | 7 | ||||
-rw-r--r-- | doc/CGROUP_DELEGATION.md | 15 | ||||
-rw-r--r-- | src/core/namespace.c | 19 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 59 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.h | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.c | 142 | ||||
-rw-r--r-- | src/nspawn/nspawn-mount.h | 27 | ||||
-rw-r--r-- | src/nspawn/nspawn-register.c | 3 | ||||
-rw-r--r-- | src/nspawn/nspawn-settings.h | 2 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 74 |
10 files changed, 215 insertions, 135 deletions
@@ -24,6 +24,10 @@ Janitorial Clean-ups: Features: +* nspawn: greater control over hostname, resolv.conf, timezone, rlim + +* nspawn: when operating in a scope, also create /payload subcrgoup + * the error paths in usbffs_dispatch_ep() leak memory * cgroups: figure out if we can somehow communicate in a cleaner way whether a @@ -52,9 +56,6 @@ Features: * add --vacuum-xyz options to coredumpctl, matching those journalctl already has. -* list the exit codes from the BSD/glibc <sysexits.h> in our own - exit-codes.[ch] tables. - * SuccessExitStatus= and friends should probably also accept symbolic exit codes names, i.e. error codes from the list maintained in exit-codes.[ch] diff --git a/doc/CGROUP_DELEGATION.md b/doc/CGROUP_DELEGATION.md index 412f0a5fa0..212283fd73 100644 --- a/doc/CGROUP_DELEGATION.md +++ b/doc/CGROUP_DELEGATION.md @@ -424,15 +424,16 @@ unified you (of course, I guess) need to provide only `/sys/fs/cgroup/` itself. cgroup tree of systemd itself is out of limits for you. It's fine to *read* from any attribute you like however. That's totally OK and welcome. -4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a container - payload running systemd, then don't get the idea that you can bind mount - only a sub-tree of the host's cgroup tree into the container. Part of the - cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every +4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a + container payload running systemd, then don't get the idea that you can bind + mount only a sub-tree of the host's cgroup tree into the container. Part of + the cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every process, and hence any path below `/sys/fs/cgroup/` needs to match what `/proc/$PID/cgroup` of the payload processes reports. What you can do safely - however, is mount the upper parts of the cgroup tree read-only or even - replace it with an intermediary `tmpfs`, as long as the path to the - delegated sub-tree remains accessible as-is. + however, is mount the upper parts of the cgroup tree read-only (or even + replace the middle bits with an intermediary `tmpfs` — but be careful not to + break the `statfs()` detection logic discussed above), as long as the path + to the delegated sub-tree remains accessible as-is. 5. ⚡ Currently, the algorithm for mapping between slice/scope/service unit naming and their cgroup paths is not considered public API of systemd, and diff --git a/src/core/namespace.c b/src/core/namespace.c index 4a7fea920e..3154cad58a 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -81,23 +81,26 @@ static const MountEntry apivfs_table[] = { /* ProtectKernelTunables= option and the related filesystem APIs */ static const MountEntry protect_kernel_tunables_table[] = { - { "/proc/sys", READONLY, false }, - { "/proc/sysrq-trigger", READONLY, true }, - { "/proc/latency_stats", READONLY, true }, - { "/proc/mtrr", READONLY, true }, - { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */ { "/proc/acpi", READONLY, true }, - { "/proc/timer_stats", READONLY, true }, + { "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */ { "/proc/asound", READONLY, true }, { "/proc/bus", READONLY, true }, { "/proc/fs", READONLY, true }, { "/proc/irq", READONLY, true }, + { "/proc/kallsyms", INACCESSIBLE, true }, + { "/proc/kcore", INACCESSIBLE, true }, + { "/proc/latency_stats", READONLY, true }, + { "/proc/mtrr", READONLY, true }, + { "/proc/scsi", READONLY, true }, + { "/proc/sys", READONLY, false }, + { "/proc/sysrq-trigger", READONLY, true }, + { "/proc/timer_stats", READONLY, true }, { "/sys", READONLY, false }, - { "/sys/kernel/debug", READONLY, true }, - { "/sys/kernel/tracing", READONLY, true }, { "/sys/fs/bpf", READONLY, true }, { "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */ { "/sys/fs/selinux", READWRITE, true }, + { "/sys/kernel/debug", READONLY, true }, + { "/sys/kernel/tracing", READONLY, true }, }; /* ProtectKernelModules= option */ diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 682ea65080..761d737dc9 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -141,44 +141,53 @@ finish: return r; } -int create_subcgroup(pid_t pid, CGroupUnified unified_requested) { +int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) { _cleanup_free_ char *cgroup = NULL; - const char *child; - int r; CGroupMask supported; + const char *payload; + int r; - /* In the unified hierarchy inner nodes may only contain - * subgroups, but not processes. Hence, if we running in the - * unified hierarchy and the container does the same, and we - * did not create a scope unit for the container move us and - * the container into two separate subcgroups. */ - - if (unified_requested == CGROUP_UNIFIED_NONE) - return 0; - - r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); - if (r < 0) - return log_error_errno(r, "Failed to determine whether the systemd controller is unified: %m"); - if (r == 0) - return 0; + assert(pid > 1); + + /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in + * the unified hierarchy and the container does the same, and we did not create a scope unit for the container + * move us and the container into two separate subcgroups. + * + * Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including + * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a + * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the + * host systemd directly to the payload, the host and payload systemd might fight for the cgroup + * attributes. Hence, let's insert an intermediary cgroup to cover that case too. + * + * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup + * that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the + * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't + * do it. */ r = cg_mask_supported(&supported); if (r < 0) return log_error_errno(r, "Failed to determine supported controllers: %m"); - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (keep_unit) + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + else + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); if (r < 0) return log_error_errno(r, "Failed to get our control group: %m"); - child = strjoina(cgroup, "/payload"); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid); + payload = strjoina(cgroup, "/payload"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid); if (r < 0) - return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + return log_error_errno(r, "Failed to create %s subcgroup: %m", payload); - child = strjoina(cgroup, "/supervisor"); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0); - if (r < 0) - return log_error_errno(r, "Failed to create %s subcgroup: %m", child); + if (keep_unit) { + const char *supervisor; + + supervisor = strjoina(cgroup, "/supervisor"); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor); + } /* Try to enable as many controllers as possible for the new payload. */ (void) cg_enable_everywhere(supported, supported, cgroup); diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h index 3a8e98e122..7639b483ae 100644 --- a/src/nspawn/nspawn-cgroup.h +++ b/src/nspawn/nspawn-cgroup.h @@ -14,4 +14,4 @@ int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift); -int create_subcgroup(pid_t pid, CGroupUnified unified_requested); +int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested); diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c index 1cc4a2da6b..8a4634f53e 100644 --- a/src/nspawn/nspawn-mount.c +++ b/src/nspawn/nspawn-mount.c @@ -27,7 +27,7 @@ #include "user-util.h" #include "util.h" -CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) { +CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) { CustomMount *c, *ret; assert(l); @@ -48,8 +48,8 @@ CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) { return ret; } -void custom_mount_free_all(CustomMount *l, unsigned n) { - unsigned i; +void custom_mount_free_all(CustomMount *l, size_t n) { + size_t i; for (i = 0; i < n; i++) { CustomMount *m = l + i; @@ -110,8 +110,8 @@ static char *resolve_source_path(const char *dest, const char *source) { return strdup(source); } -int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) { - unsigned i; +int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) { + size_t i; int r; /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the @@ -133,8 +133,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) { if (!s) return log_oom(); - free(m->source); - m->source = s; + free_and_replace(m->source, s); } else { /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */ @@ -165,8 +164,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) { if (!s) return log_oom(); - free(*j); - *j = s; + free_and_replace(*j, s); } if (m->work_dir) { @@ -176,8 +174,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) { if (!s) return log_oom(); - free(m->work_dir); - m->work_dir = s; + free_and_replace(m->work_dir, s); } else { assert(m->source); @@ -193,7 +190,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) { return 0; } -int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) { +int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) { _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL; const char *p = s; CustomMount *m; @@ -239,7 +236,7 @@ int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only return 0; } -int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) { +int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) { _cleanup_free_ char *path = NULL, *opts = NULL; const char *p = s; CustomMount *m; @@ -275,7 +272,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) { return 0; } -int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) { +int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) { _cleanup_free_ char *upper = NULL, *destination = NULL; _cleanup_strv_free_ char **lower = NULL; CustomMount *m; @@ -511,6 +508,18 @@ int mount_all(const char *dest, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context) { +#define PROC_INACCESSIBLE(path) \ + { NULL, (path), NULL, NULL, MS_BIND, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \ + { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */ + +#define PROC_READ_ONLY(path) \ + { (path), (path), NULL, NULL, MS_BIND, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \ + { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \ + MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */ + typedef struct MountPoint { const char *what; const char *where; @@ -521,39 +530,72 @@ int mount_all(const char *dest, } MountPoint; static const MountPoint mount_table[] = { - /* inner child mounts */ - { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS }, - { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ - { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */ - { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */ - { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ - { NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */ - - /* outer child mounts */ - { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL }, - { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS }, - { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */ - { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */ - - { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL }, - { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL }, - { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL }, + /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */ + { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_FATAL|MOUNT_IN_USERNS }, + + { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ + + { "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */ + + { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */ + + /* Make these files inaccessible to container payloads: they potentially leak information about kernel + * internals or the host's execution environment to the container */ + PROC_INACCESSIBLE("/proc/kallsyms"), + PROC_INACCESSIBLE("/proc/kcore"), + PROC_INACCESSIBLE("/proc/keys"), + PROC_INACCESSIBLE("/proc/sysrq-trigger"), + PROC_INACCESSIBLE("/proc/timer_list"), + + /* Make these directories read-only to container payloads: they show hardware information, and in some + * cases contain tunables the container really shouldn't have access to. */ + PROC_READ_ONLY("/proc/acpi"), + PROC_READ_ONLY("/proc/apm"), + PROC_READ_ONLY("/proc/asound"), + PROC_READ_ONLY("/proc/bus"), + PROC_READ_ONLY("/proc/fs"), + PROC_READ_ONLY("/proc/irq"), + PROC_READ_ONLY("/proc/scsi"), + + /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */ + { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + MOUNT_FATAL }, + { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS }, + { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */ + { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, + MOUNT_FATAL }, /* skipped if above was mounted */ + { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, + MOUNT_FATAL }, + { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + MOUNT_FATAL }, + { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, + MOUNT_FATAL }, + #if HAVE_SELINUX - { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */ - { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */ + { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, + 0 }, /* Bind mount first */ + { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, + 0 }, /* Then, make it r/o */ #endif }; - unsigned k; - int r; + _cleanup_(unlink_and_freep) char *inaccessible = NULL; bool use_userns = (mount_settings & MOUNT_USE_USERNS); bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS); bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO); bool in_userns = (mount_settings & MOUNT_IN_USERNS); + size_t k; + int r; for (k = 0; k < ELEMENTSOF(mount_table); k++) { _cleanup_free_ char *where = NULL, *options = NULL; - const char *o; + const char *o, *what; bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL); if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS)) @@ -569,12 +611,32 @@ int mount_all(const char *dest, if (r < 0) return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where); + if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) { + + if (!inaccessible) { + _cleanup_free_ char *np = NULL; + + r = tempfn_random_child(NULL, "inaccessible", &np); + if (r < 0) + return log_error_errno(r, "Failed to generate inaccessible file node path: %m"); + + r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000); + if (r < 0) + return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np); + + inaccessible = TAKE_PTR(np); + } + + what = inaccessible; + } else + what = mount_table[k].what; + r = path_is_mount_point(where, NULL, 0); if (r < 0 && r != -ENOENT) return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where); /* Skip this entry if it is not a remount. */ - if (mount_table[k].what && r > 0) + if (what && r > 0) continue; r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift); @@ -603,7 +665,7 @@ int mount_all(const char *dest, } r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG, - mount_table[k].what, + what, where, mount_table[k].type, mount_table[k].flags, @@ -766,11 +828,11 @@ static int mount_overlay(const char *dest, CustomMount *m) { int mount_custom( const char *dest, - CustomMount *mounts, unsigned n, + CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context) { - unsigned i; + size_t i; int r; assert(dest); diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h index 6d3aca76a0..db7aadc28e 100644 --- a/src/nspawn/nspawn-mount.h +++ b/src/nspawn/nspawn-mount.h @@ -13,12 +13,13 @@ #include "volatile-util.h" typedef enum MountSettingsMask { - MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */ - MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */ - MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */ - MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */ - MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write. - Works only if MOUNT_APPLY_APIVFS_RO is also set. */ + MOUNT_FATAL = 1U << 0, /* if set, a mount error is considered fatal */ + MOUNT_USE_USERNS = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */ + MOUNT_IN_USERNS = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */ + MOUNT_APPLY_APIVFS_RO = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */ + MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write. + Works only if MOUNT_APPLY_APIVFS_RO is also set. */ + MOUNT_INACCESSIBLE_REG = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */ } MountSettingsMask; typedef enum CustomMountType { @@ -40,13 +41,13 @@ typedef struct CustomMount { char *rm_rf_tmpdir; } CustomMount; -CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t); -void custom_mount_free_all(CustomMount *l, unsigned n); -int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n); +CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t); +void custom_mount_free_all(CustomMount *l, size_t n); +int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n); -int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only); -int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s); -int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only); +int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); +int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s); +int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only); int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int mount_sysfs(const char *dest, MountSettingsMask mount_settings); @@ -54,7 +55,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings); int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns); int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested); -int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); +int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context); diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 643e00400f..c0cdb7c0d7 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -11,6 +11,7 @@ #include "bus-unit-util.h" #include "bus-util.h" #include "nspawn-register.h" +#include "special.h" #include "stat-util.h" #include "strv.h" #include "util.h" @@ -309,7 +310,7 @@ int allocate_scope( "PIDs", "au", 1, pid, "Description", "s", description, "Delegate", "b", 1, - "Slice", "s", isempty(slice) ? "machine.slice" : slice); + "Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice); if (r < 0) return bus_log_create_error(r); diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index bee7e9f530..731db87260 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -76,7 +76,7 @@ typedef struct Settings { int read_only; VolatileMode volatile_mode; CustomMount *custom_mounts; - unsigned n_custom_mounts; + size_t n_custom_mounts; int userns_chown; /* [Network] */ diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 23bc9402a8..d7ceb4ed44 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -165,7 +165,7 @@ static uint64_t arg_caps_retain = (1ULL << CAP_SYS_RESOURCE) | (1ULL << CAP_SYS_TTY_CONFIG); static CustomMount *arg_custom_mounts = NULL; -static unsigned arg_n_custom_mounts = 0; +static size_t arg_n_custom_mounts = 0; static char **arg_setenv = NULL; static bool arg_quiet = false; static bool arg_register = true; @@ -291,7 +291,7 @@ static void help(void) { } static int custom_mount_check_all(void) { - unsigned i; + size_t i; for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; @@ -1470,31 +1470,35 @@ static int setup_resolv_conf(const char *dest) { } static int setup_boot_id(void) { + _cleanup_(unlink_and_freep) char *from = NULL; + _cleanup_free_ char *path = NULL; sd_id128_t rnd = SD_ID128_NULL; - const char *from, *to; + const char *to; int r; /* Generate a new randomized boot ID, so that each boot-up of * the container gets a new one */ - from = "/run/proc-sys-kernel-random-boot-id"; - to = "/proc/sys/kernel/random/boot_id"; + r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path); + if (r < 0) + return log_error_errno(r, "Failed to generate random boot ID path: %m"); r = sd_id128_randomize(&rnd); if (r < 0) return log_error_errno(r, "Failed to generate random boot id: %m"); - r = id128_write(from, ID128_UUID, rnd, false); + r = id128_write(path, ID128_UUID, rnd, false); if (r < 0) return log_error_errno(r, "Failed to write boot id: %m"); + from = TAKE_PTR(path); + to = "/proc/sys/kernel/random/boot_id"; + r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL); - if (r >= 0) - r = mount_verbose(LOG_ERR, NULL, to, NULL, - MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL); + if (r < 0) + return r; - (void) unlink(from); - return r; + return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); } static int copy_devnodes(const char *dest) { @@ -1662,26 +1666,32 @@ static int setup_keyring(void) { } static int setup_kmsg(int kmsg_socket) { - const char *from, *to; + _cleanup_(unlink_and_freep) char *from = NULL; + _cleanup_free_ char *fifo = NULL; + _cleanup_close_ int fd = -1; _cleanup_umask_ mode_t u; - int fd, r; + const char *to; + int r; assert(kmsg_socket >= 0); u = umask(0000); - /* We create the kmsg FIFO as /run/kmsg, but immediately - * delete it after bind mounting it to /proc/kmsg. While FIFOs - * on the reading side behave very similar to /proc/kmsg, - * their writing side behaves differently from /dev/kmsg in - * that writing blocks when nothing is reading. In order to - * avoid any problems with containers deadlocking due to this - * we simply make /dev/kmsg unavailable to the container. */ - from = "/run/kmsg"; - to = "/proc/kmsg"; + /* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to + * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves + * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems + * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */ - if (mkfifo(from, 0600) < 0) + r = tempfn_random_child(NULL, "proc-kmsg", &fifo); + if (r < 0) + return log_error_errno(r, "Failed to generate kmsg path: %m"); + + if (mkfifo(fifo, 0600) < 0) return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m"); + + from = TAKE_PTR(fifo); + to = "/proc/kmsg"; + r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL); if (r < 0) return r; @@ -1690,17 +1700,11 @@ static int setup_kmsg(int kmsg_socket) { if (fd < 0) return log_error_errno(errno, "Failed to open fifo: %m"); - /* Store away the fd in the socket, so that it stays open as - * long as we run the child */ + /* Store away the fd in the socket, so that it stays open as long as we run the child */ r = send_one_fd(kmsg_socket, fd, 0); - safe_close(fd); - if (r < 0) return log_error_errno(r, "Failed to send FIFO fd: %m"); - /* And now make the FIFO unavailable as /run/kmsg... */ - (void) unlink(from); - return 0; } @@ -2265,7 +2269,7 @@ static int inner_child( _cleanup_free_ char *home = NULL; char as_uuid[37]; - unsigned n_env = 1; + size_t n_env = 1; const char *envp[] = { "PATH=" DEFAULT_PATH_COMPAT, NULL, /* container */ @@ -3639,11 +3643,9 @@ static int run(int master, if (r < 0) return r; - if (arg_keep_unit) { - r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy); - if (r < 0) - return r; - } + r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy); + if (r < 0) + return r; r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); if (r < 0) |