summaryrefslogtreecommitdiffstats
path: root/src/core/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/namespace.c')
-rw-r--r--src/core/namespace.c112
1 files changed, 97 insertions, 15 deletions
diff --git a/src/core/namespace.c b/src/core/namespace.c
index c45be457a8..a0d3dc0cbb 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -65,6 +65,7 @@ typedef enum MountMode {
MOUNT_PRIVATE_SYSFS,
MOUNT_BIND_SYSFS,
MOUNT_PROCFS,
+ MOUNT_PRIVATE_CGROUP2FS,
MOUNT_READ_ONLY,
MOUNT_READ_WRITE,
MOUNT_NOEXEC,
@@ -199,6 +200,22 @@ static const MountEntry protect_home_yes_table[] = {
{ "/root", MOUNT_INACCESSIBLE, true },
};
+/* ProtectControlGroups=yes table */
+static const MountEntry protect_control_groups_yes_table[] = {
+ { "/sys/fs/cgroup", MOUNT_READ_ONLY, false },
+};
+
+/* ProtectControlGroups=private table. Note mount_private_apivfs() always use MS_NOSUID|MS_NOEXEC|MS_NODEV so
+ * flags is not set here. nsdelegate has been supported since kernels >= 4.13 so it is safe to use. */
+static const MountEntry protect_control_groups_private_table[] = {
+ { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
+};
+
+/* ProtectControlGroups=strict table */
+static const MountEntry protect_control_groups_strict_table[] = {
+ { "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true, .options_const = "nsdelegate" },
+};
+
/* ProtectSystem=yes table */
static const MountEntry protect_system_yes_table[] = {
{ "/usr", MOUNT_READ_ONLY, false },
@@ -247,6 +264,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
[MOUNT_EMPTY_DIR] = "empty-dir",
[MOUNT_PRIVATE_SYSFS] = "private-sysfs",
[MOUNT_BIND_SYSFS] = "bind-sysfs",
+ [MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs",
[MOUNT_PROCFS] = "procfs",
[MOUNT_READ_ONLY] = "read-only",
[MOUNT_READ_WRITE] = "read-write",
@@ -727,6 +745,28 @@ static int append_static_mounts(MountList *ml, const MountEntry *mounts, size_t
return 0;
}
+static int append_protect_control_groups(MountList *ml, ProtectControlGroups protect_control_groups, bool ignore_protect) {
+ assert(ml);
+
+ switch (protect_control_groups) {
+
+ case PROTECT_CONTROL_GROUPS_NO:
+ return 0;
+
+ case PROTECT_CONTROL_GROUPS_YES:
+ return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
+
+ case PROTECT_CONTROL_GROUPS_PRIVATE:
+ return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
+
+ case PROTECT_CONTROL_GROUPS_STRICT:
+ return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
+
+ default:
+ assert_not_reached();
+ }
+}
+
static int append_protect_home(MountList *ml, ProtectHome protect_home, bool ignore_protect) {
assert(ml);
@@ -1269,10 +1309,14 @@ static int mount_private_apivfs(
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
if (r == -EINVAL && opts)
- /* If this failed with EINVAL then this likely means the textual hidepid= stuff for procfs is
- * not supported by the kernel, and thus the per-instance hidepid= neither, which means we
- * really don't want to use it, since it would affect our host's /proc mount. Hence let's
- * gracefully fallback to a classic, unrestricted version. */
+ /* If this failed with EINVAL then this likely means either:
+ * 1. the textual hidepid= stuff for procfs is not supported by the kernel, and thus the
+ * per-instance hidepid= neither, which means we really don't want to use it, since it
+ * would affect our host's /proc mount.
+ * 2. nsdelegate for cgroup2 is not supported by the kernel even though CLONE_NEWCGROUP
+ * is supported.
+ *
+ * Hence let's gracefully fallback to a classic, unrestricted version. */
r = mount_nofollow_verbose(LOG_DEBUG, fstype, temporary_mount, fstype, MS_NOSUID|MS_NOEXEC|MS_NODEV, /* opts = */ NULL);
if (ERRNO_IS_NEG_PRIVILEGE(r)) {
/* When we do not have enough privileges to mount a new instance, fall back to use an
@@ -1318,6 +1362,39 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
}
+static bool check_recursiveprot_supported(void) {
+ int r;
+
+ /* memory_recursiveprot is only supported for kernels >= 5.7. Note mount_option_supported uses fsopen()
+ * and fsconfig() which are supported for kernels >= 5.2. So if mount_option_supported() returns an
+ * error, we can assume memory_recursiveprot is not supported. */
+ r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
+ else if (r == 0)
+ log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
+
+ return r > 0;
+}
+
+static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
+ _cleanup_free_ char *opts = NULL;
+
+ assert(m);
+ assert(p);
+
+ if (check_recursiveprot_supported()) {
+ opts = strdup(strempty(mount_entry_options(m)));
+ if (!opts)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&opts, ",", "memory_recursiveprot"))
+ return -ENOMEM;
+ }
+
+ return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", opts ?: mount_entry_options(m), p->runtime_scope);
+}
+
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
_cleanup_free_ char *opts = NULL;
@@ -1763,6 +1840,9 @@ static int apply_one_mount(
case MOUNT_PROCFS:
return mount_procfs(m, p);
+ case MOUNT_PRIVATE_CGROUP2FS:
+ return mount_private_cgroup2fs(m, p);
+
case MOUNT_RUN:
return mount_run(m);
@@ -1933,7 +2013,7 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
*/
return p->mount_apivfs ||
- p->protect_control_groups ||
+ p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT ||
p->proc_subset != PROC_SUBSET_ALL;
@@ -2490,16 +2570,9 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
return r;
}
- if (p->protect_control_groups) {
- MountEntry *me = mount_list_extend(&ml);
- if (!me)
- return log_oom_debug();
-
- *me = (MountEntry) {
- .path_const = "/sys/fs/cgroup",
- .mode = MOUNT_READ_ONLY,
- };
- }
+ r = append_protect_control_groups(&ml, p->protect_control_groups, false);
+ if (r < 0)
+ return r;
r = append_protect_home(&ml, p->protect_home, p->ignore_protect_paths);
if (r < 0)
@@ -3195,6 +3268,15 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_SYSTEM_YES);
+static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
+ [PROTECT_CONTROL_GROUPS_NO] = "no",
+ [PROTECT_CONTROL_GROUPS_YES] = "yes",
+ [PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
+ [PROTECT_CONTROL_GROUPS_STRICT] = "strict",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);
+
static const char* const namespace_type_table[] = {
[NAMESPACE_MOUNT] = "mnt",
[NAMESPACE_CGROUP] = "cgroup",