core: unified cgroup hierarchy support

This patch set adds full support the new unified cgroup hierarchy logic of modern kernels. A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is added. If specified the unified hierarchy is mounted to /sys/fs/cgroup instead of a tmpfs. No further hierarchies are mounted. The kernel command line option defaults to off. We can turn it on by default as soon as the kernel's APIs regarding this are stabilized (but even then downstream distros might want to turn this off, as this will break any tools that access cgroupfs directly). It is possibly to choose for each boot individually whether the unified or the legacy hierarchy is used. nspawn will by default provide the legacy hierarchy to containers if the host is using it, and the unified otherwise. However it is possible to run containers with the unified hierarchy on a legacy host and vice versa, by setting the $UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0, respectively. The unified hierarchy provides reliable cgroup empty notifications for the first time, via inotify. To make use of this we maintain one manager-wide inotify fd, and each cgroup to it. This patch also removes cg_delete() which is unused now. On kernel 4.2 only the "memory" controller is compatible with the unified hierarchy, hence that's the only controller systemd exposes when booted in unified heirarchy mode. This introduces a new enum for enumerating supported controllers, plus a related enum for the mask bits mapping to it. The core is changed to make use of this everywhere. This moves PID 1 into a new "init.scope" implicit scope unit in the root slice. This is necessary since on the unified hierarchy cgroups may either contain subgroups or processes but not both. PID 1 hence has to move out of the root cgroup (strictly speaking the root cgroup is the only one where processes and subgroups are still allowed, but in order to support containers nicey, we move PID 1 into the new scope in all cases.) This new unit is also used on legacy hierarchy setups. It's actually pretty useful on all systems, as it can then be used to filter journal messages coming from PID 1, and so on. The root slice ("-.slice") is now implicitly created and started (and does not require a unit file on disk anymore), since that's where "init.scope" is located and the slice needs to be started before the scope can. To check whether we are in unified or legacy hierarchy mode we use statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in legacy mode, if it reports cgroupfs we are in unified mode. This patch set carefuly makes sure that cgls and cgtop continue to work as desired. When invoking nspawn as a service it will implicitly create two subcgroups in the cgroup it is using, one to move the nspawn process into, the other to move the actual container processes into. This is done because of the requirement that cgroups may either contain processes or other subgroups.
author: Lennart Poettering <lennart@poettering.net> 2015-09-01 19:22:36 +0200
committer: Lennart Poettering <lennart@poettering.net> 2015-09-01 23:52:27 +0200
commit: efdb02375beb0a940c3320865572913780b4d7de (patch)
tree: bffddfbb0344c1d7c2e1853f36b0acf3f1624d64 /src/nspawn
parent: Merge pull request #1107 from msekletar/selinux-get-raw-context (diff)
download: systemd-efdb02375beb0a940c3320865572913780b4d7de.tar.xz
systemd-efdb02375beb0a940c3320865572913780b4d7de.zip
1 files changed, 220 insertions, 13 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 8039847a72..a56960506c 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -204,6 +204,7 @@ static char **arg_property = NULL;
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static bool arg_userns = false;
 static int arg_kill_signal = 0;
+static bool arg_unified_cgroup_hierarchy = false;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -385,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) {
         return 0;
 }
 
+static int detect_unified_cgroup_hierarchy(void) {
+        const char *e;
+        int r;
+
+        /* Allow the user to control whether the unified hierarchy is used */
+        e = getenv("UNIFIED_CGROUP_HIERARCHY");
+        if (e) {
+                r = parse_boolean(e);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
+
+                arg_unified_cgroup_hierarchy = r;
+                return 0;
+        }
+
+        /* Otherwise inherit the default from the host system */
+        r = cg_unified();
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+
+        arg_unified_cgroup_hierarchy = r;
+        return 0;
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -1037,6 +1062,10 @@ static int parse_argv(int argc, char *argv[]) {
         if (arg_boot && arg_kill_signal <= 0)
                 arg_kill_signal = SIGRTMIN+3;
 
+        r = detect_unified_cgroup_hierarchy();
+        if (r < 0)
+                return r;
+
         return 1;
 }
 
@@ -1095,7 +1124,6 @@ static int mount_all(const char *dest, bool userns) {
                 { "/proc/sys", "/proc/sys",      NULL,     NULL,        MS_BIND,                                                   true,  true  },   /* Bind mount first */
                 { NULL,        "/proc/sys",      NULL,     NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true  },   /* Then, make it r/o */
                 { "sysfs",     "/sys",           "sysfs",  NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false },
-                { "tmpfs",     "/sys/fs/cgroup", "tmpfs",  "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,               true,  false },
                 { "tmpfs",     "/dev",           "tmpfs",  "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false },
                 { "tmpfs",     "/dev/shm",       "tmpfs",  "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
                 { "tmpfs",     "/run",           "tmpfs",  "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false },
@@ -1381,7 +1409,7 @@ static int mount_custom(const char *dest) {
         return 0;
 }
 
-static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
         char *to;
         int r;
 
@@ -1409,11 +1437,31 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
         return 1;
 }
 
-static int mount_cgroup(const char *dest) {
+static int mount_legacy_cgroups(const char *dest) {
         _cleanup_set_free_free_ Set *controllers = NULL;
         const char *cgroup_root;
         int r;
 
+        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+        /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
+        r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
+        if (r == 0) {
+                _cleanup_free_ char *options = NULL;
+
+                r = tmpfs_patch_options("mode=755", &options);
+                if (r < 0)
+                        return log_oom();
+
+                if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
+                        return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
+        }
+
+        if (cg_unified() > 0)
+                goto skip_controllers;
+
         controllers = set_new(&string_hash_ops);
         if (!controllers)
                 return log_oom();
@@ -1437,7 +1485,7 @@ static int mount_cgroup(const char *dest) {
                 if (r == -EINVAL) {
                         /* Not a symbolic link, but directly a single cgroup hierarchy */
 
-                        r = mount_cgroup_hierarchy(dest, controller, controller, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
                         if (r < 0)
                                 return r;
 
@@ -1457,7 +1505,7 @@ static int mount_cgroup(const char *dest) {
                                 continue;
                         }
 
-                        r = mount_cgroup_hierarchy(dest, combined, combined, true);
+                        r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
                         if (r < 0)
                                 return r;
 
@@ -1471,17 +1519,52 @@ static int mount_cgroup(const char *dest) {
                 }
         }
 
-        r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
+skip_controllers:
+        r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
         if (r < 0)
                 return r;
 
-        cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
 
         return 0;
 }
 
+static int mount_unified_cgroups(const char *dest) {
+        const char *p;
+        int r;
+
+        assert(dest);
+
+        p = strjoina(dest, "/sys/fs/cgroup");
+
+        r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
+        if (r > 0) {
+                p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
+                if (access(p, F_OK) >= 0)
+                        return 0;
+                if (errno != ENOENT)
+                        return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
+
+                log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
+                return -EINVAL;
+        }
+
+        if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
+                return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
+
+        return 0;
+}
+
+static int mount_cgroups(const char *dest) {
+        if (arg_unified_cgroup_hierarchy)
+                return mount_unified_cgroups(dest);
+        else
+                return mount_legacy_cgroups(dest);
+}
+
 static int mount_systemd_cgroup_writable(const char *dest) {
         _cleanup_free_ char *own_cgroup_path = NULL;
         const char *systemd_root, *systemd_own;
@@ -1493,13 +1576,23 @@ static int mount_systemd_cgroup_writable(const char *dest) {
         if (r < 0)
                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
 
+        /* If we are living in the top-level, then there's nothing to do... */
+        if (path_equal(own_cgroup_path, "/"))
+                return 0;
+
+        if (arg_unified_cgroup_hierarchy) {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
+        } else {
+                systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+                systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
+        }
+
         /* Make our own cgroup a (writable) bind mount */
-        systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
 
         /* And then remount the systemd cgroup root read-only */
-        systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
 
@@ -4187,6 +4280,8 @@ static int inner_child(
         assert(directory);
         assert(kmsg_socket >= 0);
 
+        cg_unified_flush();
+
         if (arg_userns) {
                 /* Tell the parent, that it now can write the UID map. */
                 (void) barrier_place(barrier); /* #1 */
@@ -4368,6 +4463,8 @@ static int outer_child(
         assert(pid_socket >= 0);
         assert(kmsg_socket >= 0);
 
+        cg_unified_flush();
+
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
 
@@ -4484,7 +4581,7 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = mount_cgroup(directory);
+        r = mount_cgroups(directory);
         if (r < 0)
                 return r;
 
@@ -4499,7 +4596,6 @@ static int outer_child(
                         NULL);
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
-
         if (pid == 0) {
                 pid_socket = safe_close(pid_socket);
                 uid_shift_socket = safe_close(uid_shift_socket);
@@ -4567,9 +4663,112 @@ static int chown_cgroup(pid_t pid) {
         if (fd < 0)
                 return log_error_errno(errno, "Failed to open %s: %m", fs);
 
-        FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
+        FOREACH_STRING(fn,
+                       ".",
+                       "tasks",
+                       "notify_on_release",
+                       "cgroup.procs",
+                       "cgroup.clone_children",
+                       "cgroup.controllers",
+                       "cgroup.subtree_control",
+                       "cgroup.populated")
                 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
-                        log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
+                        log_full_errno(errno == ENOENT ? LOG_DEBUG :  LOG_WARNING, errno,
+                                       "Failed to chown() cgroup file %s, ignoring: %m", fn);
+
+        return 0;
+}
+
+static int sync_cgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+        bool undo_mount = false;
+        const char *fn;
+        int unified, r;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+
+        if ((unified > 0) == arg_unified_cgroup_hierarchy)
+                return 0;
+
+        /* When the host uses the legacy cgroup setup, but the
+         * container shall use the unified hierarchy, let's make sure
+         * we copy the path from the name=systemd hierarchy into the
+         * unified hierarchy. Similar for the reverse situation. */
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
+
+        /* In order to access the unified hierarchy we need to mount it */
+        if (!mkdtemp(tree))
+                return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+
+        if (unified)
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+        else
+                r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
+        if (r < 0) {
+                r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
+                goto finish;
+        }
+
+        undo_mount = true;
+
+        fn = strjoina(tree, cgroup, "/cgroup.procs");
+        (void) mkdir_parents(fn, 0755);
+
+        sprintf(pid_string, PID_FMT, pid);
+        r = write_string_file(fn, pid_string, 0);
+        if (r < 0)
+                log_error_errno(r, "Failed to move process: %m");
+
+finish:
+        if (undo_mount)
+                (void) umount(tree);
+
+        (void) rmdir(tree);
+        return r;
+}
+
+static int create_subcgroup(pid_t pid) {
+        _cleanup_free_ char *cgroup = NULL;
+        const char *child;
+        int unified, r;
+
+        /* In the unified hierarchy inner nodes may only only contain
+         * subgroups, but not processes. Hence, if we running in the
+         * unified hierarchy and the container does the same, and we
+         * did not create a scope unit for the container move us and
+         * the container into two separate subcgroups. */
+
+        if (!arg_keep_unit)
+                return 0;
+
+        if (!arg_unified_cgroup_hierarchy)
+                return 0;
+
+        unified = cg_unified();
+        if (unified < 0)
+                return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
+        if (unified == 0)
+                return 0;
+
+        r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+        if (r < 0)
+                return log_error_errno(r, "Failed to get our control group: %m");
+
+        child = strjoina(cgroup, "/payload");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
+
+        child = strjoina(cgroup, "/supervisor");
+        r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
 
         return 0;
 }
@@ -4976,6 +5175,14 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
+                r = sync_cgroup(pid);
+                if (r < 0)
+                        goto finish;
+
+                r = create_subcgroup(pid);
+                if (r < 0)
+                        goto finish;
+
                 r = chown_cgroup(pid);
                 if (r < 0)
                         goto finish;
author	Lennart Poettering <lennart@poettering.net>	2015-09-01 19:22:36 +0200
committer	Lennart Poettering <lennart@poettering.net>	2015-09-01 23:52:27 +0200
commit	efdb02375beb0a940c3320865572913780b4d7de (patch)
tree	bffddfbb0344c1d7c2e1853f36b0acf3f1624d64 /src/nspawn
parent	Merge pull request #1107 from msekletar/selinux-get-raw-context (diff)
download	systemd-efdb02375beb0a940c3320865572913780b4d7de.tar.xz systemd-efdb02375beb0a940c3320865572913780b4d7de.zip