summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2017-09-26 22:43:08 +0200
committerLennart Poettering <lennart@poettering.net>2017-09-27 18:26:18 +0200
commit09e2465407e6179d3833a68efd5ddb71bb3c3b23 (patch)
treed7f0d8c4cfea65217bd3a8a951004aecd0cbc831
parentcore: rename cgroup_queue → cgroup_realize_queue (diff)
downloadsystemd-09e2465407e6179d3833a68efd5ddb71bb3c3b23.tar.xz
systemd-09e2465407e6179d3833a68efd5ddb71bb3c3b23.zip
cgroup: after determining that a cgroup is empty, asynchronously dispatch this
This makes sure that if we learn via inotify or another event source that a cgroup is empty, and we checked that this is indeed the case (as we might get spurious notifications through inotify, as the inotify logic through the "cgroups.event" is pretty unspecific and might be trigger for a variety of reasons), then we'll enqueue a defer event for it, at a priority lower than SIGCHLD handling, so that we know for sure that if there's waitid() data for a process we used it before considering the cgroup empty notification. Fixes: #6608
-rw-r--r--src/core/cgroup.c113
-rw-r--r--src/core/cgroup.h3
-rw-r--r--src/core/manager.c2
-rw-r--r--src/core/manager.h9
-rw-r--r--src/core/service.c2
-rw-r--r--src/core/unit.c3
-rw-r--r--src/core/unit.h4
7 files changed, 112 insertions, 24 deletions
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index e5c7c7de61..c68ef5372c 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -1794,17 +1794,28 @@ int unit_watch_all_pids(Unit *u) {
return unit_watch_pids_in_path(u, u->cgroup_path);
}
-int unit_notify_cgroup_empty(Unit *u) {
+static int on_cgroup_empty_event(sd_event_source *s, void *userdata) {
+ Manager *m = userdata;
+ Unit *u;
int r;
- assert(u);
+ assert(s);
+ assert(m);
- if (!u->cgroup_path)
+ u = m->cgroup_empty_queue;
+ if (!u)
return 0;
- r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
- if (r <= 0)
- return r;
+ assert(u->in_cgroup_empty_queue);
+ u->in_cgroup_empty_queue = false;
+ LIST_REMOVE(cgroup_empty_queue, m->cgroup_empty_queue, u);
+
+ if (m->cgroup_empty_queue) {
+ /* More stuff queued, let's make sure we remain enabled */
+ r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to reenable cgroup empty event source: %m");
+ }
unit_add_to_gc_queue(u);
@@ -1814,6 +1825,51 @@ int unit_notify_cgroup_empty(Unit *u) {
return 0;
}
+void unit_add_to_cgroup_empty_queue(Unit *u) {
+ int r;
+
+ assert(u);
+
+ /* Note that there are four different ways how cgroup empty events reach us:
+ *
+ * 1. On the unified hierarchy we get an inotify event on the cgroup
+ *
+ * 2. On the legacy hierarchy, when running in system mode, we get a datagram on the cgroup agent socket
+ *
+ * 3. On the legacy hierarchy, when running in user mode, we get a D-Bus signal on the system bus
+ *
+ * 4. On the legacy hierarchy, in service units we start watching all processes of the cgroup for SIGCHLD as
+ * soon as we get one SIGCHLD, to deal with unreliable cgroup notifications.
+ *
+ * Regardless which way we got the notification, we'll verify it here, and then add it to a separate
+ * queue. This queue will be dispatched at a lower priority than the SIGCHLD handler, so that we always use
+ * SIGCHLD if we can get it first, and only use the cgroup empty notifications if there's no SIGCHLD pending
+ * (which might happen if the cgroup doesn't contain processes that are our own child, which is typically the
+ * case for scope units). */
+
+ if (u->in_cgroup_empty_queue)
+ return;
+
+ /* Let's verify that the cgroup is really empty */
+ if (!u->cgroup_path)
+ return;
+ r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
+ if (r < 0) {
+ log_unit_debug_errno(u, r, "Failed to determine whether cgroup %s is empty: %m", u->cgroup_path);
+ return;
+ }
+ if (r == 0)
+ return;
+
+ LIST_PREPEND(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+ u->in_cgroup_empty_queue = true;
+
+ /* Trigger the defer event */
+ r = sd_event_source_set_enabled(u->manager->cgroup_empty_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
+}
+
static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
Manager *m = userdata;
@@ -1853,7 +1909,7 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
* this here safely. */
continue;
- (void) unit_notify_cgroup_empty(u);
+ unit_add_to_cgroup_empty_queue(u);
}
}
}
@@ -1918,11 +1974,26 @@ int manager_setup_cgroup(Manager *m) {
log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path);
}
- /* 3. Install agent */
+ /* 3. Allocate cgroup empty defer event source */
+ m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
+ r = sd_event_add_defer(m->event, &m->cgroup_empty_event_source, on_cgroup_empty_event, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create cgroup empty event source: %m");
+
+ r = sd_event_source_set_priority(m->cgroup_empty_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of cgroup empty event source: %m");
+
+ r = sd_event_source_set_enabled(m->cgroup_empty_event_source, SD_EVENT_OFF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to disable cgroup empty event source: %m");
+
+ (void) sd_event_source_set_description(m->cgroup_empty_event_source, "cgroup-empty");
+
+ /* 4. Install notifier inotify object, or agent */
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
- /* In the unified hierarchy we can get
- * cgroup empty notifications via inotify. */
+ /* In the unified hierarchy we can get cgroup empty notifications via inotify. */
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
safe_close(m->cgroup_inotify_fd);
@@ -1937,7 +2008,7 @@ int manager_setup_cgroup(Manager *m) {
/* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also
* see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */
- r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-4);
if (r < 0)
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
@@ -1957,33 +2028,31 @@ int manager_setup_cgroup(Manager *m) {
log_debug("Release agent already installed.");
}
- /* 4. Make sure we are in the special "init.scope" unit in the root slice. */
+ /* 5. Make sure we are in the special "init.scope" unit in the root slice. */
scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
- /* also, move all other userspace processes remaining
- * in the root cgroup into that scope. */
+ /* Also, move all other userspace processes remaining in the root cgroup into that scope. */
r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
- /* 5. And pin it, so that it cannot be unmounted */
+ /* 6. And pin it, so that it cannot be unmounted */
safe_close(m->pin_cgroupfs_fd);
m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
if (m->pin_cgroupfs_fd < 0)
return log_error_errno(errno, "Failed to open pin file: %m");
- /* 6. Always enable hierarchical support if it exists... */
+ /* 7. Always enable hierarchical support if it exists... */
if (!all_unified && m->test_run_flags == 0)
(void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
- /* 7. Figure out which controllers are supported */
+ /* 8. Figure out which controllers are supported, and log about it */
r = cg_mask_supported(&m->cgroup_supported);
if (r < 0)
return log_error_errno(r, "Failed to determine supported controllers: %m");
-
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
@@ -1998,6 +2067,8 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
if (delete && m->cgroup_root)
(void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
+ m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
+
m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
@@ -2079,13 +2150,17 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
assert(m);
assert(cgroup);
+ /* Called on the legacy hierarchy whenever we get an explicit cgroup notification from the cgroup agent process
+ * or from the --system instance */
+
log_debug("Got cgroup empty notification for: %s", cgroup);
u = manager_get_unit_by_cgroup(m, cgroup);
if (!u)
return 0;
- return unit_notify_cgroup_empty(u);
+ unit_add_to_cgroup_empty_queue(u);
+ return 1;
}
int unit_get_memory_current(Unit *u, uint64_t *ret) {
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
index e422708804..65245fbc43 100644
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@@ -172,6 +172,8 @@ void unit_release_cgroup(Unit *u);
void unit_prune_cgroup(Unit *u);
int unit_watch_cgroup(Unit *u);
+void unit_add_to_cgroup_empty_queue(Unit *u);
+
int unit_attach_pids_to_cgroup(Unit *u);
int manager_setup_cgroup(Manager *m);
@@ -200,7 +202,6 @@ int unit_reset_ip_accounting(Unit *u);
cc ? cc->name : false; \
})
-int unit_notify_cgroup_empty(Unit *u);
int manager_notify_cgroup_empty(Manager *m, const char *group);
void unit_invalidate_cgroup(Unit *u, CGroupMask m);
diff --git a/src/core/manager.c b/src/core/manager.c
index da2a8c4715..8ebea8d66b 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -859,7 +859,7 @@ static int manager_setup_cgroups_agent(Manager *m) {
* SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification,
* and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of
* cgroup inotify for the unified cgroup stuff. */
- r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+ r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-4);
if (r < 0)
return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
diff --git a/src/core/manager.h b/src/core/manager.h
index 9941dd10d8..c909916a1e 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -121,6 +121,9 @@ struct Manager {
/* Units that should be realized */
LIST_HEAD(Unit, cgroup_realize_queue);
+ /* Units whose cgroup ran empty */
+ LIST_HEAD(Unit, cgroup_empty_queue);
+
sd_event *event;
/* We use two hash tables here, since the same PID might be
@@ -229,12 +232,14 @@ struct Manager {
CGroupMask cgroup_supported;
char *cgroup_root;
- /* Notifications from cgroups, when the unified hierarchy is
- * used is done via inotify. */
+ /* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
int cgroup_inotify_fd;
sd_event_source *cgroup_inotify_event_source;
Hashmap *cgroup_inotify_wd_unit;
+ /* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
+ sd_event_source *cgroup_empty_event_source;
+
/* Make sure the user cannot accidentally unmount our cgroup
* file system */
int pin_cgroupfs_fd;
diff --git a/src/core/service.c b/src/core/service.c
index 38a8010232..47fecfb34f 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -3213,7 +3213,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
/* If the PID set is empty now, then let's finish this off
(On unified we use proper notifications) */
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) == 0 && set_isempty(u->pids))
- service_notify_cgroup_empty_event(u);
+ unit_add_to_cgroup_empty_queue(u);
}
static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
diff --git a/src/core/unit.c b/src/core/unit.c
index 190c05c98e..7ca08af291 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -591,6 +591,9 @@ void unit_free(Unit *u) {
if (u->in_cgroup_realize_queue)
LIST_REMOVE(cgroup_realize_queue, u->manager->cgroup_realize_queue, u);
+ if (u->in_cgroup_empty_queue)
+ LIST_REMOVE(cgroup_empty_queue, u->manager->cgroup_empty_queue, u);
+
unit_release_cgroup(u);
unit_unref_uid_gid(u, false);
diff --git a/src/core/unit.h b/src/core/unit.h
index da40a433a3..7c6fef27b3 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -168,6 +168,9 @@ struct Unit {
/* CGroup realize members queue */
LIST_FIELDS(Unit, cgroup_realize_queue);
+ /* cgroup empty queue */
+ LIST_FIELDS(Unit, cgroup_empty_queue);
+
/* PIDs we keep an eye on. Note that a unit might have many
* more, but these are the ones we care enough about to
* process SIGCHLD for */
@@ -264,6 +267,7 @@ struct Unit {
bool in_cleanup_queue:1;
bool in_gc_queue:1;
bool in_cgroup_realize_queue:1;
+ bool in_cgroup_empty_queue:1;
bool sent_dbus_new_signal:1;