From aaf7eb81be912e7bed939f31e3bc4c631b2552b3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 1 Apr 2013 22:48:40 +0200 Subject: shutdown: correctly wait for processes we killed in the killall spree Previously we simply counted how many processes we killed and expected as many waitpid() calls to succeed. That however is incorrect to do. As we might kill processes that are not our immediate children, and as there might be left-over processes in the waitpid() queue from earlier the we might get more ore less waitpid() events that we expect. Hence: keep precise track of the processes we kill, remove the ones we get waitpid() for, and after each time we get SIGCHLD check if all others still exist. We use getpgid() to check if a PID still exists. This should fix issues with journald not setting journal files offline correctly on shutdown, because we'd too quickly proceed from SIGTERM to SIGKILL because some left-over process was in our waitpid() queue. --- src/core/killall.c | 85 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 27 deletions(-) (limited to 'src/core/killall.c') diff --git a/src/core/killall.c b/src/core/killall.c index 1eb3766f77..7f0dbb9edf 100644 --- a/src/core/killall.c +++ b/src/core/killall.c @@ -22,12 +22,14 @@ #include #include #include +#include #include "util.h" #include "def.h" #include "killall.h" +#include "set.h" -#define TIMEOUT_USEC (5 * USEC_PER_SEC) +#define TIMEOUT_USEC (10 * USEC_PER_SEC) static bool ignore_proc(pid_t pid) { char buf[PATH_MAX]; @@ -73,38 +75,68 @@ static bool ignore_proc(pid_t pid) { return false; } -static void wait_for_children(int n_processes, sigset_t *mask) { +static void wait_for_children(Set *pids, sigset_t *mask) { usec_t until; assert(mask); + if (set_isempty(pids)) + return; + until = now(CLOCK_MONOTONIC) + TIMEOUT_USEC; for (;;) { struct timespec ts; int k; usec_t n; + void *p; + Iterator i; + /* First, let the kernel inform us about killed + * children. Most processes will probably be our + * children, but some are not (might be our + * grandchildren instead...). */ for (;;) { - pid_t pid = waitpid(-1, NULL, WNOHANG); + pid_t pid; + pid = waitpid(-1, NULL, WNOHANG); if (pid == 0) break; + if (pid < 0) { + if (errno == ECHILD) + break; - if (pid < 0 && errno == ECHILD) + log_error("waitpid() failed: %m"); return; + } + + set_remove(pids, ULONG_TO_PTR(pid)); + } - if (n_processes > 0) - if (--n_processes == 0) - return; + /* Now explicitly check who might be remaining, who + * might not be our child. */ + SET_FOREACH(p, pids, i) { + + /* We misuse getpgid as a check whether a + * process still exists. */ + if (getpgid((pid_t) PTR_TO_ULONG(p)) >= 0) + continue; + + if (errno != ESRCH) + continue; + + set_remove(pids, p); } + if (set_isempty(pids)) + return; + n = now(CLOCK_MONOTONIC); if (n >= until) return; timespec_store(&ts, until - n); - - if ((k = sigtimedwait(mask, NULL, &ts)) != SIGCHLD) { + k = sigtimedwait(mask, NULL, &ts); + if (k != SIGCHLD) { if (k < 0 && errno != EAGAIN) { log_error("sigtimedwait() failed: %m"); @@ -117,10 +149,9 @@ static void wait_for_children(int n_processes, sigset_t *mask) { } } -static int killall(int sig) { - DIR *dir; +static int killall(int sig, Set *pids) { + _cleanup_closedir_ DIR *dir = NULL; struct dirent *d; - unsigned int n_processes = 0; dir = opendir("/proc"); if (!dir) @@ -143,23 +174,25 @@ static int killall(int sig) { _cleanup_free_ char *s; get_process_comm(pid, &s); - log_notice("Sending SIGKILL to PID %lu (%s)", (unsigned long) pid, strna(s)); + log_notice("Sending SIGKILL to PID %lu (%s).", (unsigned long) pid, strna(s)); } - if (kill(pid, sig) >= 0) - n_processes++; - else if (errno != ENOENT) + if (kill(pid, sig) >= 0) { + if (pids) + set_put(pids, ULONG_TO_PTR((unsigned long) pid)); + } else if (errno != ENOENT) log_warning("Could not kill %d: %m", pid); } - closedir(dir); - - return n_processes; + return set_size(pids); } void broadcast_signal(int sig, bool wait_for_exit) { sigset_t mask, oldmask; - int n_processes; + Set *pids; + + if (wait_for_exit) + pids = set_new(trivial_hash_func, trivial_compare_func); assert_se(sigemptyset(&mask) == 0); assert_se(sigaddset(&mask, SIGCHLD) == 0); @@ -168,17 +201,15 @@ void broadcast_signal(int sig, bool wait_for_exit) { if (kill(-1, SIGSTOP) < 0 && errno != ESRCH) log_warning("kill(-1, SIGSTOP) failed: %m"); - n_processes = killall(sig); + killall(sig, pids); if (kill(-1, SIGCONT) < 0 && errno != ESRCH) log_warning("kill(-1, SIGCONT) failed: %m"); - if (n_processes <= 0) - goto finish; - if (wait_for_exit) - wait_for_children(n_processes, &mask); + wait_for_children(pids, &mask); + + assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0); -finish: - sigprocmask(SIG_SETMASK, &oldmask, NULL); + set_free(pids); } -- cgit v1.2.3