summaryrefslogtreecommitdiffstats
path: root/src/libsystemd
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2019-10-30 17:22:49 +0100
committerLennart Poettering <lennart@poettering.net>2019-12-04 10:34:41 +0100
commitf8f3f9263e51db180bd78a5f3b152aefd25427ee (patch)
tree8f061092101c39a9d0ecd82e511e95400748d164 /src/libsystemd
parentprocess-util: add helper pidfd_get_pid() (diff)
downloadsystemd-f8f3f9263e51db180bd78a5f3b152aefd25427ee.tar.xz
systemd-f8f3f9263e51db180bd78a5f3b152aefd25427ee.zip
sd-event: add pidfd support
This adds support for watching for process exits via Linux new pidfd concept. This makes watching processes and killing them race-free if properly used, fixing a long-standing UNIX misdesign. This patch adds implicit and explicit pidfd support to sd-event: if a process shall be watched and is specified by PID we will now internally create a pidfd for it and use that, if available. Alternatively a new constructor for child process event sources is added that takes pidfds as input. Besides mere watching of child processes via pidfd two additional features are added: → sd_event_source_send_child_signal() allows sending a signal to the process being watched in the safest way possible (wrapping the new pidfd_send_signal() syscall). → sd_event_source_set_child_process_own() allows marking a process watched for destruction as soon as the event source is freed. This is currently implemented in userspace, but hopefully will become a kernel feature eventually. Altogether this means an sd_event_source object is now a safe and stable concept for referencing processes in race-free way, with automatic fallback to pre-pidfd kernels. Note that this patch adds support for this only to sd-event, not to PID 1. That's because PID 1 needs to use waitid(P_ALL) for reaping any process that might get reparented to it. This currently semantically conflicts with pidfd use for watching processes since we P_ALL is undirected and thus might reap process earlier than the pidfd notifies process end, which is hard to handle. The kernel will likely gain a concept for excluding specific pidfds from P_ALL watching, as soon as that is around we can start making use of this in PID 1 too.
Diffstat (limited to 'src/libsystemd')
-rw-r--r--src/libsystemd/libsystemd.sym11
-rw-r--r--src/libsystemd/sd-event/event-source.h8
-rw-r--r--src/libsystemd/sd-event/sd-event.c414
3 files changed, 412 insertions, 21 deletions
diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym
index 5ec42e0f1f..bed81bf173 100644
--- a/src/libsystemd/libsystemd.sym
+++ b/src/libsystemd/libsystemd.sym
@@ -682,3 +682,14 @@ global:
sd_bus_object_vtable_format;
sd_event_source_disable_unref;
} LIBSYSTEMD_241;
+
+LIBSYSTEMD_245 {
+global:
+ sd_event_add_child_pidfd;
+ sd_event_source_get_child_pidfd;
+ sd_event_source_get_child_pidfd_own;
+ sd_event_source_set_child_pidfd_own;
+ sd_event_source_get_child_process_own;
+ sd_event_source_set_child_process_own;
+ sd_event_source_send_child_signal;
+} LIBSYSTEMD_243;
diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h
index 99ab8fc169..08eb9b6a61 100644
--- a/src/libsystemd/sd-event/event-source.h
+++ b/src/libsystemd/sd-event/event-source.h
@@ -34,7 +34,7 @@ typedef enum EventSourceType {
* we know how to dispatch it */
typedef enum WakeupType {
WAKEUP_NONE,
- WAKEUP_EVENT_SOURCE,
+ WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */
WAKEUP_CLOCK_DATA,
WAKEUP_SIGNAL_DATA,
WAKEUP_INOTIFY_DATA,
@@ -96,6 +96,12 @@ struct sd_event_source {
siginfo_t siginfo;
pid_t pid;
int options;
+ int pidfd;
+ bool registered:1; /* whether the pidfd is registered in the epoll */
+ bool pidfd_owned:1; /* close pidfd when event source is freed */
+ bool process_owned:1; /* kill+reap process when event source is freed */
+ bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */
+ bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */
} child;
struct {
sd_event_handler_t callback;
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index e348fb699d..4693049f4c 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -9,6 +9,7 @@
#include "sd-id128.h"
#include "alloc-util.h"
+#include "env-util.h"
#include "event-source.h"
#include "fd-util.h"
#include "fs-util.h"
@@ -28,6 +29,14 @@
#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
+static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
+ /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
+ return s &&
+ s->type == SOURCE_CHILD &&
+ s->child.pidfd >= 0 &&
+ s->child.options == WEXITED;
+}
+
static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
[SOURCE_IO] = "io",
[SOURCE_TIME_REALTIME] = "realtime",
@@ -401,6 +410,51 @@ static int source_io_register(
return 0;
}
+static void source_child_pidfd_unregister(sd_event_source *s) {
+ assert(s);
+ assert(s->type == SOURCE_CHILD);
+
+ if (event_pid_changed(s->event))
+ return;
+
+ if (!s->child.registered)
+ return;
+
+ if (EVENT_SOURCE_WATCH_PIDFD(s))
+ if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
+ log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll: %m",
+ strna(s->description), event_source_type_to_string(s->type));
+
+ s->child.registered = false;
+}
+
+static int source_child_pidfd_register(sd_event_source *s, int enabled) {
+ int r;
+
+ assert(s);
+ assert(s->type == SOURCE_CHILD);
+ assert(enabled != SD_EVENT_OFF);
+
+ if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+ struct epoll_event ev;
+
+ ev = (struct epoll_event) {
+ .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
+ .data.ptr = s,
+ };
+
+ if (s->child.registered)
+ r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_MOD, s->child.pidfd, &ev);
+ else
+ r = epoll_ctl(s->event->epoll_fd, EPOLL_CTL_ADD, s->child.pidfd, &ev);
+ if (r < 0)
+ return -errno;
+ }
+
+ s->child.registered = true;
+ return 0;
+}
+
static clockid_t event_source_type_to_clock(EventSourceType t) {
switch (t) {
@@ -611,9 +665,8 @@ static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig)
assert(e);
- /* Rechecks if the specified signal is still something we are
- * interested in. If not, we'll unmask it, and possibly drop
- * the signalfd for it. */
+ /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
+ * and possibly drop the signalfd for it. */
if (sig == SIGCHLD &&
e->n_enabled_child_sources > 0)
@@ -704,9 +757,13 @@ static void source_disconnect(sd_event_source *s) {
}
(void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
- event_gc_signal_data(s->event, &s->priority, SIGCHLD);
}
+ if (EVENT_SOURCE_WATCH_PIDFD(s))
+ source_child_pidfd_unregister(s);
+ else
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+
break;
case SOURCE_DEFER:
@@ -787,6 +844,44 @@ static void source_free(sd_event_source *s) {
if (s->type == SOURCE_IO && s->io.owned)
s->io.fd = safe_close(s->io.fd);
+ if (s->type == SOURCE_CHILD) {
+ /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
+
+ if (s->child.process_owned) {
+
+ if (!s->child.exited) {
+ bool sent = false;
+
+ if (s->child.pidfd >= 0) {
+ if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
+ if (errno == ESRCH) /* Already dead */
+ sent = true;
+ else if (!ERRNO_IS_NOT_SUPPORTED(errno))
+ log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
+ s->child.pid);
+ } else
+ sent = true;
+ }
+
+ if (!sent)
+ if (kill(s->child.pid, SIGKILL) < 0)
+ if (errno != ESRCH) /* Already dead */
+ log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
+ s->child.pid);
+ }
+
+ if (!s->child.waited) {
+ siginfo_t si = {};
+
+ /* Reap the child if we can */
+ (void) waitid(P_PID, s->child.pid, &si, WEXITED);
+ }
+ }
+
+ if (s->child.pidfd_owned)
+ s->child.pidfd = safe_close(s->child.pidfd);
+ }
+
if (s->destroy_callback)
s->destroy_callback(s->userdata);
@@ -1121,6 +1216,11 @@ _public_ int sd_event_add_signal(
return 0;
}
+static bool shall_use_pidfd(void) {
+ /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
+ return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
+}
+
_public_ int sd_event_add_child(
sd_event *e,
sd_event_source **ret,
@@ -1152,30 +1252,136 @@ _public_ int sd_event_add_child(
if (!s)
return -ENOMEM;
+ s->wakeup = WAKEUP_EVENT_SOURCE;
s->child.pid = pid;
s->child.options = options;
s->child.callback = callback;
s->userdata = userdata;
s->enabled = SD_EVENT_ONESHOT;
+ /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
+ * pin the PID, and make regular waitid() handling race-free. */
+
+ if (shall_use_pidfd()) {
+ s->child.pidfd = pidfd_open(s->child.pid, 0);
+ if (s->child.pidfd < 0) {
+ /* Propagate errors unless the syscall is not supported or blocked */
+ if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+ return -errno;
+ } else
+ s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
+ } else
+ s->child.pidfd = -1;
+
r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
if (r < 0)
return r;
e->n_enabled_child_sources++;
- r = event_make_signal_data(e, SIGCHLD, NULL);
- if (r < 0) {
- e->n_enabled_child_sources--;
- return r;
- }
+ if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+ /* We have a pidfd and we only want to watch for exit */
- e->need_process_child = true;
+ r = source_child_pidfd_register(s, s->enabled);
+ if (r < 0) {
+ e->n_enabled_child_sources--;
+ return r;
+ }
+ } else {
+ /* We have no pidfd or we shall wait for some other event than WEXITED */
+
+ r = event_make_signal_data(e, SIGCHLD, NULL);
+ if (r < 0) {
+ e->n_enabled_child_sources--;
+ return r;
+ }
+
+ e->need_process_child = true;
+ }
if (ret)
*ret = s;
+
TAKE_PTR(s);
+ return 0;
+}
+
+_public_ int sd_event_add_child_pidfd(
+ sd_event *e,
+ sd_event_source **ret,
+ int pidfd,
+ int options,
+ sd_event_child_handler_t callback,
+ void *userdata) {
+
+
+ _cleanup_(source_freep) sd_event_source *s = NULL;
+ pid_t pid;
+ int r;
+
+ assert_return(e, -EINVAL);
+ assert_return(e = event_resolve(e), -ENOPKG);
+ assert_return(pidfd >= 0, -EBADF);
+ assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
+ assert_return(options != 0, -EINVAL);
+ assert_return(callback, -EINVAL);
+ assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+ assert_return(!event_pid_changed(e), -ECHILD);
+
+ r = hashmap_ensure_allocated(&e->child_sources, NULL);
+ if (r < 0)
+ return r;
+
+ r = pidfd_get_pid(pidfd, &pid);
+ if (r < 0)
+ return r;
+
+ if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
+ return -EBUSY;
+
+ s = source_new(e, !ret, SOURCE_CHILD);
+ if (!s)
+ return -ENOMEM;
+
+ s->wakeup = WAKEUP_EVENT_SOURCE;
+ s->child.pidfd = pidfd;
+ s->child.pid = pid;
+ s->child.options = options;
+ s->child.callback = callback;
+ s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
+ s->userdata = userdata;
+ s->enabled = SD_EVENT_ONESHOT;
+
+ r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
+ if (r < 0)
+ return r;
+
+ e->n_enabled_child_sources++;
+
+ if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+ /* We only want to watch for WEXITED */
+
+ r = source_child_pidfd_register(s, s->enabled);
+ if (r < 0) {
+ e->n_enabled_child_sources--;
+ return r;
+ }
+ } else {
+ /* We shall wait for some other event than WEXITED */
+
+ r = event_make_signal_data(e, SIGCHLD, NULL);
+ if (r < 0) {
+ e->n_enabled_child_sources--;
+ return r;
+ }
+ e->need_process_child = true;
+ }
+
+ if (ret)
+ *ret = s;
+
+ TAKE_PTR(s);
return 0;
}
@@ -2023,7 +2229,11 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
assert(s->event->n_enabled_child_sources > 0);
s->event->n_enabled_child_sources--;
- event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+ if (EVENT_SOURCE_WATCH_PIDFD(s))
+ source_child_pidfd_unregister(s);
+ else
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+
break;
case SOURCE_EXIT:
@@ -2097,12 +2307,25 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
s->enabled = m;
- r = event_make_signal_data(s->event, SIGCHLD, NULL);
- if (r < 0) {
- s->enabled = SD_EVENT_OFF;
- s->event->n_enabled_child_sources--;
- event_gc_signal_data(s->event, &s->priority, SIGCHLD);
- return r;
+ if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+ /* yes, we have pidfd */
+
+ r = source_child_pidfd_register(s, s->enabled);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ s->event->n_enabled_child_sources--;
+ return r;
+ }
+ } else {
+ /* no pidfd, or something other to watch for than WEXITED */
+
+ r = event_make_signal_data(s->event, SIGCHLD, NULL);
+ if (r < 0) {
+ s->enabled = SD_EVENT_OFF;
+ s->event->n_enabled_child_sources--;
+ event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+ return r;
+ }
}
break;
@@ -2225,6 +2448,98 @@ _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
return 0;
}
+_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_CHILD, -EDOM);
+ assert_return(!event_pid_changed(s->event), -ECHILD);
+
+ if (s->child.pidfd < 0)
+ return -EOPNOTSUPP;
+
+ return s->child.pidfd;
+}
+
+_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_CHILD, -EDOM);
+ assert_return(!event_pid_changed(s->event), -ECHILD);
+ assert_return(SIGNAL_VALID(sig), -EINVAL);
+
+ /* If we already have seen indication the process exited refuse sending a signal early. This way we
+ * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
+ * available. */
+ if (s->child.exited)
+ return -ESRCH;
+
+ if (s->child.pidfd >= 0) {
+ siginfo_t copy;
+
+ /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
+ * structure here */
+ if (si)
+ copy = *si;
+
+ if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
+ /* Let's propagate the error only if the system call is not implemented or prohibited */
+ if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+ return -errno;
+ } else
+ return 0;
+ }
+
+ /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
+ * this here. */
+ if (flags != 0)
+ return -EOPNOTSUPP;
+
+ if (si) {
+ /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
+ siginfo_t copy = *si;
+
+ if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
+ return -errno;
+ } else if (kill(s->child.pid, sig) < 0)
+ return -errno;
+
+ return 0;
+}
+
+_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+ if (s->child.pidfd < 0)
+ return -EOPNOTSUPP;
+
+ return s->child.pidfd_owned;
+}
+
+_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+ if (s->child.pidfd < 0)
+ return -EOPNOTSUPP;
+
+ s->child.pidfd_owned = own;
+ return 0;
+}
+
+_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+ return s->child.process_owned;
+}
+
+_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
+ assert_return(s, -EINVAL);
+ assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+ s->child.process_owned = own;
+ return 0;
+}
+
_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
assert_return(s, -EINVAL);
assert_return(mask, -EINVAL);
@@ -2535,6 +2850,12 @@ static int process_child(sd_event *e) {
if (s->enabled == SD_EVENT_OFF)
continue;
+ if (s->child.exited)
+ continue;
+
+ if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */
+ continue;
+
zero(s->child.siginfo);
r = waitid(P_PID, s->child.pid, &s->child.siginfo,
WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options);
@@ -2544,6 +2865,9 @@ static int process_child(sd_event *e) {
if (s->child.siginfo.si_pid != 0) {
bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
+ if (zombie)
+ s->child.exited = true;
+
if (!zombie && (s->child.options & WEXITED)) {
/* If the child isn't dead then let's
* immediately remove the state change
@@ -2563,6 +2887,33 @@ static int process_child(sd_event *e) {
return 0;
}
+static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
+ assert(e);
+ assert(s);
+ assert(s->type == SOURCE_CHILD);
+
+ if (s->pending)
+ return 0;
+
+ if (s->enabled == SD_EVENT_OFF)
+ return 0;
+
+ if (!EVENT_SOURCE_WATCH_PIDFD(s))
+ return 0;
+
+ zero(s->child.siginfo);
+ if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
+ return -errno;
+
+ if (s->child.siginfo.si_pid == 0)
+ return 0;
+
+ if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
+ s->child.exited = true;
+
+ return source_set_pending(s, true);
+}
+
static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
bool read_one = false;
int r;
@@ -2847,8 +3198,10 @@ static int source_dispatch(sd_event_source *s) {
r = s->child.callback(s, &s->child.siginfo, s->userdata);
/* Now, reap the PID for good. */
- if (zombie)
+ if (zombie) {
(void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
+ s->child.waited = true;
+ }
break;
}
@@ -3144,12 +3497,33 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
switch (*t) {
- case WAKEUP_EVENT_SOURCE:
- r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events);
+ case WAKEUP_EVENT_SOURCE: {
+ sd_event_source *s = ev_queue[i].data.ptr;
+
+ assert(s);
+
+ switch (s->type) {
+
+ case SOURCE_IO:
+ r = process_io(e, s, ev_queue[i].events);
+ break;
+
+ case SOURCE_CHILD:
+ r = process_pidfd(e, s, ev_queue[i].events);
+ break;
+
+ default:
+ assert_not_reached("Unexpected event source type");
+ }
+
break;
+ }
case WAKEUP_CLOCK_DATA: {
struct clock_data *d = ev_queue[i].data.ptr;
+
+ assert(d);
+
r = flush_timer(e, d->fd, ev_queue[i].events, &d->next);
break;
}