diff options
-rw-r--r-- | NEWS | 5 | ||||
-rw-r--r-- | man/org.freedesktop.systemd1.xml | 41 | ||||
-rw-r--r-- | man/systemd.exec.xml | 24 | ||||
-rw-r--r-- | src/basic/process-util.c | 5 | ||||
-rw-r--r-- | src/basic/process-util.h | 27 | ||||
-rw-r--r-- | src/core/dbus-execute.c | 23 | ||||
-rw-r--r-- | src/core/exec-invoke.c | 169 | ||||
-rw-r--r-- | src/core/execute-serialize.c | 20 | ||||
-rw-r--r-- | src/core/execute.c | 11 | ||||
-rw-r--r-- | src/core/execute.h | 4 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.in | 1 | ||||
-rw-r--r-- | src/core/load-fragment.c | 1 | ||||
-rw-r--r-- | src/core/load-fragment.h | 1 | ||||
-rw-r--r-- | src/core/manager.c | 195 | ||||
-rw-r--r-- | src/core/manager.h | 16 | ||||
-rw-r--r-- | src/core/namespace.c | 10 | ||||
-rw-r--r-- | src/core/namespace.h | 11 | ||||
-rw-r--r-- | src/core/service.c | 33 | ||||
-rw-r--r-- | src/core/unit.c | 5 | ||||
-rw-r--r-- | src/core/unit.h | 3 | ||||
-rw-r--r-- | src/shared/bus-unit-util.c | 3 | ||||
-rwxr-xr-x | test/TEST-07-PID1/test.sh | 5 | ||||
-rwxr-xr-x | test/units/TEST-07-PID1.private-pids.sh | 161 |
23 files changed, 741 insertions, 33 deletions
@@ -254,6 +254,11 @@ CHANGES WITH 257 in spe: the "nobody" user to the dynamic user, rather than via recursive chown()ing. + * A new service property PrivatePIDs= has been added that runs executed + processes as PID 1 - the init process - within their own PID namespace. + PrivatePIDs= also mounts /proc/ so only processes within the new PID + namespace are visible. + systemd-udevd: * udev rules now set 'uaccess' for /dev/udmabuf, giving locally diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 7ade8c3e8b..05afb93f9b 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3263,6 +3263,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateIPC = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivatePIDs = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectHome = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectSystem = '...'; @@ -4584,6 +4586,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/> + <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/> + <variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/> @@ -4870,6 +4874,11 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>. Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname> is a string type.</para> + + <para><varname>PrivatePIDs</varname> implements the destination parameter of the + unit file setting <varname>PrivatePIDs=</varname> listed in + <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>. + Note <varname>PrivatePIDs</varname> is a string type to allow adding more values in the future.</para> </refsect2> </refsect1> @@ -5439,6 +5448,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateIPC = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivatePIDs = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectHome = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectSystem = '...'; @@ -6744,6 +6755,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { <variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/> + <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/> + <variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/> @@ -7442,6 +7455,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateIPC = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivatePIDs = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectHome = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectSystem = '...'; @@ -8585,6 +8600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { <variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/> + <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/> + <variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/> @@ -9412,6 +9429,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b PrivateIPC = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivatePIDs = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectHome = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s ProtectSystem = '...'; @@ -10527,6 +10546,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { <variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/> + <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/> + <variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/> @@ -12281,8 +12302,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>ExtraFileDescriptorNames</varname>, <varname>ManagedOOMMemoryPressureDurationUSec</varname>, <varname>BindLogSockets</varname>, - <varname>ProtectControlGroupsEx</varname>, and - <varname>PrivateUsersEx</varname> were added in version 257.</para> + <varname>ProtectControlGroupsEx</varname>, + <varname>PrivateUsersEx</varname>, and + <varname>PrivatePIDs</varname> were added in version 257.</para> </refsect2> <refsect2> <title>Socket Unit Objects</title> @@ -12323,8 +12345,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>ImportCredentialEx</varname>, <varname>BindLogSockets</varname>, <varname>PrivateUsersEx</varname>, - <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and - <varname>ProtectControlGroupsEx</varname> were added in version 257.</para> + <varname>ManagedOOMMemoryPressureDurationUSec</varname>, + <varname>ProtectControlGroupsEx</varname>, and + <varname>PrivatePIDs</varname> were added in version 257.</para> </refsect2> <refsect2> <title>Mount Unit Objects</title> @@ -12362,8 +12385,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>ImportCredentialEx</varname>, <varname>BindLogSockets</varname>, <varname>PrivateUsersEx</varname>, - <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and - <varname>ProtectControlGroupsEx</varname> were added in version 257.</para> + <varname>ManagedOOMMemoryPressureDurationUSec</varname>, + <varname>ProtectControlGroupsEx</varname>, and + <varname>PrivatePIDs</varname> were added in version 257.</para> </refsect2> <refsect2> <title>Swap Unit Objects</title> @@ -12401,8 +12425,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>ImportCredentialEx</varname>, <varname>BindLogSockets</varname>, <varname>PrivateUsersEx</varname>, - <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and - <varname>ProtectControlGroupsEx</varname> were added in version 257.</para> + <varname>ManagedOOMMemoryPressureDurationUSec</varname>, + <varname>ProtectControlGroupsEx</varname>, and + <varname>PrivatePIDs</varname> were added in version 257.</para> </refsect2> <refsect2> <title>Slice Unit Objects</title> diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 30a926c9a0..b50f70ff42 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1977,6 +1977,30 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting> </varlistentry> <varlistentry> + <term><varname>PrivatePIDs=</varname></term> + + <listitem><para>Takes a boolean argument. Defaults to false. If enabled, sets up a new PID namespace + for the executed processes. Each executed process is now PID 1 - the init process - in the new namespace. + <filename>/proc/</filename> is mounted such that only processes in the PID namespace are visible. + If <varname>PrivatePIDs=</varname> is set, <varname>MountAPIVFS=yes</varname> is implied.</para> + + <para><varname>PrivatePIDs=</varname> is only supported for service units. This setting is not supported + with <varname>Type=forking</varname> since the kernel will kill all processes in the PID namespace if + the init process terminates.</para> + + <para>This setting will be ignored if the kernel does not support PID namespaces.</para> + + <para>Note unprivileged user services (i.e. a service run by the per-user instance of the service manager) + will fail with <varname>PrivatePIDs=yes</varname> if <filename>/proc/</filename> is masked + (i.e. <filename>/proc/kmsg</filename> is over-mounted with <constant>tmpfs</constant> like + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> does). + This is due to a kernel restriction not allowing unprivileged user namespaces to mount a less restrictive + instance of <filename>/proc/</filename>.</para> + + <xi:include href="version-info.xml" xpointer="v257"/></listitem> + </varlistentry> + + <varlistentry> <term><varname>PrivateUsers=</varname></term> <listitem><para>Takes a boolean argument or one of <literal>self</literal> or diff --git a/src/basic/process-util.c b/src/basic/process-util.c index a85a1b35f0..75bc65652e 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -1521,11 +1521,12 @@ int safe_fork_full( } } - if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS)) != 0) + if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS)) != 0) pid = raw_clone(SIGCHLD| (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) | (FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) | - (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0)); + (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0) | + (FLAGS_SET(flags, FORK_NEW_PIDNS) ? CLONE_NEWPID : 0)); else pid = fork(); if (pid < 0) diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 05b7a69fc6..cb6d47a5bb 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -166,7 +166,7 @@ int must_be_root(void); pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata); -/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, or FORK_NEW_NETNS should not be called in threaded +/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, FORK_NEW_NETNS or FORK_NEW_PIDNS should not be called in threaded * programs, because they cause us to use raw_clone() which does not synchronize the glibc malloc() locks, * and thus will cause deadlocks if the parent uses threads and the child does memory allocations. Hence: if * the parent is threaded these flags may not be used. These flags cannot be used if the parent uses threads @@ -181,18 +181,19 @@ typedef enum ForkFlags { FORK_REOPEN_LOG = 1 << 6, /* Reopen log connection */ FORK_LOG = 1 << 7, /* Log above LOG_DEBUG log level about failures */ FORK_WAIT = 1 << 8, /* Wait until child exited */ - FORK_NEW_MOUNTNS = 1 << 9, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ - FORK_MOUNTNS_SLAVE = 1 << 10, /* Make child's mount namespace MS_SLAVE */ - FORK_PRIVATE_TMP = 1 << 11, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */ - FORK_RLIMIT_NOFILE_SAFE = 1 << 12, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */ - FORK_STDOUT_TO_STDERR = 1 << 13, /* Make stdout a copy of stderr */ - FORK_FLUSH_STDIO = 1 << 14, /* fflush() stdout (and stderr) before forking */ - FORK_NEW_USERNS = 1 << 15, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ - FORK_CLOEXEC_OFF = 1 << 16, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */ - FORK_KEEP_NOTIFY_SOCKET = 1 << 17, /* Unless this specified, $NOTIFY_SOCKET will be unset. */ - FORK_DETACH = 1 << 18, /* Double fork if needed to ensure PID1/subreaper is parent */ - FORK_NEW_NETNS = 1 << 19, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ - FORK_PACK_FDS = 1 << 20, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */ + FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */ + FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */ + FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */ + FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */ + FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */ + FORK_CLOEXEC_OFF = 1 << 14, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */ + FORK_KEEP_NOTIFY_SOCKET = 1 << 15, /* Unless this specified, $NOTIFY_SOCKET will be unset. */ + FORK_DETACH = 1 << 16, /* Double fork if needed to ensure PID1/subreaper is parent */ + FORK_PACK_FDS = 1 << 17, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */ + FORK_NEW_MOUNTNS = 1 << 18, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ + FORK_NEW_USERNS = 1 << 19, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ + FORK_NEW_NETNS = 1 << 20, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ + FORK_NEW_PIDNS = 1 << 21, /* Run child in its own PID namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */ } ForkFlags; int safe_fork_full( diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index a9a73b599b..e297323f1d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -63,6 +63,7 @@ static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_pids, "s", PrivatePIDs, private_pids_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa); @@ -1194,6 +1195,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivatePIDs", "s", property_get_private_pids, offsetof(ExecContext, private_pids), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1970,6 +1972,27 @@ int bus_exec_context_set_transient_property( return 1; } + if (streq(name, "PrivatePIDs")) { + const char *s; + PrivatePIDs t; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + t = private_pids_from_string(s); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->private_pids = t; + (void) unit_write_settingf(u, flags, name, "%s=%s", + name, private_pids_to_string(c->private_pids)); + } + + return 1; + } + if (streq(name, "PrivateDevices")) return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 72ed53360b..120067a774 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2251,6 +2251,130 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi return 1; } +static int can_mount_proc(const ExecContext *c, ExecParameters *p) { + _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; + _cleanup_(sigkill_waitp) pid_t pid = 0; + ssize_t n; + int r; + + assert(c); + assert(p); + + /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs + * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction + * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */ + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m"); + + /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE + * with FORK_MOUNTNS_SLAVE. */ + r = safe_fork("(sd-proc-check)", + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to fork child process (sd-proc-check): %m"); + if (r == 0) { + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount + * namespace will be cleaned up once the process exits. */ + r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) { + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with child process (sd-proc-check): %m"); + if (n == sizeof(r)) { /* an error code was sent to us */ + /* This is the expected case where proc cannot be mounted due to permissions. */ + if (ERRNO_IS_NEG_PRIVILEGE(r)) + return 0; + if (r < 0) + return r; + + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to wait for (sd-proc-check) child process to terminate: %m"); + if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */ + return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r); + + return 1; +} + +static int setup_private_pids(const ExecContext *c, ExecParameters *p) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; + ssize_t n; + int r, q; + + assert(c); + assert(p); + assert(p->pidref_transport_fd >= 0); + + /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so + * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child + * pidref to the manager and exits while the child process continues with the rest of exec_invoke() + * and finally executes the actual payload. */ + + /* Create a communication channel so that the parent can tell the child a proper error code in case it + * failed to send child pidref to the manager. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with parent process: %m"); + + r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS, &pidref); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to fork child into new pid namespace: %m"); + if (r > 0) { + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* In the parent process, we send the child pidref to the manager and exit. + * If PIDFD is not supported, only the child PID is sent. The server then + * uses the child PID to set the new exec main process. */ + q = send_one_fd_iov( + p->pidref_transport_fd, + pidref.fd, + &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)), + /*iovlen=*/ 1, + /*flags=*/ 0); + /* Send error code to child process. */ + (void) write(errno_pipe[1], &q, sizeof(q)); + /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as + * some destructors have external effects. The main codepaths continue in the child process. */ + _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + p->pidref_transport_fd = safe_close(p->pidref_transport_fd); + + /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always + * receive an errno even on success. */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with parent process: %m"); + if (n != sizeof(r)) + return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process"); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to send child pidref to manager: %m"); + + /* NOTE! This function returns in the child process only. */ + return r; +} + static int create_many_symlinks(const char *root, const char *source, char **symlinks) { _cleanup_free_ char *src_abs = NULL; int r; @@ -3301,6 +3425,7 @@ static int apply_mount_namespace( .private_dev = needs_sandboxing && context->private_devices, .private_network = needs_sandboxing && exec_needs_network_namespace(context), .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context), + .private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO, .private_tmp = needs_sandboxing ? context->private_tmp : false, .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context), @@ -3573,7 +3698,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 16]; + int dont_close[n_fds + 17]; assert(params); @@ -3612,6 +3737,9 @@ static int close_remaining_fds( if (params->handoff_timestamp_fd >= 0) dont_close[n_dont_close++] = params->handoff_timestamp_fd; + if (params->pidref_transport_fd >= 0) + dont_close[n_dont_close++] = params->pidref_transport_fd; + assert(n_dont_close <= ELEMENTSOF(dont_close)); return close_all_fds(dont_close, n_dont_close); @@ -3934,6 +4062,7 @@ static bool exec_context_need_unprivileged_private_users( !strv_isempty(context->extension_directories) || context->protect_system != PROTECT_SYSTEM_NO || context->protect_home != PROTECT_HOME_NO || + exec_needs_pid_namespace(context) || context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || @@ -4139,6 +4268,7 @@ int exec_invoke( needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */ bool keep_seccomp_privileges = false; + bool has_cap_sys_admin = false; #if HAVE_SELINUX _cleanup_free_ char *mac_selinux_context_net = NULL; bool use_selinux = false; @@ -4790,6 +4920,9 @@ int exec_invoke( uint64_t capability_ambient_set = context->capability_ambient_set; + /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */ + has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0; + if (needs_sandboxing) { /* MAC enablement checks need to be done before a new mount ns is created, as they rely on * /sys being present. The actual MAC context application will happen later, as late as @@ -4924,6 +5057,40 @@ int exec_invoke( } } + /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible. + * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */ + if (needs_sandboxing && exec_needs_pid_namespace(context)) { + if (params->pidref_transport_fd < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "PidRef socket is not set up: %m"); + } + + /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need + * to check if we can mount /proc/. + * + * We need to check prior to entering the user namespace because if we're running unprivileged or in a + * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not + * once we unshare a mount namespace. */ + r = has_cap_sys_admin ? 1 : can_mount_proc(context, params); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to detect if /proc/ can be remounted: %m"); + } + if (r == 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EPERM), + "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing."); + } + + r = setup_private_pids(context, params); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m"); + } + } + + /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */ + if (needs_mount_namespace) { _cleanup_free_ char *error_path = NULL; diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 6fa0b21968..bf6592faed 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1391,6 +1391,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext if (r < 0) return r; + r = serialize_fd(f, fds, "exec-parameters-pidref-transport-fd", p->pidref_transport_fd); + if (r < 0) + return r; + if (c && exec_context_restrict_filesystems_set(c)) { r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd); if (r < 0) @@ -1660,6 +1664,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { continue; close_and_replace(p->handoff_timestamp_fd, fd); + } else if ((val = startswith(l, "exec-parameters-pidref-transport-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + close_and_replace(p->pidref_transport_fd, fd); } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) { int fd; @@ -1926,6 +1938,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item(f, "exec-context-private-pids", private_pids_to_string(c->private_pids)); + if (r < 0) + return r; + r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc); if (r < 0) return r; @@ -2813,6 +2829,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (r < 0) return r; c->private_ipc = r; + } else if ((val = startswith(l, "exec-context-private-pids="))) { + c->private_pids = private_pids_from_string(val); + if (c->private_pids < 0) + return -EINVAL; } else if ((val = startswith(l, "exec-context-remove-ipc="))) { r = parse_boolean(val); if (r < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 1c41b39a2f..2c5a5db10e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -254,6 +254,12 @@ bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParame return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); } +bool exec_needs_pid_namespace(const ExecContext *context) { + assert(context); + + return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID); +} + bool exec_needs_mount_namespace( const ExecContext *context, const ExecParameters *params, @@ -306,7 +312,8 @@ bool exec_needs_mount_namespace( exec_needs_cgroup_mount(context, params) || context->protect_proc != PROTECT_PROC_DEFAULT || context->proc_subset != PROC_SUBSET_ALL || - exec_needs_ipc_namespace(context)) + exec_needs_ipc_namespace(context) || + exec_needs_pid_namespace(context)) return true; if (context->root_directory) { @@ -1026,6 +1033,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { "%sProtectControlGroups: %s\n" "%sPrivateNetwork: %s\n" "%sPrivateUsers: %s\n" + "%sPrivatePIDs: %s\n" "%sProtectHome: %s\n" "%sProtectSystem: %s\n" "%sMountAPIVFS: %s\n" @@ -1052,6 +1060,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, protect_control_groups_to_string(c->protect_control_groups), prefix, yes_no(c->private_network), prefix, private_users_to_string(c->private_users), + prefix, private_pids_to_string(c->private_pids), prefix, protect_home_to_string(c->protect_home), prefix, protect_system_to_string(c->protect_system), prefix, yes_no(exec_context_get_effective_mount_apivfs(c)), diff --git a/src/core/execute.h b/src/core/execute.h index 7274c68d3d..32dabf177f 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -335,6 +335,7 @@ struct ExecContext { ProtectControlGroups protect_control_groups; ProtectSystem protect_system; ProtectHome protect_home; + PrivatePIDs private_pids; bool protect_hostname; bool dynamic_user; @@ -465,6 +466,7 @@ struct ExecParameters { char **files_env; int user_lookup_fd; int handoff_timestamp_fd; + int pidref_transport_fd; int bpf_restrict_fs_map_fd; @@ -486,6 +488,7 @@ struct ExecParameters { .bpf_restrict_fs_map_fd = -EBADF, \ .user_lookup_fd = -EBADF, \ .handoff_timestamp_fd = -EBADF, \ + .pidref_transport_fd = -EBADF, \ } #include "unit.h" @@ -623,6 +626,7 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_; bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime); bool exec_needs_network_namespace(const ExecContext *context); bool exec_needs_ipc_namespace(const ExecContext *context); +bool exec_needs_pid_namespace(const ExecContext *context); ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params); bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params); diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index f5cbb319d7..d7564b3767 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -133,6 +133,7 @@ {{type}}.PrivateUsers, config_parse_private_users, 0, offsetof({{type}}, exec_context.private_users) {{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts) {{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc) +{{type}}.PrivatePIDs, config_parse_private_pids, 0, offsetof({{type}}, exec_context.private_pids) {{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system) {{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home) {{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 1d813332b1..f34c930f4e 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -135,6 +135,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc); DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers); +DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs); DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_control_groups, protect_control_groups, ProtectControlGroups); DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode); DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 9b95f0c24e..8ac962a94b 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -114,6 +114,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp); CONFIG_PARSER_PROTOTYPE(config_parse_private_users); +CONFIG_PARSER_PROTOTYPE(config_parse_private_pids); CONFIG_PARSER_PROTOTYPE(config_parse_protect_control_groups); CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset); diff --git a/src/core/manager.c b/src/core/manager.c index f58bc547a6..296d7416b1 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -126,6 +126,7 @@ static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); @@ -913,6 +914,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, .signal_fd = -EBADF, .user_lookup_fds = EBADF_PAIR, .handoff_timestamp_fds = EBADF_PAIR, + .pidref_transport_fds = EBADF_PAIR, .private_listen_fd = -EBADF, .dev_autofs_fd = -EBADF, .cgroup_inotify_fd = -EBADF, @@ -1309,6 +1311,55 @@ static int manager_setup_handoff_timestamp_fd(Manager *m) { return 0; } +static int manager_setup_pidref_transport_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing parent and child pidrefs back when the executor unshares + * a PID namespace and forks again when using PrivatePIDs=yes. */ + + if (m->pidref_transport_fds[0] < 0) { + m->pidref_event_source = sd_event_source_disable_unref(m->pidref_event_source); + safe_close_pair(m->pidref_transport_fds); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->pidref_transport_fds) < 0) + return log_error_errno(errno, "Failed to allocate pidref socket: %m"); + + /* Make sure children never have to block */ + (void) fd_increase_rxbuf(m->pidref_transport_fds[0], MANAGER_SOCKET_RCVBUF_SIZE); + + r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "Failed to enable SO_PASSCRED for pidref socket: %m"); + + r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSPIDFD, true); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + log_debug("SO_PASSPIDFD is not supported for pidref socket, ignoring."); + else if (r < 0) + log_warning_errno(r, "Failed to enable SO_PASSPIDFD for pidref socket, ignoring: %m"); + + /* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */ + r = fd_nonblock(m->pidref_transport_fds[0], true); + if (r < 0) + return log_error_errno(r, "Failed to make pidref socket O_NONBLOCK: %m"); + } + + if (!m->pidref_event_source) { + r = sd_event_add_io(m->event, &m->pidref_event_source, m->pidref_transport_fds[0], EPOLLIN, manager_dispatch_pidref_transport_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate pidref event source: %m"); + + r = sd_event_source_set_priority(m->pidref_event_source, EVENT_PRIORITY_PIDREF); + if (r < 0) + return log_error_errno(r, "Failed to set priority of pidref event source: %m"); + + (void) sd_event_source_set_description(m->pidref_event_source, "pidref"); + } + + return 0; +} + static unsigned manager_dispatch_cleanup_queue(Manager *m) { Unit *u; unsigned n = 0; @@ -1724,6 +1775,7 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->run_queue_event_source); sd_event_source_unref(m->user_lookup_event_source); sd_event_source_unref(m->handoff_timestamp_event_source); + sd_event_source_unref(m->pidref_event_source); sd_event_source_unref(m->memory_pressure_event_source); safe_close(m->signal_fd); @@ -1731,6 +1783,7 @@ Manager* manager_free(Manager *m) { safe_close(m->cgroups_agent_fd); safe_close_pair(m->user_lookup_fds); safe_close_pair(m->handoff_timestamp_fds); + safe_close_pair(m->pidref_transport_fds); manager_close_ask_password(m); @@ -2077,6 +2130,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo /* This shouldn't fail, except if things are really broken. */ return r; + r = manager_setup_pidref_transport_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + /* Connect to the bus if we are good for it */ manager_setup_bus(m); @@ -3747,6 +3805,7 @@ int manager_reload(Manager *m) { (void) manager_setup_cgroups_agent(m); (void) manager_setup_user_lookup_fd(m); (void) manager_setup_handoff_timestamp_fd(m); + (void) manager_setup_pidref_transport_fd(m); /* Third, fire things up! */ manager_coldplug(m); @@ -5002,6 +5061,142 @@ static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd return 0; } +static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL, parent_pidref = PIDREF_NULL; + _cleanup_close_ int child_pidfd = -EBADF, parent_pidfd = -EBADF; + struct ucred *ucred = NULL; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int)) * 2) control; + pid_t child_pid; + struct msghdr msghdr = { + .msg_iov = &IOVEC_MAKE(&child_pid, sizeof(child_pid)), + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + ssize_t n; + int r; + + assert(source); + + /* Server expects: + * - Parent PID in ucreds enabled via SO_PASSCRED + * - Parent PIDFD in SCM_PIDFD message enabled via SO_PASSPIDFD + * - Child PIDFD in SCM_RIGHTS in message body + * - Child PID in message IOV + * + * SO_PASSPIDFD may not be supported by the kernel so we fall back to using parent PID from ucreds + * and accept some raciness. */ + n = recvmsg_safe(m->pidref_transport_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; /* Spurious wakeup, try again */ + if (n == -ECHRNG) { + log_warning_errno(n, "Got message with truncated control data (unexpected fds sent?), ignoring."); + return 0; + } + if (n == -EXFULL) { + log_warning_errno(n, "Got message with truncated payload data, ignoring."); + return 0; + } + if (n < 0) + return log_error_errno(n, "Failed to receive pidref message: %m"); + + if (n != sizeof(child_pid)) { + log_warning("Got pidref message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(child_pid)); + return 0; + } + + CMSG_FOREACH(cmsg, &msghdr) { + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + if (cmsg->cmsg_type == SCM_CREDENTIALS && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + assert(!ucred); + ucred = CMSG_TYPED_DATA(cmsg, struct ucred); + } else if (cmsg->cmsg_type == SCM_PIDFD) { + assert(parent_pidfd < 0); + parent_pidfd = *CMSG_TYPED_DATA(cmsg, int); + } else if (cmsg->cmsg_type == SCM_RIGHTS) { + assert(child_pidfd < 0); + child_pidfd = *CMSG_TYPED_DATA(cmsg, int); + } + } + + /* Verify and set parent pidref. */ + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received pidref message without valid credentials. Ignoring."); + return 0; + } + + /* Need to handle kernels without SO_PASSPIDFD where SCM_PIDFD will not be set. */ + if (parent_pidfd >= 0) + r = pidref_set_pidfd_consume(&parent_pidref, TAKE_FD(parent_pidfd)); + else + r = pidref_set_pid(&parent_pidref, ucred->pid); + if (r < 0) { + if (r == -ESRCH) + log_debug_errno(r, "PidRef child process died before message is processed. Ignoring."); + else + log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m"); + return 0; + } + + if (parent_pidref.pid != ucred->pid) { + assert(parent_pidref.fd >= 0); + log_warning("Got SCM_PIDFD for parent process " PID_FMT " but got SCM_CREDENTIALS for parent process " PID_FMT ". Ignoring.", + parent_pidref.pid, ucred->pid); + return 0; + } + + /* Verify and set child pidref. */ + if (!pid_is_valid(child_pid)) { + log_warning("Received pidref message without valid child PID. Ignoring."); + return 0; + } + + /* Need to handle kernels without PIDFD support. */ + if (child_pidfd >= 0) + r = pidref_set_pidfd_consume(&child_pidref, TAKE_FD(child_pidfd)); + else + r = pidref_set_pid(&child_pidref, child_pid); + if (r < 0) { + if (r == -ESRCH) + log_debug_errno(r, "PidRef child process died before message is processed. Ignoring."); + else + log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m"); + return 0; + } + + if (child_pidref.pid != child_pid) { + assert(child_pidref.fd >= 0); + log_warning("Got SCM_RIGHTS for child process " PID_FMT " but PID in IOV message is " PID_FMT ". Ignoring.", + child_pidref.pid, child_pid); + return 0; + } + + log_debug("Got pidref event with parent PID " PID_FMT " and child PID " PID_FMT ".", parent_pidref.pid, child_pidref.pid); + + /* Try finding cgroup of parent process. But if parent process exited and we're not using PIDFD, this could return NULL. + * Then fall back to finding cgroup of the child process. */ + Unit *u = manager_get_unit_by_pidref_cgroup(m, &parent_pidref); + if (!u) + u = manager_get_unit_by_pidref_cgroup(m, &child_pidref); + if (!u) { + log_debug("Got pidref for parent process " PID_FMT " and child process " PID_FMT " we are not interested in, ignoring.", parent_pidref.pid, child_pidref.pid); + return 0; + } + + if (!UNIT_VTABLE(u)->notify_pidref) { + log_unit_warning(u, "Received pidref event from unexpected unit type '%s'.", unit_type_to_string(u->type)); + return 0; + } + + UNIT_VTABLE(u)->notify_pidref(u, &parent_pidref, &child_pidref); + + return 0; +} + void manager_ref_console(Manager *m) { assert(m); diff --git a/src/core/manager.h b/src/core/manager.h index c1f7f8c083..e4cada80ff 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -289,6 +289,9 @@ struct Manager { int handoff_timestamp_fds[2]; sd_event_source *handoff_timestamp_event_source; + int pidref_transport_fds[2]; + sd_event_source *pidref_event_source; + RuntimeScope runtime_scope; LookupPaths lookup_paths; @@ -678,12 +681,13 @@ void unit_defaults_done(UnitDefaults *defaults); enum { /* most important … */ - EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11, - EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10, - EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10, - EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */ - EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */ - EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8, + EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-12, + EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-11, + EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-11, + EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv1 */ + EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv2 */ + EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-9, + EVENT_PRIORITY_PIDREF = SD_EVENT_PRIORITY_NORMAL-8, EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7, EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6, EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5, diff --git a/src/core/namespace.c b/src/core/namespace.c index 91c905f2fe..57dbbc4fc7 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -2061,7 +2061,8 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) { p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO || p->protect_kernel_tunables || p->protect_proc != PROTECT_PROC_DEFAULT || - p->proc_subset != PROC_SUBSET_ALL; + p->proc_subset != PROC_SUBSET_ALL || + p->private_pids != PRIVATE_PIDS_NO; } /* Walk all mount entries and dropping any unused mounts. This affects all @@ -3366,3 +3367,10 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = { }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF); + +static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = { + [PRIVATE_PIDS_NO] = "no", + [PRIVATE_PIDS_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES); diff --git a/src/core/namespace.h b/src/core/namespace.h index 7b6e892cc2..bd48aa31da 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -78,6 +78,13 @@ typedef enum ProtectControlGroups { _PROTECT_CONTROL_GROUPS_INVALID = -EINVAL, } ProtectControlGroups; +typedef enum PrivatePIDs { + PRIVATE_PIDS_NO, + PRIVATE_PIDS_YES, + _PRIVATE_PIDS_MAX, + _PRIVATE_PIDS_INVALID = -EINVAL, +} PrivatePIDs; + struct BindMount { char *source; char *destination; @@ -182,6 +189,7 @@ struct NamespaceParameters { ProtectProc protect_proc; ProcSubset proc_subset; PrivateTmp private_tmp; + PrivatePIDs private_pids; }; int setup_namespace(const NamespaceParameters *p, char **reterr_path); @@ -225,6 +233,9 @@ PrivateUsers private_users_from_string(const char *s) _pure_; const char* protect_control_groups_to_string(ProtectControlGroups i) _const_; ProtectControlGroups protect_control_groups_from_string(const char *s) _pure_; +const char* private_pids_to_string(PrivatePIDs i) _const_; +PrivatePIDs private_pids_from_string(const char *s) _pure_; + void bind_mount_free_many(BindMount *b, size_t n); int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); diff --git a/src/core/service.c b/src/core/service.c index 737dc9905a..a9a64938b5 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -710,6 +710,9 @@ static int service_verify(Service *s) { if (s->type == SERVICE_DBUS && !s->bus_name) return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing."); + if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing."); + if (s->usb_function_descriptors && !s->usb_function_strings) log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring."); @@ -4908,6 +4911,35 @@ static void service_handoff_timestamp( unit_add_to_dbus_queue(u); } +static void service_notify_pidref(Unit *u, PidRef *parent_pidref, PidRef *child_pidref) { + Service *s = ASSERT_PTR(SERVICE(u)); + int r; + + assert(pidref_is_set(parent_pidref)); + assert(pidref_is_set(child_pidref)); + + if (pidref_equal(&s->main_pid, parent_pidref)) { + r = service_set_main_pidref(s, TAKE_PIDREF(*child_pidref), /* start_timestamp = */ NULL); + if (r < 0) + return (void) log_unit_warning_errno(u, r, "Failed to set new main pid: %m"); + + /* Since the child process is PID 1 in a new PID namespace, it must be exclusive to this unit. */ + r = unit_watch_pidref(u, &s->main_pid, /* exclusive= */ true); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to watch new main PID " PID_FMT ": %m", s->main_pid.pid); + } else if (pidref_equal(&s->control_pid, parent_pidref)) { + service_unwatch_control_pid(s); + s->control_pid = TAKE_PIDREF(*child_pidref); + + r = unit_watch_pidref(u, &s->control_pid, /* exclusive= */ true); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to watch new control PID " PID_FMT ": %m", s->control_pid.pid); + } else + return (void) log_unit_debug(u, "Parent process " PID_FMT " does not match main or control processes, ignoring.", parent_pidref->pid); + + unit_add_to_dbus_queue(u); +} + static int service_get_timeout(Unit *u, usec_t *timeout) { Service *s = ASSERT_PTR(SERVICE(u)); uint64_t t; @@ -5638,6 +5670,7 @@ const UnitVTable service_vtable = { .notify_cgroup_oom = service_notify_cgroup_oom_event, .notify_message = service_notify_message, .notify_handoff_timestamp = service_handoff_timestamp, + .notify_pidref = service_notify_pidref, .main_pid = service_main_pid, .control_pid = service_control_pid, diff --git a/src/core/unit.c b/src/core/unit.c index eec08a2fbf..71488a4555 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -4237,6 +4237,9 @@ static int unit_verify_contexts(const Unit *u) { exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL)) return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing."); + if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing."); + const KillContext *kc = unit_get_kill_context(u); if (ec->pam_name && kc && !IN_SET(kc->kill_mode, KILL_CONTROL_GROUP, KILL_MIXED)) @@ -5402,6 +5405,8 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { p->user_lookup_fd = u->manager->user_lookup_fds[1]; p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1]; + if (UNIT_VTABLE(u)->notify_pidref) + p->pidref_transport_fd = u->manager->pidref_transport_fds[1]; p->cgroup_id = crt ? crt->cgroup_id : 0; p->invocation_id = u->invocation_id; diff --git a/src/core/unit.h b/src/core/unit.h index 01e1adf961..a8eb366337 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -640,6 +640,9 @@ typedef struct UnitVTable { /* Called whenever we learn a handoff timestamp */ void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts); + /* Called whenever we learn about a child process */ + void (*notify_pidref)(Unit *u, PidRef *parent_pidref, PidRef *child_pidref); + /* Called whenever a name this Unit registered for comes or goes away. */ void (*bus_name_owner_change)(Unit *u, const char *new_owner); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 90b6f233e2..06bfb90c8f 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -1061,7 +1061,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "LogNamespace", "RootImagePolicy", "MountImagePolicy", - "ExtensionImagePolicy")) + "ExtensionImagePolicy", + "PrivatePIDs")) return bus_append_string(m, field, eq); if (STR_IN_SET(field, "IgnoreSIGPIPE", diff --git a/test/TEST-07-PID1/test.sh b/test/TEST-07-PID1/test.sh index 2513406e0d..66e1b684ea 100755 --- a/test/TEST-07-PID1/test.sh +++ b/test/TEST-07-PID1/test.sh @@ -6,12 +6,17 @@ TEST_DESCRIPTION="Tests for core PID1 functionality" # for testing PrivateNetwork=yes NSPAWN_ARGUMENTS="--capability=CAP_NET_ADMIN" +# for testing PrivatePIDs=yes +TEST_INSTALL_VERITY_MINIMAL=1 # shellcheck source=test/test-functions . "${TEST_BASE_DIR:?}/test-functions" test_append_files() { image_install logger socat + inst_binary mksquashfs + inst_binary unsquashfs + install_verity_minimal } do_test "$@" diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh new file mode 100755 index 0000000000..6f16820aee --- /dev/null +++ b/test/units/TEST-07-PID1.private-pids.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +# shellcheck disable=SC2016 +set -eux +set -o pipefail + +# shellcheck source=test/units/test-control.sh +. "$(dirname "$0")"/test-control.sh +# shellcheck source=test/units/util.sh +. "$(dirname "$0")"/util.sh + +HAS_EXISTING_SCSI_MOUNT=no +if findmnt --mountpoint /proc/scsi; then + HAS_EXISTING_SCSI_MOUNT=yes +fi + +at_exit() { + set +e + + # Unmount any file systems + if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then + umount /proc/scsi + fi + umount /tmp/TEST-07-PID1-private-pids-proc + rm -rf /tmp/TEST-07-PID1-private-pids-proc + # Remove any test files + rm -rf /tmp/TEST-07-PID1-private-pids-services + rm -rf /tmp/TEST-07-PID1-private-pids-root + # Stop any test services + systemctl kill --signal=KILL TEST-07-PID1-private-pid.service + # Remove any failed transient units + systemctl reset-failed +} + +trap at_exit EXIT + +testcase_basic() { + # Verify current process is PID1 in new namespace + assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe readlink /proc/self)" "1" + # Verify we are only processes in new namespace + assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe ps aux --no-heading | wc -l)" "1" + # Verify procfs mount + systemd-run -p PrivatePIDs=yes --wait --pipe \ + bash -xec '[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ rw ]]; + [[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nosuid ]]; + [[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nodev ]]; + [[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ noexec ]];' + + # Verify main PID is correct + systemd-run -p PrivatePIDs=yes --remain-after-exit --unit TEST-07-PID1-private-pid sleep infinity + # Wait for ExecMainPID to be correctly populated as there might be a race between spawning service + # and actual exec child process + sleep 2 + pid=$(systemctl show TEST-07-PID1-private-pid.service -p ExecMainPID --value) + kill -9 "$pid" + timeout 10s bash -xec 'while [[ "$(systemctl show -P SubState TEST-07-PID1-private-pid.service)" != "failed" ]]; do sleep .5; done' + assert_eq "$(systemctl show -P Result TEST-07-PID1-private-pid.service)" "signal" + assert_eq "$(systemctl show -P ExecMainStatus TEST-07-PID1-private-pid.service)" "9" + systemctl reset-failed +} + +testcase_analyze() { + mkdir -p /tmp/TEST-07-PID1-private-pids-services + + # Verify other services are compatible with PrivatePIDs=yes + cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service +[Service] +ExecStart=echo hello +PrivatePIDs=yes +Type=oneshot +EOF + + # Verify Type=forking services are not compatible with PrivatePIDs=yes + cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/forking-invalid.service +[Service] +ExecStart=echo hello +PrivatePIDs=yes +Type=forking +EOF + + systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service + (! systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/forking-invalid.service) + + + rm -rf /tmp/TEST-07-PID1-private-pids-services +} + +testcase_multiple_features() { + unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-private-pids-root /usr/share/minimal_0.raw + + systemd-run \ + -p PrivatePIDs=yes \ + -p RootDirectory=/tmp/TEST-07-PID1-private-pids-root \ + -p ProcSubset=pid \ + -p BindReadOnlyPaths=/usr/share \ + -p NoNewPrivileges=yes \ + -p ProtectSystem=strict \ + -p User=testuser\ + -p Group=testuser \ + -p RuntimeDirectory=abc \ + -p StateDirectory=qed \ + -p InaccessiblePaths=/usr/include \ + -p TemporaryFileSystem=/home \ + -p PrivateTmp=yes \ + -p PrivateDevices=yes \ + -p PrivateNetwork=yes \ + -p PrivateUsersEx=self \ + -p PrivateIPC=yes \ + -p ProtectHostname=yes \ + -p ProtectClock=yes \ + -p ProtectKernelTunables=yes \ + -p ProtectKernelModules=yes \ + -p ProtectKernelLogs=yes \ + -p ProtectControlGroupsEx=private \ + -p LockPersonality=yes \ + -p Environment=ABC=QED \ + --wait \ + --pipe \ + grep MARKER=1 /etc/os-release + + rm -rf /tmp/TEST-07-PID1-private-pids-root +} + +testcase_unpriv() { + if [ ! -f /usr/lib/systemd/user/dbus.socket ] && [ ! -f /etc/systemd/user/dbus.socket ]; then + echo "Per-user instances are not supported, skipping unprivileged PrivatePIDs=yes test" + return 0 + fi + + if [[ "$(sysctl -ne kernel.apparmor_restrict_unprivileged_userns)" -eq 1 ]]; then + echo "Cannot create unprivileged user namespaces, skipping unprivileged PrivatePIDs=yes test" + return 0 + fi + + # The kernel has a restriction for unprivileged user namespaces where they cannot mount a less restrictive + # instance of /proc/. So if /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs as systemd-nspawn does), + # then mounting a new /proc/ will fail and we will still see the host's /proc/. Thus, to allow tests to run in + # a VM or nspawn, we mount a new proc on a temporary directory with no masking to bypass this kernel restriction. + mkdir -p /tmp/TEST-07-PID1-private-pids-proc + mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc + + # Verify running as unprivileged user can unshare PID namespace and mounts /proc properly. + assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1" + assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1" + + umount /tmp/TEST-07-PID1-private-pids-proc + rm -rf /tmp/TEST-07-PID1-private-pids-proc + + # Now verify the behavior with masking - units should fail as PrivatePIDs=yes has no graceful fallback. + if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then + mount -t tmpfs tmpfs /proc/scsi + fi + + (! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true) + + if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then + umount /proc/scsi + fi +} + +run_testcases |