summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--NEWS5
-rw-r--r--man/org.freedesktop.systemd1.xml41
-rw-r--r--man/systemd.exec.xml24
-rw-r--r--src/basic/process-util.c5
-rw-r--r--src/basic/process-util.h27
-rw-r--r--src/core/dbus-execute.c23
-rw-r--r--src/core/exec-invoke.c169
-rw-r--r--src/core/execute-serialize.c20
-rw-r--r--src/core/execute.c11
-rw-r--r--src/core/execute.h4
-rw-r--r--src/core/load-fragment-gperf.gperf.in1
-rw-r--r--src/core/load-fragment.c1
-rw-r--r--src/core/load-fragment.h1
-rw-r--r--src/core/manager.c195
-rw-r--r--src/core/manager.h16
-rw-r--r--src/core/namespace.c10
-rw-r--r--src/core/namespace.h11
-rw-r--r--src/core/service.c33
-rw-r--r--src/core/unit.c5
-rw-r--r--src/core/unit.h3
-rw-r--r--src/shared/bus-unit-util.c3
-rwxr-xr-xtest/TEST-07-PID1/test.sh5
-rwxr-xr-xtest/units/TEST-07-PID1.private-pids.sh161
23 files changed, 741 insertions, 33 deletions
diff --git a/NEWS b/NEWS
index 6d4b5b7461..03f61e99c7 100644
--- a/NEWS
+++ b/NEWS
@@ -254,6 +254,11 @@ CHANGES WITH 257 in spe:
the "nobody" user to the dynamic user, rather than via recursive
chown()ing.
+ * A new service property PrivatePIDs= has been added that runs executed
+ processes as PID 1 - the init process - within their own PID namespace.
+ PrivatePIDs= also mounts /proc/ so only processes within the new PID
+ namespace are visible.
+
systemd-udevd:
* udev rules now set 'uaccess' for /dev/udmabuf, giving locally
diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index 7ade8c3e8b..05afb93f9b 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -3263,6 +3263,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivatePIDs = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@@ -4584,6 +4586,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@@ -4870,6 +4874,11 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
is a string type.</para>
+
+ <para><varname>PrivatePIDs</varname> implements the destination parameter of the
+ unit file setting <varname>PrivatePIDs=</varname> listed in
+ <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
+ Note <varname>PrivatePIDs</varname> is a string type to allow adding more values in the future.</para>
</refsect2>
</refsect1>
@@ -5439,6 +5448,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivatePIDs = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@@ -6744,6 +6755,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@@ -7442,6 +7455,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivatePIDs = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@@ -8585,6 +8600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@@ -9412,6 +9429,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivatePIDs = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@@ -10527,6 +10546,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@@ -12281,8 +12302,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ExtraFileDescriptorNames</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>BindLogSockets</varname>,
- <varname>ProtectControlGroupsEx</varname>, and
- <varname>PrivateUsersEx</varname> were added in version 257.</para>
+ <varname>ProtectControlGroupsEx</varname>,
+ <varname>PrivateUsersEx</varname>, and
+ <varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
@@ -12323,8 +12345,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>,
- <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
- <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
+ <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
+ <varname>ProtectControlGroupsEx</varname>, and
+ <varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
@@ -12362,8 +12385,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>,
- <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
- <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
+ <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
+ <varname>ProtectControlGroupsEx</varname>, and
+ <varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Swap Unit Objects</title>
@@ -12401,8 +12425,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>,
- <varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
- <varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
+ <varname>ManagedOOMMemoryPressureDurationUSec</varname>,
+ <varname>ProtectControlGroupsEx</varname>, and
+ <varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 30a926c9a0..b50f70ff42 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1977,6 +1977,30 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
</varlistentry>
<varlistentry>
+ <term><varname>PrivatePIDs=</varname></term>
+
+ <listitem><para>Takes a boolean argument. Defaults to false. If enabled, sets up a new PID namespace
+ for the executed processes. Each executed process is now PID 1 - the init process - in the new namespace.
+ <filename>/proc/</filename> is mounted such that only processes in the PID namespace are visible.
+ If <varname>PrivatePIDs=</varname> is set, <varname>MountAPIVFS=yes</varname> is implied.</para>
+
+ <para><varname>PrivatePIDs=</varname> is only supported for service units. This setting is not supported
+ with <varname>Type=forking</varname> since the kernel will kill all processes in the PID namespace if
+ the init process terminates.</para>
+
+ <para>This setting will be ignored if the kernel does not support PID namespaces.</para>
+
+ <para>Note unprivileged user services (i.e. a service run by the per-user instance of the service manager)
+ will fail with <varname>PrivatePIDs=yes</varname> if <filename>/proc/</filename> is masked
+ (i.e. <filename>/proc/kmsg</filename> is over-mounted with <constant>tmpfs</constant> like
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> does).
+ This is due to a kernel restriction not allowing unprivileged user namespaces to mount a less restrictive
+ instance of <filename>/proc/</filename>.</para>
+
+ <xi:include href="version-info.xml" xpointer="v257"/></listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><varname>PrivateUsers=</varname></term>
<listitem><para>Takes a boolean argument or one of <literal>self</literal> or
diff --git a/src/basic/process-util.c b/src/basic/process-util.c
index a85a1b35f0..75bc65652e 100644
--- a/src/basic/process-util.c
+++ b/src/basic/process-util.c
@@ -1521,11 +1521,12 @@ int safe_fork_full(
}
}
- if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS)) != 0)
+ if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS)) != 0)
pid = raw_clone(SIGCHLD|
(FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
(FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
- (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0));
+ (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0) |
+ (FLAGS_SET(flags, FORK_NEW_PIDNS) ? CLONE_NEWPID : 0));
else
pid = fork();
if (pid < 0)
diff --git a/src/basic/process-util.h b/src/basic/process-util.h
index 05b7a69fc6..cb6d47a5bb 100644
--- a/src/basic/process-util.h
+++ b/src/basic/process-util.h
@@ -166,7 +166,7 @@ int must_be_root(void);
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata);
-/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, or FORK_NEW_NETNS should not be called in threaded
+/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, FORK_NEW_NETNS or FORK_NEW_PIDNS should not be called in threaded
* programs, because they cause us to use raw_clone() which does not synchronize the glibc malloc() locks,
* and thus will cause deadlocks if the parent uses threads and the child does memory allocations. Hence: if
* the parent is threaded these flags may not be used. These flags cannot be used if the parent uses threads
@@ -181,18 +181,19 @@ typedef enum ForkFlags {
FORK_REOPEN_LOG = 1 << 6, /* Reopen log connection */
FORK_LOG = 1 << 7, /* Log above LOG_DEBUG log level about failures */
FORK_WAIT = 1 << 8, /* Wait until child exited */
- FORK_NEW_MOUNTNS = 1 << 9, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
- FORK_MOUNTNS_SLAVE = 1 << 10, /* Make child's mount namespace MS_SLAVE */
- FORK_PRIVATE_TMP = 1 << 11, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
- FORK_RLIMIT_NOFILE_SAFE = 1 << 12, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
- FORK_STDOUT_TO_STDERR = 1 << 13, /* Make stdout a copy of stderr */
- FORK_FLUSH_STDIO = 1 << 14, /* fflush() stdout (and stderr) before forking */
- FORK_NEW_USERNS = 1 << 15, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
- FORK_CLOEXEC_OFF = 1 << 16, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
- FORK_KEEP_NOTIFY_SOCKET = 1 << 17, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
- FORK_DETACH = 1 << 18, /* Double fork if needed to ensure PID1/subreaper is parent */
- FORK_NEW_NETNS = 1 << 19, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
- FORK_PACK_FDS = 1 << 20, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */
+ FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
+ FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
+ FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
+ FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
+ FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
+ FORK_CLOEXEC_OFF = 1 << 14, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
+ FORK_KEEP_NOTIFY_SOCKET = 1 << 15, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
+ FORK_DETACH = 1 << 16, /* Double fork if needed to ensure PID1/subreaper is parent */
+ FORK_PACK_FDS = 1 << 17, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */
+ FORK_NEW_MOUNTNS = 1 << 18, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
+ FORK_NEW_USERNS = 1 << 19, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
+ FORK_NEW_NETNS = 1 << 20, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
+ FORK_NEW_PIDNS = 1 << 21, /* Run child in its own PID namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
} ForkFlags;
int safe_fork_full(
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index a9a73b599b..e297323f1d 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -63,6 +63,7 @@ static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
+static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_pids, "s", PrivatePIDs, private_pids_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
@@ -1194,6 +1195,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivatePIDs", "s", property_get_private_pids, offsetof(ExecContext, private_pids), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1970,6 +1972,27 @@ int bus_exec_context_set_transient_property(
return 1;
}
+ if (streq(name, "PrivatePIDs")) {
+ const char *s;
+ PrivatePIDs t;
+
+ r = sd_bus_message_read(message, "s", &s);
+ if (r < 0)
+ return r;
+
+ t = private_pids_from_string(s);
+ if (t < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
+
+ if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
+ c->private_pids = t;
+ (void) unit_write_settingf(u, flags, name, "%s=%s",
+ name, private_pids_to_string(c->private_pids));
+ }
+
+ return 1;
+ }
+
if (streq(name, "PrivateDevices"))
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c
index 72ed53360b..120067a774 100644
--- a/src/core/exec-invoke.c
+++ b/src/core/exec-invoke.c
@@ -2251,6 +2251,130 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
return 1;
}
+static int can_mount_proc(const ExecContext *c, ExecParameters *p) {
+ _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ ssize_t n;
+ int r;
+
+ assert(c);
+ assert(p);
+
+ /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
+ * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
+ * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
+
+ /* Create a communication channel so that the child can tell the parent a proper error code in case it
+ * failed. */
+ if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+ return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
+
+ /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
+ * with FORK_MOUNTNS_SLAVE. */
+ r = safe_fork("(sd-proc-check)",
+ FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
+ if (r < 0)
+ return log_exec_debug_errno(c, p, r, "Failed to fork child process (sd-proc-check): %m");
+ if (r == 0) {
+ errno_pipe[0] = safe_close(errno_pipe[0]);
+
+ /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
+ * namespace will be cleaned up once the process exits. */
+ r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (r < 0) {
+ (void) write(errno_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ errno_pipe[1] = safe_close(errno_pipe[1]);
+
+ /* Try to read an error code from the child */
+ n = read(errno_pipe[0], &r, sizeof(r));
+ if (n < 0)
+ return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
+ if (n == sizeof(r)) { /* an error code was sent to us */
+ /* This is the expected case where proc cannot be mounted due to permissions. */
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
+ return 0;
+ if (r < 0)
+ return r;
+
+ return -EIO;
+ }
+ if (n != 0) /* on success we should have read 0 bytes */
+ return -EIO;
+
+ r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
+ if (r < 0)
+ return log_exec_debug_errno(c, p, r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
+ if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
+ return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
+
+ return 1;
+}
+
+static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+ _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
+ ssize_t n;
+ int r, q;
+
+ assert(c);
+ assert(p);
+ assert(p->pidref_transport_fd >= 0);
+
+ /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
+ * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
+ * pidref to the manager and exits while the child process continues with the rest of exec_invoke()
+ * and finally executes the actual payload. */
+
+ /* Create a communication channel so that the parent can tell the child a proper error code in case it
+ * failed to send child pidref to the manager. */
+ if (pipe2(errno_pipe, O_CLOEXEC) < 0)
+ return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with parent process: %m");
+
+ r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS, &pidref);
+ if (r < 0)
+ return log_exec_debug_errno(c, p, r, "Failed to fork child into new pid namespace: %m");
+ if (r > 0) {
+ errno_pipe[0] = safe_close(errno_pipe[0]);
+
+ /* In the parent process, we send the child pidref to the manager and exit.
+ * If PIDFD is not supported, only the child PID is sent. The server then
+ * uses the child PID to set the new exec main process. */
+ q = send_one_fd_iov(
+ p->pidref_transport_fd,
+ pidref.fd,
+ &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
+ /*iovlen=*/ 1,
+ /*flags=*/ 0);
+ /* Send error code to child process. */
+ (void) write(errno_pipe[1], &q, sizeof(q));
+ /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
+ * some destructors have external effects. The main codepaths continue in the child process. */
+ _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
+ }
+
+ errno_pipe[1] = safe_close(errno_pipe[1]);
+ p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
+
+ /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
+ * receive an errno even on success. */
+ n = read(errno_pipe[0], &r, sizeof(r));
+ if (n < 0)
+ return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with parent process: %m");
+ if (n != sizeof(r))
+ return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
+ if (r < 0)
+ return log_exec_debug_errno(c, p, r, "Failed to send child pidref to manager: %m");
+
+ /* NOTE! This function returns in the child process only. */
+ return r;
+}
+
static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
_cleanup_free_ char *src_abs = NULL;
int r;
@@ -3301,6 +3425,7 @@ static int apply_mount_namespace(
.private_dev = needs_sandboxing && context->private_devices,
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
+ .private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
.private_tmp = needs_sandboxing ? context->private_tmp : false,
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
@@ -3573,7 +3698,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
- int dont_close[n_fds + 16];
+ int dont_close[n_fds + 17];
assert(params);
@@ -3612,6 +3737,9 @@ static int close_remaining_fds(
if (params->handoff_timestamp_fd >= 0)
dont_close[n_dont_close++] = params->handoff_timestamp_fd;
+ if (params->pidref_transport_fd >= 0)
+ dont_close[n_dont_close++] = params->pidref_transport_fd;
+
assert(n_dont_close <= ELEMENTSOF(dont_close));
return close_all_fds(dont_close, n_dont_close);
@@ -3934,6 +4062,7 @@ static bool exec_context_need_unprivileged_private_users(
!strv_isempty(context->extension_directories) ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
+ exec_needs_pid_namespace(context) ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
@@ -4139,6 +4268,7 @@ int exec_invoke(
needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
bool keep_seccomp_privileges = false;
+ bool has_cap_sys_admin = false;
#if HAVE_SELINUX
_cleanup_free_ char *mac_selinux_context_net = NULL;
bool use_selinux = false;
@@ -4790,6 +4920,9 @@ int exec_invoke(
uint64_t capability_ambient_set = context->capability_ambient_set;
+ /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
+ has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
+
if (needs_sandboxing) {
/* MAC enablement checks need to be done before a new mount ns is created, as they rely on
* /sys being present. The actual MAC context application will happen later, as late as
@@ -4924,6 +5057,40 @@ int exec_invoke(
}
}
+ /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
+ * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
+ if (needs_sandboxing && exec_needs_pid_namespace(context)) {
+ if (params->pidref_transport_fd < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "PidRef socket is not set up: %m");
+ }
+
+ /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
+ * to check if we can mount /proc/.
+ *
+ * We need to check prior to entering the user namespace because if we're running unprivileged or in a
+ * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
+ * once we unshare a mount namespace. */
+ r = has_cap_sys_admin ? 1 : can_mount_proc(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "Failed to detect if /proc/ can be remounted: %m");
+ }
+ if (r == 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EPERM),
+ "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
+ }
+
+ r = setup_private_pids(context, params);
+ if (r < 0) {
+ *exit_status = EXIT_NAMESPACE;
+ return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m");
+ }
+ }
+
+ /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
+
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;
diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c
index 6fa0b21968..bf6592faed 100644
--- a/src/core/execute-serialize.c
+++ b/src/core/execute-serialize.c
@@ -1391,6 +1391,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
if (r < 0)
return r;
+ r = serialize_fd(f, fds, "exec-parameters-pidref-transport-fd", p->pidref_transport_fd);
+ if (r < 0)
+ return r;
+
if (c && exec_context_restrict_filesystems_set(c)) {
r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd);
if (r < 0)
@@ -1660,6 +1664,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
continue;
close_and_replace(p->handoff_timestamp_fd, fd);
+ } else if ((val = startswith(l, "exec-parameters-pidref-transport-fd="))) {
+ int fd;
+
+ fd = deserialize_fd(fds, val);
+ if (fd < 0)
+ continue;
+
+ close_and_replace(p->pidref_transport_fd, fd);
} else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
int fd;
@@ -1926,6 +1938,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
+ r = serialize_item(f, "exec-context-private-pids", private_pids_to_string(c->private_pids));
+ if (r < 0)
+ return r;
+
r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc);
if (r < 0)
return r;
@@ -2813,6 +2829,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
if (r < 0)
return r;
c->private_ipc = r;
+ } else if ((val = startswith(l, "exec-context-private-pids="))) {
+ c->private_pids = private_pids_from_string(val);
+ if (c->private_pids < 0)
+ return -EINVAL;
} else if ((val = startswith(l, "exec-context-remove-ipc="))) {
r = parse_boolean(val);
if (r < 0)
diff --git a/src/core/execute.c b/src/core/execute.c
index 1c41b39a2f..2c5a5db10e 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -254,6 +254,12 @@ bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParame
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
+bool exec_needs_pid_namespace(const ExecContext *context) {
+ assert(context);
+
+ return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
+}
+
bool exec_needs_mount_namespace(
const ExecContext *context,
const ExecParameters *params,
@@ -306,7 +312,8 @@ bool exec_needs_mount_namespace(
exec_needs_cgroup_mount(context, params) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
- exec_needs_ipc_namespace(context))
+ exec_needs_ipc_namespace(context) ||
+ exec_needs_pid_namespace(context))
return true;
if (context->root_directory) {
@@ -1026,6 +1033,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sProtectControlGroups: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateUsers: %s\n"
+ "%sPrivatePIDs: %s\n"
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sMountAPIVFS: %s\n"
@@ -1052,6 +1060,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, protect_control_groups_to_string(c->protect_control_groups),
prefix, yes_no(c->private_network),
prefix, private_users_to_string(c->private_users),
+ prefix, private_pids_to_string(c->private_pids),
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
diff --git a/src/core/execute.h b/src/core/execute.h
index 7274c68d3d..32dabf177f 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -335,6 +335,7 @@ struct ExecContext {
ProtectControlGroups protect_control_groups;
ProtectSystem protect_system;
ProtectHome protect_home;
+ PrivatePIDs private_pids;
bool protect_hostname;
bool dynamic_user;
@@ -465,6 +466,7 @@ struct ExecParameters {
char **files_env;
int user_lookup_fd;
int handoff_timestamp_fd;
+ int pidref_transport_fd;
int bpf_restrict_fs_map_fd;
@@ -486,6 +488,7 @@ struct ExecParameters {
.bpf_restrict_fs_map_fd = -EBADF, \
.user_lookup_fd = -EBADF, \
.handoff_timestamp_fd = -EBADF, \
+ .pidref_transport_fd = -EBADF, \
}
#include "unit.h"
@@ -623,6 +626,7 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
+bool exec_needs_pid_namespace(const ExecContext *context);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index f5cbb319d7..d7564b3767 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -133,6 +133,7 @@
{{type}}.PrivateUsers, config_parse_private_users, 0, offsetof({{type}}, exec_context.private_users)
{{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts)
{{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc)
+{{type}}.PrivatePIDs, config_parse_private_pids, 0, offsetof({{type}}, exec_context.private_pids)
{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system)
{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home)
{{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag)
diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c
index 1d813332b1..f34c930f4e 100644
--- a/src/core/load-fragment.c
+++ b/src/core/load-fragment.c
@@ -135,6 +135,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_control_groups, protect_control_groups, ProtectControlGroups);
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode);
diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h
index 9b95f0c24e..8ac962a94b 100644
--- a/src/core/load-fragment.h
+++ b/src/core/load-fragment.h
@@ -114,6 +114,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp);
CONFIG_PARSER_PROTOTYPE(config_parse_private_users);
+CONFIG_PARSER_PROTOTYPE(config_parse_private_pids);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_control_groups);
CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);
diff --git a/src/core/manager.c b/src/core/manager.c
index f58bc547a6..296d7416b1 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -126,6 +126,7 @@ static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint
static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
@@ -913,6 +914,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
.signal_fd = -EBADF,
.user_lookup_fds = EBADF_PAIR,
.handoff_timestamp_fds = EBADF_PAIR,
+ .pidref_transport_fds = EBADF_PAIR,
.private_listen_fd = -EBADF,
.dev_autofs_fd = -EBADF,
.cgroup_inotify_fd = -EBADF,
@@ -1309,6 +1311,55 @@ static int manager_setup_handoff_timestamp_fd(Manager *m) {
return 0;
}
+static int manager_setup_pidref_transport_fd(Manager *m) {
+ int r;
+
+ assert(m);
+
+ /* Set up the socket pair used for passing parent and child pidrefs back when the executor unshares
+ * a PID namespace and forks again when using PrivatePIDs=yes. */
+
+ if (m->pidref_transport_fds[0] < 0) {
+ m->pidref_event_source = sd_event_source_disable_unref(m->pidref_event_source);
+ safe_close_pair(m->pidref_transport_fds);
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->pidref_transport_fds) < 0)
+ return log_error_errno(errno, "Failed to allocate pidref socket: %m");
+
+ /* Make sure children never have to block */
+ (void) fd_increase_rxbuf(m->pidref_transport_fds[0], MANAGER_SOCKET_RCVBUF_SIZE);
+
+ r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable SO_PASSCRED for pidref socket: %m");
+
+ r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSPIDFD, true);
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
+ log_debug("SO_PASSPIDFD is not supported for pidref socket, ignoring.");
+ else if (r < 0)
+ log_warning_errno(r, "Failed to enable SO_PASSPIDFD for pidref socket, ignoring: %m");
+
+ /* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */
+ r = fd_nonblock(m->pidref_transport_fds[0], true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make pidref socket O_NONBLOCK: %m");
+ }
+
+ if (!m->pidref_event_source) {
+ r = sd_event_add_io(m->event, &m->pidref_event_source, m->pidref_transport_fds[0], EPOLLIN, manager_dispatch_pidref_transport_fd, m);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate pidref event source: %m");
+
+ r = sd_event_source_set_priority(m->pidref_event_source, EVENT_PRIORITY_PIDREF);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set priority of pidref event source: %m");
+
+ (void) sd_event_source_set_description(m->pidref_event_source, "pidref");
+ }
+
+ return 0;
+}
+
static unsigned manager_dispatch_cleanup_queue(Manager *m) {
Unit *u;
unsigned n = 0;
@@ -1724,6 +1775,7 @@ Manager* manager_free(Manager *m) {
sd_event_source_unref(m->run_queue_event_source);
sd_event_source_unref(m->user_lookup_event_source);
sd_event_source_unref(m->handoff_timestamp_event_source);
+ sd_event_source_unref(m->pidref_event_source);
sd_event_source_unref(m->memory_pressure_event_source);
safe_close(m->signal_fd);
@@ -1731,6 +1783,7 @@ Manager* manager_free(Manager *m) {
safe_close(m->cgroups_agent_fd);
safe_close_pair(m->user_lookup_fds);
safe_close_pair(m->handoff_timestamp_fds);
+ safe_close_pair(m->pidref_transport_fds);
manager_close_ask_password(m);
@@ -2077,6 +2130,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
/* This shouldn't fail, except if things are really broken. */
return r;
+ r = manager_setup_pidref_transport_fd(m);
+ if (r < 0)
+ /* This shouldn't fail, except if things are really broken. */
+ return r;
+
/* Connect to the bus if we are good for it */
manager_setup_bus(m);
@@ -3747,6 +3805,7 @@ int manager_reload(Manager *m) {
(void) manager_setup_cgroups_agent(m);
(void) manager_setup_user_lookup_fd(m);
(void) manager_setup_handoff_timestamp_fd(m);
+ (void) manager_setup_pidref_transport_fd(m);
/* Third, fire things up! */
manager_coldplug(m);
@@ -5002,6 +5061,142 @@ static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd
return 0;
}
+static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ Manager *m = ASSERT_PTR(userdata);
+ _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL, parent_pidref = PIDREF_NULL;
+ _cleanup_close_ int child_pidfd = -EBADF, parent_pidfd = -EBADF;
+ struct ucred *ucred = NULL;
+ CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int)) * 2) control;
+ pid_t child_pid;
+ struct msghdr msghdr = {
+ .msg_iov = &IOVEC_MAKE(&child_pid, sizeof(child_pid)),
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+ struct cmsghdr *cmsg;
+ ssize_t n;
+ int r;
+
+ assert(source);
+
+ /* Server expects:
+ * - Parent PID in ucreds enabled via SO_PASSCRED
+ * - Parent PIDFD in SCM_PIDFD message enabled via SO_PASSPIDFD
+ * - Child PIDFD in SCM_RIGHTS in message body
+ * - Child PID in message IOV
+ *
+ * SO_PASSPIDFD may not be supported by the kernel so we fall back to using parent PID from ucreds
+ * and accept some raciness. */
+ n = recvmsg_safe(m->pidref_transport_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
+ if (ERRNO_IS_NEG_TRANSIENT(n))
+ return 0; /* Spurious wakeup, try again */
+ if (n == -ECHRNG) {
+ log_warning_errno(n, "Got message with truncated control data (unexpected fds sent?), ignoring.");
+ return 0;
+ }
+ if (n == -EXFULL) {
+ log_warning_errno(n, "Got message with truncated payload data, ignoring.");
+ return 0;
+ }
+ if (n < 0)
+ return log_error_errno(n, "Failed to receive pidref message: %m");
+
+ if (n != sizeof(child_pid)) {
+ log_warning("Got pidref message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(child_pid));
+ return 0;
+ }
+
+ CMSG_FOREACH(cmsg, &msghdr) {
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+
+ if (cmsg->cmsg_type == SCM_CREDENTIALS && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
+ assert(!ucred);
+ ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
+ } else if (cmsg->cmsg_type == SCM_PIDFD) {
+ assert(parent_pidfd < 0);
+ parent_pidfd = *CMSG_TYPED_DATA(cmsg, int);
+ } else if (cmsg->cmsg_type == SCM_RIGHTS) {
+ assert(child_pidfd < 0);
+ child_pidfd = *CMSG_TYPED_DATA(cmsg, int);
+ }
+ }
+
+ /* Verify and set parent pidref. */
+ if (!ucred || !pid_is_valid(ucred->pid)) {
+ log_warning("Received pidref message without valid credentials. Ignoring.");
+ return 0;
+ }
+
+ /* Need to handle kernels without SO_PASSPIDFD where SCM_PIDFD will not be set. */
+ if (parent_pidfd >= 0)
+ r = pidref_set_pidfd_consume(&parent_pidref, TAKE_FD(parent_pidfd));
+ else
+ r = pidref_set_pid(&parent_pidref, ucred->pid);
+ if (r < 0) {
+ if (r == -ESRCH)
+ log_debug_errno(r, "PidRef child process died before message is processed. Ignoring.");
+ else
+ log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m");
+ return 0;
+ }
+
+ if (parent_pidref.pid != ucred->pid) {
+ assert(parent_pidref.fd >= 0);
+ log_warning("Got SCM_PIDFD for parent process " PID_FMT " but got SCM_CREDENTIALS for parent process " PID_FMT ". Ignoring.",
+ parent_pidref.pid, ucred->pid);
+ return 0;
+ }
+
+ /* Verify and set child pidref. */
+ if (!pid_is_valid(child_pid)) {
+ log_warning("Received pidref message without valid child PID. Ignoring.");
+ return 0;
+ }
+
+ /* Need to handle kernels without PIDFD support. */
+ if (child_pidfd >= 0)
+ r = pidref_set_pidfd_consume(&child_pidref, TAKE_FD(child_pidfd));
+ else
+ r = pidref_set_pid(&child_pidref, child_pid);
+ if (r < 0) {
+ if (r == -ESRCH)
+ log_debug_errno(r, "PidRef child process died before message is processed. Ignoring.");
+ else
+ log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m");
+ return 0;
+ }
+
+ if (child_pidref.pid != child_pid) {
+ assert(child_pidref.fd >= 0);
+ log_warning("Got SCM_RIGHTS for child process " PID_FMT " but PID in IOV message is " PID_FMT ". Ignoring.",
+ child_pidref.pid, child_pid);
+ return 0;
+ }
+
+ log_debug("Got pidref event with parent PID " PID_FMT " and child PID " PID_FMT ".", parent_pidref.pid, child_pidref.pid);
+
+ /* Try finding cgroup of parent process. But if parent process exited and we're not using PIDFD, this could return NULL.
+ * Then fall back to finding cgroup of the child process. */
+ Unit *u = manager_get_unit_by_pidref_cgroup(m, &parent_pidref);
+ if (!u)
+ u = manager_get_unit_by_pidref_cgroup(m, &child_pidref);
+ if (!u) {
+ log_debug("Got pidref for parent process " PID_FMT " and child process " PID_FMT " we are not interested in, ignoring.", parent_pidref.pid, child_pidref.pid);
+ return 0;
+ }
+
+ if (!UNIT_VTABLE(u)->notify_pidref) {
+ log_unit_warning(u, "Received pidref event from unexpected unit type '%s'.", unit_type_to_string(u->type));
+ return 0;
+ }
+
+ UNIT_VTABLE(u)->notify_pidref(u, &parent_pidref, &child_pidref);
+
+ return 0;
+}
+
void manager_ref_console(Manager *m) {
assert(m);
diff --git a/src/core/manager.h b/src/core/manager.h
index c1f7f8c083..e4cada80ff 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -289,6 +289,9 @@ struct Manager {
int handoff_timestamp_fds[2];
sd_event_source *handoff_timestamp_event_source;
+ int pidref_transport_fds[2];
+ sd_event_source *pidref_event_source;
+
RuntimeScope runtime_scope;
LookupPaths lookup_paths;
@@ -678,12 +681,13 @@ void unit_defaults_done(UnitDefaults *defaults);
enum {
/* most important … */
- EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11,
- EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
- EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
- EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */
- EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */
- EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8,
+ EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-12,
+ EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-11,
+ EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-11,
+ EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv1 */
+ EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv2 */
+ EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-9,
+ EVENT_PRIORITY_PIDREF = SD_EVENT_PRIORITY_NORMAL-8,
EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7,
EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6,
EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5,
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 91c905f2fe..57dbbc4fc7 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -2061,7 +2061,8 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT ||
- p->proc_subset != PROC_SUBSET_ALL;
+ p->proc_subset != PROC_SUBSET_ALL ||
+ p->private_pids != PRIVATE_PIDS_NO;
}
/* Walk all mount entries and dropping any unused mounts. This affects all
@@ -3366,3 +3367,10 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
+
+static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = {
+ [PRIVATE_PIDS_NO] = "no",
+ [PRIVATE_PIDS_YES] = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);
diff --git a/src/core/namespace.h b/src/core/namespace.h
index 7b6e892cc2..bd48aa31da 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -78,6 +78,13 @@ typedef enum ProtectControlGroups {
_PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
} ProtectControlGroups;
+typedef enum PrivatePIDs {
+ PRIVATE_PIDS_NO,
+ PRIVATE_PIDS_YES,
+ _PRIVATE_PIDS_MAX,
+ _PRIVATE_PIDS_INVALID = -EINVAL,
+} PrivatePIDs;
+
struct BindMount {
char *source;
char *destination;
@@ -182,6 +189,7 @@ struct NamespaceParameters {
ProtectProc protect_proc;
ProcSubset proc_subset;
PrivateTmp private_tmp;
+ PrivatePIDs private_pids;
};
int setup_namespace(const NamespaceParameters *p, char **reterr_path);
@@ -225,6 +233,9 @@ PrivateUsers private_users_from_string(const char *s) _pure_;
const char* protect_control_groups_to_string(ProtectControlGroups i) _const_;
ProtectControlGroups protect_control_groups_from_string(const char *s) _pure_;
+const char* private_pids_to_string(PrivatePIDs i) _const_;
+PrivatePIDs private_pids_from_string(const char *s) _pure_;
+
void bind_mount_free_many(BindMount *b, size_t n);
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
diff --git a/src/core/service.c b/src/core/service.c
index 737dc9905a..a9a64938b5 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -710,6 +710,9 @@ static int service_verify(Service *s) {
if (s->type == SERVICE_DBUS && !s->bus_name)
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
+ if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
+ return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
+
if (s->usb_function_descriptors && !s->usb_function_strings)
log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring.");
@@ -4908,6 +4911,35 @@ static void service_handoff_timestamp(
unit_add_to_dbus_queue(u);
}
+static void service_notify_pidref(Unit *u, PidRef *parent_pidref, PidRef *child_pidref) {
+ Service *s = ASSERT_PTR(SERVICE(u));
+ int r;
+
+ assert(pidref_is_set(parent_pidref));
+ assert(pidref_is_set(child_pidref));
+
+ if (pidref_equal(&s->main_pid, parent_pidref)) {
+ r = service_set_main_pidref(s, TAKE_PIDREF(*child_pidref), /* start_timestamp = */ NULL);
+ if (r < 0)
+ return (void) log_unit_warning_errno(u, r, "Failed to set new main pid: %m");
+
+ /* Since the child process is PID 1 in a new PID namespace, it must be exclusive to this unit. */
+ r = unit_watch_pidref(u, &s->main_pid, /* exclusive= */ true);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to watch new main PID " PID_FMT ": %m", s->main_pid.pid);
+ } else if (pidref_equal(&s->control_pid, parent_pidref)) {
+ service_unwatch_control_pid(s);
+ s->control_pid = TAKE_PIDREF(*child_pidref);
+
+ r = unit_watch_pidref(u, &s->control_pid, /* exclusive= */ true);
+ if (r < 0)
+ log_unit_warning_errno(u, r, "Failed to watch new control PID " PID_FMT ": %m", s->control_pid.pid);
+ } else
+ return (void) log_unit_debug(u, "Parent process " PID_FMT " does not match main or control processes, ignoring.", parent_pidref->pid);
+
+ unit_add_to_dbus_queue(u);
+}
+
static int service_get_timeout(Unit *u, usec_t *timeout) {
Service *s = ASSERT_PTR(SERVICE(u));
uint64_t t;
@@ -5638,6 +5670,7 @@ const UnitVTable service_vtable = {
.notify_cgroup_oom = service_notify_cgroup_oom_event,
.notify_message = service_notify_message,
.notify_handoff_timestamp = service_handoff_timestamp,
+ .notify_pidref = service_notify_pidref,
.main_pid = service_main_pid,
.control_pid = service_control_pid,
diff --git a/src/core/unit.c b/src/core/unit.c
index eec08a2fbf..71488a4555 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -4237,6 +4237,9 @@ static int unit_verify_contexts(const Unit *u) {
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
+ if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
+ return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
+
const KillContext *kc = unit_get_kill_context(u);
if (ec->pam_name && kc && !IN_SET(kc->kill_mode, KILL_CONTROL_GROUP, KILL_MIXED))
@@ -5402,6 +5405,8 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
p->user_lookup_fd = u->manager->user_lookup_fds[1];
p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1];
+ if (UNIT_VTABLE(u)->notify_pidref)
+ p->pidref_transport_fd = u->manager->pidref_transport_fds[1];
p->cgroup_id = crt ? crt->cgroup_id : 0;
p->invocation_id = u->invocation_id;
diff --git a/src/core/unit.h b/src/core/unit.h
index 01e1adf961..a8eb366337 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -640,6 +640,9 @@ typedef struct UnitVTable {
/* Called whenever we learn a handoff timestamp */
void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts);
+ /* Called whenever we learn about a child process */
+ void (*notify_pidref)(Unit *u, PidRef *parent_pidref, PidRef *child_pidref);
+
/* Called whenever a name this Unit registered for comes or goes away. */
void (*bus_name_owner_change)(Unit *u, const char *new_owner);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 90b6f233e2..06bfb90c8f 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -1061,7 +1061,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"LogNamespace",
"RootImagePolicy",
"MountImagePolicy",
- "ExtensionImagePolicy"))
+ "ExtensionImagePolicy",
+ "PrivatePIDs"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "IgnoreSIGPIPE",
diff --git a/test/TEST-07-PID1/test.sh b/test/TEST-07-PID1/test.sh
index 2513406e0d..66e1b684ea 100755
--- a/test/TEST-07-PID1/test.sh
+++ b/test/TEST-07-PID1/test.sh
@@ -6,12 +6,17 @@ TEST_DESCRIPTION="Tests for core PID1 functionality"
# for testing PrivateNetwork=yes
NSPAWN_ARGUMENTS="--capability=CAP_NET_ADMIN"
+# for testing PrivatePIDs=yes
+TEST_INSTALL_VERITY_MINIMAL=1
# shellcheck source=test/test-functions
. "${TEST_BASE_DIR:?}/test-functions"
test_append_files() {
image_install logger socat
+ inst_binary mksquashfs
+ inst_binary unsquashfs
+ install_verity_minimal
}
do_test "$@"
diff --git a/test/units/TEST-07-PID1.private-pids.sh b/test/units/TEST-07-PID1.private-pids.sh
new file mode 100755
index 0000000000..6f16820aee
--- /dev/null
+++ b/test/units/TEST-07-PID1.private-pids.sh
@@ -0,0 +1,161 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+# shellcheck disable=SC2016
+set -eux
+set -o pipefail
+
+# shellcheck source=test/units/test-control.sh
+. "$(dirname "$0")"/test-control.sh
+# shellcheck source=test/units/util.sh
+. "$(dirname "$0")"/util.sh
+
+HAS_EXISTING_SCSI_MOUNT=no
+if findmnt --mountpoint /proc/scsi; then
+ HAS_EXISTING_SCSI_MOUNT=yes
+fi
+
+at_exit() {
+ set +e
+
+ # Unmount any file systems
+ if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
+ umount /proc/scsi
+ fi
+ umount /tmp/TEST-07-PID1-private-pids-proc
+ rm -rf /tmp/TEST-07-PID1-private-pids-proc
+ # Remove any test files
+ rm -rf /tmp/TEST-07-PID1-private-pids-services
+ rm -rf /tmp/TEST-07-PID1-private-pids-root
+ # Stop any test services
+ systemctl kill --signal=KILL TEST-07-PID1-private-pid.service
+ # Remove any failed transient units
+ systemctl reset-failed
+}
+
+trap at_exit EXIT
+
+testcase_basic() {
+ # Verify current process is PID1 in new namespace
+ assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe readlink /proc/self)" "1"
+ # Verify we are only processes in new namespace
+ assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe ps aux --no-heading | wc -l)" "1"
+ # Verify procfs mount
+ systemd-run -p PrivatePIDs=yes --wait --pipe \
+ bash -xec '[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ rw ]];
+ [[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nosuid ]];
+ [[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nodev ]];
+ [[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ noexec ]];'
+
+ # Verify main PID is correct
+ systemd-run -p PrivatePIDs=yes --remain-after-exit --unit TEST-07-PID1-private-pid sleep infinity
+ # Wait for ExecMainPID to be correctly populated as there might be a race between spawning service
+ # and actual exec child process
+ sleep 2
+ pid=$(systemctl show TEST-07-PID1-private-pid.service -p ExecMainPID --value)
+ kill -9 "$pid"
+ timeout 10s bash -xec 'while [[ "$(systemctl show -P SubState TEST-07-PID1-private-pid.service)" != "failed" ]]; do sleep .5; done'
+ assert_eq "$(systemctl show -P Result TEST-07-PID1-private-pid.service)" "signal"
+ assert_eq "$(systemctl show -P ExecMainStatus TEST-07-PID1-private-pid.service)" "9"
+ systemctl reset-failed
+}
+
+testcase_analyze() {
+ mkdir -p /tmp/TEST-07-PID1-private-pids-services
+
+ # Verify other services are compatible with PrivatePIDs=yes
+ cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service
+[Service]
+ExecStart=echo hello
+PrivatePIDs=yes
+Type=oneshot
+EOF
+
+ # Verify Type=forking services are not compatible with PrivatePIDs=yes
+ cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/forking-invalid.service
+[Service]
+ExecStart=echo hello
+PrivatePIDs=yes
+Type=forking
+EOF
+
+ systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service
+ (! systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/forking-invalid.service)
+
+
+ rm -rf /tmp/TEST-07-PID1-private-pids-services
+}
+
+testcase_multiple_features() {
+ unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-private-pids-root /usr/share/minimal_0.raw
+
+ systemd-run \
+ -p PrivatePIDs=yes \
+ -p RootDirectory=/tmp/TEST-07-PID1-private-pids-root \
+ -p ProcSubset=pid \
+ -p BindReadOnlyPaths=/usr/share \
+ -p NoNewPrivileges=yes \
+ -p ProtectSystem=strict \
+ -p User=testuser\
+ -p Group=testuser \
+ -p RuntimeDirectory=abc \
+ -p StateDirectory=qed \
+ -p InaccessiblePaths=/usr/include \
+ -p TemporaryFileSystem=/home \
+ -p PrivateTmp=yes \
+ -p PrivateDevices=yes \
+ -p PrivateNetwork=yes \
+ -p PrivateUsersEx=self \
+ -p PrivateIPC=yes \
+ -p ProtectHostname=yes \
+ -p ProtectClock=yes \
+ -p ProtectKernelTunables=yes \
+ -p ProtectKernelModules=yes \
+ -p ProtectKernelLogs=yes \
+ -p ProtectControlGroupsEx=private \
+ -p LockPersonality=yes \
+ -p Environment=ABC=QED \
+ --wait \
+ --pipe \
+ grep MARKER=1 /etc/os-release
+
+ rm -rf /tmp/TEST-07-PID1-private-pids-root
+}
+
+testcase_unpriv() {
+ if [ ! -f /usr/lib/systemd/user/dbus.socket ] && [ ! -f /etc/systemd/user/dbus.socket ]; then
+ echo "Per-user instances are not supported, skipping unprivileged PrivatePIDs=yes test"
+ return 0
+ fi
+
+ if [[ "$(sysctl -ne kernel.apparmor_restrict_unprivileged_userns)" -eq 1 ]]; then
+ echo "Cannot create unprivileged user namespaces, skipping unprivileged PrivatePIDs=yes test"
+ return 0
+ fi
+
+ # The kernel has a restriction for unprivileged user namespaces where they cannot mount a less restrictive
+ # instance of /proc/. So if /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs as systemd-nspawn does),
+ # then mounting a new /proc/ will fail and we will still see the host's /proc/. Thus, to allow tests to run in
+ # a VM or nspawn, we mount a new proc on a temporary directory with no masking to bypass this kernel restriction.
+ mkdir -p /tmp/TEST-07-PID1-private-pids-proc
+ mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc
+
+ # Verify running as unprivileged user can unshare PID namespace and mounts /proc properly.
+ assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
+ assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
+
+ umount /tmp/TEST-07-PID1-private-pids-proc
+ rm -rf /tmp/TEST-07-PID1-private-pids-proc
+
+ # Now verify the behavior with masking - units should fail as PrivatePIDs=yes has no graceful fallback.
+ if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
+ mount -t tmpfs tmpfs /proc/scsi
+ fi
+
+ (! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true)
+
+ if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
+ umount /proc/scsi
+ fi
+}
+
+run_testcases