From 406f1775017a5631bc91a1f53ac5e50f4fbfac0c Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Thu, 29 Aug 2024 17:10:46 +0200 Subject: core: Introduce PrivatePIDs= This new setting allows unsharing the pid namespace in a unit. Because you have to fork to get a process into a pid namespace, we fork in systemd-executor to get into the new pid namespace. The parent then sends the pid of the child process back to the manager and exits while the child process continues on with the rest of exec_invoke() and then executes the actual payload. Communicating the child pid is done via a new pidref socket pair that is set up on manager startup. We unshare the PID namespace right before the mount namespace so we mount procfs correctly. Note PrivatePIDs=yes always implies MountAPIVFS=yes to mount procfs. When running unprivileged in a user session, user namespace is set up first to allow for PID namespace to be unshared. However, when running in privileged mode, we unshare the user namespace last to ensure the user namespace does not own the PID namespace and cannot break out of the sandbox. Note we disallow Type=forking services from using PrivatePIDs=yes since the init proess inside the PID namespace must not exit for other processes in the namespace to exist. Note Daan De Meyer did the original work for this commit with Ryan Wilson addressing follow-ups. Co-authored-by: Daan De Meyer --- src/core/dbus-execute.c | 23 ++++ src/core/exec-invoke.c | 169 ++++++++++++++++++++++++++++- src/core/execute-serialize.c | 20 ++++ src/core/execute.c | 11 +- src/core/execute.h | 4 + src/core/load-fragment-gperf.gperf.in | 1 + src/core/load-fragment.c | 1 + src/core/load-fragment.h | 1 + src/core/manager.c | 195 ++++++++++++++++++++++++++++++++++ src/core/manager.h | 16 +-- src/core/namespace.c | 10 +- src/core/namespace.h | 11 ++ src/core/service.c | 33 ++++++ src/core/unit.c | 5 + src/core/unit.h | 3 + 15 files changed, 494 insertions(+), 9 deletions(-) (limited to 'src/core') diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index a9a73b599b..e297323f1d 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -63,6 +63,7 @@ static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL); static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string); +static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_pids, "s", PrivatePIDs, private_pids_to_string); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI); static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC); static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa); @@ -1194,6 +1195,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivatePIDs", "s", property_get_private_pids, offsetof(ExecContext, private_pids), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1970,6 +1972,27 @@ int bus_exec_context_set_transient_property( return 1; } + if (streq(name, "PrivatePIDs")) { + const char *s; + PrivatePIDs t; + + r = sd_bus_message_read(message, "s", &s); + if (r < 0) + return r; + + t = private_pids_from_string(s); + if (t < 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->private_pids = t; + (void) unit_write_settingf(u, flags, name, "%s=%s", + name, private_pids_to_string(c->private_pids)); + } + + return 1; + } + if (streq(name, "PrivateDevices")) return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 72ed53360b..120067a774 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2251,6 +2251,130 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi return 1; } +static int can_mount_proc(const ExecContext *c, ExecParameters *p) { + _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; + _cleanup_(sigkill_waitp) pid_t pid = 0; + ssize_t n; + int r; + + assert(c); + assert(p); + + /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs + * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction + * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */ + + /* Create a communication channel so that the child can tell the parent a proper error code in case it + * failed. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m"); + + /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE + * with FORK_MOUNTNS_SLAVE. */ + r = safe_fork("(sd-proc-check)", + FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to fork child process (sd-proc-check): %m"); + if (r == 0) { + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount + * namespace will be cleaned up once the process exits. */ + r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (r < 0) { + (void) write(errno_pipe[1], &r, sizeof(r)); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + + /* Try to read an error code from the child */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with child process (sd-proc-check): %m"); + if (n == sizeof(r)) { /* an error code was sent to us */ + /* This is the expected case where proc cannot be mounted due to permissions. */ + if (ERRNO_IS_NEG_PRIVILEGE(r)) + return 0; + if (r < 0) + return r; + + return -EIO; + } + if (n != 0) /* on success we should have read 0 bytes */ + return -EIO; + + r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to wait for (sd-proc-check) child process to terminate: %m"); + if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */ + return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r); + + return 1; +} + +static int setup_private_pids(const ExecContext *c, ExecParameters *p) { + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; + ssize_t n; + int r, q; + + assert(c); + assert(p); + assert(p->pidref_transport_fd >= 0); + + /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so + * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child + * pidref to the manager and exits while the child process continues with the rest of exec_invoke() + * and finally executes the actual payload. */ + + /* Create a communication channel so that the parent can tell the child a proper error code in case it + * failed to send child pidref to the manager. */ + if (pipe2(errno_pipe, O_CLOEXEC) < 0) + return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with parent process: %m"); + + r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS, &pidref); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to fork child into new pid namespace: %m"); + if (r > 0) { + errno_pipe[0] = safe_close(errno_pipe[0]); + + /* In the parent process, we send the child pidref to the manager and exit. + * If PIDFD is not supported, only the child PID is sent. The server then + * uses the child PID to set the new exec main process. */ + q = send_one_fd_iov( + p->pidref_transport_fd, + pidref.fd, + &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)), + /*iovlen=*/ 1, + /*flags=*/ 0); + /* Send error code to child process. */ + (void) write(errno_pipe[1], &q, sizeof(q)); + /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as + * some destructors have external effects. The main codepaths continue in the child process. */ + _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS); + } + + errno_pipe[1] = safe_close(errno_pipe[1]); + p->pidref_transport_fd = safe_close(p->pidref_transport_fd); + + /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always + * receive an errno even on success. */ + n = read(errno_pipe[0], &r, sizeof(r)); + if (n < 0) + return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with parent process: %m"); + if (n != sizeof(r)) + return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process"); + if (r < 0) + return log_exec_debug_errno(c, p, r, "Failed to send child pidref to manager: %m"); + + /* NOTE! This function returns in the child process only. */ + return r; +} + static int create_many_symlinks(const char *root, const char *source, char **symlinks) { _cleanup_free_ char *src_abs = NULL; int r; @@ -3301,6 +3425,7 @@ static int apply_mount_namespace( .private_dev = needs_sandboxing && context->private_devices, .private_network = needs_sandboxing && exec_needs_network_namespace(context), .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context), + .private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO, .private_tmp = needs_sandboxing ? context->private_tmp : false, .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context), @@ -3573,7 +3698,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 16]; + int dont_close[n_fds + 17]; assert(params); @@ -3612,6 +3737,9 @@ static int close_remaining_fds( if (params->handoff_timestamp_fd >= 0) dont_close[n_dont_close++] = params->handoff_timestamp_fd; + if (params->pidref_transport_fd >= 0) + dont_close[n_dont_close++] = params->pidref_transport_fd; + assert(n_dont_close <= ELEMENTSOF(dont_close)); return close_all_fds(dont_close, n_dont_close); @@ -3934,6 +4062,7 @@ static bool exec_context_need_unprivileged_private_users( !strv_isempty(context->extension_directories) || context->protect_system != PROTECT_SYSTEM_NO || context->protect_home != PROTECT_HOME_NO || + exec_needs_pid_namespace(context) || context->protect_kernel_tunables || context->protect_kernel_modules || context->protect_kernel_logs || @@ -4139,6 +4268,7 @@ int exec_invoke( needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */ needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */ bool keep_seccomp_privileges = false; + bool has_cap_sys_admin = false; #if HAVE_SELINUX _cleanup_free_ char *mac_selinux_context_net = NULL; bool use_selinux = false; @@ -4790,6 +4920,9 @@ int exec_invoke( uint64_t capability_ambient_set = context->capability_ambient_set; + /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */ + has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0; + if (needs_sandboxing) { /* MAC enablement checks need to be done before a new mount ns is created, as they rely on * /sys being present. The actual MAC context application will happen later, as late as @@ -4924,6 +5057,40 @@ int exec_invoke( } } + /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible. + * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */ + if (needs_sandboxing && exec_needs_pid_namespace(context)) { + if (params->pidref_transport_fd < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "PidRef socket is not set up: %m"); + } + + /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need + * to check if we can mount /proc/. + * + * We need to check prior to entering the user namespace because if we're running unprivileged or in a + * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not + * once we unshare a mount namespace. */ + r = has_cap_sys_admin ? 1 : can_mount_proc(context, params); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to detect if /proc/ can be remounted: %m"); + } + if (r == 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EPERM), + "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing."); + } + + r = setup_private_pids(context, params); + if (r < 0) { + *exit_status = EXIT_NAMESPACE; + return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m"); + } + } + + /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */ + if (needs_mount_namespace) { _cleanup_free_ char *error_path = NULL; diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 6fa0b21968..bf6592faed 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1391,6 +1391,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext if (r < 0) return r; + r = serialize_fd(f, fds, "exec-parameters-pidref-transport-fd", p->pidref_transport_fd); + if (r < 0) + return r; + if (c && exec_context_restrict_filesystems_set(c)) { r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd); if (r < 0) @@ -1660,6 +1664,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { continue; close_and_replace(p->handoff_timestamp_fd, fd); + } else if ((val = startswith(l, "exec-parameters-pidref-transport-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + close_and_replace(p->pidref_transport_fd, fd); } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) { int fd; @@ -1926,6 +1938,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item(f, "exec-context-private-pids", private_pids_to_string(c->private_pids)); + if (r < 0) + return r; + r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc); if (r < 0) return r; @@ -2813,6 +2829,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (r < 0) return r; c->private_ipc = r; + } else if ((val = startswith(l, "exec-context-private-pids="))) { + c->private_pids = private_pids_from_string(val); + if (c->private_pids < 0) + return -EINVAL; } else if ((val = startswith(l, "exec-context-remove-ipc="))) { r = parse_boolean(val); if (r < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 1c41b39a2f..2c5a5db10e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -254,6 +254,12 @@ bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParame return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT); } +bool exec_needs_pid_namespace(const ExecContext *context) { + assert(context); + + return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID); +} + bool exec_needs_mount_namespace( const ExecContext *context, const ExecParameters *params, @@ -306,7 +312,8 @@ bool exec_needs_mount_namespace( exec_needs_cgroup_mount(context, params) || context->protect_proc != PROTECT_PROC_DEFAULT || context->proc_subset != PROC_SUBSET_ALL || - exec_needs_ipc_namespace(context)) + exec_needs_ipc_namespace(context) || + exec_needs_pid_namespace(context)) return true; if (context->root_directory) { @@ -1026,6 +1033,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { "%sProtectControlGroups: %s\n" "%sPrivateNetwork: %s\n" "%sPrivateUsers: %s\n" + "%sPrivatePIDs: %s\n" "%sProtectHome: %s\n" "%sProtectSystem: %s\n" "%sMountAPIVFS: %s\n" @@ -1052,6 +1060,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, protect_control_groups_to_string(c->protect_control_groups), prefix, yes_no(c->private_network), prefix, private_users_to_string(c->private_users), + prefix, private_pids_to_string(c->private_pids), prefix, protect_home_to_string(c->protect_home), prefix, protect_system_to_string(c->protect_system), prefix, yes_no(exec_context_get_effective_mount_apivfs(c)), diff --git a/src/core/execute.h b/src/core/execute.h index 7274c68d3d..32dabf177f 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -335,6 +335,7 @@ struct ExecContext { ProtectControlGroups protect_control_groups; ProtectSystem protect_system; ProtectHome protect_home; + PrivatePIDs private_pids; bool protect_hostname; bool dynamic_user; @@ -465,6 +466,7 @@ struct ExecParameters { char **files_env; int user_lookup_fd; int handoff_timestamp_fd; + int pidref_transport_fd; int bpf_restrict_fs_map_fd; @@ -486,6 +488,7 @@ struct ExecParameters { .bpf_restrict_fs_map_fd = -EBADF, \ .user_lookup_fd = -EBADF, \ .handoff_timestamp_fd = -EBADF, \ + .pidref_transport_fd = -EBADF, \ } #include "unit.h" @@ -623,6 +626,7 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_; bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime); bool exec_needs_network_namespace(const ExecContext *context); bool exec_needs_ipc_namespace(const ExecContext *context); +bool exec_needs_pid_namespace(const ExecContext *context); ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params); bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params); diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index f5cbb319d7..d7564b3767 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -133,6 +133,7 @@ {{type}}.PrivateUsers, config_parse_private_users, 0, offsetof({{type}}, exec_context.private_users) {{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts) {{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc) +{{type}}.PrivatePIDs, config_parse_private_pids, 0, offsetof({{type}}, exec_context.private_pids) {{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system) {{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home) {{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 1d813332b1..f34c930f4e 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -135,6 +135,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc); DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers); +DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs); DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_control_groups, protect_control_groups, ProtectControlGroups); DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode); DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 9b95f0c24e..8ac962a94b 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -114,6 +114,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp); CONFIG_PARSER_PROTOTYPE(config_parse_private_users); +CONFIG_PARSER_PROTOTYPE(config_parse_private_pids); CONFIG_PARSER_PROTOTYPE(config_parse_protect_control_groups); CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset); diff --git a/src/core/manager.c b/src/core/manager.c index f58bc547a6..296d7416b1 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -126,6 +126,7 @@ static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); @@ -913,6 +914,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, .signal_fd = -EBADF, .user_lookup_fds = EBADF_PAIR, .handoff_timestamp_fds = EBADF_PAIR, + .pidref_transport_fds = EBADF_PAIR, .private_listen_fd = -EBADF, .dev_autofs_fd = -EBADF, .cgroup_inotify_fd = -EBADF, @@ -1309,6 +1311,55 @@ static int manager_setup_handoff_timestamp_fd(Manager *m) { return 0; } +static int manager_setup_pidref_transport_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for passing parent and child pidrefs back when the executor unshares + * a PID namespace and forks again when using PrivatePIDs=yes. */ + + if (m->pidref_transport_fds[0] < 0) { + m->pidref_event_source = sd_event_source_disable_unref(m->pidref_event_source); + safe_close_pair(m->pidref_transport_fds); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->pidref_transport_fds) < 0) + return log_error_errno(errno, "Failed to allocate pidref socket: %m"); + + /* Make sure children never have to block */ + (void) fd_increase_rxbuf(m->pidref_transport_fds[0], MANAGER_SOCKET_RCVBUF_SIZE); + + r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "Failed to enable SO_PASSCRED for pidref socket: %m"); + + r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSPIDFD, true); + if (ERRNO_IS_NEG_NOT_SUPPORTED(r)) + log_debug("SO_PASSPIDFD is not supported for pidref socket, ignoring."); + else if (r < 0) + log_warning_errno(r, "Failed to enable SO_PASSPIDFD for pidref socket, ignoring: %m"); + + /* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */ + r = fd_nonblock(m->pidref_transport_fds[0], true); + if (r < 0) + return log_error_errno(r, "Failed to make pidref socket O_NONBLOCK: %m"); + } + + if (!m->pidref_event_source) { + r = sd_event_add_io(m->event, &m->pidref_event_source, m->pidref_transport_fds[0], EPOLLIN, manager_dispatch_pidref_transport_fd, m); + if (r < 0) + return log_error_errno(r, "Failed to allocate pidref event source: %m"); + + r = sd_event_source_set_priority(m->pidref_event_source, EVENT_PRIORITY_PIDREF); + if (r < 0) + return log_error_errno(r, "Failed to set priority of pidref event source: %m"); + + (void) sd_event_source_set_description(m->pidref_event_source, "pidref"); + } + + return 0; +} + static unsigned manager_dispatch_cleanup_queue(Manager *m) { Unit *u; unsigned n = 0; @@ -1724,6 +1775,7 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->run_queue_event_source); sd_event_source_unref(m->user_lookup_event_source); sd_event_source_unref(m->handoff_timestamp_event_source); + sd_event_source_unref(m->pidref_event_source); sd_event_source_unref(m->memory_pressure_event_source); safe_close(m->signal_fd); @@ -1731,6 +1783,7 @@ Manager* manager_free(Manager *m) { safe_close(m->cgroups_agent_fd); safe_close_pair(m->user_lookup_fds); safe_close_pair(m->handoff_timestamp_fds); + safe_close_pair(m->pidref_transport_fds); manager_close_ask_password(m); @@ -2077,6 +2130,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo /* This shouldn't fail, except if things are really broken. */ return r; + r = manager_setup_pidref_transport_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + /* Connect to the bus if we are good for it */ manager_setup_bus(m); @@ -3747,6 +3805,7 @@ int manager_reload(Manager *m) { (void) manager_setup_cgroups_agent(m); (void) manager_setup_user_lookup_fd(m); (void) manager_setup_handoff_timestamp_fd(m); + (void) manager_setup_pidref_transport_fd(m); /* Third, fire things up! */ manager_coldplug(m); @@ -5002,6 +5061,142 @@ static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd return 0; } +static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL, parent_pidref = PIDREF_NULL; + _cleanup_close_ int child_pidfd = -EBADF, parent_pidfd = -EBADF; + struct ucred *ucred = NULL; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int)) * 2) control; + pid_t child_pid; + struct msghdr msghdr = { + .msg_iov = &IOVEC_MAKE(&child_pid, sizeof(child_pid)), + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + ssize_t n; + int r; + + assert(source); + + /* Server expects: + * - Parent PID in ucreds enabled via SO_PASSCRED + * - Parent PIDFD in SCM_PIDFD message enabled via SO_PASSPIDFD + * - Child PIDFD in SCM_RIGHTS in message body + * - Child PID in message IOV + * + * SO_PASSPIDFD may not be supported by the kernel so we fall back to using parent PID from ucreds + * and accept some raciness. */ + n = recvmsg_safe(m->pidref_transport_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; /* Spurious wakeup, try again */ + if (n == -ECHRNG) { + log_warning_errno(n, "Got message with truncated control data (unexpected fds sent?), ignoring."); + return 0; + } + if (n == -EXFULL) { + log_warning_errno(n, "Got message with truncated payload data, ignoring."); + return 0; + } + if (n < 0) + return log_error_errno(n, "Failed to receive pidref message: %m"); + + if (n != sizeof(child_pid)) { + log_warning("Got pidref message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(child_pid)); + return 0; + } + + CMSG_FOREACH(cmsg, &msghdr) { + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + if (cmsg->cmsg_type == SCM_CREDENTIALS && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + assert(!ucred); + ucred = CMSG_TYPED_DATA(cmsg, struct ucred); + } else if (cmsg->cmsg_type == SCM_PIDFD) { + assert(parent_pidfd < 0); + parent_pidfd = *CMSG_TYPED_DATA(cmsg, int); + } else if (cmsg->cmsg_type == SCM_RIGHTS) { + assert(child_pidfd < 0); + child_pidfd = *CMSG_TYPED_DATA(cmsg, int); + } + } + + /* Verify and set parent pidref. */ + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received pidref message without valid credentials. Ignoring."); + return 0; + } + + /* Need to handle kernels without SO_PASSPIDFD where SCM_PIDFD will not be set. */ + if (parent_pidfd >= 0) + r = pidref_set_pidfd_consume(&parent_pidref, TAKE_FD(parent_pidfd)); + else + r = pidref_set_pid(&parent_pidref, ucred->pid); + if (r < 0) { + if (r == -ESRCH) + log_debug_errno(r, "PidRef child process died before message is processed. Ignoring."); + else + log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m"); + return 0; + } + + if (parent_pidref.pid != ucred->pid) { + assert(parent_pidref.fd >= 0); + log_warning("Got SCM_PIDFD for parent process " PID_FMT " but got SCM_CREDENTIALS for parent process " PID_FMT ". Ignoring.", + parent_pidref.pid, ucred->pid); + return 0; + } + + /* Verify and set child pidref. */ + if (!pid_is_valid(child_pid)) { + log_warning("Received pidref message without valid child PID. Ignoring."); + return 0; + } + + /* Need to handle kernels without PIDFD support. */ + if (child_pidfd >= 0) + r = pidref_set_pidfd_consume(&child_pidref, TAKE_FD(child_pidfd)); + else + r = pidref_set_pid(&child_pidref, child_pid); + if (r < 0) { + if (r == -ESRCH) + log_debug_errno(r, "PidRef child process died before message is processed. Ignoring."); + else + log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m"); + return 0; + } + + if (child_pidref.pid != child_pid) { + assert(child_pidref.fd >= 0); + log_warning("Got SCM_RIGHTS for child process " PID_FMT " but PID in IOV message is " PID_FMT ". Ignoring.", + child_pidref.pid, child_pid); + return 0; + } + + log_debug("Got pidref event with parent PID " PID_FMT " and child PID " PID_FMT ".", parent_pidref.pid, child_pidref.pid); + + /* Try finding cgroup of parent process. But if parent process exited and we're not using PIDFD, this could return NULL. + * Then fall back to finding cgroup of the child process. */ + Unit *u = manager_get_unit_by_pidref_cgroup(m, &parent_pidref); + if (!u) + u = manager_get_unit_by_pidref_cgroup(m, &child_pidref); + if (!u) { + log_debug("Got pidref for parent process " PID_FMT " and child process " PID_FMT " we are not interested in, ignoring.", parent_pidref.pid, child_pidref.pid); + return 0; + } + + if (!UNIT_VTABLE(u)->notify_pidref) { + log_unit_warning(u, "Received pidref event from unexpected unit type '%s'.", unit_type_to_string(u->type)); + return 0; + } + + UNIT_VTABLE(u)->notify_pidref(u, &parent_pidref, &child_pidref); + + return 0; +} + void manager_ref_console(Manager *m) { assert(m); diff --git a/src/core/manager.h b/src/core/manager.h index c1f7f8c083..e4cada80ff 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -289,6 +289,9 @@ struct Manager { int handoff_timestamp_fds[2]; sd_event_source *handoff_timestamp_event_source; + int pidref_transport_fds[2]; + sd_event_source *pidref_event_source; + RuntimeScope runtime_scope; LookupPaths lookup_paths; @@ -678,12 +681,13 @@ void unit_defaults_done(UnitDefaults *defaults); enum { /* most important … */ - EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11, - EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10, - EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10, - EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */ - EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */ - EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8, + EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-12, + EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-11, + EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-11, + EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv1 */ + EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv2 */ + EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-9, + EVENT_PRIORITY_PIDREF = SD_EVENT_PRIORITY_NORMAL-8, EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7, EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6, EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5, diff --git a/src/core/namespace.c b/src/core/namespace.c index 91c905f2fe..57dbbc4fc7 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -2061,7 +2061,8 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) { p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO || p->protect_kernel_tunables || p->protect_proc != PROTECT_PROC_DEFAULT || - p->proc_subset != PROC_SUBSET_ALL; + p->proc_subset != PROC_SUBSET_ALL || + p->private_pids != PRIVATE_PIDS_NO; } /* Walk all mount entries and dropping any unused mounts. This affects all @@ -3366,3 +3367,10 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = { }; DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF); + +static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = { + [PRIVATE_PIDS_NO] = "no", + [PRIVATE_PIDS_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES); diff --git a/src/core/namespace.h b/src/core/namespace.h index 7b6e892cc2..bd48aa31da 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -78,6 +78,13 @@ typedef enum ProtectControlGroups { _PROTECT_CONTROL_GROUPS_INVALID = -EINVAL, } ProtectControlGroups; +typedef enum PrivatePIDs { + PRIVATE_PIDS_NO, + PRIVATE_PIDS_YES, + _PRIVATE_PIDS_MAX, + _PRIVATE_PIDS_INVALID = -EINVAL, +} PrivatePIDs; + struct BindMount { char *source; char *destination; @@ -182,6 +189,7 @@ struct NamespaceParameters { ProtectProc protect_proc; ProcSubset proc_subset; PrivateTmp private_tmp; + PrivatePIDs private_pids; }; int setup_namespace(const NamespaceParameters *p, char **reterr_path); @@ -225,6 +233,9 @@ PrivateUsers private_users_from_string(const char *s) _pure_; const char* protect_control_groups_to_string(ProtectControlGroups i) _const_; ProtectControlGroups protect_control_groups_from_string(const char *s) _pure_; +const char* private_pids_to_string(PrivatePIDs i) _const_; +PrivatePIDs private_pids_from_string(const char *s) _pure_; + void bind_mount_free_many(BindMount *b, size_t n); int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); diff --git a/src/core/service.c b/src/core/service.c index 737dc9905a..a9a64938b5 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -710,6 +710,9 @@ static int service_verify(Service *s) { if (s->type == SERVICE_DBUS && !s->bus_name) return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing."); + if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context)) + return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing."); + if (s->usb_function_descriptors && !s->usb_function_strings) log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring."); @@ -4908,6 +4911,35 @@ static void service_handoff_timestamp( unit_add_to_dbus_queue(u); } +static void service_notify_pidref(Unit *u, PidRef *parent_pidref, PidRef *child_pidref) { + Service *s = ASSERT_PTR(SERVICE(u)); + int r; + + assert(pidref_is_set(parent_pidref)); + assert(pidref_is_set(child_pidref)); + + if (pidref_equal(&s->main_pid, parent_pidref)) { + r = service_set_main_pidref(s, TAKE_PIDREF(*child_pidref), /* start_timestamp = */ NULL); + if (r < 0) + return (void) log_unit_warning_errno(u, r, "Failed to set new main pid: %m"); + + /* Since the child process is PID 1 in a new PID namespace, it must be exclusive to this unit. */ + r = unit_watch_pidref(u, &s->main_pid, /* exclusive= */ true); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to watch new main PID " PID_FMT ": %m", s->main_pid.pid); + } else if (pidref_equal(&s->control_pid, parent_pidref)) { + service_unwatch_control_pid(s); + s->control_pid = TAKE_PIDREF(*child_pidref); + + r = unit_watch_pidref(u, &s->control_pid, /* exclusive= */ true); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to watch new control PID " PID_FMT ": %m", s->control_pid.pid); + } else + return (void) log_unit_debug(u, "Parent process " PID_FMT " does not match main or control processes, ignoring.", parent_pidref->pid); + + unit_add_to_dbus_queue(u); +} + static int service_get_timeout(Unit *u, usec_t *timeout) { Service *s = ASSERT_PTR(SERVICE(u)); uint64_t t; @@ -5638,6 +5670,7 @@ const UnitVTable service_vtable = { .notify_cgroup_oom = service_notify_cgroup_oom_event, .notify_message = service_notify_message, .notify_handoff_timestamp = service_handoff_timestamp, + .notify_pidref = service_notify_pidref, .main_pid = service_main_pid, .control_pid = service_control_pid, diff --git a/src/core/unit.c b/src/core/unit.c index eec08a2fbf..71488a4555 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -4237,6 +4237,9 @@ static int unit_verify_contexts(const Unit *u) { exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL)) return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing."); + if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing."); + const KillContext *kc = unit_get_kill_context(u); if (ec->pam_name && kc && !IN_SET(kc->kill_mode, KILL_CONTROL_GROUP, KILL_MIXED)) @@ -5402,6 +5405,8 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { p->user_lookup_fd = u->manager->user_lookup_fds[1]; p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1]; + if (UNIT_VTABLE(u)->notify_pidref) + p->pidref_transport_fd = u->manager->pidref_transport_fds[1]; p->cgroup_id = crt ? crt->cgroup_id : 0; p->invocation_id = u->invocation_id; diff --git a/src/core/unit.h b/src/core/unit.h index 01e1adf961..a8eb366337 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -640,6 +640,9 @@ typedef struct UnitVTable { /* Called whenever we learn a handoff timestamp */ void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts); + /* Called whenever we learn about a child process */ + void (*notify_pidref)(Unit *u, PidRef *parent_pidref, PidRef *child_pidref); + /* Called whenever a name this Unit registered for comes or goes away. */ void (*bus_name_owner_change)(Unit *u, const char *new_owner); -- cgit v1.2.3