mountfsd: add new systemd-mountfsd component

author: Lennart Poettering <lennart@poettering.net> 2023-03-09 12:27:29 +0100
committer: Lennart Poettering <lennart@poettering.net> 2024-04-06 16:08:24 +0200
commit: 702a52f4b5d49cce11e2adbc740deb3b644e2de0 (patch)
tree: 2839e8881cd65cfd1ef03609f66e96c422ef3944 /src/mountfsd
parent: nsresourced: add client-side helpers around nsresourced APIs (diff)
download: systemd-702a52f4b5d49cce11e2adbc740deb3b644e2de0.tar.xz
systemd-702a52f4b5d49cce11e2adbc740deb3b644e2de0.zip
6 files changed, 1151 insertions, 0 deletions
diff --git a/src/mountfsd/io.systemd.mount-file-system.policy b/src/mountfsd/io.systemd.mount-file-system.policy
new file mode 100644
index 0000000000..6a151eb437
--- /dev/null
+++ b/src/mountfsd/io.systemd.mount-file-system.policy
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*-->
+<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN"
+        "https://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd">
+
+<!--
+  SPDX-License-Identifier: LGPL-2.1-or-later
+
+  This file is part of systemd.
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+-->
+
+<policyconfig>
+
+        <vendor>The systemd Project</vendor>
+        <vendor_url>https://systemd.io</vendor_url>
+
+        <!-- Allow mounting DDIs into the host user namespace -->
+        <action id="io.systemd.mount-file-system.mount-image">
+                <!-- This action is generally checked first: we'll first try to mount the image with
+                     signature checks on. If that fails, we'll retry with the untrusted action below. -->
+                <description gettext-domain="systemd">Allow mounting of file system image</description>
+                <message gettext-domain="systemd">Authentication is required for an application to mount a file system image.</message>
+                <defaults>
+                        <allow_any>auth_admin_keep</allow_any>
+                        <allow_inactive>auth_admin_keep</allow_inactive>
+                        <allow_active>yes</allow_active>
+                </defaults>
+        </action>
+
+        <action id="io.systemd.mount-file-system.mount-untrusted-image">
+                <!-- If the image cannot be mounted via the regular action because it is not signed by a
+                     recognized key, we'll try this action. -->
+                <description gettext-domain="systemd">Allow mounting of untrusted file system image</description>
+                <message gettext-domain="systemd">Authentication is required for an application to mount a cryptographically unsigned file system image or an image whose cryptographic signature is not recognized.</message>
+                <defaults>
+                        <allow_any>auth_admin</allow_any>
+                        <allow_inactive>auth_admin</allow_inactive>
+                        <allow_active>auth_admin</allow_active>
+                </defaults>
+
+                <annotate key="org.freedesktop.policykit.imply">io.systemd.mount-file-system.mount-image</annotate>
+        </action>
+
+        <!-- Allow mounting DDIs into a private user namespace -->
+        <action id="io.systemd.mount-file-system.mount-image-privately">
+                <description gettext-domain="systemd">Allow private mounting of trusted file system image</description>
+                <message gettext-domain="systemd">Authentication is required for an application to privately mount a file system image or an image whose cryptographic signature is recognized.</message>
+                <defaults>
+                        <allow_any>yes</allow_any>
+                        <allow_inactive>yes</allow_inactive>
+                        <allow_active>yes</allow_active>
+                </defaults>
+        </action>
+
+        <action id="io.systemd.mount-file-system.mount-untrusted-image-privately">
+                <description gettext-domain="systemd">Allow private mounting of untrusted file system image</description>
+                <message gettext-domain="systemd">Authentication is required for an application to privately mount a cryptographically unsigned file system image or an image whose cryptographic signature is not recognized.</message>
+                <defaults>
+                        <allow_any>auth_admin</allow_any>
+                        <allow_inactive>auth_admin</allow_inactive>
+                        <allow_active>auth_admin</allow_active>
+                </defaults>
+
+                <annotate key="org.freedesktop.policykit.imply">io.systemd.mount-file-system.mount-image-privately</annotate>
+        </action>
+</policyconfig>
diff --git a/src/mountfsd/meson.build b/src/mountfsd/meson.build
new file mode 100644
index 0000000000..3689d2af27
--- /dev/null
+++ b/src/mountfsd/meson.build
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+systemd_mountwork_sources = files(
+        'mountwork.c',
+)
+
+systemd_mountfsd_sources = files(
+        'mountfsd.c',
+        'mountfsd-manager.c',
+)
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-mountfsd',
+                'conditions' : ['ENABLE_MOUNTFSD'],
+                'sources' : systemd_mountfsd_sources,
+        },
+        libexec_template + {
+                'name' : 'systemd-mountwork',
+                'conditions' : ['ENABLE_MOUNTFSD'],
+                'sources' : systemd_mountwork_sources,
+                'link_with' : common_libs,
+                'dependencies' : common_deps,
+        },
+]
+
+install_data('io.systemd.mount-file-system.policy',
+             install_dir : polkitpolicydir)
diff --git a/src/mountfsd/mountfsd-manager.c b/src/mountfsd/mountfsd-manager.c
new file mode 100644
index 0000000000..b05c6e8493
--- /dev/null
+++ b/src/mountfsd/mountfsd-manager.c
@@ -0,0 +1,277 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#include <sys/wait.h>
+
+#include "sd-daemon.h"
+
+#include "build-path.h"
+#include "common-signal.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "mkdir.h"
+#include "mountfsd-manager.h"
+#include "process-util.h"
+#include "set.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "stdio-util.h"
+#include "umask-util.h"
+
+#define LISTEN_TIMEOUT_USEC (25 * USEC_PER_SEC)
+
+static int start_workers(Manager *m, bool explicit_request);
+
+static size_t manager_current_workers(Manager *m) {
+        assert(m);
+
+        return set_size(m->workers_fixed) + set_size(m->workers_dynamic);
+}
+
+static int on_worker_exit(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+
+        assert_se(!set_remove(m->workers_dynamic, s) != !set_remove(m->workers_fixed, s));
+        sd_event_source_disable_unref(s);
+
+        if (si->si_code == CLD_EXITED) {
+                if (si->si_status == EXIT_SUCCESS)
+                        log_debug("Worker " PID_FMT " exited successfully.", si->si_pid);
+                else
+                        log_warning("Worker " PID_FMT " died with a failure exit status %i, ignoring.", si->si_pid, si->si_status);
+        } else if (si->si_code == CLD_KILLED)
+                log_warning("Worker " PID_FMT " was killed by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status));
+        else if (si->si_code == CLD_DUMPED)
+                log_warning("Worker " PID_FMT " dumped core by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status));
+        else
+                log_warning("Got unexpected exit code via SIGCHLD, ignoring.");
+
+        (void) start_workers(m, /* explicit_request= */ false); /* Fill up workers again if we fell below the low watermark */
+        return 0;
+}
+
+static int on_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        (void) start_workers(m, /* explicit_request= */ true); /* Workers told us there's more work, let's add one more worker as long as we are below the high watermark */
+        return 0;
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(
+                event_source_hash_ops,
+                sd_event_source,
+                (void (*)(const sd_event_source*, struct siphash*)) trivial_hash_func,
+                (int (*)(const sd_event_source*, const sd_event_source*)) trivial_compare_func,
+                sd_event_source_disable_unref);
+
+int manager_new(Manager **ret) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        int r;
+
+        m = new(Manager, 1);
+        if (!m)
+                return -ENOMEM;
+
+        *m = (Manager) {
+                .listen_fd = -EBADF,
+                .worker_ratelimit = {
+                        .interval = 5 * USEC_PER_SEC,
+                        .burst = 50,
+                },
+        };
+
+        r = sd_event_new(&m->event);
+        if (r < 0)
+                return r;
+
+        r = sd_event_set_signal_exit(m->event, true);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_signal(m->event, NULL, (SIGRTMIN+18)|SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
+
+        r = sd_event_set_watchdog(m->event, true);
+        if (r < 0)
+                log_debug_errno(r, "Failed to enable watchdog handling, ignoring: %m");
+
+        r = sd_event_add_signal(m->event, NULL, SIGUSR2|SD_EVENT_SIGNAL_PROCMASK, on_sigusr2, m);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(m);
+        return 0;
+}
+
+Manager* manager_free(Manager *m) {
+        if (!m)
+                return NULL;
+
+        set_free(m->workers_fixed);
+        set_free(m->workers_dynamic);
+
+        /* Note: we rely on PR_DEATHSIG to kill the workers for us */
+
+        sd_event_unref(m->event);
+
+        return mfree(m);
+}
+
+static int start_one_worker(Manager *m) {
+        _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source = NULL;
+        bool fixed;
+        pid_t pid;
+        int r;
+
+        assert(m);
+
+        fixed = set_size(m->workers_fixed) < MOUNTFS_WORKERS_MIN;
+
+        r = safe_fork_full(
+                        "(sd-worker)",
+                        /* stdio_fds= */ NULL,
+                        &m->listen_fd, 1,
+                        FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_CLOSE_ALL_FDS,
+                        &pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to fork new worker child: %m");
+        if (r == 0) {
+                char pids[DECIMAL_STR_MAX(pid_t)];
+                /* Child */
+
+                if (m->listen_fd == 3) {
+                        r = fd_cloexec(3, false);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to turn off O_CLOEXEC for fd 3: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+                } else {
+                        if (dup2(m->listen_fd, 3) < 0) { /* dup2() creates with O_CLOEXEC off */
+                                log_error_errno(errno, "Failed to move listen fd to 3: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        safe_close(m->listen_fd);
+                }
+
+                xsprintf(pids, PID_FMT, pid);
+                if (setenv("LISTEN_PID", pids, 1) < 0) {
+                        log_error_errno(errno, "Failed to set $LISTEN_PID: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setenv("LISTEN_FDS", "1", 1) < 0) {
+                        log_error_errno(errno, "Failed to set $LISTEN_FDS: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setenv("MOUNTFS_FIXED_WORKER", one_zero(fixed), 1) < 0) {
+                        log_error_errno(errno, "Failed to set $MOUNTFS_FIXED_WORKER: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = setenv_systemd_log_level();
+                if (r < 0) {
+                        log_error_errno(r, "Failed to set $SYSTEMD_LOG_LEVEL: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = invoke_callout_binary(SYSTEMD_MOUNTWORK_PATH, STRV_MAKE("systemd-mountwork", "xxxxxxxxxxxxxxxx")); /* With some extra space rename_process() can make use of */
+                log_error_errno(r, "Failed start worker process: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        r = sd_event_add_child(m->event, &source, pid, WEXITED, on_worker_exit, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to watch child " PID_FMT ": %m", pid);
+
+        r = set_ensure_put(
+                        fixed ? &m->workers_fixed : &m->workers_dynamic,
+                        &event_source_hash_ops,
+                        source);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add child process to set: %m");
+
+        TAKE_PTR(source);
+
+        return 0;
+}
+
+static int start_workers(Manager *m, bool explicit_request) {
+        int r;
+
+        assert(m);
+
+        for (;;)  {
+                size_t n;
+
+                n = manager_current_workers(m);
+
+                log_debug("%zu workers running.", n);
+
+                if (n >= MOUNTFS_WORKERS_MIN && (!explicit_request || n >= MOUNTFS_WORKERS_MAX))
+                        break;
+
+                if (!ratelimit_below(&m->worker_ratelimit)) {
+                        /* If we keep starting workers too often, let's fail the whole daemon, something is wrong */
+                        sd_event_exit(m->event, EXIT_FAILURE);
+
+                        return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "Worker threads requested too frequently, something is wrong.");
+                }
+
+                r = start_one_worker(m);
+                if (r < 0)
+                        return r;
+
+                explicit_request = false;
+        }
+
+        return 0;
+}
+
+int manager_startup(Manager *m) {
+        int n;
+
+        assert(m);
+        assert(m->listen_fd < 0);
+
+        n = sd_listen_fds(false);
+        if (n < 0)
+                return log_error_errno(n, "Failed to determine number of passed file descriptors: %m");
+        if (n > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expected one listening fd, got %i.", n);
+        if (n == 1)
+                m->listen_fd = SD_LISTEN_FDS_START;
+        else {
+                static const union sockaddr_union sockaddr = {
+                        .un.sun_family = AF_UNIX,
+                        .un.sun_path = "/run/systemd/io.systemd.MountFileSystem",
+                };
+
+                m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
+                if (m->listen_fd < 0)
+                        return log_error_errno(errno, "Failed to bind on socket: %m");
+
+                (void) sockaddr_un_unlink(&sockaddr.un);
+
+                WITH_UMASK(0000)
+                        if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0)
+                                return log_error_errno(errno, "Failed to bind socket: %m");
+
+                if (listen(m->listen_fd, SOMAXCONN) < 0)
+                        return log_error_errno(errno, "Failed to listen on socket: %m");
+        }
+
+        /* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be
+         * GC'ed on idle */
+        if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0)
+                return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m");
+
+        return start_workers(m, /* explicit_request= */ false);
+}
diff --git a/src/mountfsd/mountfsd-manager.h b/src/mountfsd/mountfsd-manager.h
new file mode 100644
index 0000000000..6bfbddcd17
--- /dev/null
+++ b/src/mountfsd/mountfsd-manager.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-event.h"
+
+typedef struct Manager Manager;
+
+#include "hashmap.h"
+#include "ratelimit.h"
+
+#define MOUNTFS_WORKERS_MIN 3
+#define MOUNTFS_WORKERS_MAX 4096
+
+struct Manager {
+        sd_event *event;
+
+        Set *workers_fixed;    /* Workers 0…MOUNTFS_WORKERS_MIN */
+        Set *workers_dynamic;  /* Workers MOUNTFS_WORKERS_MIN+1…MOUNTFS_WORKERS_MAX */
+
+        int listen_fd;
+
+        RateLimit worker_ratelimit;
+};
+
+int manager_new(Manager **ret);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m);
diff --git a/src/mountfsd/mountfsd.c b/src/mountfsd/mountfsd.c
new file mode 100644
index 0000000000..6073bd548e
--- /dev/null
+++ b/src/mountfsd/mountfsd.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "daemon-util.h"
+#include "log.h"
+#include "main-func.h"
+#include "mountfsd-manager.h"
+#include "signal-util.h"
+
+static int run(int argc, char *argv[]) {
+        _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL;
+        _cleanup_(manager_freep) Manager *m = NULL;
+        int r;
+
+        log_setup();
+
+        umask(0022);
+
+        if (argc != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments.");
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD) >= 0);
+
+        r = manager_new(&m);
+        if (r < 0)
+                return log_error_errno(r, "Could not create manager: %m");
+
+        r = manager_startup(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start up daemon: %m");
+
+        notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+        r = sd_event_loop(m->event);
+        if (r < 0)
+                return log_error_errno(r, "Event loop failed: %m");
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/mountfsd/mountwork.c b/src/mountfsd/mountwork.c
new file mode 100644
index 0000000000..f598596215
--- /dev/null
+++ b/src/mountfsd/mountwork.c
@@ -0,0 +1,703 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-daemon.h"
+
+#include "argv-util.h"
+#include "bus-polkit.h"
+#include "chase.h"
+#include "discover-image.h"
+#include "dissect-image.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "io-util.h"
+#include "main-func.h"
+#include "missing_loop.h"
+#include "namespace-util.h"
+#include "nsresource.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "process-util.h"
+#include "stat-util.h"
+#include "user-util.h"
+#include "varlink.h"
+#include "varlink-io.systemd.MountFileSystem.h"
+
+#define ITERATIONS_MAX 64U
+#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE)
+#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC)
+#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC)
+
+static const ImagePolicy image_policy_untrusted = {
+        .n_policies = 2,
+        .policies = {
+                { PARTITION_ROOT,     PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
+                { PARTITION_USR,      PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
+        },
+        .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+static int json_dispatch_image_policy(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+        _cleanup_(image_policy_freep) ImagePolicy *q = NULL;
+        ImagePolicy **p = ASSERT_PTR(userdata);
+        int r;
+
+        assert(p);
+
+        if (json_variant_is_null(variant)) {
+                *p = image_policy_free(*p);
+                return 0;
+        }
+
+        if (!json_variant_is_string(variant))
+                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+        r = image_policy_from_string(json_variant_string(variant), &q);
+        if (r < 0)
+                return json_log(variant, flags, r, "JSON field '%s' is not a valid image policy.", strna(name));
+
+        image_policy_free(*p);
+        *p = TAKE_PTR(q);
+        return 0;
+}
+
+typedef struct MountImageParameters {
+        unsigned image_fd_idx;
+        unsigned userns_fd_idx;
+        int read_only;
+        int growfs;
+        char *password;
+        ImagePolicy *image_policy;
+} MountImageParameters;
+
+static void mount_image_parameters_done(MountImageParameters *p) {
+        assert(p);
+
+        p->password = erase_and_free(p->password);
+        p->image_policy = image_policy_free(p->image_policy);
+}
+
+static int validate_image_fd(int fd, MountImageParameters *p) {
+        int r, fl;
+
+        assert(fd >= 0);
+        assert(p);
+
+        r = fd_verify_regular(fd);
+        if (r < 0)
+                return r;
+
+        fl = fd_verify_safe_flags(fd);
+        if (fl < 0)
+                return log_debug_errno(fl, "Image file descriptor has unsafe flags set: %m");
+
+        switch (fl & O_ACCMODE) {
+
+        case O_RDONLY:
+                p->read_only = true;
+                break;
+
+        case O_RDWR:
+                break;
+
+        default:
+                return -EBADF;
+        }
+
+        return 0;
+}
+
+static int verify_trusted_image_fd_by_path(int fd) {
+        _cleanup_free_ char *p = NULL;
+        struct stat sta;
+        int r;
+
+        assert(fd >= 0);
+
+        r = secure_getenv_bool("SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES");
+        if (r == -ENXIO)  {
+                if (!DEFAULT_MOUNTFSD_TRUSTED_DIRECTORIES) {
+                        log_debug("Trusted directory mechanism disabled at compile time.");
+                        return false;
+                }
+        } else if (r < 0) {
+                log_debug_errno(r, "Failed to parse $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable, not trusting any image.");
+                return false;
+        } else if (!r) {
+                log_debug("Trusted directory mechanism disabled via $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable.");
+                return false;
+        }
+
+        r = fd_get_path(fd, &p);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get path of passed image file descriptor: %m");
+        if (fstat(fd, &sta) < 0)
+                return log_debug_errno(errno, "Failed to stat() passed image file descriptor: %m");
+
+        log_debug("Checking if image '%s' is in trusted directories.", p);
+
+        for (ImageClass c = 0; c < _IMAGE_CLASS_MAX; c++)
+                NULSTR_FOREACH(s, image_search_path[c]) {
+                        _cleanup_close_ int dir_fd = -EBADF, inode_fd = -EBADF;
+                        _cleanup_free_ char *q = NULL;
+                        struct stat stb;
+                        const char *e;
+
+                        r = chase(s, NULL, CHASE_SAFE, &q, &dir_fd);
+                        if (r == -ENOENT)
+                                continue;
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to resolve search path '%s', ignoring: %m", s);
+                                continue;
+                        }
+
+                        /* Check that the inode refers to a file immediately inside the image directory,
+                         * i.e. not the image directory itself, and nothing further down the tree */
+                        e = path_startswith(p, q);
+                        if (isempty(e))
+                                continue;
+
+                        e += strspn(e, "/");
+                        if (!filename_is_valid(e))
+                                continue;
+
+                        r = chaseat(dir_fd, e, CHASE_SAFE, NULL, &inode_fd);
+                        if (r < 0)
+                                return log_error_errno(r, "Couldn't verify that specified image '%s' is in search path '%s': %m", p, s);
+
+                        if (fstat(inode_fd, &stb) < 0)
+                                return log_error_errno(errno, "Failed to stat image file '%s/%s': %m", q, e);
+
+                        if (stat_inode_same(&sta, &stb)) {
+                                log_debug("Image '%s' is *in* trusted directories.", p);
+                                return true; /* Yay */
+                        }
+                }
+
+        log_debug("Image '%s' is *not* in trusted directories.", p);
+        return false;
+}
+
+static int determine_image_policy(
+                int image_fd,
+                bool trusted,
+                ImagePolicy *client_policy,
+                ImagePolicy **ret) {
+
+        _cleanup_(image_policy_freep) ImagePolicy *envvar_policy = NULL;
+        const ImagePolicy *default_policy;
+        const char *envvar, *e;
+        int r;
+
+        assert(image_fd >= 0);
+        assert(ret);
+
+        if (trusted) {
+                envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_TRUSTED";
+                default_policy = &image_policy_allow;
+        } else {
+                envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_UNTRUSTED";
+                default_policy = &image_policy_untrusted;
+        }
+
+        e = secure_getenv(envvar);
+        if (e) {
+                r = image_policy_from_string(e, &envvar_policy);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to parse image policy supplied via $%s: %m", envvar);
+
+                default_policy = envvar_policy;
+        }
+
+        return image_policy_intersect(default_policy, client_policy, ret);
+}
+
+static int validate_userns(Varlink *link, int *userns_fd) {
+        int r;
+
+        assert(link);
+        assert(userns_fd);
+
+        if (*userns_fd < 0)
+                return 0;
+
+        r = fd_verify_safe_flags(*userns_fd);
+        if (r < 0)
+                return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
+
+        r = fd_is_ns(*userns_fd, CLONE_NEWUSER);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        /* Our own host user namespace? Then close the fd, and handle it as if none was specified. */
+        r = is_our_namespace(*userns_fd, NAMESPACE_USER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if user namespace provided by client is our own.");
+        if (r > 0) {
+                log_debug("User namespace provided by client is our own.");
+                *userns_fd = safe_close(*userns_fd);
+        }
+
+        return 0;
+}
+
+static int vl_method_mount_image(
+                Varlink *link,
+                JsonVariant *parameters,
+                VarlinkMethodFlags flags,
+                void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "imageFileDescriptor",         JSON_VARIANT_UNSIGNED, json_dispatch_uint,         offsetof(MountImageParameters, image_fd_idx),  JSON_MANDATORY },
+                { "userNamespaceFileDescriptor", JSON_VARIANT_UNSIGNED, json_dispatch_uint,         offsetof(MountImageParameters, userns_fd_idx), 0 },
+                { "readOnly",                    JSON_VARIANT_BOOLEAN,  json_dispatch_tristate,     offsetof(MountImageParameters, read_only),     0 },
+                { "growFileSystems",             JSON_VARIANT_BOOLEAN,  json_dispatch_tristate,     offsetof(MountImageParameters, growfs),        0 },
+                { "password",                    JSON_VARIANT_STRING,   json_dispatch_string,       offsetof(MountImageParameters, password),      0 },
+                { "imagePolicy",                 JSON_VARIANT_STRING,   json_dispatch_image_policy, offsetof(MountImageParameters, image_policy),  0 },
+                VARLINK_DISPATCH_POLKIT_FIELD,
+                {}
+        };
+
+        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+        _cleanup_(mount_image_parameters_done) MountImageParameters p = {
+                .image_fd_idx = UINT_MAX,
+                .userns_fd_idx = UINT_MAX,
+                .read_only = -1,
+                .growfs = -1,
+        };
+        _cleanup_(dissected_image_unrefp) DissectedImage *di = NULL;
+        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *aj = NULL;
+        _cleanup_close_ int image_fd = -EBADF, userns_fd = -EBADF;
+        _cleanup_(image_policy_freep) ImagePolicy *use_policy = NULL;
+        Hashmap **polkit_registry = ASSERT_PTR(userdata);
+        _cleanup_free_ char *ps = NULL;
+        bool image_is_trusted = false;
+        uid_t peer_uid;
+        int r;
+
+        assert(link);
+        assert(parameters);
+
+        json_variant_sensitive(parameters); /* might contain passwords */
+
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get client UID: %m");
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (p.image_fd_idx != UINT_MAX) {
+                image_fd = varlink_peek_dup_fd(link, p.image_fd_idx);
+                if (image_fd < 0)
+                        return log_debug_errno(image_fd, "Failed to peek image fd from client: %m");
+        }
+
+        if (p.userns_fd_idx != UINT_MAX) {
+                userns_fd = varlink_peek_dup_fd(link, p.userns_fd_idx);
+                if (userns_fd < 0)
+                        return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m");
+        }
+
+        r = validate_image_fd(image_fd, &p);
+        if (r < 0)
+                return r;
+
+        r = validate_userns(link, &userns_fd);
+        if (r != 0)
+                return r;
+
+        r = verify_trusted_image_fd_by_path(image_fd);
+        if (r < 0)
+                return r;
+        image_is_trusted = r;
+
+        const char *polkit_details[] = {
+                "read_only", one_zero(p.read_only > 0),
+                NULL,
+        };
+
+        const char *polkit_action, *polkit_untrusted_action;
+        PolkitFlags polkit_flags;
+        if (userns_fd < 0) {
+                /* Mount into the host user namespace */
+                polkit_action = "io.systemd.mount-file-system.mount-image";
+                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image";
+                polkit_flags = 0;
+        } else {
+                /* Mount into a private user namespace */
+                polkit_action = "io.systemd.mount-file-system.mount-image-privately";
+                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image-privately";
+
+                /* If polkit is not around, let's allow mounting authenticated images by default */
+                polkit_flags = POLKIT_DEFAULT_ALLOW;
+        }
+
+        /* Let's definitely acquire the regular action privilege, for mounting properly signed images */
+        r = varlink_verify_polkit_async_full(
+                        link,
+                        /* bus= */ NULL,
+                        polkit_action,
+                        polkit_details,
+                        /* good_user= */ UID_INVALID,
+                        polkit_flags,
+                        polkit_registry);
+        if (r <= 0)
+                return r;
+
+        /* Generate the commmon dissection directory here. We are not going to use it, but the clients might,
+         * and they likely are unprivileged, hence cannot create it themselves. Hence let's jsut create it
+         * here, if it is missing. */
+        r = get_common_dissect_directory(NULL);
+        if (r < 0)
+                return r;
+
+        r = loop_device_make(
+                        image_fd,
+                        p.read_only == 0 ? O_RDONLY : O_RDWR,
+                        0,
+                        UINT64_MAX,
+                        UINT32_MAX,
+                        LO_FLAGS_PARTSCAN,
+                        LOCK_EX,
+                        &loop);
+        if (r < 0)
+                return r;
+
+        DissectImageFlags dissect_flags =
+                (p.read_only == 0 ? DISSECT_IMAGE_READ_ONLY : 0) |
+                (p.growfs != 0 ? DISSECT_IMAGE_GROWFS : 0) |
+                DISSECT_IMAGE_DISCARD_ANY |
+                DISSECT_IMAGE_FSCK |
+                DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+                DISSECT_IMAGE_PIN_PARTITION_DEVICES |
+                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
+
+        /* Let's see if we have acquired the privilege to mount untrusted images already */
+        bool polkit_have_untrusted_action =
+                varlink_has_polkit_action(link, polkit_untrusted_action, polkit_details, polkit_registry);
+
+        for (;;) {
+                use_policy = image_policy_free(use_policy);
+                ps = mfree(ps);
+
+                /* We use the image policy for trusted images if either the path is below a trusted
+                 * directory, or if we have already acquired a PK authentication that tells us that untrusted
+                 * images are OK */
+                bool use_trusted_policy =
+                        image_is_trusted ||
+                        polkit_have_untrusted_action;
+
+                r = determine_image_policy(
+                                image_fd,
+                                use_trusted_policy,
+                                p.image_policy,
+                                &use_policy);
+                if (r < 0)
+                        return r;
+
+                r = image_policy_to_string(use_policy, /* simplify= */ true, &ps);
+                if (r < 0)
+                        return r;
+
+                log_debug("Using image policy: %s", ps);
+
+                r = dissect_loop_device(
+                                loop,
+                                &verity,
+                                /* mount_options= */ NULL,
+                                use_policy,
+                                dissect_flags,
+                                &di);
+                if (r == -ENOPKG)
+                        return varlink_error(link, "io.systemd.MountFileSystem.IncompatibleImage", NULL);
+                if (r == -ENOTUNIQ)
+                        return varlink_error(link, "io.systemd.MountFileSystem.MultipleRootPartitionsFound", NULL);
+                if (r == -ENXIO)
+                        return varlink_error(link, "io.systemd.MountFileSystem.RootPartitionNotFound", NULL);
+                if (r == -ERFKILL) {
+                        /* The image policy refused this, let's retry after trying to get PolicyKit */
+
+                        if (!polkit_have_untrusted_action) {
+                                log_debug("Denied by image policy. Trying a stronger polkit authentication before continuing.");
+                                r = varlink_verify_polkit_async_full(
+                                                link,
+                                                /* bus= */ NULL,
+                                                polkit_untrusted_action,
+                                                polkit_details,
+                                                /* good_user= */ UID_INVALID,
+                                                /* flags= */ 0,                   /* NB: the image cannot be authenticated, hence unless PK is around to allow this anyway, fail! */
+                                                polkit_registry);
+                                if (r <= 0 && !ERRNO_IS_NEG_PRIVILEGE(r))
+                                        return r;
+                                if (r > 0) {
+                                        /* Try again, now that we know the client has enough privileges. */
+                                        log_debug("Denied by image policy, retrying after polkit authentication.");
+                                        polkit_have_untrusted_action = true;
+                                        continue;
+                                }
+                        }
+
+                        return varlink_error(link, "io.systemd.MountFileSystem.DeniedByImagePolicy", NULL);
+                }
+                if (r < 0)
+                        return r;
+
+                /* Success */
+                break;
+        }
+
+        r = dissected_image_load_verity_sig_partition(
+                        di,
+                        loop->fd,
+                        &verity);
+        if (r < 0)
+                return r;
+
+        r = dissected_image_decrypt(
+                        di,
+                        p.password,
+                        &verity,
+                        dissect_flags);
+        if (r == -ENOKEY) /* new dm-verity userspace returns ENOKEY if the dm-verity signature key is not in
+                           * key chain. That's great. */
+                return varlink_error(link, "io.systemd.MountFileSystem.KeyNotFound", NULL);
+        if (r == -EBUSY) /* DM kernel subsystem is shit with returning useful errors hence we keep retrying
+                          * under the assumption that some errors are transitional. Which the errors might
+                          * not actually be. After all retries failed we return EBUSY. Let's turn that into a
+                          * generic Verity error. It's not very helpful, could mean anything, but at least it
+                          * gives client a clear idea that this has to do with Verity. */
+                return varlink_error(link, "io.systemd.MountFileSystem.VerityFailure", NULL);
+        if (r < 0)
+                return r;
+
+        r = dissected_image_mount(
+                        di,
+                        /* where= */ NULL,
+                        /* uid_shift= */ UID_INVALID,
+                        /* uid_range= */ UID_INVALID,
+                        userns_fd,
+                        dissect_flags);
+        if (r < 0)
+                return r;
+
+        for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
+                _cleanup_(json_variant_unrefp) JsonVariant *pj = NULL;
+                DissectedPartition *pp = di->partitions + d;
+                int fd_idx;
+
+                if (!pp->found)
+                        continue;
+
+                if (pp->fsmount_fd < 0)
+                        continue;
+
+                if (userns_fd >= 0) {
+                        r = nsresource_add_mount(userns_fd, pp->fsmount_fd);
+                        if (r < 0)
+                                return r;
+                }
+
+                fd_idx = varlink_push_fd(link, pp->fsmount_fd);
+                if (fd_idx < 0)
+                        return fd_idx;
+
+                TAKE_FD(pp->fsmount_fd);
+
+                r = json_build(&pj,
+                               JSON_BUILD_OBJECT(
+                                               JSON_BUILD_PAIR("designator", JSON_BUILD_STRING(partition_designator_to_string(d))),
+                                               JSON_BUILD_PAIR("writable", JSON_BUILD_BOOLEAN(pp->rw)),
+                                               JSON_BUILD_PAIR("growFileSystem", JSON_BUILD_BOOLEAN(pp->growfs)),
+                                               JSON_BUILD_PAIR_CONDITION(pp->partno > 0, "partitionNumber", JSON_BUILD_INTEGER(pp->partno)),
+                                               JSON_BUILD_PAIR_CONDITION(pp->architecture > 0, "architecture", JSON_BUILD_STRING(architecture_to_string(pp->architecture))),
+                                               JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(pp->uuid), "partitionUuid", JSON_BUILD_UUID(pp->uuid)),
+                                               JSON_BUILD_PAIR("fileSystemType", JSON_BUILD_STRING(dissected_partition_fstype(pp))),
+                                               JSON_BUILD_PAIR_CONDITION(pp->label, "partitionLabel", JSON_BUILD_STRING(pp->label)),
+                                               JSON_BUILD_PAIR("size", JSON_BUILD_INTEGER(pp->size)),
+                                               JSON_BUILD_PAIR("offset", JSON_BUILD_INTEGER(pp->offset)),
+                                               JSON_BUILD_PAIR("mountFileDescriptor", JSON_BUILD_INTEGER(fd_idx))));
+                if (r < 0)
+                        return r;
+
+                r = json_variant_append_array(&aj, pj);
+                if (r < 0)
+                        return r;
+        }
+
+        loop_device_relinquish(loop);
+
+        r = varlink_replyb(link, JSON_BUILD_OBJECT(
+                                           JSON_BUILD_PAIR("partitions", JSON_BUILD_VARIANT(aj)),
+                                           JSON_BUILD_PAIR("imagePolicy", JSON_BUILD_STRING(ps)),
+                                           JSON_BUILD_PAIR("imageSize", JSON_BUILD_INTEGER(di->image_size)),
+                                           JSON_BUILD_PAIR("sectorSize", JSON_BUILD_INTEGER(di->sector_size)),
+                                           JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(di->image_uuid), "imageUuid", JSON_BUILD_UUID(di->image_uuid))));
+        if (r < 0)
+                return r;
+
+        return r;
+}
+
+static int process_connection(VarlinkServer *server, int _fd) {
+        _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possesion */
+        _cleanup_(varlink_close_unrefp) Varlink *vl = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+        int r;
+
+        r = sd_event_new(&event);
+        if (r < 0)
+                return r;
+
+        r = varlink_server_attach_event(server, event, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to attach Varlink server to event loop: %m");
+
+        r = varlink_server_add_connection(server, fd, &vl);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add connection: %m");
+
+        TAKE_FD(fd);
+        vl = varlink_ref(vl);
+
+        r = varlink_set_allow_fd_passing_input(vl, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable fd passing for read: %m");
+
+        r = varlink_set_allow_fd_passing_output(vl, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable fd passing for write: %m");
+
+        r = sd_event_loop(event);
+        if (r < 0)
+                return log_error_errno(r, "Failed to run event loop: %m");
+
+        r = varlink_server_detach_event(server);
+        if (r < 0)
+                return log_error_errno(r, "Failed to detach Varlink server from event loop: %m");
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
+        _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL;
+        _cleanup_(hashmap_freep) Hashmap *polkit_registry = NULL;
+        _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
+        unsigned n_iterations = 0;
+        int m, listen_fd, r;
+
+        log_setup();
+
+        m = sd_listen_fds(false);
+        if (m < 0)
+                return log_error_errno(m, "Failed to determine number of listening fds: %m");
+        if (m == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received.");
+        if (m > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time.");
+
+        listen_fd = SD_LISTEN_FDS_START;
+
+        r = fd_nonblock(listen_fd, false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m");
+
+        r = varlink_server_new(&server, VARLINK_SERVER_INHERIT_USERDATA);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate server: %m");
+
+        r = varlink_server_add_interface(server, &vl_interface_io_systemd_MountFileSystem);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add MountFileSystem interface to varlink server: %m");
+
+        r = varlink_server_bind_method_many(
+                        server,
+                        "io.systemd.MountFileSystem.MountImage",vl_method_mount_image);
+        if (r < 0)
+                return log_error_errno(r, "Failed to bind methods: %m");
+
+        varlink_server_set_userdata(server, &polkit_registry);
+
+        r = varlink_server_set_exit_on_idle(server, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable exit-on-idle mode: %m");
+
+        r = getenv_bool("MOUNTFS_FIXED_WORKER");
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse MOUNTFSD_FIXED_WORKER: %m");
+        listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC;
+
+        r = pidref_set_parent(&parent);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire pidfd of parent process: %m");
+
+        start_time = now(CLOCK_MONOTONIC);
+
+        for (;;) {
+                _cleanup_close_ int fd = -EBADF;
+                usec_t n;
+
+                /* Exit the worker in regular intervals, to flush out all memory use */
+                if (n_iterations++ > ITERATIONS_MAX) {
+                        log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations);
+                        break;
+                }
+
+                n = now(CLOCK_MONOTONIC);
+                if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) {
+                        log_debug("Exiting worker, ran for %s, that's enough.",
+                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0));
+                        break;
+                }
+
+                if (last_busy_usec == USEC_INFINITY)
+                        last_busy_usec = n;
+                else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) {
+                        log_debug("Exiting worker, been idle for %s.",
+                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0));
+                        break;
+                }
+
+                (void) rename_process("systemd-mountwork: waiting...");
+                fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC));
+                (void) rename_process("systemd-mountwork: processing...");
+
+                if (fd == -EAGAIN)
+                        continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected
+                                   * after a while, let's check if it's time to exit though. */
+                if (fd == -EINTR)
+                        continue; /* Might be that somebody attached via strace, let's just continue in that
+                                   * case */
+                if (fd < 0)
+                        return log_error_errno(fd, "Failed to accept() from listening socket: %m");
+
+                if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) {
+                        /* We only slept a very short time? If so, let's see if there are more sockets
+                         * pending, and if so, let's ask our parent for more workers */
+
+                        r = fd_wait_for_event(listen_fd, POLLIN, 0);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m");
+
+                        if (FLAGS_SET(r, POLLIN)) {
+                                r = pidref_kill(&parent, SIGUSR2);
+                                if (r == -ESRCH)
+                                        return log_error_errno(r, "Parent already died?");
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to send SIGUSR2 signal to parent. %m");
+                        }
+                }
+
+                (void) process_connection(server, TAKE_FD(fd));
+                last_busy_usec = USEC_INFINITY;
+        }
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
author	Lennart Poettering <lennart@poettering.net>	2023-03-09 12:27:29 +0100
committer	Lennart Poettering <lennart@poettering.net>	2024-04-06 16:08:24 +0200
commit	702a52f4b5d49cce11e2adbc740deb3b644e2de0 (patch)
tree	2839e8881cd65cfd1ef03609f66e96c422ef3944 /src/mountfsd
parent	nsresourced: add client-side helpers around nsresourced APIs (diff)
download	systemd-702a52f4b5d49cce11e2adbc740deb3b644e2de0.tar.xz systemd-702a52f4b5d49cce11e2adbc740deb3b644e2de0.zip