summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYu Watanabe <watanabe.yu+github@gmail.com>2023-08-12 08:18:41 +0200
committerYu Watanabe <watanabe.yu+github@gmail.com>2023-08-22 04:50:16 +0200
commit94fe4cf2557d1f70f20ee02d32f4c2ae6bc1fb3f (patch)
tree18f374ba064ad7a3b53976ec2678564b520f7fe6
parentcore/credential: make setup_credentials() return path to credentials directory (diff)
downloadsystemd-94fe4cf2557d1f70f20ee02d32f4c2ae6bc1fb3f.tar.xz
systemd-94fe4cf2557d1f70f20ee02d32f4c2ae6bc1fb3f.zip
core: do not leak mount for credentials directory if mount namespace is enabled
Since kernel v5.2, open_tree() and move_mount() are added. If a service loads or sets credentials, then let's try to clone the mount that contains credentials with open_tree(), then mount it after a (private) mount namespace is initialized for the service. Then, we can setup a mount for credentials directory without leaking it to the main shared mount namespace. With this change, the credentials for services that request their own private mount namespace become much much safer. And, the number of mount events triggered by setting up credential directories can be decreased. Unfortunately, this does not 'fix' the original issue #25527, as the reported service does not requests private mount namespace, but the situation should be better now.
-rw-r--r--src/core/credential.c70
-rw-r--r--src/core/credential.h3
-rw-r--r--src/core/execute.c22
-rw-r--r--src/core/namespace.c34
-rw-r--r--src/core/namespace.h1
-rw-r--r--src/test/test-namespace.c1
-rw-r--r--src/test/test-ns.c1
7 files changed, 115 insertions, 17 deletions
diff --git a/src/core/credential.c b/src/core/credential.c
index 7c9d6f6a9b..d9a88004c5 100644
--- a/src/core/credential.c
+++ b/src/core/credential.c
@@ -10,6 +10,7 @@
#include "glob-util.h"
#include "io-util.h"
#include "label-util.h"
+#include "missing_syscall.h"
#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
@@ -17,6 +18,7 @@
#include "random-util.h"
#include "recurse-dir.h"
#include "rm-rf.h"
+#include "socket-util.h"
#include "tmpfile-util.h"
ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
@@ -730,7 +732,8 @@ static int setup_credentials_internal(
bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
uid_t uid,
- gid_t gid) {
+ gid_t gid,
+ int *ret_mount_fd) {
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
@@ -848,6 +851,27 @@ static int setup_credentials_internal(
if (r < 0)
return r;
+ if (ret_mount_fd) {
+ _cleanup_close_ int mount_fd = -EBADF;
+
+ r = mount_fd = RET_NERRNO(open_tree(AT_FDCWD, workspace, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC));
+ if (r >= 0) {
+ /* The workspace is already cloned in the above, and not necessary
+ * anymore. Even though the workspace is unmounted when the short-lived
+ * child process exits, let's explicitly unmount it here for safety. */
+ r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
+ if (r < 0)
+ return r;
+
+ *ret_mount_fd = TAKE_FD(mount_fd);
+ return 0;
+ }
+
+ /* Old kernel? Unprivileged? */
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
+ return r;
+ }
+
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
@@ -868,6 +892,8 @@ static int setup_credentials_internal(
return -errno;
}
+ if (ret_mount_fd)
+ *ret_mount_fd = -EBADF;
return 0;
}
@@ -877,17 +903,22 @@ int setup_credentials(
const char *unit,
uid_t uid,
gid_t gid,
- char **ret_path) {
+ char **ret_path,
+ int *ret_mount_fd) {
+ _cleanup_close_pair_ int socket_pair[2] = PIPE_EBADF;
_cleanup_free_ char *p = NULL, *q = NULL;
+ _cleanup_close_ int mount_fd = -EBADF;
int r;
assert(context);
assert(params);
assert(ret_path);
+ assert(ret_mount_fd);
if (!exec_context_has_credentials(context)) {
*ret_path = NULL;
+ *ret_mount_fd = -EBADF;
return 0;
}
@@ -912,7 +943,12 @@ int setup_credentials(
if (r < 0 && r != -EEXIST)
return r;
- r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
+ if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, socket_pair) < 0)
+ return -errno;
+
+ r = safe_fork_full("(sd-mkdcreds)",
+ NULL, &socket_pair[1], 1,
+ FORK_DEATHSIG|FORK_CLOSE_ALL_FDS|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_REOPEN_LOG, NULL);
if (r < 0) {
_cleanup_free_ char *t = NULL, *u = NULL;
@@ -948,14 +984,15 @@ int setup_credentials(
true, /* reuse the workspace if it is already a mount */
false, /* it's OK to fall back to a plain directory if we can't mount anything */
uid,
- gid);
+ gid,
+ NULL);
(void) rmdir(u); /* remove the workspace again if we can. */
if (r < 0)
return r;
- } else if (r == 0) {
+ } else if (r == 0) { /* child */
/* We managed to set up a mount namespace, and are now in a child. That's great. In this case
* we can use the same directory for all cases, after turning off propagation. Question
@@ -974,6 +1011,8 @@ int setup_credentials(
* given that we do this in a privately namespaced short-lived single-threaded process that
* no one else sees this should be OK to do. */
+ _cleanup_close_ int fd = -EBADF;
+
/* Turn off propagation from our namespace to host */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
if (r < 0)
@@ -988,7 +1027,14 @@ int setup_credentials(
false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
true, /* insist that something is mounted, do not allow fallback to plain directory */
uid,
- gid);
+ gid,
+ &fd);
+ if (r < 0)
+ goto child_fail;
+
+ r = send_one_fd_iov(socket_pair[1], fd,
+ &IOVEC_MAKE((int[]) { fd >= 0 }, sizeof(int)), 1,
+ MSG_DONTWAIT);
if (r < 0)
goto child_fail;
@@ -996,6 +1042,17 @@ int setup_credentials(
child_fail:
_exit(EXIT_FAILURE);
+
+ } else { /* parent */
+
+ int ret;
+ struct iovec iov = IOVEC_MAKE(&ret, sizeof(int));
+
+ r = receive_one_fd_iov(socket_pair[0], &iov, 1, MSG_DONTWAIT, &mount_fd);
+ if (r < 0)
+ return r;
+ if (ret > 0 && mount_fd < 0)
+ return -EIO;
}
/* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
@@ -1005,5 +1062,6 @@ int setup_credentials(
(void) rmdir(p);
*ret_path = TAKE_PTR(p);
+ *ret_mount_fd = TAKE_FD(mount_fd);
return 0;
}
diff --git a/src/core/credential.h b/src/core/credential.h
index 2afd88dfc5..4b936b32d6 100644
--- a/src/core/credential.h
+++ b/src/core/credential.h
@@ -46,4 +46,5 @@ int setup_credentials(
const char *unit,
uid_t uid,
gid_t gid,
- char **ret_path);
+ char **ret_path,
+ int *ret_mount_fd);
diff --git a/src/core/execute.c b/src/core/execute.c
index 9c900095ce..eb452a45ec 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -7,6 +7,7 @@
#include <sys/file.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
+#include <sys/mount.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/shm.h>
@@ -72,6 +73,7 @@
#include "missing_fs.h"
#include "missing_ioprio.h"
#include "missing_prctl.h"
+#include "missing_syscall.h"
#include "mkdir-label.h"
#include "namespace.h"
#include "parse-util.h"
@@ -3112,6 +3114,7 @@ static int apply_mount_namespace(
ExecRuntime *runtime,
const char *memory_pressure_path,
const char *creds_path,
+ int creds_fd,
char **error_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
@@ -3287,6 +3290,7 @@ static int apply_mount_namespace(
tmp_dir,
var_tmp_dir,
creds_path,
+ creds_fd,
context->log_namespace,
context->mount_propagation_flag,
&verity,
@@ -3971,6 +3975,7 @@ static int exec_child(
int ngids_after_pam = 0;
_cleanup_free_ int *fds = NULL;
_cleanup_strv_free_ char **fdnames = NULL;
+ _cleanup_close_ int creds_fd = -EBADF;
assert(unit);
assert(command);
@@ -4421,7 +4426,7 @@ static int exec_child(
}
if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
- r = setup_credentials(context, params, unit->id, uid, gid, &creds_path);
+ r = setup_credentials(context, params, unit->id, uid, gid, &creds_path, &creds_fd);
if (r < 0) {
*exit_status = EXIT_CREDENTIALS;
return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
@@ -4635,7 +4640,7 @@ static int exec_child(
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;
- r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, &error_path);
+ r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, creds_fd, &error_path);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
@@ -4643,6 +4648,19 @@ static int exec_child(
}
}
+ if (creds_fd >= 0) {
+ assert(creds_path);
+
+ /* When a mount namespace is not requested, then the target directory may not exist yet.
+ * Here, we ignore the failure, as if it fails, the subsequent move_mount() will fail. */
+ (void) mkdir_p_label(creds_path, 0755);
+
+ if (move_mount(creds_fd, "", AT_FDCWD, creds_path, MOVE_MOUNT_F_EMPTY_PATH) < 0) {
+ *exit_status = EXIT_CREDENTIALS;
+ return log_unit_error_errno(unit, errno, "Failed to mount credentials directory on %s: %m", creds_path);
+ }
+ }
+
if (needs_sandboxing) {
r = apply_protect_hostname(unit, context, exit_status);
if (r < 0)
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 51b5aad9c9..86406007ad 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -74,7 +74,8 @@ typedef enum MountMode {
EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
MQUEUEFS,
- READWRITE_IMPLICIT, /* Should have the lowest priority. */
+ READWRITE_IMPLICIT, /* Should have the 2nd lowest priority. */
+ MKDIR, /* Should have the lowest priority. */
_MOUNT_MODE_MAX,
} MountMode;
@@ -227,6 +228,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
[EXEC] = "exec",
[NOEXEC] = "noexec",
[MQUEUEFS] = "mqueuefs",
+ [MKDIR] = "mkdir",
};
/* Helper struct for naming simplicity and reusability */
@@ -1551,6 +1553,12 @@ static int apply_one_mount(
case OVERLAY_MOUNT:
return mount_overlay(m);
+ case MKDIR:
+ r = mkdir_p_label(mount_entry_path(m), 0755);
+ if (r < 0)
+ return r;
+ return 1;
+
default:
assert_not_reached();
}
@@ -2017,6 +2025,7 @@ int setup_namespace(
const char* tmp_dir,
const char* var_tmp_dir,
const char *creds_path,
+ int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,
@@ -2339,13 +2348,22 @@ int setup_namespace(
.flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
};
- *(m++) = (MountEntry) {
- .path_const = creds_path,
- .mode = BIND_MOUNT,
- .read_only = true,
- .source_const = creds_path,
- .ignore = true,
- };
+ /* If we have mount fd for credentials directory, then it will be mounted after
+ * namespace is set up. So, here we only create the mount point. */
+
+ if (creds_fd < 0)
+ *(m++) = (MountEntry) {
+ .path_const = creds_path,
+ .mode = BIND_MOUNT,
+ .read_only = true,
+ .source_const = creds_path,
+ .ignore = true,
+ };
+ else
+ *(m++) = (MountEntry) {
+ .path_const = creds_path,
+ .mode = MKDIR,
+ };
} else {
/* If our service has no credentials store configured, then make the whole
* credentials tree inaccessible wholesale. */
diff --git a/src/core/namespace.h b/src/core/namespace.h
index b6132154c5..b8615cbd98 100644
--- a/src/core/namespace.h
+++ b/src/core/namespace.h
@@ -122,6 +122,7 @@ int setup_namespace(
const char *tmp_dir,
const char *var_tmp_dir,
const char *creds_path,
+ int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,
diff --git a/src/test/test-namespace.c b/src/test/test-namespace.c
index 25aafc35ca..436a784e86 100644
--- a/src/test/test-namespace.c
+++ b/src/test/test-namespace.c
@@ -194,6 +194,7 @@ TEST(protect_kernel_logs) {
NULL,
NULL,
NULL,
+ -EBADF,
NULL,
0,
NULL,
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index 77afd2f6b9..56f3de83b6 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -96,6 +96,7 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
NULL,
+ -EBADF,
NULL,
0,
NULL,