From 8e8009dc500247da26e5414571c60d20ad3b8483 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:39:17 +0100 Subject: execute: use structured initialization --- src/core/execute.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 39d9f07518..0336083b0e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4864,14 +4864,20 @@ static void exec_runtime_freep(ExecRuntime **rt) { (void) exec_runtime_free(*rt, false); } -static int exec_runtime_allocate(ExecRuntime **rt) { - assert(rt); +static int exec_runtime_allocate(ExecRuntime **ret) { + ExecRuntime *n; - *rt = new0(ExecRuntime, 1); - if (!*rt) + assert(ret); + + n = new(ExecRuntime, 1); + if (!n) return -ENOMEM; - (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1; + *n = (ExecRuntime) { + .netns_storage_socket = { -1, -1 }, + }; + + *ret = n; return 0; } -- cgit v1.2.3 From 2fa3742d96395e44c952582fc8348a04c350f68e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:39:45 +0100 Subject: execute: make things a tiny bit shorter --- src/core/execute.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 0336083b0e..8f0e80290c 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4940,7 +4940,7 @@ static int exec_runtime_add( static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) { _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL; - _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1}; + _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }; int r; assert(m); @@ -4967,8 +4967,7 @@ static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, E return r; /* Avoid cleanup */ - netns_storage_socket[0] = -1; - netns_storage_socket[1] = -1; + netns_storage_socket[0] = netns_storage_socket[1] = -1; return 1; } -- cgit v1.2.3 From 44ffcbaea4fe323ab94448f25fe8e923524b120e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:40:06 +0100 Subject: execute: (void)ify more --- src/core/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/namespace.c b/src/core/namespace.c index d482c40c24..87e4a8a25f 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1661,14 +1661,14 @@ int setup_netns(int netns_storage_socket[static 2]) { netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); if (netns == -EAGAIN) { - /* Nothing stored yet, so let's create a new namespace */ + /* Nothing stored yet, so let's create a new namespace. */ if (unshare(CLONE_NEWNET) < 0) { r = -errno; goto fail; } - loopback_setup(); + (void) loopback_setup(); netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); if (netns < 0) { -- cgit v1.2.3 From da6bc6ed05f04561aff6f995fc457d42d892c049 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:55:19 +0100 Subject: execute: no need to check for NULL when function right after does anyway --- src/core/execute.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 8f0e80290c..4a0ffe198a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -4860,8 +4860,7 @@ static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) { } static void exec_runtime_freep(ExecRuntime **rt) { - if (*rt) - (void) exec_runtime_free(*rt, false); + (void) exec_runtime_free(*rt, false); } static int exec_runtime_allocate(ExecRuntime **ret) { -- cgit v1.2.3 From 51af7fb230e0d9eebf3810b46334d475e7536833 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:42:04 +0100 Subject: core: add open_netns_path() helper The new call allows us to open a netns from the file system, and store it in a "storage fd pair". It's supposed to work with setup_netns() and allows pre-population of the netns used with one opened from the file system. --- src/core/namespace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/core/namespace.h | 1 + 2 files changed, 54 insertions(+) diff --git a/src/core/namespace.c b/src/core/namespace.c index 87e4a8a25f..02ac49d02c 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1703,6 +1703,59 @@ fail: return r; } +int open_netns_path(int netns_storage_socket[static 2], const char *path) { + _cleanup_close_ int netns = -1; + int q, r; + + assert(netns_storage_socket); + assert(netns_storage_socket[0] >= 0); + assert(netns_storage_socket[1] >= 0); + assert(path); + + /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in + * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a + * new anonymous netns if needed. */ + + if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) + return -errno; + + netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); + if (netns == -EAGAIN) { + /* Nothing stored yet. Open the file from the file system. */ + + netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (netns < 0) { + r = -errno; + goto fail; + } + + r = fd_is_network_ns(netns); + if (r == 0) { /* Not a netns? Refuse early. */ + r = -EINVAL; + goto fail; + } + if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */ + goto fail; + + r = 1; + + } else if (netns < 0) { + r = netns; + goto fail; + } else + r = 0; /* Already allocated */ + + q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); + if (q < 0) { + r = q; + goto fail; + } + +fail: + (void) lockf(netns_storage_socket[0], F_ULOCK, 0); + return r; +} + bool ns_type_supported(NamespaceType type) { const char *t, *ns_proc; diff --git a/src/core/namespace.h b/src/core/namespace.h index ab3983f790..cd1e8b77bb 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -93,6 +93,7 @@ int setup_tmp_dirs( char **var_tmp_dir); int setup_netns(int netns_storage_socket[static 2]); +int open_netns_path(int netns_storage_socket[static 2], const char *path); const char* protect_home_to_string(ProtectHome p) _const_; ProtectHome protect_home_from_string(const char *s) _pure_; -- cgit v1.2.3 From a8d08f39d140afb1cb047c65d7d24388bda82e71 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:39:01 +0100 Subject: core: add new setting NetworkNamespacePath= for configuring a netns by path for a service Fixes: #2741 --- src/core/dbus-execute.c | 4 ++++ src/core/execute.c | 25 ++++++++++++++++++++++--- src/core/execute.h | 2 ++ src/core/load-fragment-gperf.gperf.m4 | 1 + 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index f22bf4a371..0b28643e79 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -778,6 +778,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), /* Obsolete/redundant properties: */ SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), @@ -1217,6 +1218,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "MountFlags")) return bus_set_transient_mount_flags(u, name, &c->mount_flags, message, flags, error); + if (streq(name, "NetworkNamespacePath")) + return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error); + if (streq(name, "SupplementaryGroups")) { _cleanup_strv_free_ char **l = NULL; char **p; diff --git a/src/core/execute.c b/src/core/execute.c index 4a0ffe198a..240ec5487b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3062,6 +3062,14 @@ static int exec_child( } } + if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) { + r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path); + if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path); + } + } + r = setup_input(context, params, socket_fd, named_iofds); if (r < 0) { *exit_status = EXIT_STDIN; @@ -3272,13 +3280,17 @@ static int exec_child( } } - if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) { + if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) { + if (ns_type_supported(NAMESPACE_NET)) { r = setup_netns(runtime->netns_storage_socket); if (r < 0) { *exit_status = EXIT_NETWORK; return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m"); } + } else if (context->network_namespace_path) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing."); } else log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); } @@ -3879,6 +3891,8 @@ void exec_context_done(ExecContext *c) { c->stdin_data = mfree(c->stdin_data); c->stdin_data_size = 0; + + c->network_namespace_path = mfree(c->network_namespace_path); } int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) { @@ -4556,6 +4570,11 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, s); } + if (c->network_namespace_path) + fprintf(f, + "%sNetworkNamespacePath: %s\n", + prefix, c->network_namespace_path); + if (c->syscall_errno > 0) { const char *errno_name; @@ -4947,7 +4966,7 @@ static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, E assert(id); /* It is not necessary to create ExecRuntime object. */ - if (!c->private_network && !c->private_tmp) + if (!c->private_network && !c->private_tmp && !c->network_namespace_path) return 0; if (c->private_tmp) { @@ -4956,7 +4975,7 @@ static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, E return r; } - if (c->private_network) { + if (c->private_network || c->network_namespace_path) { if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0) return -errno; } diff --git a/src/core/execute.h b/src/core/execute.h index 4b5b2d98ce..df6dd9f388 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -279,6 +279,8 @@ struct ExecContext { bool nice_set:1; bool ioprio_set:1; bool cpu_sched_set:1; + + char *network_namespace_path; }; static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 2ac822ef4b..c7c097d0a4 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -114,6 +114,7 @@ $1.PrivateDevices, config_parse_bool, 0, $1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) $1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) +$1.NetworkNamespacePath, config_parse_unit_path_printf, 0, offsetof($1, exec_context.network_namespace_path) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) $1.PrivateMounts, config_parse_bool, 0, offsetof($1, exec_context.private_mounts) -- cgit v1.2.3 From 7619cb32f05f50011760b0063bc5c6e1041f926c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:43:58 +0100 Subject: core: support netns joining also for sockets created by .socket unit Similar to the cgroup magic we nowadays do when listening to sockets, to assign them the right bpf programs, let's also do the same and join the specified netns in the child process. This allows people to listen in sockets in specific namespaces, or join multiple services and socket units together to live in the same namespace. --- src/core/socket.c | 77 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/src/core/socket.c b/src/core/socket.c index af95e9027e..3b60914c86 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1473,6 +1473,25 @@ static int socket_address_listen_do( log_unit_error_errno(u, error, fmt, strna(_t)); \ }) +static int fork_needed(const SocketAddress *address, const ExecContext *context) { + int r; + + assert(address); + assert(context); + + /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */ + + if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ + return true; + } + + return context->private_network || context->network_namespace_path; +} + static int socket_address_listen_in_cgroup( Socket *s, const SocketAddress *address, @@ -1485,18 +1504,34 @@ static int socket_address_listen_in_cgroup( assert(s); assert(address); - /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the socket's cgroup - * in which the socket is actually created. This way we ensure the socket is actually properly attached to the - * unit's cgroup for the purpose of BPF filtering and such. */ - - if (!IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) - goto shortcut; /* BPF filtering only applies to IPv4 + IPv6, shortcut things for other protocols */ + /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the + * socket's cgroup and network namespace in which the socket is actually created. This way we ensure + * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and + * such. */ - r = bpf_firewall_supported(); + r = fork_needed(address, &s->exec_context); if (r < 0) return r; - if (r == BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ - goto shortcut; + if (r == 0) { + /* Shortcut things... */ + fd = socket_address_listen_do(s, address, label); + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + + return fd; + } + + r = unit_setup_exec_runtime(UNIT(s)); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m"); + + if (s->exec_context.network_namespace_path && + s->exec_runtime && + s->exec_runtime->netns_storage_socket[0] >= 0) { + r = open_netns_path(s->exec_runtime->netns_storage_socket, s->exec_context.network_namespace_path); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path); + } if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); @@ -1509,6 +1544,23 @@ static int socket_address_listen_in_cgroup( pair[0] = safe_close(pair[0]); + if ((s->exec_context.private_network || s->exec_context.network_namespace_path) && + s->exec_runtime && + s->exec_runtime->netns_storage_socket[0] >= 0) { + + if (ns_type_supported(NAMESPACE_NET)) { + r = setup_netns(s->exec_runtime->netns_storage_socket); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m"); + _exit(EXIT_NETWORK); + } + } else if (s->exec_context.network_namespace_path) { + log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported."); + _exit(EXIT_NETWORK); + } else + log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); + } + fd = socket_address_listen_do(s, address, label); if (fd < 0) { log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); @@ -1538,13 +1590,6 @@ static int socket_address_listen_in_cgroup( return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m"); return fd; - -shortcut: - fd = socket_address_listen_do(s, address, label); - if (fd < 0) - return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); - - return fd; } DEFINE_TRIVIAL_CLEANUP_FUNC(Socket *, socket_close_fds); -- cgit v1.2.3 From 4ad9fb38a9b20dee8bb8e576b2be9c278bf2df1c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 16:46:25 +0100 Subject: run: make sure NetworkNamespacePath= can be used on the systemd-run cmdline --- src/shared/bus-unit-util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index dff87f565e..3ea1bd29c9 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -744,7 +744,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "UtmpIdentifier", "UtmpMode", "PAMName", "TTYPath", "WorkingDirectory", "RootDirectory", "SyslogIdentifier", "ProtectSystem", "ProtectHome", "SELinuxContext", "RootImage", - "RuntimeDirectoryPreserve", "Personality", "KeyringMode")) + "RuntimeDirectoryPreserve", "Personality", "KeyringMode", "NetworkNamespacePath")) return bus_append_string(m, field, eq); -- cgit v1.2.3 From 4107452e510d1a33ef4f3313c07912c098c7ae98 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Mar 2019 21:20:36 +0100 Subject: man: document NetworkNamespacePath= --- man/systemd.exec.xml | 24 +++++++++++++++++++++++- man/systemd.unit.xml | 27 +++++++++++---------------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index b8843f1ea0..2ed8c38f37 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1100,7 +1100,29 @@ BindReadOnlyPaths=/var/lib/systemd Note that the implementation of this setting might be impossible (for example if network namespaces are not available), and the unit should be written in a way that does not solely rely on this setting for - security. + security. + + When this option is used on a socket unit any sockets bound on behalf of this unit will be + bound within a private network namespace. This may be combined with + JoinsNamespaceOf= to listen on sockets inside of network namespaces of other + services. + + + + NetworkNamespacePath= + + Takes an absolute file system path refererring to a Linux network namespace + pseudo-file (i.e. a file like /proc/$PID/ns/net or a bind mount or symlink to + one). When set the invoked processes are added to the network namespace referenced by that path. The + path has to point to a valid namespace file at the moment the processes are forked off. If this + option is used PrivateNetwork= has no effect. If this option is used together with + JoinsNamespaceOf= then it only has an effect if this unit is started before any of + the listed units that have PrivateNetwork= or + NetworkNamespacePath= configured, as otherwise the network namespace of those + units is reused. + + When this option is used on a socket unit any sockets bound on behalf of this unit will be + bound within the specified network namespace. diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml index 82c63e1609..14418c359f 100644 --- a/man/systemd.unit.xml +++ b/man/systemd.unit.xml @@ -728,23 +728,18 @@ JoinsNamespaceOf= - For units that start processes (such as - service units), lists one or more other units whose network - and/or temporary file namespace to join. This only applies to - unit types which support the - PrivateNetwork= and + For units that start processes (such as service units), lists one or more other units + whose network and/or temporary file namespace to join. This only applies to unit types which support + the PrivateNetwork=, NetworkNamespacePath= and PrivateTmp= directives (see - systemd.exec5 - for details). If a unit that has this setting set is started, - its processes will see the same /tmp, - /var/tmp and network namespace as one - listed unit that is started. If multiple listed units are - already started, it is not defined which namespace is joined. - Note that this setting only has an effect if - PrivateNetwork= and/or - PrivateTmp= is enabled for both the unit - that joins the namespace and the unit whose namespace is - joined. + systemd.exec5 for + details). If a unit that has this setting set is started, its processes will see the same + /tmp, /var/tmp and network namespace as one listed unit + that is started. If multiple listed units are already started, it is not defined which namespace is + joined. Note that this setting only has an effect if + PrivateNetwork=/NetworkNamespacePath= and/or + PrivateTmp= is enabled for both the unit that joins the namespace and the unit + whose namespace is joined. -- cgit v1.2.3