diff options
author | Lennart Poettering <lennart@poettering.net> | 2024-09-12 17:28:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-12 17:28:59 +0200 |
commit | 5892950ba47b69ca5447e4969a0b8159f3555397 (patch) | |
tree | 6c8c1b456d729f8d2e570a3b0a6c467e12d4bd33 /src | |
parent | update TODO (diff) | |
parent | test-network: add test for sysctl watch (diff) | |
download | systemd-5892950ba47b69ca5447e4969a0b8159f3555397.tar.xz systemd-5892950ba47b69ca5447e4969a0b8159f3555397.zip |
Merge pull request #32212 from teknoraver/networkd-sysctl
More visibility into systemd-networkd sysctls
Diffstat (limited to 'src')
-rw-r--r-- | src/basic/sysctl-util.c | 45 | ||||
-rw-r--r-- | src/basic/sysctl-util.h | 21 | ||||
-rw-r--r-- | src/network/bpf/sysctl_monitor/meson.build | 25 | ||||
-rw-r--r-- | src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h | 16 | ||||
-rw-r--r-- | src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c | 134 | ||||
-rw-r--r-- | src/network/bpf/sysctl_monitor/sysctl-write-event.h | 46 | ||||
-rw-r--r-- | src/network/meson.build | 6 | ||||
-rw-r--r-- | src/network/networkd-ipv6ll.c | 7 | ||||
-rw-r--r-- | src/network/networkd-link.c | 2 | ||||
-rw-r--r-- | src/network/networkd-manager.c | 18 | ||||
-rw-r--r-- | src/network/networkd-manager.h | 6 | ||||
-rw-r--r-- | src/network/networkd-ndisc.c | 9 | ||||
-rw-r--r-- | src/network/networkd-sysctl.c | 239 | ||||
-rw-r--r-- | src/network/networkd-sysctl.h | 10 | ||||
-rw-r--r-- | src/network/networkd.c | 4 | ||||
-rw-r--r-- | src/systemd/sd-messages.h | 3 | ||||
-rw-r--r-- | src/test/test-sysctl-util.c | 4 |
17 files changed, 556 insertions, 39 deletions
diff --git a/src/basic/sysctl-util.c b/src/basic/sysctl-util.c index b284c9ccd2..dfb99e1896 100644 --- a/src/basic/sysctl-util.c +++ b/src/basic/sysctl-util.c @@ -44,8 +44,39 @@ char* sysctl_normalize(char *s) { return s; } -int sysctl_write(const char *property, const char *value) { +static int shadow_update(Hashmap **shadow, const char *property, const char *value) { + _cleanup_free_ char *k = NULL, *v = NULL, *cur_k = NULL, *cur_v = NULL; + int r; + + assert(property); + assert(value); + + if (!shadow) + return 0; + + k = strdup(property); + if (!k) + return -ENOMEM; + + v = strdup(value); + if (!v) + return -ENOMEM; + + cur_v = hashmap_remove2(*shadow, k, (void**)&cur_k); + + r = hashmap_ensure_put(shadow, &path_hash_ops_free_free, k, v); + if (r < 0) + return r; + + TAKE_PTR(k); + TAKE_PTR(v); + + return 0; +} + +int sysctl_write_full(const char *property, const char *value, Hashmap **shadow) { char *p; + int r; assert(property); assert(value); @@ -58,6 +89,10 @@ int sysctl_write(const char *property, const char *value) { log_debug("Setting '%s' to '%s'", p, value); + r = shadow_update(shadow, p, value); + if (r < 0) + return r; + return write_string_file(p, value, WRITE_STRING_FILE_VERIFY_ON_FAILURE | WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL); } @@ -76,7 +111,7 @@ int sysctl_writef(const char *property, const char *format, ...) { return sysctl_write(property, v); } -int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value) { +int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow) { const char *p; assert(property); @@ -93,10 +128,10 @@ int sysctl_write_ip_property(int af, const char *ifname, const char *property, c } else p = strjoina("net/", af_to_ipv4_ipv6(af), "/", property); - return sysctl_write(p, value); + return sysctl_write_full(p, value, shadow); } -int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value) { +int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow) { const char *p; assert(property); @@ -113,7 +148,7 @@ int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *pr } else p = strjoina("net/", af_to_ipv4_ipv6(af), "/neigh/default/", property); - return sysctl_write(p, value); + return sysctl_write_full(p, value, shadow); } int sysctl_read(const char *property, char **ret) { diff --git a/src/basic/sysctl-util.h b/src/basic/sysctl-util.h index 2bf5491703..041292f693 100644 --- a/src/basic/sysctl-util.h +++ b/src/basic/sysctl-util.h @@ -10,27 +10,30 @@ char* sysctl_normalize(char *s); int sysctl_read(const char *property, char **value); -int sysctl_write(const char *property, const char *value); +int sysctl_write_full(const char *property, const char *value, Hashmap **shadow); int sysctl_writef(const char *property, const char *format, ...) _printf_(2, 3); +static inline int sysctl_write(const char *property, const char *value) { + return sysctl_write_full(property, value, NULL); +} int sysctl_read_ip_property(int af, const char *ifname, const char *property, char **ret); -int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value); -static inline int sysctl_write_ip_property_boolean(int af, const char *ifname, const char *property, bool value) { - return sysctl_write_ip_property(af, ifname, property, one_zero(value)); +int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow); +static inline int sysctl_write_ip_property_boolean(int af, const char *ifname, const char *property, bool value, Hashmap **shadow) { + return sysctl_write_ip_property(af, ifname, property, one_zero(value), shadow); } -int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value); -static inline int sysctl_write_ip_neighbor_property_uint32(int af, const char *ifname, const char *property, uint32_t value) { +int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow); +static inline int sysctl_write_ip_neighbor_property_uint32(int af, const char *ifname, const char *property, uint32_t value, Hashmap **shadow) { char buf[DECIMAL_STR_MAX(uint32_t)]; xsprintf(buf, "%u", value); - return sysctl_write_ip_neighbor_property(af, ifname, property, buf); + return sysctl_write_ip_neighbor_property(af, ifname, property, buf, shadow); } #define DEFINE_SYSCTL_WRITE_IP_PROPERTY(name, type, format) \ - static inline int sysctl_write_ip_property_##name(int af, const char *ifname, const char *property, type value) { \ + static inline int sysctl_write_ip_property_##name(int af, const char *ifname, const char *property, type value, Hashmap **shadow) { \ char buf[DECIMAL_STR_MAX(type)]; \ xsprintf(buf, format, value); \ - return sysctl_write_ip_property(af, ifname, property, buf); \ + return sysctl_write_ip_property(af, ifname, property, buf, shadow); \ } DEFINE_SYSCTL_WRITE_IP_PROPERTY(int, int, "%i"); diff --git a/src/network/bpf/sysctl_monitor/meson.build b/src/network/bpf/sysctl_monitor/meson.build new file mode 100644 index 0000000000..ac8e81e927 --- /dev/null +++ b/src/network/bpf/sysctl_monitor/meson.build @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('HAVE_VMLINUX_H') != 1 + subdir_done() +endif + +sysctl_monitor_bpf_o_unstripped = custom_target( + 'sysctl-monitor.bpf.unstripped.o', + input : 'sysctl-monitor.bpf.c', + output : 'sysctl-monitor.bpf.unstripped.o', + command : bpf_o_unstripped_cmd, + depends : vmlinux_h_dependency) + +sysctl_monitor_bpf_o = custom_target( + 'sysctl-monitor.bpf.o', + input : sysctl_monitor_bpf_o_unstripped, + output : 'sysctl-monitor.bpf.o', + command : bpf_o_cmd) + +sysctl_monitor_skel_h = custom_target( + 'sysctl-monitor.skel.h', + input : sysctl_monitor_bpf_o, + output : 'sysctl-monitor.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h b/src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h new file mode 100644 index 0000000000..d002414521 --- /dev/null +++ b/src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include "bpf-dlopen.h" + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton + +#include "bpf/sysctl_monitor/sysctl-monitor.skel.h" diff --git a/src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c b/src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c new file mode 100644 index 0000000000..ef154931ce --- /dev/null +++ b/src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> + +#include "sysctl-write-event.h" + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} written_sysctls SEC(".maps"); + +static bool my_streq(const char *s1, const char *s2, size_t l) { + for (size_t i = 0; i < l; i++) { + if (s1[i] != s2[i]) + return false; + if (s1[i] == 0) + return true; + } + return true; +} + +struct str { + char *s; + size_t l; +}; + +static long cut_last(u32 i, struct str *str) { + char *s; + + i = str->l - i - 1; + s = str->s + i; + + /* Sanity check for the preverifier */ + if (i >= str->l) + return 1; + + if (*s == 0) + return 0; + + if (*s == '\n' || *s == '\r' || *s == ' ' || *s == '\t') { + *s = 0; + + return 0; + } + + return 1; +} + +/* Cut off trailing whitespace and newlines */ +static void chop(char *s, size_t l) { + struct str str = { s, l }; + + bpf_loop(l, cut_last, &str, 0); +} + +SEC("cgroup/sysctl") +int sysctl_monitor(struct bpf_sysctl *ctx) { + int r; + + /* Ignore events generated by us */ + if (bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 1; + + /* Allow reads */ + if (!ctx->write) + return 1; + + /* Declare the struct without contextually initializing it. + * This avoid zero-filling the struct, which would be a waste of + * resource and code size. Since we're sending an event even on failure, + * truncate the strings to zero size, in case we don't populate them. */ + struct sysctl_write_event we; + we.version = 1; + we.errorcode = 0; + we.path[0] = 0; + we.comm[0] = 0; + we.current[0] = 0; + we.newvalue[0] = 0; + + /* Set the simple values first */ + we.pid = bpf_get_current_pid_tgid() >> 32; + we.cgroup_id = bpf_get_current_cgroup_id(); + + /* Only monitor /proc/sys/net/ */ + r = bpf_sysctl_get_name(ctx, we.path, sizeof(we.path), 0); + if (r < 0) { + we.errorcode = r; + goto send_event; + } + + if (bpf_strncmp(we.path, 4, "net/") != 0) + return 1; + + r = bpf_get_current_comm(we.comm, sizeof(we.comm)); + if (r < 0) { + we.errorcode = r; + goto send_event; + } + + r = bpf_sysctl_get_current_value(ctx, we.current, sizeof(we.current)); + if (r < 0) { + we.errorcode = r; + goto send_event; + } + + r = bpf_sysctl_get_new_value(ctx, we.newvalue, sizeof(we.newvalue)); + if (r < 0) { + we.errorcode = r; + goto send_event; + } + + /* Both the kernel and userspace applications add a newline at the end, + * remove it from both strings */ + chop(we.current, sizeof(we.current)); + chop(we.newvalue, sizeof(we.newvalue)); + +send_event: + /* If new value differs or we encountered an error, send the event */ + if (r < 0 || !my_streq(we.current, we.newvalue, sizeof(we.current))) + bpf_ringbuf_output(&written_sysctls, &we, sizeof(we), 0); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/network/bpf/sysctl_monitor/sysctl-write-event.h b/src/network/bpf/sysctl_monitor/sysctl-write-event.h new file mode 100644 index 0000000000..77b71fb4f9 --- /dev/null +++ b/src/network/bpf/sysctl_monitor/sysctl-write-event.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +/* It would be nice to size these members to bigger values, but the stack + * in BPF programs is limited to 512 bytes, and allocating bigger structures + * leads to this compile time error: + * error: Looks like the BPF stack limit is exceeded. + * Please move large on stack variables into BPF per-cpu array map. + * For non-kernel uses, the stack can be increased using -mllvm -bpf-stack-size. */ +struct sysctl_write_event { + /* Used to track changes in the struct layout */ + int version; + + /* Error code returned to userspace to handle eventual failures. */ + int errorcode; + + /* The PID of the process which is writing the sysctl. */ + pid_t pid; + + /* The cgroup id of the process. */ + uint64_t cgroup_id; + + /* The name of the binary. */ + char comm[TASK_COMM_LEN]; + + /* The path of the sysctl, relative to /proc/sys/. + * The longest path observed is 64 bytes: + * net/ipv4/conf/123456789012345/igmpv3_unsolicited_report_interval + * so set it to 100 gives us lot of headroom */ + char path[100]; + + /* The value of the sysctl just before the write. + * The longest value observed is net.core.netdev_rss_key which + * contains 155 bytes, so set it to 160 to have some headroom + * even in this corner case. */ + char current[160]; + + /* The new value being written into the sysctl. + * same sizing as 'current' */ + char newvalue[160]; +}; diff --git a/src/network/meson.build b/src/network/meson.build index 54cf694aeb..73c48e06af 100644 --- a/src/network/meson.build +++ b/src/network/meson.build @@ -1,5 +1,7 @@ # SPDX-License-Identifier: LGPL-2.1-or-later +subdir('bpf/sysctl_monitor') + sources = files( 'netdev/bareudp.c', 'netdev/batadv.c', @@ -140,6 +142,10 @@ network_generator_sources = files( networkd_network_gperf_gperf = files('networkd-network-gperf.gperf') networkd_netdev_gperf_gperf = files('netdev/netdev-gperf.gperf') +if conf.get('HAVE_VMLINUX_H') == 1 + sources += sysctl_monitor_skel_h +endif + sources += custom_target( 'networkd-gperf.c', input : 'networkd-gperf.gperf', diff --git a/src/network/networkd-ipv6ll.c b/src/network/networkd-ipv6ll.c index cd23cc94aa..0daf3ad8ab 100644 --- a/src/network/networkd-ipv6ll.c +++ b/src/network/networkd-ipv6ll.c @@ -7,6 +7,7 @@ #include "networkd-address.h" #include "networkd-ipv6ll.h" #include "networkd-link.h" +#include "networkd-manager.h" #include "networkd-network.h" #include "networkd-util.h" #include "socket-util.h" @@ -189,6 +190,7 @@ int link_set_ipv6ll_stable_secret(Link *link) { int r; assert(link); + assert(link->manager); assert(link->network); if (link->network->ipv6ll_address_gen_mode != IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY) @@ -219,17 +221,18 @@ int link_set_ipv6ll_stable_secret(Link *link) { } return sysctl_write_ip_property(AF_INET6, link->ifname, "stable_secret", - IN6_ADDR_TO_STRING(&a)); + IN6_ADDR_TO_STRING(&a), &link->manager->sysctl_shadow); } int link_set_ipv6ll_addrgen_mode(Link *link, IPv6LinkLocalAddressGenMode mode) { assert(link); + assert(link->manager); assert(mode >= 0 && mode < _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX); if (mode == link->ipv6ll_address_gen_mode) return 0; - return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "addr_gen_mode", mode); + return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "addr_gen_mode", mode, &link->manager->sysctl_shadow); } static const char* const ipv6_link_local_address_gen_mode_table[_IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX] = { diff --git a/src/network/networkd-link.c b/src/network/networkd-link.c index 0eeab6e8b0..303007a9de 100644 --- a/src/network/networkd-link.c +++ b/src/network/networkd-link.c @@ -252,6 +252,8 @@ static void link_free_engines(Link *link) { static Link *link_free(Link *link) { assert(link); + (void) sysctl_clear_link_shadows(link); + link_ntp_settings_clear(link); link_dns_settings_clear(link); diff --git a/src/network/networkd-manager.c b/src/network/networkd-manager.c index 2c2956f465..6063834a20 100644 --- a/src/network/networkd-manager.c +++ b/src/network/networkd-manager.c @@ -16,6 +16,7 @@ #include "bus-log-control-api.h" #include "bus-polkit.h" #include "bus-util.h" +#include "capability-util.h" #include "common-signal.h" #include "conf-parser.h" #include "constants.h" @@ -603,6 +604,7 @@ int manager_new(Manager **ret, bool test_mode) { .duid_product_uuid.type = DUID_TYPE_UUID, .dhcp_server_persist_leases = true, .ip_forwarding = { -1, -1, }, + .cgroup_fd = -EBADF, }; *ret = TAKE_PTR(m); @@ -615,11 +617,15 @@ Manager* manager_free(Manager *m) { if (!m) return NULL; + sysctl_remove_monitor(m); + free(m->state_file); HASHMAP_FOREACH(link, m->links_by_index) (void) link_stop_engines(link, true); + hashmap_free(m->sysctl_shadow); + m->request_queue = ordered_set_free(m->request_queue); m->remove_request_queue = ordered_set_free(m->remove_request_queue); @@ -692,6 +698,18 @@ int manager_start(Manager *m) { assert(m); + (void) sysctl_add_monitor(m); + + /* Loading BPF programs requires CAP_SYS_ADMIN and CAP_BPF. + * Drop the capabilities here, regardless if the load succeeds or not. */ + r = drop_capability(CAP_SYS_ADMIN); + if (r < 0) + log_warning_errno(r, "Failed to drop CAP_SYS_ADMIN: %m, ignoring."); + + r = drop_capability(CAP_BPF); + if (r < 0) + log_warning_errno(r, "Failed to drop CAP_BPF: %m, ignoring."); + manager_set_sysctl(m); r = manager_request_static_address_labels(m); diff --git a/src/network/networkd-manager.h b/src/network/networkd-manager.h index a70b3e708f..5a0decced2 100644 --- a/src/network/networkd-manager.h +++ b/src/network/networkd-manager.h @@ -122,6 +122,12 @@ struct Manager { /* sysctl */ int ip_forwarding[2]; + Hashmap *sysctl_shadow; + sd_event_source *sysctl_event_source; + struct ring_buffer *sysctl_buffer; + struct sysctl_monitor_bpf *sysctl_skel; + struct bpf_link *sysctl_link; + int cgroup_fd; }; int manager_new(Manager **ret, bool test_mode); diff --git a/src/network/networkd-ndisc.c b/src/network/networkd-ndisc.c index f44f03365c..81835c06e5 100644 --- a/src/network/networkd-ndisc.c +++ b/src/network/networkd-ndisc.c @@ -965,6 +965,7 @@ static int ndisc_router_process_reachable_time(Link *link, sd_ndisc_router *rt) int r; assert(link); + assert(link->manager); assert(link->network); assert(rt); @@ -986,7 +987,7 @@ static int ndisc_router_process_reachable_time(Link *link, sd_ndisc_router *rt) } /* Set the reachable time for Neighbor Solicitations. */ - r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "base_reachable_time_ms", (uint32_t) msec); + r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "base_reachable_time_ms", (uint32_t) msec, &link->manager->sysctl_shadow); if (r < 0) log_link_warning_errno(link, r, "Failed to apply neighbor reachable time (%"PRIu64"), ignoring: %m", msec); @@ -998,6 +999,7 @@ static int ndisc_router_process_retransmission_time(Link *link, sd_ndisc_router int r; assert(link); + assert(link->manager); assert(link->network); assert(rt); @@ -1019,7 +1021,7 @@ static int ndisc_router_process_retransmission_time(Link *link, sd_ndisc_router } /* Set the retransmission time for Neighbor Solicitations. */ - r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", (uint32_t) msec); + r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", (uint32_t) msec, &link->manager->sysctl_shadow); if (r < 0) log_link_warning_errno(link, r, "Failed to apply neighbor retransmission time (%"PRIu64"), ignoring: %m", msec); @@ -1031,6 +1033,7 @@ static int ndisc_router_process_hop_limit(Link *link, sd_ndisc_router *rt) { int r; assert(link); + assert(link->manager); assert(link->network); assert(rt); @@ -1054,7 +1057,7 @@ static int ndisc_router_process_hop_limit(Link *link, sd_ndisc_router *rt) { if (hop_limit <= 0) return 0; - r = sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "hop_limit", (uint32_t) hop_limit); + r = sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "hop_limit", (uint32_t) hop_limit, &link->manager->sysctl_shadow); if (r < 0) log_link_warning_errno(link, r, "Failed to apply hop_limit (%u), ignoring: %m", hop_limit); diff --git a/src/network/networkd-sysctl.c b/src/network/networkd-sysctl.c index 2027a29f27..b85f0ca568 100644 --- a/src/network/networkd-sysctl.c +++ b/src/network/networkd-sysctl.c @@ -4,7 +4,11 @@ #include <linux/if.h> #include <linux/if_arp.h> +#include "sd-messages.h" + #include "af-list.h" +#include "cgroup-util.h" +#include "fd-util.h" #include "missing_network.h" #include "networkd-link.h" #include "networkd-lldp-tx.h" @@ -12,10 +16,197 @@ #include "networkd-ndisc.h" #include "networkd-network.h" #include "networkd-sysctl.h" +#include "path-util.h" #include "socket-util.h" #include "string-table.h" #include "sysctl-util.h" +#if HAVE_VMLINUX_H + +#include "bpf-link.h" + +#include "bpf/sysctl_monitor/sysctl-monitor-skel.h" +#include "bpf/sysctl_monitor/sysctl-write-event.h" + +static struct sysctl_monitor_bpf *sysctl_monitor_bpf_free(struct sysctl_monitor_bpf *obj) { + sysctl_monitor_bpf__destroy(obj); + return NULL; +} + +static struct ring_buffer *rb_free(struct ring_buffer *rb) { + sym_ring_buffer__free(rb); + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct sysctl_monitor_bpf *, sysctl_monitor_bpf_free); +DEFINE_TRIVIAL_CLEANUP_FUNC(struct ring_buffer *, rb_free); + +static int sysctl_event_handler(void *ctx, void *data, size_t data_sz) { + struct sysctl_write_event *we = ASSERT_PTR(data); + Hashmap **sysctl_shadow = ASSERT_PTR(ctx); + _cleanup_free_ char *path = NULL; + char *value; + + /* Returning a negative value interrupts the ring buffer polling, + * so do it only in case of a fatal error like a version mismatch. */ + if (we->version != 1) + return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), + "Unexpected sysctl event, disabling sysctl monitoring: %d", we->version); + + if (we->errorcode != 0) { + log_warning_errno(we->errorcode, "Sysctl monitor BPF returned error: %m"); + return 0; + } + + path = path_join("/proc/sys", we->path); + if (!path) { + log_oom(); + return 0; + } + + /* If we never managed this handle, ignore it. */ + value = hashmap_get(*sysctl_shadow, path); + if (!value) + return 0; + + if (!strneq(value, we->newvalue, sizeof(we->newvalue))) + log_struct(LOG_WARNING, + "MESSAGE_ID=" SD_MESSAGE_SYSCTL_CHANGED_STR, + "OBJECT_PID=%d", we->pid, + "OBJECT_COMM=%s", we->comm, + "SYSCTL=/proc/sys/%s", we->path, + "OLDVALUE=%s", we->current, + "NEWVALUE=%s", we->newvalue, + "OURVALUE=%s", value, + LOG_MESSAGE("Foreign process '%s[%d]' changed sysctl '/proc/sys/%s' from '%s' to '%s', conflicting with our setting to '%s'", + we->comm, we->pid, we->path, we->current, we->newvalue, value)); + + return 0; +} + +static int on_ringbuf_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) { + struct ring_buffer *rb = ASSERT_PTR(userdata); + int r; + + r = sym_ring_buffer__poll(rb, /* timeout_msec= */ 0); + if (r < 0 && errno != EINTR) + log_error_errno(errno, "Error polling ring buffer: %m"); + + return 0; +} + +int sysctl_add_monitor(Manager *manager) { + _cleanup_(sysctl_monitor_bpf_freep) struct sysctl_monitor_bpf *obj = NULL; + _cleanup_(bpf_link_freep) struct bpf_link *sysctl_link = NULL; + _cleanup_(rb_freep) struct ring_buffer *sysctl_buffer = NULL; + _cleanup_close_ int cgroup_fd = -EBADF, rootcg = -EBADF; + _cleanup_free_ char *cgroup = NULL; + int idx = 0, r; + + assert(manager); + + r = dlopen_bpf(); + if (r < 0) { + log_info_errno(r, "sysctl monitor disabled, as BPF support is not available."); + return 0; + } + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup); + if (r < 0) + return log_warning_errno(r, "Failed to get cgroup path, ignoring: %m."); + + rootcg = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, "/"); + if (rootcg < 0) + return log_warning_errno(rootcg, "Failed to open cgroup, ignoring: %m."); + + obj = sysctl_monitor_bpf__open_and_load(); + if (!obj) { + log_info_errno(errno, "Unable to load sysctl monitor BPF program, ignoring: %m."); + return 0; + } + + cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, cgroup); + if (cgroup_fd < 0) + return log_warning_errno(cgroup_fd, "Failed to open cgroup: %m"); + + if (sym_bpf_map_update_elem(sym_bpf_map__fd(obj->maps.cgroup_map), &idx, &cgroup_fd, BPF_ANY)) + return log_warning_errno(errno, "Failed to update cgroup map: %m"); + + sysctl_link = sym_bpf_program__attach_cgroup(obj->progs.sysctl_monitor, rootcg); + r = bpf_get_error_translated(sysctl_link); + if (r < 0) { + log_info_errno(r, "Unable to attach sysctl monitor BPF program to cgroup, ignoring: %m."); + return 0; + } + + sysctl_buffer = sym_ring_buffer__new( + sym_bpf_map__fd(obj->maps.written_sysctls), + sysctl_event_handler, &manager->sysctl_shadow, NULL); + if (!sysctl_buffer) + return log_warning_errno(errno, "Failed to create ring buffer: %m"); + + r = sd_event_add_io(manager->event, &manager->sysctl_event_source, + sym_ring_buffer__epoll_fd(sysctl_buffer), EPOLLIN, on_ringbuf_io, sysctl_buffer); + if (r < 0) + return log_warning_errno(r, "Failed to watch sysctl event ringbuffer: %m"); + + manager->sysctl_link = TAKE_PTR(sysctl_link); + manager->sysctl_skel = TAKE_PTR(obj); + manager->sysctl_buffer = TAKE_PTR(sysctl_buffer); + manager->cgroup_fd = TAKE_FD(cgroup_fd); + + return 0; +} + +void sysctl_remove_monitor(Manager *manager) { + assert(manager); + + manager->sysctl_event_source = sd_event_source_disable_unref(manager->sysctl_event_source); + + if (manager->sysctl_buffer) { + sym_ring_buffer__free(manager->sysctl_buffer); + manager->sysctl_buffer = NULL; + } + + if (manager->sysctl_link) { + sym_bpf_link__destroy(manager->sysctl_link); + manager->sysctl_link = NULL; + } + + if (manager->sysctl_skel) { + sysctl_monitor_bpf__destroy(manager->sysctl_skel); + manager->sysctl_skel = NULL; + } + + manager->cgroup_fd = safe_close(manager->cgroup_fd); +} + +int sysctl_clear_link_shadows(Link *link) { + _cleanup_free_ char *ipv4 = NULL, *ipv6 = NULL; + char *key = NULL, *value = NULL; + + assert(link); + assert(link->manager); + + ipv4 = path_join("/proc/sys/net/ipv4/conf", link->ifname); + if (!ipv4) + return log_oom(); + + ipv6 = path_join("/proc/sys/net/ipv6/conf", link->ifname); + if (!ipv6) + return log_oom(); + + HASHMAP_FOREACH_KEY(value, key, link->manager->sysctl_shadow) + if (path_startswith(key, ipv4) || path_startswith(key, ipv6)) { + assert_se(hashmap_remove_value(link->manager->sysctl_shadow, key, value) == value); + free(key); + free(value); + } + + return 0; +} +#endif + static void manager_set_ip_forwarding(Manager *manager, int family) { int r, t; @@ -30,13 +221,13 @@ static void manager_set_ip_forwarding(Manager *manager, int family) { return; /* keep */ /* First, set the default value. */ - r = sysctl_write_ip_property_boolean(family, "default", "forwarding", t); + r = sysctl_write_ip_property_boolean(family, "default", "forwarding", t, &manager->sysctl_shadow); if (r < 0) log_warning_errno(r, "Failed to %s the default %s forwarding: %m", enable_disable(t), af_to_ipv4_ipv6(family)); /* Then, set the value to all interfaces. */ - r = sysctl_write_ip_property_boolean(family, "all", "forwarding", t); + r = sysctl_write_ip_property_boolean(family, "all", "forwarding", t, &manager->sysctl_shadow); if (r < 0) log_warning_errno(r, "Failed to %s %s forwarding for all interfaces: %m", enable_disable(t), af_to_ipv4_ipv6(family)); @@ -73,6 +264,7 @@ static bool link_is_configured_for_family(Link *link, int family) { static int link_update_ipv6_sysctl(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; @@ -80,11 +272,12 @@ static int link_update_ipv6_sysctl(Link *link) { if (!link_ipv6_enabled(link)) return 0; - return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "disable_ipv6", false); + return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "disable_ipv6", false, &link->manager->sysctl_shadow); } static int link_set_proxy_arp(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET)) return 0; @@ -92,11 +285,12 @@ static int link_set_proxy_arp(Link *link) { if (link->network->proxy_arp < 0) return 0; - return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp", link->network->proxy_arp > 0); + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp", link->network->proxy_arp > 0, &link->manager->sysctl_shadow); } static int link_set_proxy_arp_pvlan(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET)) return 0; @@ -104,7 +298,7 @@ static int link_set_proxy_arp_pvlan(Link *link) { if (link->network->proxy_arp_pvlan < 0) return 0; - return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp_pvlan", link->network->proxy_arp_pvlan > 0); + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp_pvlan", link->network->proxy_arp_pvlan > 0, &link->manager->sysctl_shadow); } int link_get_ip_forwarding(Link *link, int family) { @@ -136,6 +330,7 @@ static int link_set_ip_forwarding_impl(Link *link, int family) { int r, t; assert(link); + assert(link->manager); assert(IN_SET(family, AF_INET, AF_INET6)); if (!link_is_configured_for_family(link, family)) @@ -145,7 +340,7 @@ static int link_set_ip_forwarding_impl(Link *link, int family) { if (t < 0) return 0; /* keep */ - r = sysctl_write_ip_property_boolean(family, link->ifname, "forwarding", t); + r = sysctl_write_ip_property_boolean(family, link->ifname, "forwarding", t, &link->manager->sysctl_shadow); if (r < 0) return log_link_warning_errno(link, r, "Failed to %s %s forwarding, ignoring: %m", enable_disable(t), af_to_ipv4_ipv6(family)); @@ -214,6 +409,7 @@ static int link_set_ip_forwarding(Link *link, int family) { static int link_set_ipv4_rp_filter(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET)) return 0; @@ -221,7 +417,7 @@ static int link_set_ipv4_rp_filter(Link *link) { if (link->network->ipv4_rp_filter < 0) return 0; - return sysctl_write_ip_property_int(AF_INET, link->ifname, "rp_filter", link->network->ipv4_rp_filter); + return sysctl_write_ip_property_int(AF_INET, link->ifname, "rp_filter", link->network->ipv4_rp_filter, &link->manager->sysctl_shadow); } static int link_set_ipv6_privacy_extensions(Link *link) { @@ -241,20 +437,22 @@ static int link_set_ipv6_privacy_extensions(Link *link) { if (val == IPV6_PRIVACY_EXTENSIONS_KERNEL) return 0; - return sysctl_write_ip_property_int(AF_INET6, link->ifname, "use_tempaddr", (int) val); + return sysctl_write_ip_property_int(AF_INET6, link->ifname, "use_tempaddr", (int) val, &link->manager->sysctl_shadow); } static int link_set_ipv6_accept_ra(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; - return sysctl_write_ip_property(AF_INET6, link->ifname, "accept_ra", "0"); + return sysctl_write_ip_property(AF_INET6, link->ifname, "accept_ra", "0", &link->manager->sysctl_shadow); } static int link_set_ipv6_dad_transmits(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; @@ -262,11 +460,12 @@ static int link_set_ipv6_dad_transmits(Link *link) { if (link->network->ipv6_dad_transmits < 0) return 0; - return sysctl_write_ip_property_int(AF_INET6, link->ifname, "dad_transmits", link->network->ipv6_dad_transmits); + return sysctl_write_ip_property_int(AF_INET6, link->ifname, "dad_transmits", link->network->ipv6_dad_transmits, &link->manager->sysctl_shadow); } static int link_set_ipv6_hop_limit(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; @@ -274,13 +473,14 @@ static int link_set_ipv6_hop_limit(Link *link) { if (link->network->ipv6_hop_limit <= 0) return 0; - return sysctl_write_ip_property_int(AF_INET6, link->ifname, "hop_limit", link->network->ipv6_hop_limit); + return sysctl_write_ip_property_int(AF_INET6, link->ifname, "hop_limit", link->network->ipv6_hop_limit, &link->manager->sysctl_shadow); } static int link_set_ipv6_retransmission_time(Link *link) { usec_t retrans_time_ms; assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; @@ -292,13 +492,14 @@ static int link_set_ipv6_retransmission_time(Link *link) { if (retrans_time_ms <= 0 || retrans_time_ms > UINT32_MAX) return 0; - return sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", retrans_time_ms); + return sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", retrans_time_ms, &link->manager->sysctl_shadow); } static int link_set_ipv6_proxy_ndp(Link *link) { bool v; assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; @@ -308,13 +509,14 @@ static int link_set_ipv6_proxy_ndp(Link *link) { else v = !set_isempty(link->network->ipv6_proxy_ndp_addresses); - return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "proxy_ndp", v); + return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "proxy_ndp", v, &link->manager->sysctl_shadow); } int link_set_ipv6_mtu(Link *link, int log_level) { uint32_t mtu = 0; assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET6)) return 0; @@ -335,11 +537,12 @@ int link_set_ipv6_mtu(Link *link, int log_level) { mtu = link->mtu; } - return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "mtu", mtu); + return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "mtu", mtu, &link->manager->sysctl_shadow); } static int link_set_ipv4_accept_local(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET)) return 0; @@ -347,11 +550,12 @@ static int link_set_ipv4_accept_local(Link *link) { if (link->network->ipv4_accept_local < 0) return 0; - return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "accept_local", link->network->ipv4_accept_local > 0); + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "accept_local", link->network->ipv4_accept_local > 0, &link->manager->sysctl_shadow); } static int link_set_ipv4_route_localnet(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET)) return 0; @@ -359,11 +563,12 @@ static int link_set_ipv4_route_localnet(Link *link) { if (link->network->ipv4_route_localnet < 0) return 0; - return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "route_localnet", link->network->ipv4_route_localnet > 0); + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "route_localnet", link->network->ipv4_route_localnet > 0, &link->manager->sysctl_shadow); } static int link_set_ipv4_promote_secondaries(Link *link) { assert(link); + assert(link->manager); if (!link_is_configured_for_family(link, AF_INET)) return 0; @@ -373,7 +578,7 @@ static int link_set_ipv4_promote_secondaries(Link *link) { * otherwise. The way systemd-networkd works is that the new IP of a lease is added as a * secondary IP and when the primary one expires it relies on the kernel to promote the * secondary IP. See also https://github.com/systemd/systemd/issues/7163 */ - return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "promote_secondaries", true); + return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "promote_secondaries", true, &link->manager->sysctl_shadow); } int link_set_sysctl(Link *link) { diff --git a/src/network/networkd-sysctl.h b/src/network/networkd-sysctl.h index d7a9b1f320..446b835555 100644 --- a/src/network/networkd-sysctl.h +++ b/src/network/networkd-sysctl.h @@ -27,6 +27,16 @@ typedef enum IPReversePathFilter { _IP_REVERSE_PATH_FILTER_INVALID = -EINVAL, } IPReversePathFilter; +#if HAVE_VMLINUX_H +int sysctl_add_monitor(Manager *manager); +void sysctl_remove_monitor(Manager *manager); +int sysctl_clear_link_shadows(Link *link); +#else +static inline int sysctl_add_monitor(Manager *manager) { return 0; } +static inline void sysctl_remove_monitor(Manager *manager) { } +static inline int sysctl_clear_link_shadows(Link *link) { return 0; } +#endif + void manager_set_sysctl(Manager *manager); int link_get_ip_forwarding(Link *link, int family); diff --git a/src/network/networkd.c b/src/network/networkd.c index 69a28647c8..2798cd8cf8 100644 --- a/src/network/networkd.c +++ b/src/network/networkd.c @@ -62,7 +62,9 @@ static int run(int argc, char *argv[]) { (1ULL << CAP_NET_ADMIN) | (1ULL << CAP_NET_BIND_SERVICE) | (1ULL << CAP_NET_BROADCAST) | - (1ULL << CAP_NET_RAW)); + (1ULL << CAP_NET_RAW) | + (1ULL << CAP_SYS_ADMIN) | + (1ULL << CAP_BPF)); if (r < 0) return log_error_errno(r, "Failed to drop privileges: %m"); } diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h index f4f4e95b7f..441f4e6888 100644 --- a/src/systemd/sd-messages.h +++ b/src/systemd/sd-messages.h @@ -277,6 +277,9 @@ _SD_BEGIN_DECLARATIONS; #define SD_MESSAGE_SRK_ENROLLMENT_NEEDS_AUTHORIZATION SD_ID128_MAKE(ad,70,89,f9,28,ac,4f,7e,a0,0c,07,45,7d,47,ba,8a) #define SD_MESSAGE_SRK_ENROLLMENT_NEEDS_AUTHORIZATION_STR SD_ID128_MAKE_STR(ad,70,89,f9,28,ac,4f,7e,a0,0c,07,45,7d,47,ba,8a) +#define SD_MESSAGE_SYSCTL_CHANGED SD_ID128_MAKE(9c,f5,6b,8b,af,95,46,cf,94,78,78,3a,8d,e4,21,13) +#define SD_MESSAGE_SYSCTL_CHANGED_STR SD_ID128_MAKE_STR(9c,f5,6b,8b,af,95,46,cf,94,78,78,3a,8d,e4,21,13) + _SD_END_DECLARATIONS; #endif diff --git a/src/test/test-sysctl-util.c b/src/test/test-sysctl-util.c index e94099605c..83d6c9036c 100644 --- a/src/test/test-sysctl-util.c +++ b/src/test/test-sysctl-util.c @@ -53,14 +53,14 @@ TEST(sysctl_read) { assert_se(sysctl_read_ip_property(AF_INET, "lo", "forwarding", &s)); assert_se(STR_IN_SET(s, "0", "1")); - r = sysctl_write_ip_property(AF_INET, "lo", "forwarding", s); + r = sysctl_write_ip_property(AF_INET, "lo", "forwarding", s, NULL); assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS); s = mfree(s); assert_se(sysctl_read_ip_property(AF_INET, NULL, "ip_forward", &s)); assert_se(STR_IN_SET(s, "0", "1")); - r = sysctl_write_ip_property(AF_INET, NULL, "ip_forward", s); + r = sysctl_write_ip_property(AF_INET, NULL, "ip_forward", s, NULL); assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS); s = mfree(s); |