summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2024-09-12 17:28:59 +0200
committerGitHub <noreply@github.com>2024-09-12 17:28:59 +0200
commit5892950ba47b69ca5447e4969a0b8159f3555397 (patch)
tree6c8c1b456d729f8d2e570a3b0a6c467e12d4bd33 /src
parentupdate TODO (diff)
parenttest-network: add test for sysctl watch (diff)
downloadsystemd-5892950ba47b69ca5447e4969a0b8159f3555397.tar.xz
systemd-5892950ba47b69ca5447e4969a0b8159f3555397.zip
Merge pull request #32212 from teknoraver/networkd-sysctl
More visibility into systemd-networkd sysctls
Diffstat (limited to 'src')
-rw-r--r--src/basic/sysctl-util.c45
-rw-r--r--src/basic/sysctl-util.h21
-rw-r--r--src/network/bpf/sysctl_monitor/meson.build25
-rw-r--r--src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h16
-rw-r--r--src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c134
-rw-r--r--src/network/bpf/sysctl_monitor/sysctl-write-event.h46
-rw-r--r--src/network/meson.build6
-rw-r--r--src/network/networkd-ipv6ll.c7
-rw-r--r--src/network/networkd-link.c2
-rw-r--r--src/network/networkd-manager.c18
-rw-r--r--src/network/networkd-manager.h6
-rw-r--r--src/network/networkd-ndisc.c9
-rw-r--r--src/network/networkd-sysctl.c239
-rw-r--r--src/network/networkd-sysctl.h10
-rw-r--r--src/network/networkd.c4
-rw-r--r--src/systemd/sd-messages.h3
-rw-r--r--src/test/test-sysctl-util.c4
17 files changed, 556 insertions, 39 deletions
diff --git a/src/basic/sysctl-util.c b/src/basic/sysctl-util.c
index b284c9ccd2..dfb99e1896 100644
--- a/src/basic/sysctl-util.c
+++ b/src/basic/sysctl-util.c
@@ -44,8 +44,39 @@ char* sysctl_normalize(char *s) {
return s;
}
-int sysctl_write(const char *property, const char *value) {
+static int shadow_update(Hashmap **shadow, const char *property, const char *value) {
+ _cleanup_free_ char *k = NULL, *v = NULL, *cur_k = NULL, *cur_v = NULL;
+ int r;
+
+ assert(property);
+ assert(value);
+
+ if (!shadow)
+ return 0;
+
+ k = strdup(property);
+ if (!k)
+ return -ENOMEM;
+
+ v = strdup(value);
+ if (!v)
+ return -ENOMEM;
+
+ cur_v = hashmap_remove2(*shadow, k, (void**)&cur_k);
+
+ r = hashmap_ensure_put(shadow, &path_hash_ops_free_free, k, v);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(k);
+ TAKE_PTR(v);
+
+ return 0;
+}
+
+int sysctl_write_full(const char *property, const char *value, Hashmap **shadow) {
char *p;
+ int r;
assert(property);
assert(value);
@@ -58,6 +89,10 @@ int sysctl_write(const char *property, const char *value) {
log_debug("Setting '%s' to '%s'", p, value);
+ r = shadow_update(shadow, p, value);
+ if (r < 0)
+ return r;
+
return write_string_file(p, value, WRITE_STRING_FILE_VERIFY_ON_FAILURE | WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL);
}
@@ -76,7 +111,7 @@ int sysctl_writef(const char *property, const char *format, ...) {
return sysctl_write(property, v);
}
-int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value) {
+int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow) {
const char *p;
assert(property);
@@ -93,10 +128,10 @@ int sysctl_write_ip_property(int af, const char *ifname, const char *property, c
} else
p = strjoina("net/", af_to_ipv4_ipv6(af), "/", property);
- return sysctl_write(p, value);
+ return sysctl_write_full(p, value, shadow);
}
-int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value) {
+int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow) {
const char *p;
assert(property);
@@ -113,7 +148,7 @@ int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *pr
} else
p = strjoina("net/", af_to_ipv4_ipv6(af), "/neigh/default/", property);
- return sysctl_write(p, value);
+ return sysctl_write_full(p, value, shadow);
}
int sysctl_read(const char *property, char **ret) {
diff --git a/src/basic/sysctl-util.h b/src/basic/sysctl-util.h
index 2bf5491703..041292f693 100644
--- a/src/basic/sysctl-util.h
+++ b/src/basic/sysctl-util.h
@@ -10,27 +10,30 @@
char* sysctl_normalize(char *s);
int sysctl_read(const char *property, char **value);
-int sysctl_write(const char *property, const char *value);
+int sysctl_write_full(const char *property, const char *value, Hashmap **shadow);
int sysctl_writef(const char *property, const char *format, ...) _printf_(2, 3);
+static inline int sysctl_write(const char *property, const char *value) {
+ return sysctl_write_full(property, value, NULL);
+}
int sysctl_read_ip_property(int af, const char *ifname, const char *property, char **ret);
-int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value);
-static inline int sysctl_write_ip_property_boolean(int af, const char *ifname, const char *property, bool value) {
- return sysctl_write_ip_property(af, ifname, property, one_zero(value));
+int sysctl_write_ip_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow);
+static inline int sysctl_write_ip_property_boolean(int af, const char *ifname, const char *property, bool value, Hashmap **shadow) {
+ return sysctl_write_ip_property(af, ifname, property, one_zero(value), shadow);
}
-int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value);
-static inline int sysctl_write_ip_neighbor_property_uint32(int af, const char *ifname, const char *property, uint32_t value) {
+int sysctl_write_ip_neighbor_property(int af, const char *ifname, const char *property, const char *value, Hashmap **shadow);
+static inline int sysctl_write_ip_neighbor_property_uint32(int af, const char *ifname, const char *property, uint32_t value, Hashmap **shadow) {
char buf[DECIMAL_STR_MAX(uint32_t)];
xsprintf(buf, "%u", value);
- return sysctl_write_ip_neighbor_property(af, ifname, property, buf);
+ return sysctl_write_ip_neighbor_property(af, ifname, property, buf, shadow);
}
#define DEFINE_SYSCTL_WRITE_IP_PROPERTY(name, type, format) \
- static inline int sysctl_write_ip_property_##name(int af, const char *ifname, const char *property, type value) { \
+ static inline int sysctl_write_ip_property_##name(int af, const char *ifname, const char *property, type value, Hashmap **shadow) { \
char buf[DECIMAL_STR_MAX(type)]; \
xsprintf(buf, format, value); \
- return sysctl_write_ip_property(af, ifname, property, buf); \
+ return sysctl_write_ip_property(af, ifname, property, buf, shadow); \
}
DEFINE_SYSCTL_WRITE_IP_PROPERTY(int, int, "%i");
diff --git a/src/network/bpf/sysctl_monitor/meson.build b/src/network/bpf/sysctl_monitor/meson.build
new file mode 100644
index 0000000000..ac8e81e927
--- /dev/null
+++ b/src/network/bpf/sysctl_monitor/meson.build
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('HAVE_VMLINUX_H') != 1
+ subdir_done()
+endif
+
+sysctl_monitor_bpf_o_unstripped = custom_target(
+ 'sysctl-monitor.bpf.unstripped.o',
+ input : 'sysctl-monitor.bpf.c',
+ output : 'sysctl-monitor.bpf.unstripped.o',
+ command : bpf_o_unstripped_cmd,
+ depends : vmlinux_h_dependency)
+
+sysctl_monitor_bpf_o = custom_target(
+ 'sysctl-monitor.bpf.o',
+ input : sysctl_monitor_bpf_o_unstripped,
+ output : 'sysctl-monitor.bpf.o',
+ command : bpf_o_cmd)
+
+sysctl_monitor_skel_h = custom_target(
+ 'sysctl-monitor.skel.h',
+ input : sysctl_monitor_bpf_o,
+ output : 'sysctl-monitor.skel.h',
+ command : skel_h_cmd,
+ capture : true)
diff --git a/src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h b/src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h
new file mode 100644
index 0000000000..d002414521
--- /dev/null
+++ b/src/network/bpf/sysctl_monitor/sysctl-monitor-skel.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "bpf-dlopen.h"
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+
+#include "bpf/sysctl_monitor/sysctl-monitor.skel.h"
diff --git a/src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c b/src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c
new file mode 100644
index 0000000000..ef154931ce
--- /dev/null
+++ b/src/network/bpf/sysctl_monitor/sysctl-monitor.bpf.c
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+#include "sysctl-write-event.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
+ __type(key, u32);
+ __type(value, u32);
+ __uint(max_entries, 1);
+} cgroup_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 256 * 1024);
+} written_sysctls SEC(".maps");
+
+static bool my_streq(const char *s1, const char *s2, size_t l) {
+ for (size_t i = 0; i < l; i++) {
+ if (s1[i] != s2[i])
+ return false;
+ if (s1[i] == 0)
+ return true;
+ }
+ return true;
+}
+
+struct str {
+ char *s;
+ size_t l;
+};
+
+static long cut_last(u32 i, struct str *str) {
+ char *s;
+
+ i = str->l - i - 1;
+ s = str->s + i;
+
+ /* Sanity check for the preverifier */
+ if (i >= str->l)
+ return 1;
+
+ if (*s == 0)
+ return 0;
+
+ if (*s == '\n' || *s == '\r' || *s == ' ' || *s == '\t') {
+ *s = 0;
+
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Cut off trailing whitespace and newlines */
+static void chop(char *s, size_t l) {
+ struct str str = { s, l };
+
+ bpf_loop(l, cut_last, &str, 0);
+}
+
+SEC("cgroup/sysctl")
+int sysctl_monitor(struct bpf_sysctl *ctx) {
+ int r;
+
+ /* Ignore events generated by us */
+ if (bpf_current_task_under_cgroup(&cgroup_map, 0))
+ return 1;
+
+ /* Allow reads */
+ if (!ctx->write)
+ return 1;
+
+ /* Declare the struct without contextually initializing it.
+ * This avoid zero-filling the struct, which would be a waste of
+ * resource and code size. Since we're sending an event even on failure,
+ * truncate the strings to zero size, in case we don't populate them. */
+ struct sysctl_write_event we;
+ we.version = 1;
+ we.errorcode = 0;
+ we.path[0] = 0;
+ we.comm[0] = 0;
+ we.current[0] = 0;
+ we.newvalue[0] = 0;
+
+ /* Set the simple values first */
+ we.pid = bpf_get_current_pid_tgid() >> 32;
+ we.cgroup_id = bpf_get_current_cgroup_id();
+
+ /* Only monitor /proc/sys/net/ */
+ r = bpf_sysctl_get_name(ctx, we.path, sizeof(we.path), 0);
+ if (r < 0) {
+ we.errorcode = r;
+ goto send_event;
+ }
+
+ if (bpf_strncmp(we.path, 4, "net/") != 0)
+ return 1;
+
+ r = bpf_get_current_comm(we.comm, sizeof(we.comm));
+ if (r < 0) {
+ we.errorcode = r;
+ goto send_event;
+ }
+
+ r = bpf_sysctl_get_current_value(ctx, we.current, sizeof(we.current));
+ if (r < 0) {
+ we.errorcode = r;
+ goto send_event;
+ }
+
+ r = bpf_sysctl_get_new_value(ctx, we.newvalue, sizeof(we.newvalue));
+ if (r < 0) {
+ we.errorcode = r;
+ goto send_event;
+ }
+
+ /* Both the kernel and userspace applications add a newline at the end,
+ * remove it from both strings */
+ chop(we.current, sizeof(we.current));
+ chop(we.newvalue, sizeof(we.newvalue));
+
+send_event:
+ /* If new value differs or we encountered an error, send the event */
+ if (r < 0 || !my_streq(we.current, we.newvalue, sizeof(we.current)))
+ bpf_ringbuf_output(&written_sysctls, &we, sizeof(we), 0);
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/src/network/bpf/sysctl_monitor/sysctl-write-event.h b/src/network/bpf/sysctl_monitor/sysctl-write-event.h
new file mode 100644
index 0000000000..77b71fb4f9
--- /dev/null
+++ b/src/network/bpf/sysctl_monitor/sysctl-write-event.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#ifndef TASK_COMM_LEN
+#define TASK_COMM_LEN 16
+#endif
+
+/* It would be nice to size these members to bigger values, but the stack
+ * in BPF programs is limited to 512 bytes, and allocating bigger structures
+ * leads to this compile time error:
+ * error: Looks like the BPF stack limit is exceeded.
+ * Please move large on stack variables into BPF per-cpu array map.
+ * For non-kernel uses, the stack can be increased using -mllvm -bpf-stack-size. */
+struct sysctl_write_event {
+ /* Used to track changes in the struct layout */
+ int version;
+
+ /* Error code returned to userspace to handle eventual failures. */
+ int errorcode;
+
+ /* The PID of the process which is writing the sysctl. */
+ pid_t pid;
+
+ /* The cgroup id of the process. */
+ uint64_t cgroup_id;
+
+ /* The name of the binary. */
+ char comm[TASK_COMM_LEN];
+
+ /* The path of the sysctl, relative to /proc/sys/.
+ * The longest path observed is 64 bytes:
+ * net/ipv4/conf/123456789012345/igmpv3_unsolicited_report_interval
+ * so set it to 100 gives us lot of headroom */
+ char path[100];
+
+ /* The value of the sysctl just before the write.
+ * The longest value observed is net.core.netdev_rss_key which
+ * contains 155 bytes, so set it to 160 to have some headroom
+ * even in this corner case. */
+ char current[160];
+
+ /* The new value being written into the sysctl.
+ * same sizing as 'current' */
+ char newvalue[160];
+};
diff --git a/src/network/meson.build b/src/network/meson.build
index 54cf694aeb..73c48e06af 100644
--- a/src/network/meson.build
+++ b/src/network/meson.build
@@ -1,5 +1,7 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
+subdir('bpf/sysctl_monitor')
+
sources = files(
'netdev/bareudp.c',
'netdev/batadv.c',
@@ -140,6 +142,10 @@ network_generator_sources = files(
networkd_network_gperf_gperf = files('networkd-network-gperf.gperf')
networkd_netdev_gperf_gperf = files('netdev/netdev-gperf.gperf')
+if conf.get('HAVE_VMLINUX_H') == 1
+ sources += sysctl_monitor_skel_h
+endif
+
sources += custom_target(
'networkd-gperf.c',
input : 'networkd-gperf.gperf',
diff --git a/src/network/networkd-ipv6ll.c b/src/network/networkd-ipv6ll.c
index cd23cc94aa..0daf3ad8ab 100644
--- a/src/network/networkd-ipv6ll.c
+++ b/src/network/networkd-ipv6ll.c
@@ -7,6 +7,7 @@
#include "networkd-address.h"
#include "networkd-ipv6ll.h"
#include "networkd-link.h"
+#include "networkd-manager.h"
#include "networkd-network.h"
#include "networkd-util.h"
#include "socket-util.h"
@@ -189,6 +190,7 @@ int link_set_ipv6ll_stable_secret(Link *link) {
int r;
assert(link);
+ assert(link->manager);
assert(link->network);
if (link->network->ipv6ll_address_gen_mode != IPV6_LINK_LOCAL_ADDRESSS_GEN_MODE_STABLE_PRIVACY)
@@ -219,17 +221,18 @@ int link_set_ipv6ll_stable_secret(Link *link) {
}
return sysctl_write_ip_property(AF_INET6, link->ifname, "stable_secret",
- IN6_ADDR_TO_STRING(&a));
+ IN6_ADDR_TO_STRING(&a), &link->manager->sysctl_shadow);
}
int link_set_ipv6ll_addrgen_mode(Link *link, IPv6LinkLocalAddressGenMode mode) {
assert(link);
+ assert(link->manager);
assert(mode >= 0 && mode < _IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX);
if (mode == link->ipv6ll_address_gen_mode)
return 0;
- return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "addr_gen_mode", mode);
+ return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "addr_gen_mode", mode, &link->manager->sysctl_shadow);
}
static const char* const ipv6_link_local_address_gen_mode_table[_IPV6_LINK_LOCAL_ADDRESS_GEN_MODE_MAX] = {
diff --git a/src/network/networkd-link.c b/src/network/networkd-link.c
index 0eeab6e8b0..303007a9de 100644
--- a/src/network/networkd-link.c
+++ b/src/network/networkd-link.c
@@ -252,6 +252,8 @@ static void link_free_engines(Link *link) {
static Link *link_free(Link *link) {
assert(link);
+ (void) sysctl_clear_link_shadows(link);
+
link_ntp_settings_clear(link);
link_dns_settings_clear(link);
diff --git a/src/network/networkd-manager.c b/src/network/networkd-manager.c
index 2c2956f465..6063834a20 100644
--- a/src/network/networkd-manager.c
+++ b/src/network/networkd-manager.c
@@ -16,6 +16,7 @@
#include "bus-log-control-api.h"
#include "bus-polkit.h"
#include "bus-util.h"
+#include "capability-util.h"
#include "common-signal.h"
#include "conf-parser.h"
#include "constants.h"
@@ -603,6 +604,7 @@ int manager_new(Manager **ret, bool test_mode) {
.duid_product_uuid.type = DUID_TYPE_UUID,
.dhcp_server_persist_leases = true,
.ip_forwarding = { -1, -1, },
+ .cgroup_fd = -EBADF,
};
*ret = TAKE_PTR(m);
@@ -615,11 +617,15 @@ Manager* manager_free(Manager *m) {
if (!m)
return NULL;
+ sysctl_remove_monitor(m);
+
free(m->state_file);
HASHMAP_FOREACH(link, m->links_by_index)
(void) link_stop_engines(link, true);
+ hashmap_free(m->sysctl_shadow);
+
m->request_queue = ordered_set_free(m->request_queue);
m->remove_request_queue = ordered_set_free(m->remove_request_queue);
@@ -692,6 +698,18 @@ int manager_start(Manager *m) {
assert(m);
+ (void) sysctl_add_monitor(m);
+
+ /* Loading BPF programs requires CAP_SYS_ADMIN and CAP_BPF.
+ * Drop the capabilities here, regardless if the load succeeds or not. */
+ r = drop_capability(CAP_SYS_ADMIN);
+ if (r < 0)
+ log_warning_errno(r, "Failed to drop CAP_SYS_ADMIN: %m, ignoring.");
+
+ r = drop_capability(CAP_BPF);
+ if (r < 0)
+ log_warning_errno(r, "Failed to drop CAP_BPF: %m, ignoring.");
+
manager_set_sysctl(m);
r = manager_request_static_address_labels(m);
diff --git a/src/network/networkd-manager.h b/src/network/networkd-manager.h
index a70b3e708f..5a0decced2 100644
--- a/src/network/networkd-manager.h
+++ b/src/network/networkd-manager.h
@@ -122,6 +122,12 @@ struct Manager {
/* sysctl */
int ip_forwarding[2];
+ Hashmap *sysctl_shadow;
+ sd_event_source *sysctl_event_source;
+ struct ring_buffer *sysctl_buffer;
+ struct sysctl_monitor_bpf *sysctl_skel;
+ struct bpf_link *sysctl_link;
+ int cgroup_fd;
};
int manager_new(Manager **ret, bool test_mode);
diff --git a/src/network/networkd-ndisc.c b/src/network/networkd-ndisc.c
index f44f03365c..81835c06e5 100644
--- a/src/network/networkd-ndisc.c
+++ b/src/network/networkd-ndisc.c
@@ -965,6 +965,7 @@ static int ndisc_router_process_reachable_time(Link *link, sd_ndisc_router *rt)
int r;
assert(link);
+ assert(link->manager);
assert(link->network);
assert(rt);
@@ -986,7 +987,7 @@ static int ndisc_router_process_reachable_time(Link *link, sd_ndisc_router *rt)
}
/* Set the reachable time for Neighbor Solicitations. */
- r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "base_reachable_time_ms", (uint32_t) msec);
+ r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "base_reachable_time_ms", (uint32_t) msec, &link->manager->sysctl_shadow);
if (r < 0)
log_link_warning_errno(link, r, "Failed to apply neighbor reachable time (%"PRIu64"), ignoring: %m", msec);
@@ -998,6 +999,7 @@ static int ndisc_router_process_retransmission_time(Link *link, sd_ndisc_router
int r;
assert(link);
+ assert(link->manager);
assert(link->network);
assert(rt);
@@ -1019,7 +1021,7 @@ static int ndisc_router_process_retransmission_time(Link *link, sd_ndisc_router
}
/* Set the retransmission time for Neighbor Solicitations. */
- r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", (uint32_t) msec);
+ r = sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", (uint32_t) msec, &link->manager->sysctl_shadow);
if (r < 0)
log_link_warning_errno(link, r, "Failed to apply neighbor retransmission time (%"PRIu64"), ignoring: %m", msec);
@@ -1031,6 +1033,7 @@ static int ndisc_router_process_hop_limit(Link *link, sd_ndisc_router *rt) {
int r;
assert(link);
+ assert(link->manager);
assert(link->network);
assert(rt);
@@ -1054,7 +1057,7 @@ static int ndisc_router_process_hop_limit(Link *link, sd_ndisc_router *rt) {
if (hop_limit <= 0)
return 0;
- r = sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "hop_limit", (uint32_t) hop_limit);
+ r = sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "hop_limit", (uint32_t) hop_limit, &link->manager->sysctl_shadow);
if (r < 0)
log_link_warning_errno(link, r, "Failed to apply hop_limit (%u), ignoring: %m", hop_limit);
diff --git a/src/network/networkd-sysctl.c b/src/network/networkd-sysctl.c
index 2027a29f27..b85f0ca568 100644
--- a/src/network/networkd-sysctl.c
+++ b/src/network/networkd-sysctl.c
@@ -4,7 +4,11 @@
#include <linux/if.h>
#include <linux/if_arp.h>
+#include "sd-messages.h"
+
#include "af-list.h"
+#include "cgroup-util.h"
+#include "fd-util.h"
#include "missing_network.h"
#include "networkd-link.h"
#include "networkd-lldp-tx.h"
@@ -12,10 +16,197 @@
#include "networkd-ndisc.h"
#include "networkd-network.h"
#include "networkd-sysctl.h"
+#include "path-util.h"
#include "socket-util.h"
#include "string-table.h"
#include "sysctl-util.h"
+#if HAVE_VMLINUX_H
+
+#include "bpf-link.h"
+
+#include "bpf/sysctl_monitor/sysctl-monitor-skel.h"
+#include "bpf/sysctl_monitor/sysctl-write-event.h"
+
+static struct sysctl_monitor_bpf *sysctl_monitor_bpf_free(struct sysctl_monitor_bpf *obj) {
+ sysctl_monitor_bpf__destroy(obj);
+ return NULL;
+}
+
+static struct ring_buffer *rb_free(struct ring_buffer *rb) {
+ sym_ring_buffer__free(rb);
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct sysctl_monitor_bpf *, sysctl_monitor_bpf_free);
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct ring_buffer *, rb_free);
+
+static int sysctl_event_handler(void *ctx, void *data, size_t data_sz) {
+ struct sysctl_write_event *we = ASSERT_PTR(data);
+ Hashmap **sysctl_shadow = ASSERT_PTR(ctx);
+ _cleanup_free_ char *path = NULL;
+ char *value;
+
+ /* Returning a negative value interrupts the ring buffer polling,
+ * so do it only in case of a fatal error like a version mismatch. */
+ if (we->version != 1)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Unexpected sysctl event, disabling sysctl monitoring: %d", we->version);
+
+ if (we->errorcode != 0) {
+ log_warning_errno(we->errorcode, "Sysctl monitor BPF returned error: %m");
+ return 0;
+ }
+
+ path = path_join("/proc/sys", we->path);
+ if (!path) {
+ log_oom();
+ return 0;
+ }
+
+ /* If we never managed this handle, ignore it. */
+ value = hashmap_get(*sysctl_shadow, path);
+ if (!value)
+ return 0;
+
+ if (!strneq(value, we->newvalue, sizeof(we->newvalue)))
+ log_struct(LOG_WARNING,
+ "MESSAGE_ID=" SD_MESSAGE_SYSCTL_CHANGED_STR,
+ "OBJECT_PID=%d", we->pid,
+ "OBJECT_COMM=%s", we->comm,
+ "SYSCTL=/proc/sys/%s", we->path,
+ "OLDVALUE=%s", we->current,
+ "NEWVALUE=%s", we->newvalue,
+ "OURVALUE=%s", value,
+ LOG_MESSAGE("Foreign process '%s[%d]' changed sysctl '/proc/sys/%s' from '%s' to '%s', conflicting with our setting to '%s'",
+ we->comm, we->pid, we->path, we->current, we->newvalue, value));
+
+ return 0;
+}
+
+static int on_ringbuf_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ struct ring_buffer *rb = ASSERT_PTR(userdata);
+ int r;
+
+ r = sym_ring_buffer__poll(rb, /* timeout_msec= */ 0);
+ if (r < 0 && errno != EINTR)
+ log_error_errno(errno, "Error polling ring buffer: %m");
+
+ return 0;
+}
+
+int sysctl_add_monitor(Manager *manager) {
+ _cleanup_(sysctl_monitor_bpf_freep) struct sysctl_monitor_bpf *obj = NULL;
+ _cleanup_(bpf_link_freep) struct bpf_link *sysctl_link = NULL;
+ _cleanup_(rb_freep) struct ring_buffer *sysctl_buffer = NULL;
+ _cleanup_close_ int cgroup_fd = -EBADF, rootcg = -EBADF;
+ _cleanup_free_ char *cgroup = NULL;
+ int idx = 0, r;
+
+ assert(manager);
+
+ r = dlopen_bpf();
+ if (r < 0) {
+ log_info_errno(r, "sysctl monitor disabled, as BPF support is not available.");
+ return 0;
+ }
+
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to get cgroup path, ignoring: %m.");
+
+ rootcg = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, "/");
+ if (rootcg < 0)
+ return log_warning_errno(rootcg, "Failed to open cgroup, ignoring: %m.");
+
+ obj = sysctl_monitor_bpf__open_and_load();
+ if (!obj) {
+ log_info_errno(errno, "Unable to load sysctl monitor BPF program, ignoring: %m.");
+ return 0;
+ }
+
+ cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, cgroup);
+ if (cgroup_fd < 0)
+ return log_warning_errno(cgroup_fd, "Failed to open cgroup: %m");
+
+ if (sym_bpf_map_update_elem(sym_bpf_map__fd(obj->maps.cgroup_map), &idx, &cgroup_fd, BPF_ANY))
+ return log_warning_errno(errno, "Failed to update cgroup map: %m");
+
+ sysctl_link = sym_bpf_program__attach_cgroup(obj->progs.sysctl_monitor, rootcg);
+ r = bpf_get_error_translated(sysctl_link);
+ if (r < 0) {
+ log_info_errno(r, "Unable to attach sysctl monitor BPF program to cgroup, ignoring: %m.");
+ return 0;
+ }
+
+ sysctl_buffer = sym_ring_buffer__new(
+ sym_bpf_map__fd(obj->maps.written_sysctls),
+ sysctl_event_handler, &manager->sysctl_shadow, NULL);
+ if (!sysctl_buffer)
+ return log_warning_errno(errno, "Failed to create ring buffer: %m");
+
+ r = sd_event_add_io(manager->event, &manager->sysctl_event_source,
+ sym_ring_buffer__epoll_fd(sysctl_buffer), EPOLLIN, on_ringbuf_io, sysctl_buffer);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to watch sysctl event ringbuffer: %m");
+
+ manager->sysctl_link = TAKE_PTR(sysctl_link);
+ manager->sysctl_skel = TAKE_PTR(obj);
+ manager->sysctl_buffer = TAKE_PTR(sysctl_buffer);
+ manager->cgroup_fd = TAKE_FD(cgroup_fd);
+
+ return 0;
+}
+
+void sysctl_remove_monitor(Manager *manager) {
+ assert(manager);
+
+ manager->sysctl_event_source = sd_event_source_disable_unref(manager->sysctl_event_source);
+
+ if (manager->sysctl_buffer) {
+ sym_ring_buffer__free(manager->sysctl_buffer);
+ manager->sysctl_buffer = NULL;
+ }
+
+ if (manager->sysctl_link) {
+ sym_bpf_link__destroy(manager->sysctl_link);
+ manager->sysctl_link = NULL;
+ }
+
+ if (manager->sysctl_skel) {
+ sysctl_monitor_bpf__destroy(manager->sysctl_skel);
+ manager->sysctl_skel = NULL;
+ }
+
+ manager->cgroup_fd = safe_close(manager->cgroup_fd);
+}
+
+int sysctl_clear_link_shadows(Link *link) {
+ _cleanup_free_ char *ipv4 = NULL, *ipv6 = NULL;
+ char *key = NULL, *value = NULL;
+
+ assert(link);
+ assert(link->manager);
+
+ ipv4 = path_join("/proc/sys/net/ipv4/conf", link->ifname);
+ if (!ipv4)
+ return log_oom();
+
+ ipv6 = path_join("/proc/sys/net/ipv6/conf", link->ifname);
+ if (!ipv6)
+ return log_oom();
+
+ HASHMAP_FOREACH_KEY(value, key, link->manager->sysctl_shadow)
+ if (path_startswith(key, ipv4) || path_startswith(key, ipv6)) {
+ assert_se(hashmap_remove_value(link->manager->sysctl_shadow, key, value) == value);
+ free(key);
+ free(value);
+ }
+
+ return 0;
+}
+#endif
+
static void manager_set_ip_forwarding(Manager *manager, int family) {
int r, t;
@@ -30,13 +221,13 @@ static void manager_set_ip_forwarding(Manager *manager, int family) {
return; /* keep */
/* First, set the default value. */
- r = sysctl_write_ip_property_boolean(family, "default", "forwarding", t);
+ r = sysctl_write_ip_property_boolean(family, "default", "forwarding", t, &manager->sysctl_shadow);
if (r < 0)
log_warning_errno(r, "Failed to %s the default %s forwarding: %m",
enable_disable(t), af_to_ipv4_ipv6(family));
/* Then, set the value to all interfaces. */
- r = sysctl_write_ip_property_boolean(family, "all", "forwarding", t);
+ r = sysctl_write_ip_property_boolean(family, "all", "forwarding", t, &manager->sysctl_shadow);
if (r < 0)
log_warning_errno(r, "Failed to %s %s forwarding for all interfaces: %m",
enable_disable(t), af_to_ipv4_ipv6(family));
@@ -73,6 +264,7 @@ static bool link_is_configured_for_family(Link *link, int family) {
static int link_update_ipv6_sysctl(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
@@ -80,11 +272,12 @@ static int link_update_ipv6_sysctl(Link *link) {
if (!link_ipv6_enabled(link))
return 0;
- return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "disable_ipv6", false);
+ return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "disable_ipv6", false, &link->manager->sysctl_shadow);
}
static int link_set_proxy_arp(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET))
return 0;
@@ -92,11 +285,12 @@ static int link_set_proxy_arp(Link *link) {
if (link->network->proxy_arp < 0)
return 0;
- return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp", link->network->proxy_arp > 0);
+ return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp", link->network->proxy_arp > 0, &link->manager->sysctl_shadow);
}
static int link_set_proxy_arp_pvlan(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET))
return 0;
@@ -104,7 +298,7 @@ static int link_set_proxy_arp_pvlan(Link *link) {
if (link->network->proxy_arp_pvlan < 0)
return 0;
- return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp_pvlan", link->network->proxy_arp_pvlan > 0);
+ return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "proxy_arp_pvlan", link->network->proxy_arp_pvlan > 0, &link->manager->sysctl_shadow);
}
int link_get_ip_forwarding(Link *link, int family) {
@@ -136,6 +330,7 @@ static int link_set_ip_forwarding_impl(Link *link, int family) {
int r, t;
assert(link);
+ assert(link->manager);
assert(IN_SET(family, AF_INET, AF_INET6));
if (!link_is_configured_for_family(link, family))
@@ -145,7 +340,7 @@ static int link_set_ip_forwarding_impl(Link *link, int family) {
if (t < 0)
return 0; /* keep */
- r = sysctl_write_ip_property_boolean(family, link->ifname, "forwarding", t);
+ r = sysctl_write_ip_property_boolean(family, link->ifname, "forwarding", t, &link->manager->sysctl_shadow);
if (r < 0)
return log_link_warning_errno(link, r, "Failed to %s %s forwarding, ignoring: %m",
enable_disable(t), af_to_ipv4_ipv6(family));
@@ -214,6 +409,7 @@ static int link_set_ip_forwarding(Link *link, int family) {
static int link_set_ipv4_rp_filter(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET))
return 0;
@@ -221,7 +417,7 @@ static int link_set_ipv4_rp_filter(Link *link) {
if (link->network->ipv4_rp_filter < 0)
return 0;
- return sysctl_write_ip_property_int(AF_INET, link->ifname, "rp_filter", link->network->ipv4_rp_filter);
+ return sysctl_write_ip_property_int(AF_INET, link->ifname, "rp_filter", link->network->ipv4_rp_filter, &link->manager->sysctl_shadow);
}
static int link_set_ipv6_privacy_extensions(Link *link) {
@@ -241,20 +437,22 @@ static int link_set_ipv6_privacy_extensions(Link *link) {
if (val == IPV6_PRIVACY_EXTENSIONS_KERNEL)
return 0;
- return sysctl_write_ip_property_int(AF_INET6, link->ifname, "use_tempaddr", (int) val);
+ return sysctl_write_ip_property_int(AF_INET6, link->ifname, "use_tempaddr", (int) val, &link->manager->sysctl_shadow);
}
static int link_set_ipv6_accept_ra(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
- return sysctl_write_ip_property(AF_INET6, link->ifname, "accept_ra", "0");
+ return sysctl_write_ip_property(AF_INET6, link->ifname, "accept_ra", "0", &link->manager->sysctl_shadow);
}
static int link_set_ipv6_dad_transmits(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
@@ -262,11 +460,12 @@ static int link_set_ipv6_dad_transmits(Link *link) {
if (link->network->ipv6_dad_transmits < 0)
return 0;
- return sysctl_write_ip_property_int(AF_INET6, link->ifname, "dad_transmits", link->network->ipv6_dad_transmits);
+ return sysctl_write_ip_property_int(AF_INET6, link->ifname, "dad_transmits", link->network->ipv6_dad_transmits, &link->manager->sysctl_shadow);
}
static int link_set_ipv6_hop_limit(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
@@ -274,13 +473,14 @@ static int link_set_ipv6_hop_limit(Link *link) {
if (link->network->ipv6_hop_limit <= 0)
return 0;
- return sysctl_write_ip_property_int(AF_INET6, link->ifname, "hop_limit", link->network->ipv6_hop_limit);
+ return sysctl_write_ip_property_int(AF_INET6, link->ifname, "hop_limit", link->network->ipv6_hop_limit, &link->manager->sysctl_shadow);
}
static int link_set_ipv6_retransmission_time(Link *link) {
usec_t retrans_time_ms;
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
@@ -292,13 +492,14 @@ static int link_set_ipv6_retransmission_time(Link *link) {
if (retrans_time_ms <= 0 || retrans_time_ms > UINT32_MAX)
return 0;
- return sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", retrans_time_ms);
+ return sysctl_write_ip_neighbor_property_uint32(AF_INET6, link->ifname, "retrans_time_ms", retrans_time_ms, &link->manager->sysctl_shadow);
}
static int link_set_ipv6_proxy_ndp(Link *link) {
bool v;
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
@@ -308,13 +509,14 @@ static int link_set_ipv6_proxy_ndp(Link *link) {
else
v = !set_isempty(link->network->ipv6_proxy_ndp_addresses);
- return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "proxy_ndp", v);
+ return sysctl_write_ip_property_boolean(AF_INET6, link->ifname, "proxy_ndp", v, &link->manager->sysctl_shadow);
}
int link_set_ipv6_mtu(Link *link, int log_level) {
uint32_t mtu = 0;
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET6))
return 0;
@@ -335,11 +537,12 @@ int link_set_ipv6_mtu(Link *link, int log_level) {
mtu = link->mtu;
}
- return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "mtu", mtu);
+ return sysctl_write_ip_property_uint32(AF_INET6, link->ifname, "mtu", mtu, &link->manager->sysctl_shadow);
}
static int link_set_ipv4_accept_local(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET))
return 0;
@@ -347,11 +550,12 @@ static int link_set_ipv4_accept_local(Link *link) {
if (link->network->ipv4_accept_local < 0)
return 0;
- return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "accept_local", link->network->ipv4_accept_local > 0);
+ return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "accept_local", link->network->ipv4_accept_local > 0, &link->manager->sysctl_shadow);
}
static int link_set_ipv4_route_localnet(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET))
return 0;
@@ -359,11 +563,12 @@ static int link_set_ipv4_route_localnet(Link *link) {
if (link->network->ipv4_route_localnet < 0)
return 0;
- return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "route_localnet", link->network->ipv4_route_localnet > 0);
+ return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "route_localnet", link->network->ipv4_route_localnet > 0, &link->manager->sysctl_shadow);
}
static int link_set_ipv4_promote_secondaries(Link *link) {
assert(link);
+ assert(link->manager);
if (!link_is_configured_for_family(link, AF_INET))
return 0;
@@ -373,7 +578,7 @@ static int link_set_ipv4_promote_secondaries(Link *link) {
* otherwise. The way systemd-networkd works is that the new IP of a lease is added as a
* secondary IP and when the primary one expires it relies on the kernel to promote the
* secondary IP. See also https://github.com/systemd/systemd/issues/7163 */
- return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "promote_secondaries", true);
+ return sysctl_write_ip_property_boolean(AF_INET, link->ifname, "promote_secondaries", true, &link->manager->sysctl_shadow);
}
int link_set_sysctl(Link *link) {
diff --git a/src/network/networkd-sysctl.h b/src/network/networkd-sysctl.h
index d7a9b1f320..446b835555 100644
--- a/src/network/networkd-sysctl.h
+++ b/src/network/networkd-sysctl.h
@@ -27,6 +27,16 @@ typedef enum IPReversePathFilter {
_IP_REVERSE_PATH_FILTER_INVALID = -EINVAL,
} IPReversePathFilter;
+#if HAVE_VMLINUX_H
+int sysctl_add_monitor(Manager *manager);
+void sysctl_remove_monitor(Manager *manager);
+int sysctl_clear_link_shadows(Link *link);
+#else
+static inline int sysctl_add_monitor(Manager *manager) { return 0; }
+static inline void sysctl_remove_monitor(Manager *manager) { }
+static inline int sysctl_clear_link_shadows(Link *link) { return 0; }
+#endif
+
void manager_set_sysctl(Manager *manager);
int link_get_ip_forwarding(Link *link, int family);
diff --git a/src/network/networkd.c b/src/network/networkd.c
index 69a28647c8..2798cd8cf8 100644
--- a/src/network/networkd.c
+++ b/src/network/networkd.c
@@ -62,7 +62,9 @@ static int run(int argc, char *argv[]) {
(1ULL << CAP_NET_ADMIN) |
(1ULL << CAP_NET_BIND_SERVICE) |
(1ULL << CAP_NET_BROADCAST) |
- (1ULL << CAP_NET_RAW));
+ (1ULL << CAP_NET_RAW) |
+ (1ULL << CAP_SYS_ADMIN) |
+ (1ULL << CAP_BPF));
if (r < 0)
return log_error_errno(r, "Failed to drop privileges: %m");
}
diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h
index f4f4e95b7f..441f4e6888 100644
--- a/src/systemd/sd-messages.h
+++ b/src/systemd/sd-messages.h
@@ -277,6 +277,9 @@ _SD_BEGIN_DECLARATIONS;
#define SD_MESSAGE_SRK_ENROLLMENT_NEEDS_AUTHORIZATION SD_ID128_MAKE(ad,70,89,f9,28,ac,4f,7e,a0,0c,07,45,7d,47,ba,8a)
#define SD_MESSAGE_SRK_ENROLLMENT_NEEDS_AUTHORIZATION_STR SD_ID128_MAKE_STR(ad,70,89,f9,28,ac,4f,7e,a0,0c,07,45,7d,47,ba,8a)
+#define SD_MESSAGE_SYSCTL_CHANGED SD_ID128_MAKE(9c,f5,6b,8b,af,95,46,cf,94,78,78,3a,8d,e4,21,13)
+#define SD_MESSAGE_SYSCTL_CHANGED_STR SD_ID128_MAKE_STR(9c,f5,6b,8b,af,95,46,cf,94,78,78,3a,8d,e4,21,13)
+
_SD_END_DECLARATIONS;
#endif
diff --git a/src/test/test-sysctl-util.c b/src/test/test-sysctl-util.c
index e94099605c..83d6c9036c 100644
--- a/src/test/test-sysctl-util.c
+++ b/src/test/test-sysctl-util.c
@@ -53,14 +53,14 @@ TEST(sysctl_read) {
assert_se(sysctl_read_ip_property(AF_INET, "lo", "forwarding", &s));
assert_se(STR_IN_SET(s, "0", "1"));
- r = sysctl_write_ip_property(AF_INET, "lo", "forwarding", s);
+ r = sysctl_write_ip_property(AF_INET, "lo", "forwarding", s, NULL);
assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS);
s = mfree(s);
assert_se(sysctl_read_ip_property(AF_INET, NULL, "ip_forward", &s));
assert_se(STR_IN_SET(s, "0", "1"));
- r = sysctl_write_ip_property(AF_INET, NULL, "ip_forward", s);
+ r = sysctl_write_ip_property(AF_INET, NULL, "ip_forward", s, NULL);
assert_se(r >= 0 || ERRNO_IS_PRIVILEGE(r) || r == -EROFS);
s = mfree(s);