From 1a4a0cb7985f921548f1a7ac17686afbefe67f87 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 18 Mar 2024 14:25:26 +0100 Subject: bpf/lpm_trie: Inline longest_prefix_match for fastpath The BPF map type LPM (Longest Prefix Match) is used heavily in production by multiple products that have BPF components. Perf data shows trie_lookup_elem() and longest_prefix_match() being part of kernels perf top. For every level in the LPM tree trie_lookup_elem() calls out to longest_prefix_match(). The compiler is free to inline this call, but chooses not to inline, because other slowpath callers (that can be invoked via syscall) exists like trie_update_elem(), trie_delete_elem() or trie_get_next_key(). bcc/tools/funccount -Ti 1 'trie_lookup_elem|longest_prefix_match.isra.0' FUNC COUNT trie_lookup_elem 664945 longest_prefix_match.isra.0 8101507 Observation on a single random machine shows a factor 12 between the two functions. Given an average of 12 levels in the trie being searched. This patch force inlining longest_prefix_match(), but only for the lookup fastpath to balance object instruction size. In production with AMD CPUs, measuring the function latency of 'trie_lookup_elem' (bcc/tools/funclatency) we are seeing an improvement function latency reduction 7-8% with this patch applied (to production kernels 6.6 and 6.1). Analyzing perf data, we can explain this rather large improvement due to reducing the overhead for AMD side-channel mitigation SRSO (Speculative Return Stack Overflow). Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/171076828575.2141737.18370644069389889027.stgit@firesoul --- kernel/bpf/lpm_trie.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel/bpf/lpm_trie.c') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 050fe1ebf0f7..939620b91c0e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -155,16 +155,17 @@ static inline int extract_bit(const u8 *data, size_t index) } /** - * longest_prefix_match() - determine the longest prefix + * __longest_prefix_match() - determine the longest prefix * @trie: The trie to get internal sizes from * @node: The node to operate on * @key: The key to compare to @node * * Determine the longest prefix of @node that matches the bits in @key. */ -static size_t longest_prefix_match(const struct lpm_trie *trie, - const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key_u8 *key) +static __always_inline +size_t __longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; @@ -224,6 +225,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, return prefixlen; } +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) +{ + return __longest_prefix_match(trie, node, key); +} + /* Called from syscall or from eBPF program */ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { @@ -245,7 +253,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * If it's the maximum possible prefix for this trie, we have * an exact match and can return it directly. */ - matchlen = longest_prefix_match(trie, node, key); + matchlen = __longest_prefix_match(trie, node, key); if (matchlen == trie->max_prefixlen) { found = node; break; -- cgit v1.2.3 From 59f2f841179aa6a0899cb9cf53659149a35749b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 29 Mar 2024 10:14:39 -0700 Subject: bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie. syzbot reported the following lock sequence: cpu 2: grabs timer_base lock spins on bpf_lpm lock cpu 1: grab rcu krcp lock spins on timer_base lock cpu 0: grab bpf_lpm lock spins on rcu krcp lock bpf_lpm lock can be the same. timer_base lock can also be the same due to timer migration. but rcu krcp lock is always per-cpu, so it cannot be the same lock. Hence it's a false positive. To avoid lockdep complaining move kfree_rcu() after spin_unlock. Reported-by: syzbot+1fa663a2100308ab6eab@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240329171439.37813-1-alexei.starovoitov@gmail.com --- kernel/bpf/lpm_trie.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/bpf/lpm_trie.c') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 939620b91c0e..0218a5132ab5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -316,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -390,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -437,6 +438,7 @@ out: } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -445,6 +447,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -514,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -529,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } -- cgit v1.2.3