From d3e42bb0a329fadff98fcb927714d0a486840e3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 27 Jan 2020 09:51:45 -0800 Subject: bpf: Reuse log from btf_prase_vmlinux() in btf_struct_ops_init() Instead of using a locally defined "struct bpf_verifier_log log = {}", btf_struct_ops_init() should reuse the "log" from its calling function "btf_parse_vmlinux()". It should also resolve the frame-size too large compiler warning in some ARCH. Fixes: 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200127175145.1154438-1-kafai@fb.com --- include/linux/bpf.h | 7 +++++-- kernel/bpf/bpf_struct_ops.c | 5 ++--- kernel/bpf/btf.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8e9ad3943cd9..49b1a70e12c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -728,7 +728,7 @@ struct bpf_struct_ops { #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); -void bpf_struct_ops_init(struct btf *btf); +void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, @@ -752,7 +752,10 @@ static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { return NULL; } -static inline void bpf_struct_ops_init(struct btf *btf) { } +static inline void bpf_struct_ops_init(struct btf *btf, + struct bpf_verifier_log *log) +{ +} static inline bool bpf_try_module_get(const void *data, struct module *owner) { return try_module_get(owner); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ad1c9ea26b2..042f95534f86 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -96,12 +96,11 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = { static const struct btf_type *module_type; -void bpf_struct_ops_init(struct btf *btf) +void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) { s32 type_id, value_id, module_id; const struct btf_member *member; struct bpf_struct_ops *st_ops; - struct bpf_verifier_log log = {}; const struct btf_type *t; char value_name[128]; const char *mname; @@ -172,7 +171,7 @@ void bpf_struct_ops_init(struct btf *btf) member->type, NULL); if (func_proto && - btf_distill_func_proto(&log, btf, + btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[j])) { pr_warn("Error in parsing func ptr %s in struct %s\n", diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b7c1660fb594..8c9d8f266bef 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3643,7 +3643,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; } - bpf_struct_ops_init(btf); + bpf_struct_ops_init(btf, log); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); -- cgit v1.2.3 From 2577e373bbc07f18680287b3958d3b2fdebe4b20 Mon Sep 17 00:00:00 2001 From: Yulia Kartseva Date: Thu, 30 Jan 2020 15:13:10 -0800 Subject: runqslower: Fix Makefile Fix undefined reference linker errors when building runqslower with gcc 7.4.0 on Ubuntu 18.04. The issue is with misplaced -lelf, -lz options in Makefile: $(Q)$(CC) $(CFLAGS) -lelf -lz $^ -o $@ -lelf, -lz options should follow the list of target dependencies: $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ or after substitution cc -g -Wall runqslower.o libbpf.a -lelf -lz -o runqslower The current order of gcc params causes failure in libelf symbols resolution, e.g. undefined reference to `elf_memory' Fixes: 9c01546d26d2 ("tools/bpf: Add runqslower tool to tools/bpf") Signed-off-by: Julia Kartseva Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/908498f794661c44dca54da9e09dc0c382df6fcb.1580425879.git.hex@fb.com --- tools/bpf/runqslower/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 0c021352beed..87eae5be9bcd 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -41,7 +41,7 @@ clean: $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) $(call msg,BINARY,$@) - $(Q)$(CC) $(CFLAGS) -lelf -lz $^ -o $@ + $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o -- cgit v1.2.3 From a525b0881de7742617343f02df4073ddc1571237 Mon Sep 17 00:00:00 2001 From: Michal Rostecki Date: Sun, 2 Feb 2020 12:02:00 +0100 Subject: bpftool: Remove redundant "HAVE" prefix from the large INSN limit check "HAVE" prefix is already applied by default to feature macros and before this change, the large INSN limit macro had the incorrect name with double "HAVE". Fixes: 2faef64aa6b3 ("bpftool: Add misc section and probe for large INSN limit") Signed-off-by: Michal Rostecki Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200202110200.31024-1-mrostecki@opensuse.org --- tools/bpf/bpftool/feature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 446ba891f1e2..941873d778d8 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -580,7 +580,7 @@ probe_large_insn_limit(const char *define_prefix, __u32 ifindex) res = bpf_probe_large_insn_limit(ifindex); print_bool_feature("have_large_insn_limit", "Large program size limit", - "HAVE_LARGE_INSN_LIMIT", + "LARGE_INSN_LIMIT", res, define_prefix); } -- cgit v1.2.3 From 257af63d7f84f0672aa6a24b5511871f00741f19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 31 Jan 2020 16:03:14 -0800 Subject: bpf: Fix modifier skipping logic Fix the way modifiers are skipped while walking pointers. Otherwise second level dereferences of 'const struct foo *' will be rejected by the verifier. Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200201000314.261392-1-ast@kernel.org --- kernel/bpf/btf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8c9d8f266bef..805c43b083e9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3931,6 +3931,7 @@ again: if (btf_type_is_ptr(mtype)) { const struct btf_type *stype; + u32 id; if (msize != size || off != moff) { bpf_log(log, @@ -3939,12 +3940,9 @@ again: return -EACCES; } - stype = btf_type_by_id(btf_vmlinux, mtype->type); - /* skip modifiers */ - while (btf_type_is_modifier(stype)) - stype = btf_type_by_id(btf_vmlinux, stype->type); + stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); if (btf_type_is_struct(stype)) { - *next_btf_id = mtype->type; + *next_btf_id = id; return PTR_TO_BTF_ID; } } -- cgit v1.2.3 From 8fc91b972b734a62a3f3ef45db06d06617f739ea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 1 Feb 2020 22:51:52 -0800 Subject: selftests/bpf: Fix trampoline_count.c selftest compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix missing braces compilation warning in trampoline_count test: .../prog_tests/trampoline_count.c: In function ‘test_trampoline_count’: .../prog_tests/trampoline_count.c:49:9: warning: missing braces around initializer [-Wmissing-braces] struct inst inst[MAX_TRAMP_PROGS] = { 0 }; ^ .../prog_tests/trampoline_count.c:49:9: warning: (near initialization for ‘inst[0]’) [-Wmissing-braces] Fixes: d633d57902a5 ("selftest/bpf: Add test for allowed trampolines count") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200202065152.2718142-1-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/trampoline_count.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 1235f3d1cc50..1f6ccdaed1ac 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -46,7 +46,7 @@ void test_trampoline_count(void) const char *fentry_name = "fentry/__set_task_comm"; const char *fexit_name = "fexit/__set_task_comm"; const char *object = "test_trampoline_count.o"; - struct inst inst[MAX_TRAMP_PROGS] = { 0 }; + struct inst inst[MAX_TRAMP_PROGS] = {}; int err, i = 0, duration = 0; struct bpf_object *obj; struct bpf_link *link; -- cgit v1.2.3 From fc9e34f8de001c92e6cfc6481bb30a34dbda4f5c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 4 Feb 2020 13:50:37 -0800 Subject: tools/bpf/runqslower: Rebuild libbpf.a on libbpf source change Add missing dependency of $(BPFOBJ) to $(LIBBPF_SRC), so that running make in runqslower/ will rebuild libbpf.a when there is change in libbpf/. Fixes: 9c01546d26d2 ("tools/bpf: Add runqslower tool to tools/bpf") Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200204215037.2258698-1-songliubraving@fb.com --- tools/bpf/runqslower/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 87eae5be9bcd..39edd68afa8e 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -75,7 +75,7 @@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) fi $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ -$(BPFOBJ): | $(OUTPUT) +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ OUTPUT=$(abspath $(dir $@))/ $(abspath $@) -- cgit v1.2.3 From c77e9f09143822623dd71a0fdc84331129e97c3a Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 5 Feb 2020 05:58:32 +0100 Subject: i40e: Relax i40e_xsk_wakeup's return value when PF is busy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return -EAGAIN instead of -ENETDOWN to provide a slightly milder information to user space so that an application will know to retry the syscall when __I40E_CONFIG_BUSY bit is set on pf->state. Fixes: b3873a5be757 ("net/i40e: Fix concurrency issues between config flow and XSK") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200205045834.56795-2-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 42058fad6a3c..0b7d29192b2c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -791,7 +791,7 @@ int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) struct i40e_ring *ring; if (test_bit(__I40E_CONFIG_BUSY, pf->state)) - return -ENETDOWN; + return -EAGAIN; if (test_bit(__I40E_VSI_DOWN, vsi->state)) return -ENETDOWN; -- cgit v1.2.3 From 32c92c15ad3dca6f7cca8643766ad79fa3ab3d17 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 5 Feb 2020 05:58:33 +0100 Subject: samples: bpf: Drop doubled variable declaration in xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seems that by accident there is a doubled declaration of global variable opt_xdp_bind_flags in xdpsock_user.c. The second one is uninitialized so compiler was simply ignoring it. To keep things clean, drop the doubled variable. Fixes: c543f5469822 ("samples/bpf: add unaligned chunks mode support to xdpsock") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200205045834.56795-3-maciej.fijalkowski@intel.com --- samples/bpf/xdpsock_user.c | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 0b5acd722306..bab7a850e864 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -83,7 +83,6 @@ static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; static u32 opt_umem_flags; static int opt_unaligned_chunks; static int opt_mmap_flags; -static u32 opt_xdp_bind_flags; static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; static int opt_timeout = 1000; static bool opt_need_wakeup = true; -- cgit v1.2.3 From 8ed47e140867a6e7d56170f325c8d4fdee6d6b66 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 5 Feb 2020 05:58:34 +0100 Subject: samples: bpf: Allow for -ENETDOWN in xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ndo_xsk_wakeup() can return -ENETDOWN and there's no particular reason to bail the whole application out on that case. Let's check in kick_tx() whether errno was set to mentioned value and basically allow application to further process frames. Fixes: 248c7f9c0e21 ("samples/bpf: convert xdpsock to use libbpf for AF_XDP access") Reported-by: Cameron Elliott Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200205045834.56795-4-maciej.fijalkowski@intel.com --- samples/bpf/xdpsock_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index bab7a850e864..c91e91362a0c 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -788,7 +788,8 @@ static void kick_tx(struct xsk_socket_info *xsk) int ret; ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY) + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || + errno == EBUSY || errno == ENETDOWN) return; exit_with_error(errno); } -- cgit v1.2.3 From 85b8ac01a421791d66c3a458a7f83cfd173fe3fa Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 7 Feb 2020 10:37:12 +0000 Subject: bpf, sockmap: Check update requirements after locking It's currently possible to insert sockets in unexpected states into a sockmap, due to a TOCTTOU when updating the map from a syscall. sock_map_update_elem checks that sk->sk_state == TCP_ESTABLISHED, locks the socket and then calls sock_map_update_common. At this point, the socket may have transitioned into another state, and the earlier assumptions don't hold anymore. Crucially, it's conceivable (though very unlikely) that a socket has become unhashed. This breaks the sockmap's assumption that it will get a callback via sk->sk_prot->unhash. Fix this by checking the (fixed) sk_type and sk_protocol without the lock, followed by a locked check of sk_state. Unfortunately it's not possible to push the check down into sock_(map|hash)_update_common, since BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB run before the socket has transitioned from TCP_SYN_RECV into TCP_ESTABLISHED. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200207103713.28175-1-lmb@cloudflare.com --- net/core/sock_map.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8998e356f423..36a2433e183f 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -416,14 +416,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_map_update_common(map, idx, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_map_update_common(map, idx, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -739,14 +741,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_hash_update_common(map, key, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); -- cgit v1.2.3 From d95f1e8b462c4372ac409886070bb8719d8a4d3a Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 6 Feb 2020 11:29:06 +0100 Subject: bpftool: Don't crash on missing xlated program instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out the xlated program instructions can also be missing if kptr_restrict sysctl is set. This means that the previous fix to check the jited_prog_insns pointer was insufficient; add another check of the xlated_prog_insns pointer as well. Fixes: 5b79bcdf0362 ("bpftool: Don't crash on missing jited insns or ksyms") Fixes: cae73f233923 ("bpftool: use bpf_program__get_prog_info_linear() in prog.c:do_dump()") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200206102906.112551-1-toke@redhat.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index a3521deca869..b352ab041160 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -536,7 +536,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, buf = (unsigned char *)(info->jited_prog_insns); member_len = info->jited_prog_len; } else { /* DUMP_XLATED */ - if (info->xlated_prog_len == 0) { + if (info->xlated_prog_len == 0 || !info->xlated_prog_insns) { p_err("error retrieving insn dump: kernel.kptr_restrict set?"); return -1; } -- cgit v1.2.3 From db6a5018b6e008c1d69c6628cdaa9541b8e70940 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:50 +0100 Subject: bpf, sockmap: Don't sleep while holding RCU lock on tear-down rcu_read_lock is needed to protect access to psock inside sock_map_unref when tearing down the map. However, we can't afford to sleep in lock_sock while in RCU read-side critical section. Grab the RCU lock only after we have locked the socket. This fixes RCU warnings triggerable on a VM with 1 vCPU when free'ing a sockmap/sockhash that contains at least one socket: | ============================= | WARNING: suspicious RCU usage | 5.5.0-04005-g8fc91b972b73 #450 Not tainted | ----------------------------- | include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section! | | other info that might help us debug this: | | | rcu_scheduler_active = 2, debug_locks = 1 | 4 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_map_free+0x5/0x170 | #3: ffff8881368c5df8 (&stab->lock){+...}, at: sock_map_free+0x64/0x170 | | stack backtrace: | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73 #450 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep+0x105/0x190 | lock_sock_nested+0x28/0x90 | sock_map_free+0x95/0x170 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 | ============================= | WARNING: suspicious RCU usage | 5.5.0-04005-g8fc91b972b73-dirty #452 Not tainted | ----------------------------- | include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section! | | other info that might help us debug this: | | | rcu_scheduler_active = 2, debug_locks = 1 | 4 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_hash_free+0x5/0x1d0 | #3: ffff888139966e00 (&htab->buckets[i].lock){+...}, at: sock_hash_free+0x92/0x1d0 | | stack backtrace: | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73-dirty #452 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep+0x105/0x190 | lock_sock_nested+0x28/0x90 | sock_hash_free+0xec/0x1d0 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-2-jakub@cloudflare.com --- net/core/sock_map.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 36a2433e183f..992504478e68 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -234,7 +234,6 @@ static void sock_map_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; @@ -243,12 +242,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { lock_sock(sk); + rcu_read_lock(); sock_map_unref(sk, psk); + rcu_read_unlock(); release_sock(sk); } } raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); synchronize_rcu(); @@ -863,19 +863,19 @@ static void sock_hash_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); + rcu_read_lock(); sock_map_unref(elem->sk, elem); + rcu_read_unlock(); release_sock(elem->sk); } raw_spin_unlock_bh(&bucket->lock); } - rcu_read_unlock(); bpf_map_area_free(htab->buckets); kfree(htab); -- cgit v1.2.3 From 0b2dc83906cf1e694e48003eae5df8fa63f76fd9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:51 +0100 Subject: bpf, sockhash: Synchronize_rcu before free'ing map We need to have a synchronize_rcu before free'ing the sockhash because any outstanding psock references will have a pointer to the map and when they use it, this could trigger a use after free. This is a sister fix for sockhash, following commit 2bb90e5cc90e ("bpf: sockmap, synchronize_rcu before free'ing map") which addressed sockmap, which comes from a manual audit. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-3-jakub@cloudflare.com --- net/core/sock_map.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 992504478e68..085cef5857bb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -250,6 +250,7 @@ static void sock_map_free(struct bpf_map *map) } raw_spin_unlock_bh(&stab->lock); + /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); @@ -877,6 +878,9 @@ static void sock_hash_free(struct bpf_map *map) raw_spin_unlock_bh(&bucket->lock); } + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + bpf_map_area_free(htab->buckets); kfree(htab); } -- cgit v1.2.3 From 5d3919a953c3c96c02fc7a337f8376cde43ae31f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:52 +0100 Subject: selftests/bpf: Test freeing sockmap/sockhash with a socket in it Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") introduced sleeping issues inside RCU critical sections and while holding a spinlock on sockmap/sockhash tear-down. There has to be at least one socket in the map for the problem to surface. This adds a test that triggers the warnings for broken locking rules. Not a fix per se, but rather tooling to verify the accompanying fixes. Run on a VM with 1 vCPU to reproduce the warnings. Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-4-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c new file mode 100644 index 000000000000..07f5b462c2ef --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare + +#include "test_progs.h" + +static int connected_socket_v4(void) +{ + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons(80), + .sin_addr = { inet_addr("127.0.0.1") }, + }; + socklen_t len = sizeof(addr); + int s, repair, err; + + s = socket(AF_INET, SOCK_STREAM, 0); + if (CHECK_FAIL(s == -1)) + goto error; + + repair = TCP_REPAIR_ON; + err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair)); + if (CHECK_FAIL(err)) + goto error; + + err = connect(s, (struct sockaddr *)&addr, len); + if (CHECK_FAIL(err)) + goto error; + + repair = TCP_REPAIR_OFF_NO_WP; + err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair)); + if (CHECK_FAIL(err)) + goto error; + + return s; +error: + perror(__func__); + close(s); + return -1; +} + +/* Create a map, populate it with one socket, and free the map. */ +static void test_sockmap_create_update_free(enum bpf_map_type map_type) +{ + const int zero = 0; + int s, map, err; + + s = connected_socket_v4(); + if (CHECK_FAIL(s == -1)) + return; + + map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); + if (CHECK_FAIL(map == -1)) { + perror("bpf_create_map"); + goto out; + } + + err = bpf_map_update_elem(map, &zero, &s, BPF_NOEXIST); + if (CHECK_FAIL(err)) { + perror("bpf_map_update"); + goto out; + } + +out: + close(map); + close(s); +} + +void test_sockmap_basic(void) +{ + if (test__start_subtest("sockmap create_update_free")) + test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash create_update_free")) + test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH); +} -- cgit v1.2.3 From 88d6f130e5632bbf419a2e184ec7adcbe241260b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 7 Feb 2020 00:18:10 -0800 Subject: bpf: Improve bucket_log calculation logic It was reported that the max_t, ilog2, and roundup_pow_of_two macros have exponential effects on the number of states in the sparse checker. This patch breaks them up by calculating the "nbuckets" first so that the "bucket_log" only needs to take ilog2(). In addition, Linus mentioned: Patch looks good, but I'd like to point out that it's not just sparse. You can see it with a simple make net/core/bpf_sk_storage.i grep 'smap->bucket_log = ' net/core/bpf_sk_storage.i | wc and see the end result: 1 365071 2686974 That's one line (the assignment line) that is 2,686,974 characters in length. Now, sparse does happen to react particularly badly to that (I didn't look to why, but I suspect it's just that evaluating all the types that don't actually ever end up getting used ends up being much more expensive than it should be), but I bet it's not good for gcc either. Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Reported-by: Randy Dunlap Reported-by: Luc Van Oostenryck Suggested-by: Linus Torvalds Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Luc Van Oostenryck Link: https://lore.kernel.org/bpf/20200207081810.3918919-1-kafai@fb.com --- net/core/bpf_sk_storage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 458be6b3eda9..3ab23f698221 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -643,9 +643,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); + nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); - nbuckets = 1U << smap->bucket_log; + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); ret = bpf_map_charge_init(&smap->map.memory, cost); -- cgit v1.2.3