diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-01-28 08:59:45 +0100 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-01-28 09:00:14 +0100 |
commit | 2d104c390f0d3901c4915dcb141cb96da96cffe7 (patch) | |
tree | 6c74e81297ef3ae095ff69ab90fea64816babbae /kernel/bpf | |
parent | netpoll: Remove 4s sleep during carrier detection (diff) | |
parent | selftest/bpf: Make crashes more debuggable in test_progs (diff) | |
download | linux-2d104c390f0d3901c4915dcb141cb96da96cffe7.tar.xz linux-2d104c390f0d3901c4915dcb141cb96da96cffe7.zip |
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
bpf-next 2023-01-28
We've added 124 non-merge commits during the last 22 day(s) which contain
a total of 124 files changed, 6386 insertions(+), 1827 deletions(-).
The main changes are:
1) Implement XDP hints via kfuncs with initial support for RX hash and
timestamp metadata kfuncs, from Stanislav Fomichev and
Toke Høiland-Jørgensen.
Measurements on overhead: https://lore.kernel.org/bpf/875yellcx6.fsf@toke.dk
2) Extend libbpf's bpf_tracing.h support for tracing arguments of
kprobes/uprobes and syscall as a special case, from Andrii Nakryiko.
3) Significantly reduce the search time for module symbols by livepatch
and BPF, from Jiri Olsa and Zhen Lei.
4) Enable cpumasks to be used as kptrs, which is useful for tracing
programs tracking which tasks end up running on which CPUs
in different time intervals, from David Vernet.
5) Fix several issues in the dynptr processing such as stack slot liveness
propagation, missing checks for PTR_TO_STACK variable offset, etc,
from Kumar Kartikeya Dwivedi.
6) Various performance improvements, fixes, and introduction of more
than just one XDP program to XSK selftests, from Magnus Karlsson.
7) Big batch to BPF samples to reduce deprecated functionality,
from Daniel T. Lee.
8) Enable struct_ops programs to be sleepable in verifier,
from David Vernet.
9) Reduce pr_warn() noise on BTF mismatches when they are expected under
the CONFIG_MODULE_ALLOW_BTF_MISMATCH config anyway, from Connor O'Brien.
10) Describe modulo and division by zero behavior of the BPF runtime
in BPF's instruction specification document, from Dave Thaler.
11) Several improvements to libbpf API documentation in libbpf.h,
from Grant Seltzer.
12) Improve resolve_btfids header dependencies related to subcmd and add
proper support for HOSTCC, from Ian Rogers.
13) Add ipip6 and ip6ip decapsulation support for bpf_skb_adjust_room()
helper along with BPF selftests, from Ziyang Xuan.
14) Simplify the parsing logic of structure parameters for BPF trampoline
in the x86-64 JIT compiler, from Pu Lehui.
15) Get BTF working for kernels with CONFIG_RUST enabled by excluding
Rust compilation units with pahole, from Martin Rodriguez Reboredo.
16) Get bpf_setsockopt() working for kTLS on top of TCP sockets,
from Kui-Feng Lee.
17) Disable stack protection for BPF objects in bpftool given BPF backends
don't support it, from Holger Hoffstätte.
* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (124 commits)
selftest/bpf: Make crashes more debuggable in test_progs
libbpf: Add documentation to map pinning API functions
libbpf: Fix malformed documentation formatting
selftests/bpf: Properly enable hwtstamp in xdp_hw_metadata
selftests/bpf: Calls bpf_setsockopt() on a ktls enabled socket.
bpf: Check the protocol of a sock to agree the calls to bpf_setsockopt().
bpf/selftests: Verify struct_ops prog sleepable behavior
bpf: Pass const struct bpf_prog * to .check_member
libbpf: Support sleepable struct_ops.s section
bpf: Allow BPF_PROG_TYPE_STRUCT_OPS programs to be sleepable
selftests/bpf: Fix vmtest static compilation error
tools/resolve_btfids: Alter how HOSTCC is forced
tools/resolve_btfids: Install subcmd headers
bpf/docs: Document the nocast aliasing behavior of ___init
bpf/docs: Document how nested trusted fields may be defined
bpf/docs: Document cpumask kfuncs in a new file
selftests/bpf: Add selftest suite for cpumask kfuncs
selftests/bpf: Add nested trust selftests suite
bpf: Enable cpumasks to be queried and used as kptrs
bpf: Disallow NULLable pointers for trusted kfuncs
...
====================
Link: https://lore.kernel.org/r/20230128004827.21371-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 1 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 160 | ||||
-rw-r--r-- | kernel/bpf/core.c | 12 | ||||
-rw-r--r-- | kernel/bpf/cpumask.c | 476 | ||||
-rw-r--r-- | kernel/bpf/offload.c | 419 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 34 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 544 |
7 files changed, 1399 insertions, 247 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a12e6b400a2..02242614dcc7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 578cee398550..47b8cb96f2c2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -337,6 +337,12 @@ const char *btf_type_str(const struct btf_type *t) #define BTF_SHOW_NAME_SIZE 80 /* + * The suffix of a type that indicates it cannot alias another type when + * comparing BTF IDs for kfunc invocations. + */ +#define NOCAST_ALIAS_SUFFIX "___init" + +/* * Common data to all BTF show operations. Private show functions can add * their own data to a structure containing a struct btf_show and consult it * in the show callback. See btf_type_show() below. @@ -1397,12 +1403,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; - /* btf verifier prints all types it is processing via - * btf_verifier_log_type(..., fmt = NULL). - * Skip those prints for in-kernel BTF verification. - */ - if (log->level == BPF_LOG_KERNEL && !fmt) - return; + if (log->level == BPF_LOG_KERNEL) { + /* btf verifier prints all types it is processing via + * btf_verifier_log_type(..., fmt = NULL). + * Skip those prints for in-kernel BTF verification. + */ + if (!fmt) + return; + + /* Skip logging when loading module BTF with mismatches permitted */ + if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + return; + } __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, @@ -1441,8 +1453,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; - if (log->level == BPF_LOG_KERNEL && !fmt) - return; + if (log->level == BPF_LOG_KERNEL) { + if (!fmt) + return; + + /* Skip logging when loading module BTF with mismatches permitted */ + if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + return; + } + /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in @@ -7261,11 +7280,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, } btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); if (IS_ERR(btf)) { - pr_warn("failed to validate module [%s] BTF: %ld\n", - mod->name, PTR_ERR(btf)); kfree(btf_mod); - if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); err = PTR_ERR(btf); + } else { + pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n"); + } goto out; } err = btf_alloc_id(btf); @@ -8211,3 +8233,119 @@ out: } return err; } + +bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off) +{ + struct btf *btf = reg->btf; + const struct btf_type *walk_type, *safe_type; + const char *tname; + char safe_tname[64]; + long ret, safe_id; + const struct btf_member *member, *m_walk = NULL; + u32 i; + const char *walk_name; + + walk_type = btf_type_by_id(btf, reg->btf_id); + if (!walk_type) + return false; + + tname = btf_name_by_offset(btf, walk_type->name_off); + + ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname); + if (ret < 0) + return false; + + safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); + if (safe_id < 0) + return false; + + safe_type = btf_type_by_id(btf, safe_id); + if (!safe_type) + return false; + + for_each_member(i, walk_type, member) { + u32 moff; + + /* We're looking for the PTR_TO_BTF_ID member in the struct + * type we're walking which matches the specified offset. + * Below, we'll iterate over the fields in the safe variant of + * the struct and see if any of them has a matching type / + * name. + */ + moff = __btf_member_bit_offset(walk_type, member) / 8; + if (off == moff) { + m_walk = member; + break; + } + } + if (m_walk == NULL) + return false; + + walk_name = __btf_name_by_offset(btf, m_walk->name_off); + for_each_member(i, safe_type, member) { + const char *m_name = __btf_name_by_offset(btf, member->name_off); + + /* If we match on both type and name, the field is considered trusted. */ + if (m_walk->type == member->type && !strcmp(walk_name, m_name)) + return true; + } + + return false; +} + +bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, + const struct btf *reg_btf, u32 reg_id, + const struct btf *arg_btf, u32 arg_id) +{ + const char *reg_name, *arg_name, *search_needle; + const struct btf_type *reg_type, *arg_type; + int reg_len, arg_len, cmp_len; + size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char); + + reg_type = btf_type_by_id(reg_btf, reg_id); + if (!reg_type) + return false; + + arg_type = btf_type_by_id(arg_btf, arg_id); + if (!arg_type) + return false; + + reg_name = btf_name_by_offset(reg_btf, reg_type->name_off); + arg_name = btf_name_by_offset(arg_btf, arg_type->name_off); + + reg_len = strlen(reg_name); + arg_len = strlen(arg_name); + + /* Exactly one of the two type names may be suffixed with ___init, so + * if the strings are the same size, they can't possibly be no-cast + * aliases of one another. If you have two of the same type names, e.g. + * they're both nf_conn___init, it would be improper to return true + * because they are _not_ no-cast aliases, they are the same type. + */ + if (reg_len == arg_len) + return false; + + /* Either of the two names must be the other name, suffixed with ___init. */ + if ((reg_len != arg_len + pattern_len) && + (arg_len != reg_len + pattern_len)) + return false; + + if (reg_len < arg_len) { + search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX); + cmp_len = reg_len; + } else { + search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX); + cmp_len = arg_len; + } + + if (!search_needle) + return false; + + /* ___init suffix must come at the end of the name */ + if (*(search_needle + pattern_len) != '\0') + return false; + + return !strncmp(reg_name, arg_name, cmp_len); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba3fff17e2f9..16da51093aff 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - if (!bpf_prog_is_dev_bound(fp->aux)) { + if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; @@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) - bpf_prog_offload_destroy(aux->prog); + bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c new file mode 100644 index 000000000000..25355a0a367a --- /dev/null +++ b/kernel/bpf/cpumask.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2023 Meta, Inc */ +#include <linux/bpf.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/cpumask.h> + +/** + * struct bpf_cpumask - refcounted BPF cpumask wrapper structure + * @cpumask: The actual cpumask embedded in the struct. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + * + * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This + * is done to avoid confusing the verifier due to the typedef of cpumask_var_t + * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See + * the details in <linux/cpumask.h>. The consequence is that this structure is + * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is + * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory + * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK + * not being defined, the structure is the same size regardless. + */ +struct bpf_cpumask { + cpumask_t cpumask; + refcount_t usage; +}; + +static struct bpf_mem_alloc bpf_cpumask_ma; + +static bool cpu_valid(u32 cpu) +{ + return cpu < nr_cpu_ids; +} + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global kfuncs as their definitions will be in BTF"); + +/** + * bpf_cpumask_create() - Create a mutable BPF cpumask. + * + * Allocates a cpumask that can be queried, mutated, acquired, and released by + * a BPF program. The cpumask returned by this function must either be embedded + * in a map as a kptr, or freed with bpf_cpumask_release(). + * + * bpf_cpumask_create() allocates memory using the BPF memory allocator, and + * will not block. It may return NULL if no memory is available. + */ +struct bpf_cpumask *bpf_cpumask_create(void) +{ + struct bpf_cpumask *cpumask; + + cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask)); + if (!cpumask) + return NULL; + + memset(cpumask, 0, sizeof(*cpumask)); + refcount_set(&cpumask->usage, 1); + + return cpumask; +} + +/** + * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask. + * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted + * pointer. + * + * Acquires a reference to a BPF cpumask. The cpumask returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_cpumask_release(). + */ +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) +{ + refcount_inc(&cpumask->usage); + return cpumask; +} + +/** + * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask + * stored in a map. + * @cpumaskp: A pointer to a BPF cpumask map value. + * + * Attempts to acquire a reference to a BPF cpumask stored in a map value. The + * cpumask returned by this function must either be embedded in a map as a + * kptr, or freed with bpf_cpumask_release(). This function may return NULL if + * no BPF cpumask was found in the specified map value. + */ +struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp) +{ + struct bpf_cpumask *cpumask; + + /* The BPF memory allocator frees memory backing its caches in an RCU + * callback. Thus, we can safely use RCU to ensure that the cpumask is + * safe to read. + */ + rcu_read_lock(); + + cpumask = READ_ONCE(*cpumaskp); + if (cpumask && !refcount_inc_not_zero(&cpumask->usage)) + cpumask = NULL; + + rcu_read_unlock(); + return cpumask; +} + +/** + * bpf_cpumask_release() - Release a previously acquired BPF cpumask. + * @cpumask: The cpumask being released. + * + * Releases a previously acquired reference to a BPF cpumask. When the final + * reference of the BPF cpumask has been released, it is subsequently freed in + * an RCU callback in the BPF memory allocator. + */ +void bpf_cpumask_release(struct bpf_cpumask *cpumask) +{ + if (!cpumask) + return; + + if (refcount_dec_and_test(&cpumask->usage)) { + migrate_disable(); + bpf_mem_free(&bpf_cpumask_ma, cpumask); + migrate_enable(); + } +} + +/** + * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask. + * @cpumask: The cpumask being queried. + * + * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask + * pointer may be safely passed to this function. + */ +u32 bpf_cpumask_first(const struct cpumask *cpumask) +{ + return cpumask_first(cpumask); +} + +/** + * bpf_cpumask_first_zero() - Get the index of the first unset bit in the + * cpumask. + * @cpumask: The cpumask being queried. + * + * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask + * pointer may be safely passed to this function. + */ +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) +{ + return cpumask_first_zero(cpumask); +} + +/** + * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask. + * @cpu: The CPU to be set in the cpumask. + * @cpumask: The BPF cpumask in which a bit is being set. + */ +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return; + + cpumask_set_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask. + * @cpu: The CPU to be cleared from the cpumask. + * @cpumask: The BPF cpumask in which a bit is being cleared. + */ +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return; + + cpumask_clear_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask. + * @cpu: The CPU being queried for. + * @cpumask: The cpumask being queried for containing a CPU. + * + * Return: + * * true - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu. + */ +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return false; + + return cpumask_test_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask. + * @cpu: The CPU being set and queried for. + * @cpumask: The BPF cpumask being set and queried for containing a CPU. + * + * Return: + * * true - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is invalid. + */ +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return false; + + return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF + * cpumask. + * @cpu: The CPU being cleared and queried for. + * @cpumask: The BPF cpumask being cleared and queried for containing a CPU. + * + * Return: + * * true - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is invalid. + */ +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ + if (!cpu_valid(cpu)) + return false; + + return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask. + * @cpumask: The BPF cpumask having all of its bits set. + */ +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) +{ + cpumask_setall((struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask. + * @cpumask: The BPF cpumask being cleared. + */ +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) +{ + cpumask_clear((struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_and() - AND two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true - @dst has at least one bit set following the operation + * * false - @dst is empty following the operation + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_and(struct bpf_cpumask *dst, + const struct cpumask *src1, + const struct cpumask *src2) +{ + return cpumask_and((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_or() - OR two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +void bpf_cpumask_or(struct bpf_cpumask *dst, + const struct cpumask *src1, + const struct cpumask *src2) +{ + cpumask_or((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_xor() - XOR two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +void bpf_cpumask_xor(struct bpf_cpumask *dst, + const struct cpumask *src1, + const struct cpumask *src2) +{ + cpumask_xor((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_equal() - Check two cpumasks for equality. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true - @src1 and @src2 have the same bits set. + * * false - @src1 and @src2 differ in at least one bit. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_equal(src1, src2); +} + +/** + * bpf_cpumask_intersects() - Check two cpumasks for overlap. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true - @src1 and @src2 have at least one of the same bits set. + * * false - @src1 and @src2 don't have any of the same bits set. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_intersects(src1, src2); +} + +/** + * bpf_cpumask_subset() - Check if a cpumask is a subset of another. + * @src1: The first cpumask being checked as a subset. + * @src2: The second cpumask being checked as a superset. + * + * Return: + * * true - All of the bits of @src1 are set in @src2. + * * false - At least one bit in @src1 is not set in @src2. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_subset(src1, src2); +} + +/** + * bpf_cpumask_empty() - Check if a cpumask is empty. + * @cpumask: The cpumask being checked. + * + * Return: + * * true - None of the bits in @cpumask are set. + * * false - At least one bit in @cpumask is set. + * + * A struct bpf_cpumask pointer may be safely passed to @cpumask. + */ +bool bpf_cpumask_empty(const struct cpumask *cpumask) +{ + return cpumask_empty(cpumask); +} + +/** + * bpf_cpumask_full() - Check if a cpumask has all bits set. + * @cpumask: The cpumask being checked. + * + * Return: + * * true - All of the bits in @cpumask are set. + * * false - At least one bit in @cpumask is cleared. + * + * A struct bpf_cpumask pointer may be safely passed to @cpumask. + */ +bool bpf_cpumask_full(const struct cpumask *cpumask) +{ + return cpumask_full(cpumask); +} + +/** + * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask. + * @dst: The BPF cpumask being copied into. + * @src: The cpumask being copied. + * + * A struct bpf_cpumask pointer may be safely passed to @src. + */ +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) +{ + cpumask_copy((struct cpumask *)dst, src); +} + +/** + * bpf_cpumask_any() - Return a random set CPU from a cpumask. + * @cpumask: The cpumask being queried. + * + * Return: + * * A random set bit within [0, num_cpus) if at least one bit is set. + * * >= num_cpus if no bit is set. + * + * A struct bpf_cpumask pointer may be safely passed to @src. + */ +u32 bpf_cpumask_any(const struct cpumask *cpumask) +{ + return cpumask_any(cpumask); +} + +/** + * bpf_cpumask_any_and() - Return a random set CPU from the AND of two + * cpumasks. + * @src1: The first cpumask. + * @src2: The second cpumask. + * + * Return: + * * A random set bit within [0, num_cpus) if at least one bit is set. + * * >= num_cpus if no bit is set. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) +{ + return cpumask_any_and(src1, src2); +} + +__diag_pop(); + +BTF_SET8_START(cpumask_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS) +BTF_SET8_END(cpumask_kfunc_btf_ids) + +static const struct btf_kfunc_id_set cpumask_kfunc_set = { + .owner = THIS_MODULE, + .set = &cpumask_kfunc_btf_ids, +}; + +BTF_ID_LIST(cpumask_dtor_ids) +BTF_ID(struct, bpf_cpumask) +BTF_ID(func, bpf_cpumask_release) + +static int __init cpumask_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc cpumask_dtors[] = { + { + .btf_id = cpumask_dtor_ids[0], + .kfunc_btf_id = cpumask_dtor_ids[1] + }, + }; + + ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, + ARRAY_SIZE(cpumask_dtors), + THIS_MODULE); +} + +late_initcall(cpumask_kfunc_init); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 190d9f9dc987..88aae38fde66 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -41,7 +41,7 @@ struct bpf_offload_dev { struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; - struct bpf_offload_dev *offdev; + struct bpf_offload_dev *offdev; /* NULL when bound-only */ struct list_head progs; struct list_head maps; struct list_head offdev_netdevs; @@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = { }; static struct rhashtable offdevs; -static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev) { lockdep_assert_held(&bpf_devs_lock); - if (!offdevs_inited) - return NULL; return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); } -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; - struct bpf_prog_offload *offload; int err; - if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && - attr->prog_type != BPF_PROG_TYPE_XDP) - return -EINVAL; + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; - if (attr->prog_flags) - return -EINVAL; + ondev->netdev = netdev; + ondev->offdev = offdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_free; + } + + if (offdev) + list_add(&ondev->offdev_netdevs, &offdev->netdevs); + return 0; + +err_free: + kfree(ondev); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_prog_offload *offload = prog->aux->offload; + + if (offload->dev_state) + offload->offdev->ops->destroy(prog); + + list_del_init(&offload->offloads); + kfree(offload); + prog->aux->offload = NULL; +} + +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) +{ + struct bpf_offload_netdev *ondev, *altdev = NULL; + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + + ASSERT_RTNL(); + + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + return; + + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + + /* Try to move the objects to another netdev of the device */ + if (offdev) { + list_del(&ondev->offdev_netdevs); + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + } + + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } + + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +} + +static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev) +{ + struct bpf_offload_netdev *ondev; + struct bpf_prog_offload *offload; + int err; offload = kzalloc(sizeof(*offload), GFP_USER); if (!offload) return -ENOMEM; offload->prog = prog; + offload->netdev = netdev; - offload->netdev = dev_get_by_index(current->nsproxy->net_ns, - attr->prog_ifindex); - err = bpf_dev_offload_check(offload->netdev); - if (err) - goto err_maybe_put; - - down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offload->netdev); if (!ondev) { - err = -EINVAL; - goto err_unlock; + if (bpf_prog_is_offloaded(prog->aux)) { + err = -EINVAL; + goto err_free; + } + + /* When only binding to the device, explicitly + * create an entry in the hashtable. + */ + err = __bpf_offload_dev_netdev_register(NULL, offload->netdev); + if (err) + goto err_free; + ondev = bpf_offload_find_netdev(offload->netdev); } offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); - dev_put(offload->netdev); - up_write(&bpf_devs_lock); return 0; -err_unlock: - up_write(&bpf_devs_lock); -err_maybe_put: - if (offload->netdev) - dev_put(offload->netdev); +err_free: kfree(offload); return err; } +int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net_device *netdev; + int err; + + if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && + attr->prog_type != BPF_PROG_TYPE_XDP) + return -EINVAL; + + if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) + return -EINVAL; + + if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && + attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) + return -EINVAL; + + netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); + if (!netdev) + return -EINVAL; + + err = bpf_dev_offload_check(netdev); + if (err) + goto out; + + prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY); + + down_write(&bpf_devs_lock); + err = __bpf_prog_dev_bound_init(prog, netdev); + up_write(&bpf_devs_lock); + +out: + dev_put(netdev); + return err; +} + +int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) +{ + int err; + + if (!bpf_prog_is_dev_bound(old_prog->aux)) + return 0; + + if (bpf_prog_is_offloaded(old_prog->aux)) + return -EINVAL; + + new_prog->aux->dev_bound = old_prog->aux->dev_bound; + new_prog->aux->offload_requested = old_prog->aux->offload_requested; + + down_write(&bpf_devs_lock); + if (!old_prog->aux->offload) { + err = -EINVAL; + goto out; + } + + err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev); + +out: + up_write(&bpf_devs_lock); + return err; +} + int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; @@ -209,24 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) up_read(&bpf_devs_lock); } -static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +void bpf_prog_dev_bound_destroy(struct bpf_prog *prog) { - struct bpf_prog_offload *offload = prog->aux->offload; - - if (offload->dev_state) - offload->offdev->ops->destroy(prog); - - list_del_init(&offload->offloads); - kfree(offload); - prog->aux->offload = NULL; -} + struct bpf_offload_netdev *ondev; + struct net_device *netdev; -void bpf_prog_offload_destroy(struct bpf_prog *prog) -{ + rtnl_lock(); down_write(&bpf_devs_lock); - if (prog->aux->offload) + if (prog->aux->offload) { + list_del_init(&prog->aux->offload->offloads); + + netdev = prog->aux->offload->netdev; __bpf_prog_offload_destroy(prog); + + ondev = bpf_offload_find_netdev(netdev); + if (!ondev->offdev && list_empty(&ondev->progs)) + __bpf_offload_dev_netdev_unregister(NULL, netdev); + } up_write(&bpf_devs_lock); + rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) @@ -340,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; -static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, - enum bpf_netdev_command cmd) -{ - struct netdev_bpf data = {}; - struct net_device *netdev; - - ASSERT_RTNL(); - - data.command = cmd; - data.offmap = offmap; - /* Caller must make sure netdev is valid */ - netdev = offmap->netdev; - - return netdev->netdev_ops->ndo_bpf(netdev, &data); -} - struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; @@ -405,15 +549,6 @@ err_unlock: return ERR_PTR(err); } -static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) -{ - WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); - /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ - bpf_map_free_id(&offmap->map, true); - list_del_init(&offmap->offloads); - offmap->netdev = NULL; -} - void bpf_map_offload_map_free(struct bpf_map *map) { struct bpf_offloaded_map *offmap = map_to_offmap(map); @@ -573,12 +708,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) } EXPORT_SYMBOL_GPL(bpf_offload_dev_match); +bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) +{ + bool ret; + + if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux)) + return false; + + down_read(&bpf_devs_lock); + ret = lhs->aux->offload && rhs->aux->offload && + lhs->aux->offload->netdev && + lhs->aux->offload->netdev == rhs->aux->offload->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; bool ret; - if (!bpf_map_is_dev_bound(map)) + if (!bpf_map_is_offloaded(map)) return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); @@ -592,32 +743,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { - struct bpf_offload_netdev *ondev; int err; - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); - if (!ondev) - return -ENOMEM; - - ondev->netdev = netdev; - ondev->offdev = offdev; - INIT_LIST_HEAD(&ondev->progs); - INIT_LIST_HEAD(&ondev->maps); - down_write(&bpf_devs_lock); - err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); - if (err) { - netdev_warn(netdev, "failed to register for BPF offload\n"); - goto err_unlock_free; - } - - list_add(&ondev->offdev_netdevs, &offdev->netdevs); + err = __bpf_offload_dev_netdev_register(offdev, netdev); up_write(&bpf_devs_lock); - return 0; - -err_unlock_free: - up_write(&bpf_devs_lock); - kfree(ondev); return err; } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); @@ -625,43 +755,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { - struct bpf_offload_netdev *ondev, *altdev; - struct bpf_offloaded_map *offmap, *mtmp; - struct bpf_prog_offload *offload, *ptmp; - - ASSERT_RTNL(); - down_write(&bpf_devs_lock); - ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); - if (WARN_ON(!ondev)) - goto unlock; - - WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); - list_del(&ondev->offdev_netdevs); - - /* Try to move the objects to another netdev of the device */ - altdev = list_first_entry_or_null(&offdev->netdevs, - struct bpf_offload_netdev, - offdev_netdevs); - if (altdev) { - list_for_each_entry(offload, &ondev->progs, offloads) - offload->netdev = altdev->netdev; - list_splice_init(&ondev->progs, &altdev->progs); - - list_for_each_entry(offmap, &ondev->maps, offloads) - offmap->netdev = altdev->netdev; - list_splice_init(&ondev->maps, &altdev->maps); - } else { - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); - } - - WARN_ON(!list_empty(&ondev->progs)); - WARN_ON(!list_empty(&ondev->maps)); - kfree(ondev); -unlock: + __bpf_offload_dev_netdev_unregister(offdev, netdev); up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); @@ -670,18 +765,6 @@ struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; - int err; - - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) { - up_write(&bpf_devs_lock); - return ERR_PTR(err); - } - offdevs_inited = true; - } - up_write(&bpf_devs_lock); offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); if (!offdev) @@ -707,3 +790,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) return offdev->priv; } EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); + +void bpf_dev_bound_netdev_unregister(struct net_device *dev) +{ + struct bpf_offload_netdev *ondev; + + ASSERT_RTNL(); + + down_write(&bpf_devs_lock); + ondev = bpf_offload_find_netdev(dev); + if (ondev && !ondev->offdev) + __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev); + up_write(&bpf_devs_lock); +} + +int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, + struct bpf_prog_aux *prog_aux) +{ + if (!bpf_prog_is_dev_bound(prog_aux)) { + bpf_log(log, "metadata kfuncs require device-bound program\n"); + return -EINVAL; + } + + if (bpf_prog_is_offloaded(prog_aux)) { + bpf_log(log, "metadata kfuncs can't be offloaded\n"); + return -EINVAL; + } + + return 0; +} + +void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) +{ + const struct xdp_metadata_ops *ops; + void *p = NULL; + + /* We don't hold bpf_devs_lock while resolving several + * kfuncs and can race with the unregister_netdevice(). + * We rely on bpf_dev_bound_match() check at attach + * to render this program unusable. + */ + down_read(&bpf_devs_lock); + if (!prog->aux->offload) + goto out; + + ops = prog->aux->offload->netdev->xdp_metadata_ops; + if (!ops) + goto out; + + if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) + p = ops->xmo_rx_timestamp; + else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) + p = ops->xmo_rx_hash; +out: + up_read(&bpf_devs_lock); + + return p; +} + +static int __init bpf_offload_init(void) +{ + return rhashtable_init(&offdevs, &offdevs_params); +} + +late_initcall(bpf_offload_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bf384b3346d6..99417b387547 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, int err; /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { @@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, void *ptr; int err; - if (bpf_map_is_dev_bound(map)) + if (bpf_map_is_offloaded(map)) return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); @@ -1483,7 +1483,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; } else if (IS_FD_PROG_ARRAY(map) || @@ -1547,7 +1547,7 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_get_next_key(map, key, next_key); goto out; } @@ -1605,7 +1605,7 @@ int generic_map_delete_batch(struct bpf_map *map, map->key_size)) break; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); break; } @@ -1851,7 +1851,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - if (!bpf_map_is_dev_bound(map)) { + if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); @@ -1944,7 +1944,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (!ops) return -EINVAL; - if (!bpf_prog_is_dev_bound(prog->aux)) + if (!bpf_prog_is_offloaded(prog->aux)) prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; @@ -2245,7 +2245,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog, if (prog->type != *attach_type) return false; - if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) + if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) return false; return true; @@ -2481,7 +2481,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | - BPF_F_XDP_HAS_FRAGS)) + BPF_F_XDP_HAS_FRAGS | + BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2565,7 +2566,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; - prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -2589,7 +2590,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { - err = bpf_prog_offload_init(prog, attr); + err = bpf_prog_dev_bound_init(prog, attr); + if (err) + goto free_prog_sec; + } + + if (type == BPF_PROG_TYPE_EXT && dst_prog && + bpf_prog_is_dev_bound(dst_prog->aux)) { + err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) goto free_prog_sec; } @@ -3987,7 +3995,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; @@ -4215,7 +4223,7 @@ static int bpf_map_get_info_by_fd(struct file *file, } info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba62f98d3c59..4cc0e70ee71e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -255,6 +255,7 @@ struct bpf_call_arg_meta { int mem_size; u64 msize_max_value; int ref_obj_id; + int dynptr_id; int map_uid; int func_id; struct btf *btf; @@ -638,31 +639,57 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } -static int get_spi(s32 off) +static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; } +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { - int allocated_slots = state->allocated_stack / BPF_REG_SIZE; + int allocated_slots = state->allocated_stack / BPF_REG_SIZE; - /* We need to check that slots between [spi - nr_slots + 1, spi] are - * within [0, allocated_stack). - * - * Please note that the spi grows downwards. For example, a dynptr - * takes the size of two stack slots; the first slot will be at - * spi and the second slot will be at spi - 1. - */ - return spi - nr_slots + 1 >= 0 && spi < allocated_slots; + /* We need to check that slots between [spi - nr_slots + 1, spi] are + * within [0, allocated_stack). + * + * Please note that the spi grows downwards. For example, a dynptr + * takes the size of two stack slots; the first slot will be at + * spi and the second slot will be at spi - 1. + */ + return spi - nr_slots + 1 >= 0 && spi < allocated_slots; } -static struct bpf_func_state *func(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg) +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_verifier_state *cur = env->cur_state; + int off, spi; - return cur->frame[reg->frameno]; + if (!tnum_is_const(reg->var_off)) { + verbose(env, "dynptr has to be at a constant offset\n"); + return -EINVAL; + } + + off = reg->off + reg->var_off.value; + if (off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + return -EINVAL; + } + + spi = __get_spi(off); + if (spi < 1) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", off); + return -EINVAL; + } + + if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS)) + return -ERANGE; + return spi; } static const char *kernel_type_name(const struct btf* btf, u32 id) @@ -727,37 +754,58 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot); + bool first_slot, int dynptr_id); static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, +static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, + struct bpf_reg_state *sreg1, struct bpf_reg_state *sreg2, enum bpf_dynptr_type type) { - __mark_dynptr_reg(sreg1, type, true); - __mark_dynptr_reg(sreg2, type, false); + int id = ++env->id_gen; + + __mark_dynptr_reg(sreg1, type, true, id); + __mark_dynptr_reg(sreg2, type, false, id); } -static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, +static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, enum bpf_dynptr_type type) { - __mark_dynptr_reg(reg, type, true); + __mark_dynptr_reg(reg, type, true, ++env->id_gen); } +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type type; - int spi, i, id; - - spi = get_spi(reg->off); - - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) - return -EINVAL; + int spi, i, id, err; + + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + /* We cannot assume both spi and spi - 1 belong to the same dynptr, + * hence we need to call destroy_if_dynptr_stack_slot twice for both, + * to ensure that for the following example: + * [d1][d1][d2][d2] + * spi 3 2 1 0 + * So marking spi = 2 should lead to destruction of both d1 and d2. In + * case they do belong to same dynptr, second call won't see slot_type + * as STACK_DYNPTR and will simply skip destruction. + */ + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + err = destroy_if_dynptr_stack_slot(env, state, spi - 1); + if (err) + return err; for (i = 0; i < BPF_REG_SIZE; i++) { state->stack[spi].slot_type[i] = STACK_DYNPTR; @@ -768,7 +816,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { @@ -781,6 +829,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + return 0; } @@ -789,10 +840,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, i; - spi = get_spi(reg->off); - - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) - return -EINVAL; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; for (i = 0; i < BPF_REG_SIZE; i++) { state->stack[spi].slot_type[i] = STACK_INVALID; @@ -805,43 +855,133 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + + /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? + * + * While we don't allow reading STACK_INVALID, it is still possible to + * do <8 byte writes marking some but not all slots as STACK_MISC. Then, + * helpers or insns can do partial read of that part without failing, + * but check_stack_range_initialized, check_stack_read_var_off, and + * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of + * the slot conservatively. Hence we need to prevent those liveness + * marking walks. + * + * This was not a problem before because STACK_INVALID is only set by + * default (where the default reg state has its reg->parent as NULL), or + * in clean_live_states after REG_LIVE_DONE (at which point + * mark_reg_read won't walk reg->parent chain), but not randomly during + * verifier state exploration (like we did above). Hence, for our case + * parentage chain will still be live (i.e. reg->parent may be + * non-NULL), while earlier reg->parent was NULL, so we need + * REG_LIVE_WRITTEN to screen off read marker propagation when it is + * done later on reads or by mark_dynptr_read as well to unnecessary + * mark registers in verifier state. + */ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + return 0; } -static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi) { - struct bpf_func_state *state = func(env, reg); - int spi, i; + struct bpf_func_state *fstate; + struct bpf_reg_state *dreg; + int i, dynptr_id; - if (reg->type == CONST_PTR_TO_DYNPTR) - return false; + /* We always ensure that STACK_DYNPTR is never set partially, + * hence just checking for slot_type[0] is enough. This is + * different for STACK_SPILL, where it may be only set for + * 1 byte, so code has to use is_spilled_reg. + */ + if (state->stack[spi].slot_type[0] != STACK_DYNPTR) + return 0; - spi = get_spi(reg->off); - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) - return true; + /* Reposition spi to first slot */ + if (!state->stack[spi].spilled_ptr.dynptr.first_slot) + spi = spi + 1; + + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; + } + + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); + /* Writing partially to one dynptr stack slot destroys both. */ for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_DYNPTR || - state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) - return false; + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; } + dynptr_id = state->stack[spi].spilled_ptr.id; + /* Invalidate any slices associated with this dynptr */ + bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ + /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ + if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) + continue; + if (dreg->dynptr_id == dynptr_id) { + if (!env->allow_ptr_leaks) + __mark_reg_not_init(env, dreg); + else + __mark_reg_unknown(env, dreg); + } + })); + + /* Do not release reference state, we are destroying dynptr on stack, + * not using some helper to release it. Just reset register. + */ + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + + /* Same reason as unmark_stack_slots_dynptr above */ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + + return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi) +{ + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + + /* For -ERANGE (i.e. spi not falling into allocated stack slots), we + * will do check_mem_access to check and update stack bounds later, so + * return true for that case. + */ + if (spi < 0) + return spi == -ERANGE; + /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see + * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to + * ensure dynptr objects at the slots we are touching are completely + * destructed before we reinitialize them for a new one. For referenced + * ones, destroy_if_dynptr_stack_slot returns an error early instead of + * delaying it until the end where the user will get "Unreleased + * reference" error. + */ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi) { struct bpf_func_state *state = func(env, reg); - int spi; int i; /* This already represents first slot of initialized bpf_dynptr */ if (reg->type == CONST_PTR_TO_DYNPTR) return true; - spi = get_spi(reg->off); - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.dynptr.first_slot) + if (spi < 0) + return false; + if (!state->stack[spi].spilled_ptr.dynptr.first_slot) return false; for (i = 0; i < BPF_REG_SIZE; i++) { @@ -868,7 +1008,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg if (reg->type == CONST_PTR_TO_DYNPTR) { return reg->dynptr.type == dynptr_type; } else { - spi = get_spi(reg->off); + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return false; return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; } } @@ -1449,7 +1591,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot) + bool first_slot, int dynptr_id) { /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1457,6 +1599,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty */ __mark_reg_known_zero(reg); reg->type = CONST_PTR_TO_DYNPTR; + /* Give each dynptr a unique id to uniquely associate slices to it. */ + reg->id = dynptr_id; reg->dynptr.type = type; reg->dynptr.first_slot = first_slot; } @@ -2189,6 +2333,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + if (bpf_dev_bound_kfunc_id(func_id)) { + err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); + if (err) + return err; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; desc->imm = call_imm; @@ -2390,6 +2540,32 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, ret; + + /* For CONST_PTR_TO_DYNPTR, it must have already been done by + * check_reg_arg in check_helper_call and mark_btf_func_reg_size in + * check_kfunc_call. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return 0; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + /* Caller ensures dynptr is valid and initialized, which means spi is in + * bounds and spi is the first dynptr slot. Simply mark stack slot as + * read. + */ + ret = mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + if (ret) + return ret; + return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr, + state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); +} + /* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. @@ -3311,6 +3487,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, env->insn_aux_data[insn_idx].sanitize_stack_spill = true; } + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { @@ -3424,6 +3604,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, if (err) return err; + for (i = min_off; i < max_off; i++) { + int spi; + + spi = __get_spi(i); + err = destroy_if_dynptr_stack_slot(env, state, spi); + if (err) + return err; + } /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { @@ -4763,6 +4951,25 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) return 0; } +#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields) + +BTF_TYPE_SAFE_NESTED(struct task_struct) { + const cpumask_t *cpus_ptr; +}; + +static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + int off) +{ + /* If its parent is not trusted, it can't regain its trusted status. */ + if (!is_trusted_reg(reg)) + return false; + + BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); + + return btf_nested_type_is_trusted(&env->log, reg, off); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -4851,10 +5058,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_flag(reg->type) & PTR_UNTRUSTED) flag |= PTR_UNTRUSTED; - /* By default any pointer obtained from walking a trusted pointer is - * no longer trusted except the rcu case below. + /* By default any pointer obtained from walking a trusted pointer is no + * longer trusted, unless the field being accessed has explicitly been + * marked as inheriting its parent's state of trust. + * + * An RCU-protected pointer can also be deemed trusted if we are in an + * RCU read region. This case is handled below. */ - flag &= ~PTR_TRUSTED; + if (nested_ptr_is_trusted(env, reg, off)) + flag |= PTR_TRUSTED; + else + flag &= ~PTR_TRUSTED; if (flag & MEM_RCU) { /* Mark value register as MEM_RCU only if it is protected by @@ -5451,6 +5665,31 @@ static int check_stack_range_initialized( } if (meta && meta->raw_mode) { + /* Ensure we won't be overwriting dynptrs when simulating byte + * by byte access in check_helper_call using meta.access_size. + * This would be a problem if we have a helper in the future + * which takes: + * + * helper(uninit_mem, len, dynptr) + * + * Now, uninint_mem may overlap with dynptr pointer. Hence, it + * may end up writing to dynptr itself when touching memory from + * arg 1. This can be relaxed on a case by case basis for known + * safe cases, but reject due to the possibilitiy of aliasing by + * default. + */ + for (i = min_off; i < max_off + access_size; i++) { + int stack_off = -i - 1; + + spi = __get_spi(i); + /* raw_mode may write past allocated_stack */ + if (state->allocated_stack <= stack_off) + continue; + if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) { + verbose(env, "potential write to dynptr at off=%d disallowed\n", i); + return -EACCES; + } + } meta->access_size = access_size; meta->regno = regno; return 0; @@ -5938,6 +6177,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int spi = 0; /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -5948,12 +6188,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, } /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to * check_func_arg_reg_off's logic. We only need to check offset - * alignment for PTR_TO_STACK. + * and its alignment for PTR_TO_STACK. */ - if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { - verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); - return -EINVAL; + if (reg->type == PTR_TO_STACK) { + spi = dynptr_get_spi(env, reg); + if (spi < 0 && spi != -ERANGE) + return spi; } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -5970,7 +6212,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, * to. */ if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg)) { + if (!is_dynptr_reg_valid_uninit(env, reg, spi)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); return -EINVAL; } @@ -5985,13 +6227,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, meta->uninit_dynptr_regno = regno; } else /* MEM_RDONLY and None case from above */ { + int err; + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); return -EINVAL; } - if (!is_dynptr_reg_valid_init(env, reg)) { + if (!is_dynptr_reg_valid_init(env, reg, spi)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", regno); @@ -6018,6 +6262,10 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno, err_extra, regno); return -EINVAL; } + + err = mark_dynptr_read(env, reg); + if (err) + return err; } return 0; } @@ -6355,15 +6603,29 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, } } -static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->ref_obj_id; + return reg->id; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + return state->stack[spi].spilled_ptr.id; +} + +static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi; - spi = get_spi(reg->off); + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; return state->stack[spi].spilled_ptr.ref_obj_id; } @@ -6437,9 +6699,8 @@ skip_type_check: * PTR_TO_STACK. */ if (reg->type == PTR_TO_STACK) { - spi = get_spi(reg->off); - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.ref_obj_id) { + spi = dynptr_get_spi(env, reg); + if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) { verbose(env, "arg %d is an unacquired reference\n", regno); return -EINVAL; } @@ -7421,7 +7682,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); - mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); + mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ @@ -7927,13 +8188,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + int id, ref_obj_id; + + if (meta.dynptr_id) { + verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + return -EFAULT; + } if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - meta.ref_obj_id = dynptr_ref_obj_id(env, reg); + id = dynptr_id(env, reg); + if (id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + return id; + } + + ref_obj_id = dynptr_ref_obj_id(env, reg); + if (ref_obj_id < 0) { + verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + return ref_obj_id; + } + + meta.dynptr_id = id; + meta.ref_obj_id = ref_obj_id; break; } } @@ -8089,6 +8369,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } + if (is_dynptr_ref_function(func_id)) + regs[BPF_REG_0].dynptr_id = meta.dynptr_id; + if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -8545,9 +8828,37 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } - if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id)) + /* Enforce strict type matching for calls to kfuncs that are acquiring + * or releasing a reference, or are no-cast aliases. We do _not_ + * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * as we want to enable BPF programs to pass types that are bitwise + * equivalent without forcing them to explicitly cast with something + * like bpf_cast_to_kern_ctx(). + * + * For example, say we had a type like the following: + * + * struct bpf_cpumask { + * cpumask_t cpumask; + * refcount_t usage; + * }; + * + * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed + * to a struct cpumask, so it would be safe to pass a struct + * bpf_cpumask * to a kfunc expecting a struct cpumask *. + * + * The philosophy here is similar to how we allow scalars of different + * types to be passed to kfuncs as long as the size is the same. The + * only difference here is that we're simply allowing + * btf_struct_ids_match() to walk the struct at the 0th offset, and + * resolve types. + */ + if (is_kfunc_acquire(meta) || + (is_kfunc_release(meta) && reg->ref_obj_id) || + btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; + WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off); + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { @@ -8891,6 +9202,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } + if (is_kfunc_trusted_args(meta) && + (register_is_null(reg) || type_may_be_null(reg->type))) { + verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + return -EACCES; + } + if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -13223,10 +13540,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return false; if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) continue; - if (!is_spilled_reg(&old->stack[spi])) - continue; - if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap)) + /* Both old and cur are having same slot_type */ + switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -13237,7 +13553,30 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * such verifier states are not equivalent. * return false to continue verification of this path */ + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) + return false; + break; + case STACK_DYNPTR: + { + const struct bpf_reg_state *old_reg, *cur_reg; + + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + if (old_reg->dynptr.type != cur_reg->dynptr.type || + old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; + } + case STACK_MISC: + case STACK_ZERO: + case STACK_INVALID: + continue; + /* Ensure that new unhandled slot types return false by default */ + default: return false; + } } return true; } @@ -13834,7 +14173,7 @@ static int do_check(struct bpf_verifier_env *env) env->prev_log_len = env->log.len_used; } - if (bpf_prog_is_dev_bound(env->prog->aux)) { + if (bpf_prog_is_offloaded(env->prog->aux)) { err = bpf_prog_offload_verify_insn(env, env->insn_idx, env->prev_insn_idx); if (err) @@ -14314,7 +14653,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; @@ -14795,7 +15134,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) unsigned int orig_prog_len = env->prog->len; int err; - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); err = bpf_remove_insns(env->prog, off, cnt); @@ -14876,7 +15215,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) else continue; - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_replace_insn(env, i, &ja); memcpy(insn, &ja, sizeof(ja)); @@ -15063,7 +15402,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; @@ -15463,7 +15802,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) int err = 0; if (env->prog->jit_requested && - !bpf_prog_is_dev_bound(env->prog->aux)) { + !bpf_prog_is_offloaded(env->prog->aux)) { err = jit_subprogs(env); if (err == 0) return 0; @@ -15507,12 +15846,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { const struct bpf_kfunc_desc *desc; + void *xdp_kfunc; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); return -EINVAL; } + *cnt = 0; + + if (bpf_dev_bound_kfunc_id(insn->imm)) { + xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm); + if (xdp_kfunc) { + insn->imm = BPF_CALL_IMM(xdp_kfunc); + return 0; + } + + /* fallback to default kfunc when not supported by netdev */ + } + /* insn->imm has the btf func_id. Replace it with * an address (relative to __bpf_call_base). */ @@ -15523,7 +15875,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } - *cnt = 0; insn->imm = desc->imm; if (insn->off) return 0; @@ -16449,7 +16800,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } if (st_ops->check_member) { - int err = st_ops->check_member(t, member); + int err = st_ops->check_member(t, member, prog); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", @@ -16530,6 +16881,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + if (bpf_prog_is_dev_bound(prog->aux) && + !bpf_prog_dev_bound_match(prog, tgt_prog)) { + bpf_log(log, "Target program bound device mismatch"); + return -EINVAL; + } + for (i = 0; i < aux->func_info_cnt; i++) if (aux->func_info[i].type_id == btf_id) { subprog = i; @@ -16751,6 +17108,24 @@ BTF_ID(func, rcu_read_unlock_strict) #endif BTF_SET_END(btf_id_deny) +static bool can_be_sleepable(struct bpf_prog *prog) +{ + if (prog->type == BPF_PROG_TYPE_TRACING) { + switch (prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + case BPF_TRACE_ITER: + return true; + default: + return false; + } + } + return prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || + prog->type == BPF_PROG_TYPE_STRUCT_OPS; +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -16769,9 +17144,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { - verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); + if (prog->aux->sleepable && !can_be_sleepable(prog)) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); return -EINVAL; } @@ -16950,7 +17324,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) if (ret < 0) goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { + if (bpf_prog_is_offloaded(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) goto skip_full_check; @@ -16963,7 +17337,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) ret = do_check_subprogs(env); ret = ret ?: do_check_main(env); - if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) + if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: @@ -16998,7 +17372,7 @@ skip_full_check: /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ - if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) { ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret : false; |