From 1b2b234b1318afb3775d4c6624fd5a96558f19df Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:00 -0800 Subject: bpf: pass struct btf pointer to the map_check_btf() callback If key_type or value_type are of non-trivial data types (e.g. structure or typedef), it's not possible to check them without the additional information, which can't be obtained without a pointer to the btf structure. So, let's pass btf pointer to the map_check_btf() callbacks. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c992b86eb2c..e734f163bd0b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; +struct btf; struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ @@ -52,6 +53,7 @@ struct bpf_map_ops { void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); }; @@ -126,6 +128,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); -- cgit v1.2.3 From 9a1126b63190e2541dd5d643f4bfeb5a7f493729 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:01 -0800 Subject: bpf: add bpffs pretty print for cgroup local storage maps Implement bpffs pretty printing for cgroup local storage maps (both shared and per-cpu). Output example (captured for tools/testing/selftests/bpf/netcnt_prog.c): Shared: $ cat /sys/fs/bpf/map_2 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: {9999,1039896} Per-cpu: $ cat /sys/fs/bpf/map_1 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: { cpu0: {0,0,0,0,0} cpu1: {0,0,0,0,0} cpu2: {1,104,0,0,0} cpu3: {0,0,0,0,0} } Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 22 +++++++++++ kernel/bpf/local_storage.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 115 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index b98405a56383..a4cf075b89eb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -47,6 +47,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); bool btf_name_offset_valid(const struct btf *btf, u32 offset); +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bf34933cc413..1545ddfb6fa5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -514,6 +514,28 @@ static bool btf_type_int_is_regular(const struct btf_type *t) return true; } +/* + * Check that given type is a regular int and has the expected size. + */ +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size) +{ + u8 nr_bits, nr_bytes; + u32 int_data; + + if (!btf_type_is_int(t)) + return false; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + nr_bytes != expected_size) + return false; + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b65017dead44..5eca03da0989 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,11 +1,13 @@ //SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include #include #include +#include DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -308,6 +310,94 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static int cgroup_storage_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + const struct btf_type *t; + struct btf_member *m; + u32 id, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + if (m->offset) + return -EINVAL; + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!t || !btf_type_is_reg_int(t, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) * + BITS_PER_BYTE) + return -EINVAL; + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); + if (!t || !btf_type_is_reg_int(t, size)) + return -EINVAL; + + return 0; +} + +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, + struct seq_file *m) +{ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map_to_storage(map), key, false); + if (!storage) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) { + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &READ_ONCE(storage->buf)->data[0], m); + seq_puts(m, "\n"); + } else { + seq_puts(m, ": {\n"); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(storage->percpu_buf, cpu), + m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + } + rcu_read_unlock(); +} + const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -315,7 +405,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = cgroup_storage_check_btf, + .map_seq_show_elem = cgroup_storage_seq_show_elem, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) -- cgit v1.2.3 From c872bdb38febb4c31ece3599c52cf1f833b89f4e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 12 Dec 2018 09:37:46 -0800 Subject: bpf: include sub program tags in bpf_prog_info Changes v2 -> v3: 1. remove check for bpf_dump_raw_ok(). Changes v1 -> v2: 1. Fix error path as Martin suggested. This patch adds nr_prog_tags and prog_tags to bpf_prog_info. This is a reliable way for user space to get tags of all sub programs. Before this patch, user space need to find sub program tags via kallsyms. This feature will be used in BPF introspection, where user space queries information about BPF programs via sys_bpf. Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa582cd5bfcf..e7d57e89f25f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2717,6 +2717,8 @@ struct bpf_prog_info { __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b7c585838c72..7f1410d6fbe9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2315,6 +2315,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.nr_prog_tags; + info.nr_prog_tags = prog->aux->func_cnt ? : 1; + if (ulen) { + __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; + u32 i; + + user_prog_tags = u64_to_user_ptr(info.prog_tags); + ulen = min_t(u32, info.nr_prog_tags, ulen); + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + if (copy_to_user(user_prog_tags[i], + prog->aux->func[i]->tag, + BPF_TAG_SIZE)) + return -EFAULT; + } + } else { + if (copy_to_user(user_prog_tags[0], + prog->tag, BPF_TAG_SIZE)) + return -EFAULT; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.2.3 From 23127b33ec80e656921362d7dc82a0064bac20a2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:46 -0800 Subject: bpf: Create a new btf_name_by_offset() for non type name use case The current btf_name_by_offset() is returning "(anon)" type name for the offset == 0 case and "(invalid-name-offset)" for the out-of-bound offset case. It fits well for the internal BTF verbose log purpose which is focusing on type. For example, offset == 0 => "(anon)" => anonymous type/name. Returning non-NULL for the bad offset case is needed during the BTF verification process because the BTF verifier may complain about another field first before discovering the name_off is invalid. However, it may not be ideal for the newer use case which does not necessary mean type name. For example, when logging line_info in the BPF verifier in the next patch, it is better to log an empty src line instead of logging "(anon)". The existing bpf_name_by_offset() is renamed to __bpf_name_by_offset() and static to btf.c. A new bpf_name_by_offset() is added for generic context usage. It returns "\0" for name_off == 0 (note that btf->strings[0] is "\0") and NULL for invalid offset. It allows the caller to decide what is the best output in its context. The new btf_name_by_offset() is overlapped with btf_name_offset_valid(). Hence, btf_name_offset_valid() is removed from btf.h to keep the btf.h API minimal. The existing btf_name_offset_valid() usage in btf.c could also be replaced later. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - kernel/bpf/btf.c | 31 ++++++++++++++++++++----------- kernel/bpf/verifier.c | 4 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a4cf075b89eb..58000d7e06e3 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,7 +46,6 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_name_offset_valid(const struct btf *btf, u32 offset); bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1545ddfb6fa5..8fa0bf1c33fd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } -const char *btf_name_by_offset(const struct btf *btf, u32 offset) +static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) return "(anon)"; @@ -484,6 +484,14 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } +const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { if (type_id > btf->nr_types) @@ -576,7 +584,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], - btf_name_by_offset(btf, t->name_off), + __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) @@ -620,7 +628,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, btf_verifier_log_type(env, struct_type, NULL); __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - btf_name_by_offset(btf, member->name_off), + __btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { @@ -1872,7 +1880,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, btf_verifier_log(env, "\t%s val=%d\n", - btf_name_by_offset(btf, enums[i].name_off), + __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -1896,7 +1904,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, for (i = 0; i < nr_enums; i++) { if (v == enums[i].val) { seq_printf(m, "%s", - btf_name_by_offset(btf, enums[i].name_off)); + __btf_name_by_offset(btf, + enums[i].name_off)); return; } } @@ -1954,20 +1963,20 @@ static void btf_func_proto_log(struct btf_verifier_env *env, } btf_verifier_log(env, "%u %s", args[0].type, - btf_name_by_offset(env->btf, - args[0].name_off)); + __btf_name_by_offset(env->btf, + args[0].name_off)); for (i = 1; i < nr_args - 1; i++) btf_verifier_log(env, ", %u %s", args[i].type, - btf_name_by_offset(env->btf, - args[i].name_off)); + __btf_name_by_offset(env->btf, + args[i].name_off)); if (nr_args > 1) { const struct btf_param *last_arg = &args[nr_args - 1]; if (last_arg->type) btf_verifier_log(env, ", %u %s", last_arg->type, - btf_name_by_offset(env->btf, - last_arg->name_off)); + __btf_name_by_offset(env->btf, + last_arg->name_off)); else btf_verifier_log(env, ", vararg"); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8b511a4fe84a..89ce2613fdb0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4910,8 +4910,8 @@ static int check_btf_line(struct bpf_verifier_env *env, goto err_free; } - if (!btf_name_offset_valid(btf, linfo[i].line_off) || - !btf_name_offset_valid(btf, linfo[i].file_name_off)) { + if (!btf_name_by_offset(btf, linfo[i].line_off) || + !btf_name_by_offset(btf, linfo[i].file_name_off)) { verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); err = -EINVAL; goto err_free; -- cgit v1.2.3 From d9762e84ede3eae9636f5dbbe0c8f0390d37e114 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:48 -0800 Subject: bpf: verbose log bpf_line_info in verifier This patch adds bpf_line_info during the verifier's verbose. It can give error context for debug purpose. ~~~~~~~~~~ Here is the verbose log for backedge: while (a) { a += bpf_get_smp_processor_id(); bpf_trace_printk(fmt, sizeof(fmt), a); } ~> bpftool prog load ./test_loop.o /sys/fs/bpf/test_loop type tracepoint 13: while (a) { 3: a += bpf_get_smp_processor_id(); back-edge from insn 13 to 3 ~~~~~~~~~~ Here is the verbose log for invalid pkt access: Modification to test_xdp_noinline.c: data = (void *)(long)xdp->data; data_end = (void *)(long)xdp->data_end; /* if (data + 4 > data_end) return XDP_DROP; */ *(u32 *)data = dst->dst; ~> bpftool prog load ./test_xdp_noinline.o /sys/fs/bpf/test_xdp_noinline type xdp ; data = (void *)(long)xdp->data; 224: (79) r2 = *(u64 *)(r10 -112) 225: (61) r2 = *(u32 *)(r2 +0) ; *(u32 *)data = dst->dst; 226: (63) *(u32 *)(r2 +0) = r1 invalid access to packet, off=0 size=4, R2(id=0,off=0,r=0) R2 offset is outside of the packet Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 74 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c736945be7c5..548dcbdb7111 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89ce2613fdb0..ba8e3134bbc2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "disasm.h" @@ -216,6 +217,27 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { @@ -266,6 +288,42 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -4561,6 +4619,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { + verbose_linfo(env, t, "%d: ", t); verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -4578,6 +4637,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose_linfo(env, t, "%d: ", t); + verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { @@ -4600,10 +4661,6 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - ret = check_subprogs(env); - if (ret < 0) - return ret; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -5448,6 +5505,8 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; + env->prev_linfo = NULL; + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; @@ -5521,6 +5580,7 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; + verbose_linfo(env, insn_idx, "; "); verbose(env, "%d: ", insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } @@ -6755,7 +6815,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_cfg(env); + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -6763,6 +6823,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); -- cgit v1.2.3 From 9242b5f5615c823bfc1e9aea284617ff25a55f10 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:34 -0800 Subject: bpf: add self-check logic to liveness analysis Introduce REG_LIVE_DONE to check the liveness propagation and prepare the states for merging. See algorithm description in clean_live_states(). Signed-off-by: Alexei Starovoitov Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 548dcbdb7111..c233efc106c6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,6 +38,7 @@ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e4724fe8120f..0125731e2512 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -397,12 +397,14 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); } static struct bpf_func_state *func(struct bpf_verifier_env *env, @@ -1132,6 +1134,12 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; + if (parent->live & REG_LIVE_DONE) { + verbose(env, "verifier BUG type %s var_off %lld off %d\n", + reg_type_str[parent->type], + parent->var_off.value, parent->off); + return -EFAULT; + } /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; @@ -5078,6 +5086,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) return false; } +static void clean_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *st) +{ + enum bpf_reg_liveness live; + int i, j; + + for (i = 0; i < BPF_REG_FP; i++) { + live = st->regs[i].live; + /* liveness must not touch this register anymore */ + st->regs[i].live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) + /* since the register is unused, clear its state + * to make further comparison simpler + */ + __mark_reg_not_init(&st->regs[i]); + } + + for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { + live = st->stack[i].spilled_ptr.live; + /* liveness must not touch this stack slot anymore */ + st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) { + __mark_reg_not_init(&st->stack[i].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + st->stack[i].slot_type[j] = STACK_INVALID; + } + } +} + +static void clean_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + int i; + + if (st->frame[0]->regs[0].live & REG_LIVE_DONE) + /* all regs in this state in all frames were already marked */ + return; + + for (i = 0; i <= st->curframe; i++) + clean_func_state(env, st->frame[i]); +} + +/* the parentage chains form a tree. + * the verifier states are added to state lists at given insn and + * pushed into state stack for future exploration. + * when the verifier reaches bpf_exit insn some of the verifer states + * stored in the state lists have their final liveness state already, + * but a lot of states will get revised from liveness point of view when + * the verifier explores other branches. + * Example: + * 1: r0 = 1 + * 2: if r1 == 100 goto pc+1 + * 3: r0 = 2 + * 4: exit + * when the verifier reaches exit insn the register r0 in the state list of + * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. At the insn 4 it will walk the + * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * + * Since the verifier pushes the branch states as it sees them while exploring + * the program the condition of walking the branch instruction for the second + * time means that all states below this branch were already explored and + * their final liveness markes are already propagated. + * Hence when the verifier completes the search of state list in is_state_visited() + * we can call this clean_live_states() function to mark all liveness states + * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * will not be used. + * This function also clears the registers and stack for states that !READ + * to simplify state merging. + * + * Important note here that walking the same branch instruction in the callee + * doesn't meant that the states are DONE. The verifier has to compare + * the callsites + */ +static void clean_live_states(struct bpf_verifier_env *env, int insn, + struct bpf_verifier_state *cur) +{ + struct bpf_verifier_state_list *sl; + int i; + + sl = env->explored_states[insn]; + if (!sl) + return; + + while (sl != STATE_LIST_MARK) { + if (sl->state.curframe != cur->curframe) + goto next; + for (i = 0; i <= cur->curframe; i++) + if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) + goto next; + clean_verifier_state(env, &sl->state); +next: + sl = sl->next; + } +} + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) @@ -5396,6 +5500,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + clean_live_states(env, insn_idx, cur); + while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, -- cgit v1.2.3 From 6c4fc209fcf9d27efbaa48368773e4d2bfbd59aa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Dec 2018 00:49:47 +0100 Subject: bpf: remove useless version check for prog load Existing libraries and tracing frameworks work around this kernel version check by automatically deriving the kernel version from uname(3) or similar such that the user does not need to do it manually; these workarounds also make the version check useless at the same time. Moreover, most other BPF tracing types enabling bpf_probe_read()-like functionality have /not/ adapted this check, and in general these days it is well understood anyway that all the tracing programs are not stable with regards to future kernels as kernel internal data structures are subject to change from release to release. Back at last netconf we discussed [0] and agreed to remove this check from bpf_prog_load() and instead document it here in the uapi header that there is no such guarantee for stable API for these programs. [0] http://vger.kernel.org/netconf2018_files/DanielBorkmann_netconf2018.pdf Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- kernel/bpf/syscall.c | 5 ----- tools/include/uapi/linux/bpf.h | 10 +++++++++- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e7d57e89f25f..1d324c2cbca2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -343,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ae062f1cf20..5db31067d85e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1473,11 +1473,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) return -E2BIG; - - if (type == BPF_PROG_TYPE_KPROBE && - attr->kern_version != LINUX_VERSION_CODE) - return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !capable(CAP_SYS_ADMIN)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e7d57e89f25f..1d324c2cbca2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -343,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ -- cgit v1.2.3 From 9d5f9f701b1891466fb3dbb1806ad97716f95cc3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:51 -0800 Subject: bpf: btf: fix struct/union/fwd types with kind_flag This patch fixed two issues with BTF. One is related to struct/union bitfield encoding and the other is related to forward type. Issue #1 and solution: ====================== Current btf encoding of bitfield follows what pahole generates. For each bitfield, pahole will duplicate the type chain and put the bitfield size at the final int or enum type. Since the BTF enum type cannot encode bit size, pahole workarounds the issue by generating an int type whenever the enum bit size is not 32. For example, -bash-4.4$ cat t.c typedef int ___int; enum A { A1, A2, A3 }; struct t { int a[5]; ___int b:4; volatile enum A c:4; } g; -bash-4.4$ gcc -c -O2 -g t.c The current kernel supports the following BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t size=24 vlen=3 a type_id=5 bits_offset=0 b type_id=9 bits_offset=160 c type_id=11 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 [8] INT int size=1 bit_offset=0 nr_bits=4 encoding=(none) [9] TYPEDEF ___int type_id=8 [10] INT (anon) size=1 bit_offset=0 nr_bits=4 encoding=SIGNED [11] VOLATILE (anon) type_id=10 Two issues are in the above: . by changing enum type to int, we lost the original type information and this will not be ideal later when we try to convert BTF to a header file. . the type duplication for bitfields will cause BTF bloat. Duplicated types cannot be deduplicated later if the bitfield size is different. To fix this issue, this patch implemented a compatible change for BTF struct type encoding: . the bit 31 of struct_type->info, previously reserved, now is used to indicate whether bitfield_size is encoded in btf_member or not. . if bit 31 of struct_type->info is set, btf_member->offset will encode like: bit 0 - 23: bit offset bit 24 - 31: bitfield size if bit 31 is not set, the old behavior is preserved: bit 0 - 31: bit offset So if the struct contains a bit field, the maximum bit offset will be reduced to (2^24 - 1) instead of MAX_UINT. The maximum bitfield size will be 256 which is enough for today as maximum bitfield in compiler can be 128 where int128 type is supported. This kernel patch intends to support the new BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t kind_flag=1 size=24 vlen=3 a type_id=5 bitfield_size=0 bits_offset=0 b type_id=1 bitfield_size=4 bits_offset=160 c type_id=7 bitfield_size=4 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 Issue #2 and solution: ====================== Current forward type in BTF does not specify whether the original type is struct or union. This will not work for type pretty print and BTF-to-header-file conversion as struct/union must be specified. $ cat tt.c struct t; union u; int foo(struct t *t, union u *u) { return 0; } $ gcc -c -g -O2 tt.c $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t type_id=0 [3] PTR (anon) type_id=2 [4] FWD u type_id=0 [5] PTR (anon) type_id=4 To fix this issue, similar to issue #1, type->info bit 31 is used. If the bit is set, it is union type. Otherwise, it is a struct type. $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t kind_flag=0 type_id=0 [3] PTR (anon) kind_flag=0 type_id=2 [4] FWD u kind_flag=1 type_id=0 [5] PTR (anon) kind_flag=0 type_id=4 Pahole/LLVM change: =================== The new kind_flag functionality has been implemented in pahole and llvm: https://github.com/yonghong-song/pahole/tree/bitfield https://github.com/yonghong-song/llvm/tree/bitfield Note that pahole hasn't implemented func/func_proto kind and .BTF.ext. So to print function signature with bpftool, the llvm compiler should be used. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 20 +++- kernel/bpf/btf.c | 280 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 278 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 14f66948fc95..7b7475ef2f17 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -34,7 +34,9 @@ struct btf_type { * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -52,6 +54,7 @@ struct btf_type { #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) #define BTF_KIND_UNKN 0 /* Unknown */ #define BTF_KIND_INT 1 /* Integer */ @@ -110,9 +113,22 @@ struct btf_array { struct btf_member { __u32 name_off; __u32 type; - __u32 offset; /* offset in bits */ + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; }; +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". * The exact number of btf_param is stored in the vlen (of the * info in "struct btf_type"). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 72caa799e82f..93b6905e3a9b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -164,7 +164,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INFO_MASK 0x8f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -274,6 +274,10 @@ struct btf_kind_operations { const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); + int (*check_kflag_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); void (*seq_show)(const struct btf *btf, const struct btf_type *t, @@ -419,6 +423,25 @@ static u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static bool btf_type_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static u32 btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) + : member->offset; +} + +static u32 btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) + : 0; +} + static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -627,9 +650,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (env->phase != CHECK_META) btf_verifier_log_type(env, struct_type, NULL); - __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - __btf_name_by_offset(btf, member->name_off), - member->type, member->offset); + if (btf_type_kflag(struct_type)) + __btf_verifier_log(log, + "\t%s type_id=%u bitfield_size=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, + BTF_MEMBER_BITFIELD_SIZE(member->offset), + BTF_MEMBER_BIT_OFFSET(member->offset)); + else + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, member->offset); if (fmt && *fmt) { __btf_verifier_log(log, " "); @@ -945,6 +976,38 @@ static int btf_df_check_member(struct btf_verifier_env *env, return -EINVAL; } +static int btf_df_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_kflag_member"); + return -EINVAL; +} + +/* Used for ptr, array and struct/union type members. + * int, enum and modifier types have their specific callback functions. + */ +static int btf_generic_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + /* bitfield size is 0, so member->offset represents bit offset only. + * It is safe to call non kflag check_member variants. + */ + return btf_type_ops(member_type)->check_member(env, struct_type, + member, + member_type); +} + static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -997,6 +1060,62 @@ static int btf_int_check_member(struct btf_verifier_env *env, return 0; } +static int btf_int_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; + u32 int_data = btf_type_int(member_type); + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + + /* a regular int type is required for the kflag int member */ + if (!btf_type_int_is_regular(member_type)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member base type"); + return -EINVAL; + } + + /* check sanity of bitfield size */ + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_int_data_bits = BTF_INT_BITS(int_data); + if (!nr_bits) { + /* Not a bitfield member, member offset must be at byte + * boundary. + */ + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member offset"); + return -EINVAL; + } + + nr_bits = nr_int_data_bits; + } else if (nr_bits > nr_int_data_bits) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); + if (nr_copy_bits > BITS_PER_U64) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 64"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1016,6 +1135,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + int_data = btf_type_int(t); if (int_data & ~BTF_INT_MASK) { btf_verifier_log_basic(env, t, "Invalid int_data:%x", @@ -1097,6 +1221,7 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, seq_printf(m, "0x%llx", print_num); } + static void btf_int_bits_seq_show(const struct btf *btf, const struct btf_type *t, void *data, u8 bits_offset, @@ -1163,6 +1288,7 @@ static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, + .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, .seq_show = btf_int_seq_show, }; @@ -1192,6 +1318,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env, resolved_type); } +static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, + &resolved_member, + resolved_type); +} + static int btf_ptr_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, @@ -1227,6 +1378,11 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; @@ -1380,6 +1536,7 @@ static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, + .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_modifier_seq_show, }; @@ -1388,6 +1545,7 @@ static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_ptr_seq_show, }; @@ -1422,6 +1580,7 @@ static struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_df_seq_show, }; @@ -1480,6 +1639,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size) { btf_verifier_log_type(env, t, "size != 0"); return -EINVAL; @@ -1603,6 +1767,7 @@ static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, .seq_show = btf_array_seq_show, }; @@ -1641,6 +1806,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; + u32 offset; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1682,7 +1848,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (is_union && member->offset) { + offset = btf_member_bit_offset(t, member); + if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; @@ -1692,20 +1859,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, * ">" instead of ">=" because the last member could be * "char a[0];" */ - if (last_offset > member->offset) { + if (last_offset > offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } - if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + if (BITS_ROUNDUP_BYTES(offset) > struct_size) { btf_verifier_log_member(env, t, member, "Member bits_offset exceeds its struct size"); return -EINVAL; } btf_verifier_log_member(env, t, member, NULL); - last_offset = member->offset; + last_offset = offset; } return meta_needed; @@ -1735,9 +1902,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, last_member_type = btf_type_by_id(env->btf, last_member_type_id); - err = btf_type_ops(last_member_type)->check_member(env, v->t, - last_member, - last_member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, + last_member, + last_member_type); + else + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); if (err) return err; } @@ -1759,9 +1931,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, return env_stack_push(env, member_type, member_type_id); } - err = btf_type_ops(member_type)->check_member(env, v->t, - member, - member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(member_type)->check_kflag_member(env, v->t, + member, + member_type); + else + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); if (err) return err; } @@ -1789,17 +1966,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - u32 member_offset = member->offset; - u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); const struct btf_kind_operations *ops; + u32 member_offset, bitfield_size; + u32 bytes_offset; + u8 bits8_offset; if (i) seq_puts(m, seq); - ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + member_offset = btf_member_bit_offset(t, member); + bitfield_size = btf_member_bitfield_size(t, member); + if (bitfield_size) { + btf_bitfield_seq_show(data, member_offset, + bitfield_size, m); + } else { + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } } seq_puts(m, "}"); } @@ -1808,6 +1994,7 @@ static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, .seq_show = btf_struct_seq_show, }; @@ -1837,6 +2024,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env, return 0; } +static int btf_enum_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, bytes_end, struct_size; + u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; + + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + if (!nr_bits) { + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + nr_bits = int_bitsize; + } else if (nr_bits > int_bitsize) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); + if (struct_size < bytes_end) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1856,6 +2078,11 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size != sizeof(int)) { btf_verifier_log_type(env, t, "Expected size:%zu", sizeof(int)); @@ -1924,6 +2151,7 @@ static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .seq_show = btf_enum_seq_show, }; @@ -1946,6 +2174,11 @@ static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return meta_needed; @@ -2005,6 +2238,7 @@ static struct btf_kind_operations func_proto_ops = { * Hence, there is no btf_func_check_member(). */ .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, .seq_show = btf_df_seq_show, }; @@ -2024,6 +2258,11 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -2033,6 +2272,7 @@ static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_df_seq_show, }; -- cgit v1.2.3 From ffa0c1cf59596fba54546ea828305acfcc2cf55e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:52 -0800 Subject: bpf: enable cgroup local storage map pretty print with kind_flag Commit 970289fc0a83 ("bpf: add bpffs pretty print for cgroup local storage maps") added bpffs pretty print for cgroup local storage maps. The commit worked for struct without kind_flag set. This patch refactored and made pretty print also work with kind_flag set for the struct. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 ++++- kernel/bpf/btf.c | 37 ++++++++++++++++++++++++++++--------- kernel/bpf/local_storage.c | 17 ++++------------- 3 files changed, 36 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 58000d7e06e3..12502e25e767 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -7,6 +7,7 @@ #include struct btf; +struct btf_member; struct btf_type; union bpf_attr; @@ -46,7 +47,9 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 93b6905e3a9b..e804b26a0506 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -546,22 +546,41 @@ static bool btf_type_int_is_regular(const struct btf_type *t) } /* - * Check that given type is a regular int and has the expected size. + * Check that given struct member is a regular int with expected + * offset and size. */ -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size) +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size) { - u8 nr_bits, nr_bytes; - u32 int_data; + const struct btf_type *t; + u32 id, int_data; + u8 nr_bits; - if (!btf_type_is_int(t)) + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + if (!t || !btf_type_is_int(t)) return false; int_data = btf_type_int(t); nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - nr_bytes != expected_size) + if (btf_type_kflag(s)) { + u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); + u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); + + /* if kflag set, int should be a regular int and + * bit offset should be at byte boundary. + */ + return !bitfield_size && + BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && + BITS_ROUNDUP_BYTES(nr_bits) == expected_size; + } + + if (BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(m->offset) || + BITS_ROUNDUP_BYTES(m->offset) != expected_offset || + BITS_PER_BYTE_MASKED(nr_bits) || + BITS_ROUNDUP_BYTES(nr_bits) != expected_size) return false; return true; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5eca03da0989..07a34ef562a0 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -315,9 +315,8 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - const struct btf_type *t; struct btf_member *m; - u32 id, size; + u32 offset, size; /* Key is expected to be of struct bpf_cgroup_storage_key type, * which is: @@ -338,25 +337,17 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, * The first field must be a 64 bit integer at 0 offset. */ m = (struct btf_member *)(key_type + 1); - if (m->offset) - return -EINVAL; - id = m->type; - t = btf_type_id_size(btf, &id, NULL); size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!t || !btf_type_is_reg_int(t, size)) + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) return -EINVAL; /* * The second field must be a 32 bit integer at 64 bit offset. */ m++; - if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) * - BITS_PER_BYTE) - return -EINVAL; - id = m->type; - t = btf_type_id_size(btf, &id, NULL); + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); - if (!t || !btf_type_is_reg_int(t, size)) + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; return 0; -- cgit v1.2.3 From a38d1107f937ca95dcf820161ef44ea683d6a0b1 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Wed, 12 Dec 2018 16:42:37 -0800 Subject: bpf: support raw tracepoints in modules Distributions build drivers as modules, including network and filesystem drivers which export numerous tracepoints. This enables bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. Signed-off-by: Matt Mullins Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 4 ++ include/linux/trace_events.h | 8 +++- kernel/bpf/syscall.c | 11 +++-- kernel/module.c | 5 +++ kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 120 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index fce6b4335e36..5f147dd5e709 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -432,6 +432,10 @@ struct module { unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif +#ifdef CONFIG_BPF_EVENTS + unsigned int num_bpf_raw_events; + struct bpf_raw_event_map *bpf_raw_events; +#endif #ifdef HAVE_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4130a5497d40..8a62731673f7 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf { return -EOPNOTSUPP; } -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ +} static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5db31067d85e..0607db304def 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1604,6 +1604,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) bpf_probe_unregister(raw_tp->btp, raw_tp->prog); bpf_prog_put(raw_tp->prog); } + bpf_put_raw_tracepoint(raw_tp->btp); kfree(raw_tp); return 0; } @@ -1629,13 +1630,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return -EFAULT; tp_name[sizeof(tp_name) - 1] = 0; - btp = bpf_find_raw_tracepoint(tp_name); + btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) - return -ENOMEM; + if (!raw_tp) { + err = -ENOMEM; + goto out_put_btp; + } raw_tp->btp = btp; prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, @@ -1663,6 +1666,8 @@ out_put_prog: bpf_prog_put(prog); out_free_tp: kfree(raw_tp); +out_put_btp: + bpf_put_raw_tracepoint(btp); return err; } diff --git a/kernel/module.c b/kernel/module.c index 49a405891587..06ec68f08387 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_BPF_EVENTS + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", + sizeof(*mod->bpf_raw_events), + &mod->num_bpf_raw_events); +#endif #ifdef HAVE_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9864a35c8bb5..9ddb6fddb4e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,43 @@ #include "trace_probe.h" #include "trace.h" +#ifdef CONFIG_MODULES +struct bpf_trace_module { + struct module *module; + struct list_head list; +}; + +static LIST_HEAD(bpf_trace_modules); +static DEFINE_MUTEX(bpf_module_mutex); + +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + struct bpf_raw_event_map *btp, *ret = NULL; + struct bpf_trace_module *btm; + unsigned int i; + + mutex_lock(&bpf_module_mutex); + list_for_each_entry(btm, &bpf_trace_modules, list) { + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { + btp = &btm->module->bpf_raw_events[i]; + if (!strcmp(btp->tp->name, name)) { + if (try_module_get(btm->module)) + ret = btp; + goto out; + } + } + } +out: + mutex_unlock(&bpf_module_mutex); + return ret; +} +#else +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + return NULL; +} +#endif /* CONFIG_MODULES */ + u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) if (!strcmp(btp->tp->name, name)) return btp; } - return NULL; + + return bpf_get_raw_tracepoint_module(name); +} + +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ + struct module *mod = __module_address((unsigned long)btp); + + if (mod) + module_put(mod); } static __always_inline @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } + +#ifdef CONFIG_MODULES +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +{ + struct bpf_trace_module *btm, *tmp; + struct module *mod = module; + + if (mod->num_bpf_raw_events == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + return 0; + + mutex_lock(&bpf_module_mutex); + + switch (op) { + case MODULE_STATE_COMING: + btm = kzalloc(sizeof(*btm), GFP_KERNEL); + if (btm) { + btm->module = module; + list_add(&btm->list, &bpf_trace_modules); + } + break; + case MODULE_STATE_GOING: + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { + if (btm->module == module) { + list_del(&btm->list); + kfree(btm); + break; + } + } + break; + } + + mutex_unlock(&bpf_module_mutex); + + return 0; +} + +static struct notifier_block bpf_module_nb = { + .notifier_call = bpf_event_notify, +}; + +int __init bpf_event_init(void) +{ + register_module_notifier(&bpf_module_nb); + return 0; +} + +fs_initcall(bpf_event_init); +#endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 3bdbd0228e7555ec745e08469b98e5a0966409d6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 16 Dec 2018 15:47:04 -0800 Subject: bpf: sockmap, metadata support for reporting size of msg This adds metadata to sk_msg_md for BPF programs to read the sk_msg size. When the SK_MSG program is running under an application that is using sendfile the data is not copied into sk_msg buffers by default. Rather the BPF program uses sk_msg_pull_data to read the bytes in. This avoids doing the costly memcopy instructions when they are not in fact needed. However, if we don't know the size of the sk_msg we have to guess if needed bytes are available by doing a pull request which may fail. By including the size of the sk_msg BPF programs can check the size before issuing sk_msg_pull_data requests. Additionally, the same applies for sendmsg calls when the application provides multiple iovs. Here the BPF program needs to pull in data to update data pointers but its not clear where the data ends without a size parameter. In many cases "guessing" is not easy to do and results in multiple calls to pull and without bounded loops everything gets fairly tricky. Clean this up by including a u32 size field. Note, all writes into sk_msg_md are rejected already from sk_msg_is_valid_access so nothing additional is needed there. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 3 +++ include/uapi/linux/bpf.h | 1 + net/core/filter.c | 6 ++++++ 3 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2a11e9d91dfa..eb8f6cb84c10 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,6 +36,9 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +/* UAPI in filter.c depends on struct sk_msg_sg being first element. If + * this is moved filter.c also must be updated. + */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d324c2cbca2..91c43884f295 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2665,6 +2665,7 @@ struct sk_msg_md { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index f9348806e843..3a3b21726fb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7530,6 +7530,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; } return insn - insn_buf; -- cgit v1.2.3 From 7a69c0f250568e6ab72f401b2c69aa0e666c94f2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:31 -0800 Subject: bpf: skmsg, replace comments with BUILD bug Enforce comment on structure layout dependency with a BUILD_BUG_ON to ensure the condition is maintained. Suggested-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 4 +--- net/core/filter.c | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index eb8f6cb84c10..dd57e6f408b1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,9 +36,7 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; -/* UAPI in filter.c depends on struct sk_msg_sg being first element. If - * this is moved filter.c also must be updated. - */ +/* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/net/core/filter.c b/net/core/filter.c index 6bd9f08f6162..447dd1bad31f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7425,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), -- cgit v1.2.3 From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:33 -0800 Subject: bpf: sk_msg, fix socket data_ready events When a skb verdict program is in-use and either another BPF program redirects to that socket or the new SK_PASS support is used the data_ready callback does not wake up application. Instead because the stream parser/verdict is using the sk data_ready callback we wake up the stream parser/verdict block. Fix this by adding a helper to check if the stream parser block is enabled on the sk and if so call the saved pointer which is the upper layers wake up function. This fixes application stalls observed when an application is waiting for data in a blocking read(). Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 8 ++++++++ net/core/skmsg.c | 6 +++--- net/ipv4/tcp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dd57e6f408b1..178a3933a71b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -417,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) sk_psock_drop(sk, psock); } +static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) +{ + if (psock->parser.enabled) + psock->parser.saved_data_ready(sk); + else + sk->sk_data_ready(sk); +} + static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8a91a460de8f..3df7627db4bb 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) msg->skb = skb; sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); return copied; } @@ -751,7 +751,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) } /* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) +static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; @@ -799,7 +799,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a47c1cdf90fc..87503343743d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -198,7 +198,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, msg->sg.start = i; msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); - sk->sk_data_ready(sk); + sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); -- cgit v1.2.3 From 0608c69c9a805c6264689d7eab4203eab88cf1da Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:35 -0800 Subject: bpf: sk_msg, sock{map|hash} redirect through ULP A sockmap program that redirects through a kTLS ULP enabled socket will not work correctly because the ULP layer is skipped. This fixes the behavior to call through the ULP layer on redirect to ensure any operations required on the data stream at the ULP layer continue to be applied. To do this we add an internal flag MSG_SENDPAGE_NOPOLICY to avoid calling the BPF layer on a redirected message. This is required to avoid calling the BPF layer multiple times (possibly recursively) which is not the current/expected behavior without ULPs. In the future we may add a redirect flag if users _do_ want the policy applied again but this would need to work for both ULP and non-ULP sockets and be opt-in to avoid breaking existing programs. Also to avoid polluting the flag space with an internal flag we reuse the flag space overlapping MSG_SENDPAGE_NOPOLICY with MSG_WAITFORONE. Here WAITFORONE is specific to recv path and SENDPAGE_NOPOLICY is only used for sendpage hooks. The last thing to verify is user space API is masked correctly to ensure the flag can not be set by user. (Note this needs to be true regardless because we have internal flags already in-use that user space should not be able to set). But for completeness we have two UAPI paths into sendpage, sendfile and splice. In the sendfile case the function do_sendfile() zero's flags, ./fs/read_write.c: static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { ... fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); } In the splice case the pipe_to_sendpage "actor" is used which masks flags with SPLICE_F_MORE. ./fs/splice.c: static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { ... more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; ... } Confirming what we expect that internal flags are in fact internal to socket side. Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/socket.h | 1 + include/net/tls.h | 9 +++++++++ net/ipv4/tcp_bpf.c | 13 ++++++++++++- net/tls/tls_sw.c | 43 ++++++++++++++++++++++++++++++------------- 4 files changed, 52 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 8b571e9b9f76..84c48a3c0227 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -286,6 +286,7 @@ struct ucred { #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ +#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN diff --git a/include/net/tls.h b/include/net/tls.h index bab5627ff5e3..23601f3b02ee 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -454,6 +454,15 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline bool tls_sw_has_ctx_tx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_tx(ctx); +} + static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 87503343743d..1bb7321a256d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -8,6 +8,7 @@ #include #include +#include static bool tcp_bpf_stream_read(const struct sock *sk) { @@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, u32 off; while (1) { + bool has_tx_ulp; + sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; @@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, tcp_rate_check_app_limited(sk); retry: - ret = do_tcp_sendpages(sk, page, off, size, flags); + has_tx_ulp = tls_sw_has_ctx_tx(sk); + if (has_tx_ulp) { + flags |= MSG_SENDPAGE_NOPOLICY; + ret = kernel_sendpage_locked(sk, + page, off, size, flags); + } else { + ret = do_tcp_sendpages(sk, page, off, size, flags); + } + if (ret <= 0) return ret; if (apply) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d4ecc66464e6..5aee9ae5ca53 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -686,12 +686,13 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; + bool enospc, policy; int err = 0, send; u32 delta = 0; - bool enospc; + policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); - if (!psock) + if (!psock || !policy) return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); @@ -1017,8 +1018,8 @@ send_end: return copied ? copied : ret; } -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) +int tls_sw_do_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) { long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -1033,15 +1034,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, int ret = 0; bool eor; - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST)) - return -ENOTSUPP; - - /* No MSG_EOR from splice, only look at MSG_MORE */ eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); - - lock_sock(sk); - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Wait till there is any pending write on socket */ @@ -1145,10 +1138,34 @@ wait_for_memory: } sendpage_end: ret = sk_stream_error(sk, flags, ret); - release_sock(sk); return copied ? copied : ret; } +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -ENOTSUPP; + + return tls_sw_do_sendpage(sk, page, offset, size, flags); +} + +int tls_sw_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + int ret; + + if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + return -ENOTSUPP; + + lock_sock(sk); + ret = tls_sw_do_sendpage(sk, page, offset, size, flags); + release_sock(sk); + return ret; +} + static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, int flags, long timeo, int *err) { -- cgit v1.2.3