diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 10 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 27 | ||||
-rw-r--r-- | kernel/bpf/core.c | 121 | ||||
-rw-r--r-- | kernel/bpf/dispatcher.c | 5 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 45 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 42 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 306 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 152 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 29 |
9 files changed, 525 insertions, 212 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c498f0fffb40..ca5cc8cdb6eb 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -320,6 +320,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; + struct bpf_tramp_progs *tprogs = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image; @@ -343,6 +344,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return -ENOMEM; + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; @@ -425,10 +430,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; + tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; err = arch_prepare_bpf_trampoline(image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, - &prog, 1, NULL, 0, NULL); + tprogs, NULL); if (err < 0) goto reset_unlock; @@ -469,6 +476,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: + kfree(tprogs); mutex_unlock(&st_map->lock); return err; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..50080add2ab9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3710,13 +3710,26 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, nr_args--; } - if (prog->expected_attach_type == BPF_TRACE_FEXIT && - arg == nr_args) { - if (!t) - /* Default prog with 5 args. 6th arg is retval. */ - return true; - /* function return type */ - t = btf_type_by_id(btf, t->type); + if (arg == nr_args) { + if (prog->expected_attach_type == BPF_TRACE_FEXIT) { + if (!t) + return true; + t = btf_type_by_id(btf, t->type); + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + /* For now the BPF_MODIFY_RETURN can only be attached to + * functions that return an int. + */ + if (!t) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!btf_type_is_int(t)) { + bpf_log(log, + "ret type %s not allowed for fmod_ret\n", + btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + } } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 973a20d49749..914f3463aa41 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -97,7 +97,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); - INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); return fp; } @@ -523,22 +523,22 @@ int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; -static __always_inline void -bpf_get_prog_addr_region(const struct bpf_prog *prog, - unsigned long *symbol_start, - unsigned long *symbol_end) +static void +bpf_prog_ksym_set_addr(struct bpf_prog *prog) { const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); unsigned long addr = (unsigned long)hdr; WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); - *symbol_start = addr; - *symbol_end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.start = (unsigned long) prog->bpf_func; + prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; } -void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +static void +bpf_prog_ksym_set_name(struct bpf_prog *prog) { + char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; @@ -572,36 +572,27 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) *sym = 0; } -static __always_inline unsigned long -bpf_get_prog_addr_start(struct latch_tree_node *n) +static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; - - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - - return symbol_start; + return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { - return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); + return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; + const struct bpf_ksym *ksym; - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + ksym = container_of(n, struct bpf_ksym, tnode); - if (val < symbol_start) + if (val < ksym->start) return -1; - if (val >= symbol_end) + if (val >= ksym->end) return 1; return 0; @@ -616,20 +607,29 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +void bpf_ksym_add(struct bpf_ksym *ksym) { - WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); - list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); - latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + spin_lock_bh(&bpf_lock); + WARN_ON_ONCE(!list_empty(&ksym->lnode)); + list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); + latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + spin_unlock_bh(&bpf_lock); } -static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +static void __bpf_ksym_del(struct bpf_ksym *ksym) { - if (list_empty(&aux->ksym_lnode)) + if (list_empty(&ksym->lnode)) return; - latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); - list_del_rcu(&aux->ksym_lnode); + latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&ksym->lnode); +} + +void bpf_ksym_del(struct bpf_ksym *ksym) +{ + spin_lock_bh(&bpf_lock); + __bpf_ksym_del(ksym); + spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) @@ -639,8 +639,8 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { - return list_empty(&fp->aux->ksym_lnode) || - fp->aux->ksym_lnode.prev == LIST_POISON2; + return list_empty(&fp->aux->ksym.lnode) || + fp->aux->ksym.lnode.prev == LIST_POISON2; } void bpf_prog_kallsyms_add(struct bpf_prog *fp) @@ -649,9 +649,11 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) !capable(CAP_SYS_ADMIN)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_add(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_prog_ksym_set_addr(fp); + bpf_prog_ksym_set_name(fp); + fp->aux->ksym.prog = true; + + bpf_ksym_add(&fp->aux->ksym); } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -659,33 +661,30 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) if (!bpf_prog_kallsyms_candidate(fp)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_del(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_ksym_del(&fp->aux->ksym); } -static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); - return n ? - container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : - NULL; + return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - unsigned long symbol_start, symbol_end; - struct bpf_prog *prog; + struct bpf_ksym *ksym; char *ret = NULL; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); - if (prog) { - bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); - bpf_get_prog_name(prog, sym); + ksym = bpf_ksym_find(addr); + if (ksym) { + unsigned long symbol_start = ksym->start; + unsigned long symbol_end = ksym->end; + + strncpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -703,19 +702,28 @@ bool is_bpf_text_address(unsigned long addr) bool ret; rcu_read_lock(); - ret = bpf_prog_kallsyms_find(addr) != NULL; + ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } +static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +{ + struct bpf_ksym *ksym = bpf_ksym_find(addr); + + return ksym && ksym->prog ? + container_of(ksym, struct bpf_prog_aux, ksym)->prog : + NULL; +} + const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); + prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) @@ -730,7 +738,7 @@ out: int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - struct bpf_prog_aux *aux; + struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; @@ -738,13 +746,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; rcu_read_lock(); - list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; - bpf_get_prog_name(aux->prog, sym); + strncpy(sym, ksym->name, KSYM_NAME_LEN); - *value = (unsigned long)aux->prog->bpf_func; + *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; @@ -2149,6 +2157,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index b3e5b214fed8..2444bd15cc2d 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); + noff = d->image_off ^ (PAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,9 +140,10 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_image_alloc(); + d->image = bpf_jit_alloc_exec_page(); if (!d->image) goto out; + bpf_image_ksym_add(d->image, &d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d8b7b110a1c5..01878db15eaf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,8 @@ #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> +#include <linux/pid_namespace.h> +#include <linux/proc_ns.h> #include "../../lib/kstrtox.h" @@ -499,3 +501,46 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg4_type = ARG_PTR_TO_LONG, }; #endif + +BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, + struct bpf_pidns_info *, nsdata, u32, size) +{ + struct task_struct *task = current; + struct pid_namespace *pidns; + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_pidns_info))) + goto clear; + + if (unlikely((u64)(dev_t)dev != dev)) + goto clear; + + if (unlikely(!task)) + goto clear; + + pidns = task_active_pid_ns(task); + if (unlikely(!pidns)) { + err = -ENOENT; + goto clear; + } + + if (!ns_match(&pidns->ns, (dev_t)dev, ino)) + goto clear; + + nsdata->pid = task_pid_nr_ns(task, pidns); + nsdata->tgid = task_tgid_nr_ns(task, pidns); + return 0; +clear: + memset((void *)nsdata, 0, (size_t) size); + return err; +} + +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { + .func = bpf_get_ns_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5e40e7fccc21..95087d9f4ed3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -25,6 +25,7 @@ enum bpf_type { BPF_TYPE_UNSPEC = 0, BPF_TYPE_PROG, BPF_TYPE_MAP, + BPF_TYPE_LINK, }; static void *bpf_any_get(void *raw, enum bpf_type type) @@ -36,6 +37,9 @@ static void *bpf_any_get(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_inc(raw); + break; default: WARN_ON_ONCE(1); break; @@ -53,6 +57,9 @@ static void bpf_any_put(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_put_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_put(raw); + break; default: WARN_ON_ONCE(1); break; @@ -63,20 +70,32 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) { void *raw; - *type = BPF_TYPE_MAP; raw = bpf_map_get_with_uref(ufd); - if (IS_ERR(raw)) { + if (!IS_ERR(raw)) { + *type = BPF_TYPE_MAP; + return raw; + } + + raw = bpf_prog_get(ufd); + if (!IS_ERR(raw)) { *type = BPF_TYPE_PROG; - raw = bpf_prog_get(ufd); + return raw; } - return raw; + raw = bpf_link_get_from_fd(ufd); + if (!IS_ERR(raw)) { + *type = BPF_TYPE_LINK; + return raw; + } + + return ERR_PTR(-EINVAL); } static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; +static const struct inode_operations bpf_link_iops = { }; static struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -114,6 +133,8 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) *type = BPF_TYPE_PROG; else if (inode->i_op == &bpf_map_iops) *type = BPF_TYPE_MAP; + else if (inode->i_op == &bpf_link_iops) + *type = BPF_TYPE_LINK; else return -EACCES; @@ -335,6 +356,12 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) &bpffs_map_fops : &bpffs_obj_fops); } +static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, + &bpffs_obj_fops); +} + static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { @@ -411,6 +438,9 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, case BPF_TYPE_MAP: ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); break; + case BPF_TYPE_LINK: + ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); + break; default: ret = -EPERM; } @@ -487,6 +517,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); + else if (type == BPF_TYPE_LINK) + ret = bpf_link_new_fd(raw); else return -ENOENT; @@ -504,6 +536,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (inode->i_op == &bpf_map_iops) return ERR_PTR(-EINVAL); + if (inode->i_op == &bpf_link_iops) + return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c536c65256ad..85567a6ea5f9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2173,84 +2173,274 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +struct bpf_link { + atomic64_t refcnt; + const struct bpf_link_ops *ops; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, + struct bpf_prog *prog) { - struct bpf_prog *prog = filp->private_data; + atomic64_set(&link->refcnt, 1); + link->ops = ops; + link->prog = prog; +} - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - bpf_prog_put(prog); +/* Clean up bpf_link and corresponding anon_inode file and FD. After + * anon_inode is created, bpf_link can't be just kfree()'d due to deferred + * anon_inode's release() call. This helper manages marking bpf_link as + * defunct, releases anon_inode file and puts reserved FD. + */ +static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd) +{ + link->prog = NULL; + fput(link_file); + put_unused_fd(link_fd); +} + +void bpf_link_inc(struct bpf_link *link) +{ + atomic64_inc(&link->refcnt); +} + +/* bpf_link_free is guaranteed to be called from process context */ +static void bpf_link_free(struct bpf_link *link) +{ + if (link->prog) { + /* detach BPF program, clean up used resources */ + link->ops->release(link); + bpf_prog_put(link->prog); + } + /* free bpf_link and its containing memory */ + link->ops->dealloc(link); +} + +static void bpf_link_put_deferred(struct work_struct *work) +{ + struct bpf_link *link = container_of(work, struct bpf_link, work); + + bpf_link_free(link); +} + +/* bpf_link_put can be called from atomic context, but ensures that resources + * are freed from process context + */ +void bpf_link_put(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + + if (in_atomic()) { + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); + } else { + bpf_link_free(link); + } +} + +static int bpf_link_release(struct inode *inode, struct file *filp) +{ + struct bpf_link *link = filp->private_data; + + bpf_link_put(link); return 0; } -static const struct file_operations bpf_tracing_prog_fops = { - .release = bpf_tracing_prog_release, +#ifdef CONFIG_PROC_FS +static const struct bpf_link_ops bpf_raw_tp_lops; +static const struct bpf_link_ops bpf_tracing_link_lops; +static const struct bpf_link_ops bpf_xdp_link_lops; + +static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_link *link = filp->private_data; + const struct bpf_prog *prog = link->prog; + char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + const char *link_type; + + if (link->ops == &bpf_raw_tp_lops) + link_type = "raw_tracepoint"; + else if (link->ops == &bpf_tracing_link_lops) + link_type = "tracing"; + else + link_type = "unknown"; + + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "link_type:\t%s\n" + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + link_type, + prog_tag, + prog->aux->id); +} +#endif + +const struct file_operations bpf_link_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_link_show_fdinfo, +#endif + .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); +} + +/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but + * instead of immediately installing fd in fdtable, just reserve it and + * return. Caller then need to either install it with fd_install(fd, file) or + * release with put_unused_fd(fd). + * This is useful for cases when bpf_link attachment/detachment are + * complicated and expensive operations and should be delayed until all the fd + * reservation and anon_inode creation succeeds. + */ +struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +{ + struct file *file; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return ERR_PTR(fd); + + file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(fd); + return file; + } + + *reserved_fd = fd; + return file; +} + +struct bpf_link *bpf_link_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_link *link; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_link_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + link = f.file->private_data; + bpf_link_inc(link); + fdput(f); + + return link; +} + +struct bpf_tracing_link { + struct bpf_link link; +}; + +static void bpf_tracing_link_release(struct bpf_link *link) +{ + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); +} + +static void bpf_tracing_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + kfree(tr_link); +} + +static const struct bpf_link_ops bpf_tracing_link_lops = { + .release = bpf_tracing_link_release, + .dealloc = bpf_tracing_link_dealloc, +}; + static int bpf_tracing_prog_attach(struct bpf_prog *prog) { - int tr_fd, err; + struct bpf_tracing_link *link; + struct file *link_file; + int link_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_MODIFY_RETURN && prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } - err = bpf_trampoline_link_prog(prog); - if (err) + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_prog; + } + bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); goto out_put_prog; + } - tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, - prog, O_CLOEXEC); - if (tr_fd < 0) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - err = tr_fd; + err = bpf_trampoline_link_prog(prog); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); goto out_put_prog; } - return tr_fd; + + fd_install(link_fd, link_file); + return link_fd; out_put_prog: bpf_prog_put(prog); return err; } -struct bpf_raw_tracepoint { +struct bpf_raw_tp_link { + struct bpf_link link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; }; -static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +static void bpf_raw_tp_link_release(struct bpf_link *link) { - struct bpf_raw_tracepoint *raw_tp = filp->private_data; + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); - if (raw_tp->prog) { - bpf_probe_unregister(raw_tp->btp, raw_tp->prog); - bpf_prog_put(raw_tp->prog); - } + bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); bpf_put_raw_tracepoint(raw_tp->btp); +} + +static void bpf_raw_tp_link_dealloc(struct bpf_link *link) +{ + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + kfree(raw_tp); - return 0; } -static const struct file_operations bpf_raw_tp_fops = { - .release = bpf_raw_tracepoint_release, - .read = bpf_dummy_read, - .write = bpf_dummy_write, +static const struct bpf_link_ops bpf_raw_tp_lops = { + .release = bpf_raw_tp_link_release, + .dealloc = bpf_raw_tp_link_dealloc, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { - struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; + struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int tp_fd, err; + int link_fd, err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2297,29 +2487,30 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } - raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) { + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { err = -ENOMEM; goto out_put_btp; } - raw_tp->btp = btp; - raw_tp->prog = prog; + bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + link->btp = btp; - err = bpf_probe_register(raw_tp->btp, prog); - if (err) - goto out_free_tp; + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_btp; + } - tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, - O_CLOEXEC); - if (tp_fd < 0) { - bpf_probe_unregister(raw_tp->btp, prog); - err = tp_fd; - goto out_free_tp; + err = bpf_probe_register(link->btp, prog); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_btp; } - return tp_fd; -out_free_tp: - kfree(raw_tp); + fd_install(link_fd, link_file); + return link_fd; + out_put_btp: bpf_put_raw_tracepoint(btp); out_put_prog: @@ -3266,15 +3457,21 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (err) goto out; - if (file->f_op == &bpf_raw_tp_fops) { - struct bpf_raw_tracepoint *raw_tp = file->private_data; - struct bpf_raw_event_map *btp = raw_tp->btp; + if (file->f_op == &bpf_link_fops) { + struct bpf_link *link = file->private_data; - err = bpf_task_fd_query_copy(attr, uattr, - raw_tp->prog->aux->id, - BPF_FD_TYPE_RAW_TRACEPOINT, - btp->tp->name, 0, 0); - goto put_file; + if (link->ops == &bpf_raw_tp_lops) { + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->link.prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + goto out_not_supp; } event = perf_get_event(file); @@ -3294,6 +3491,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, goto put_file; } +out_not_supp: err = -ENOTSUPP; put_file: fput(file); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 704fa787fec0..f30bca2a4d01 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,7 @@ #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/rbtree_latch.h> +#include <linux/perf_event.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -17,12 +18,11 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; -static struct latch_tree_root image_tree __cacheline_aligned; -/* serializes access to trampoline_table and image_tree */ +/* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); -static void *bpf_jit_alloc_exec_page(void) +void *bpf_jit_alloc_exec_page(void) { void *image; @@ -38,62 +38,28 @@ static void *bpf_jit_alloc_exec_page(void) return image; } -static __always_inline bool image_tree_less(struct latch_tree_node *a, - struct latch_tree_node *b) +void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { - struct bpf_image *ia = container_of(a, struct bpf_image, tnode); - struct bpf_image *ib = container_of(b, struct bpf_image, tnode); - - return ia < ib; -} - -static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) -{ - void *image = container_of(n, struct bpf_image, tnode); - - if (addr < image) - return -1; - if (addr >= image + PAGE_SIZE) - return 1; - - return 0; -} - -static const struct latch_tree_ops image_tree_ops = { - .less = image_tree_less, - .comp = image_tree_comp, -}; - -static void *__bpf_image_alloc(bool lock) -{ - struct bpf_image *image; - - image = bpf_jit_alloc_exec_page(); - if (!image) - return NULL; - - if (lock) - mutex_lock(&trampoline_mutex); - latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); - if (lock) - mutex_unlock(&trampoline_mutex); - return image->data; + ksym->start = (unsigned long) data; + ksym->end = ksym->start + PAGE_SIZE; + bpf_ksym_add(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + PAGE_SIZE, false, ksym->name); } -void *bpf_image_alloc(void) +void bpf_image_ksym_del(struct bpf_ksym *ksym) { - return __bpf_image_alloc(true); + bpf_ksym_del(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + PAGE_SIZE, true, ksym->name); } -bool is_bpf_image_address(unsigned long addr) +static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) { - bool ret; - - rcu_read_lock(); - ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL; - rcu_read_unlock(); + struct bpf_ksym *ksym = &tr->ksym; - return ret; + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); + bpf_image_ksym_add(tr->image, ksym); } struct bpf_trampoline *bpf_trampoline_lookup(u64 key) @@ -116,7 +82,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = __bpf_image_alloc(false); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -131,6 +97,8 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); tr->image = image; + INIT_LIST_HEAD_RCU(&tr->ksym.lnode); + bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -190,40 +158,50 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 - */ -#define BPF_MAX_TRAMP_PROGS 40 +static struct bpf_tramp_progs * +bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) +{ + const struct bpf_prog_aux *aux; + struct bpf_tramp_progs *tprogs; + struct bpf_prog **progs; + int kind; + + *total = 0; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return ERR_PTR(-ENOMEM); + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + tprogs[kind].nr_progs = tr->progs_cnt[kind]; + *total += tr->progs_cnt[kind]; + progs = tprogs[kind].progs; + + hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) + *progs++ = aux->prog; + } + return tprogs; +} static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; - struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; - int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; - int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; - struct bpf_prog **progs, **fentry, **fexit; + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; - struct bpf_prog_aux *aux; - int err; + int err, total; + + tprogs = bpf_trampoline_get_progs(tr, &total); + if (IS_ERR(tprogs)) + return PTR_ERR(tprogs); - if (fentry_cnt + fexit_cnt == 0) { + if (total == 0) { err = unregister_fentry(tr, old_image); tr->selector = 0; goto out; } - /* populate fentry progs */ - fentry = progs = progs_to_run; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) - *progs++ = aux->prog; - - /* populate fexit progs */ - fexit = progs; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) - *progs++ = aux->prog; - - if (fexit_cnt) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || + tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -232,12 +210,11 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. */ + synchronize_rcu_tasks(); - err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, - &tr->func.model, flags, - fentry, fentry_cnt, - fexit, fexit_cnt, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; @@ -252,6 +229,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; tr->selector++; out: + kfree(tprogs); return err; } @@ -260,6 +238,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_MODIFY_RETURN: + return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; default: @@ -344,8 +324,6 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { - struct bpf_image *image; - if (!tr) return; mutex_lock(&trampoline_mutex); @@ -356,11 +334,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - image = container_of(tr->image, struct bpf_image, data); - latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); + bpf_image_ksym_del(&tr->ksym); /* wait for tasks to get out of trampoline before freeing it */ synchronize_rcu_tasks(); - bpf_jit_free_exec(image); + bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: @@ -375,6 +352,7 @@ out: * call __bpf_prog_exit */ u64 notrace __bpf_prog_enter(void) + __acquires(RCU) { u64 start = 0; @@ -386,6 +364,7 @@ u64 notrace __bpf_prog_enter(void) } void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) { struct bpf_prog_stats *stats; @@ -409,8 +388,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_prog **fentry_progs, int fentry_cnt, - struct bpf_prog **fexit_progs, int fexit_cnt, + struct bpf_tramp_progs *tprogs, void *orig_call) { return -ENOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 289383edfc8c..745f3cfdf3b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,7 @@ #include <linux/sort.h> #include <linux/perf_event.h> #include <linux/ctype.h> +#include <linux/error-injection.h> #include "disasm.h" @@ -3649,7 +3650,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && func_id != BPF_FUNC_skb_output && - func_id != BPF_FUNC_perf_event_read_value) + func_id != BPF_FUNC_perf_event_read_value && + func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -3739,6 +3741,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: + case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -9800,6 +9803,26 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return 0; } +#define SECURITY_PREFIX "security_" + +static int check_attach_modify_return(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; + + /* This is expected to be cleaned up in the future with the KRSI effort + * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. + */ + if (within_error_injection_list(addr) || + !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, + sizeof(SECURITY_PREFIX) - 1)) + return 0; + + verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + + return -EINVAL; +} static int check_attach_btf_id(struct bpf_verifier_env *env) { @@ -9950,6 +9973,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!prog_extension) return -EINVAL; /* fallthrough */ + case BPF_MODIFY_RETURN: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -9999,6 +10023,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } tr->func.addr = (void *)addr; prog->aux->trampoline = tr; + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) + ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) |