diff options
Diffstat (limited to 'kernel')
116 files changed, 5099 insertions, 1937 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..9a20016d4900 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,13 +5,14 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ + sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o ucount.o + async.o range.o smpboot.o ucount.o regset.o +obj-$(CONFIG_BPFILTER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o @@ -35,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n -CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -48,6 +49,7 @@ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ +obj-y += entry/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -125,6 +127,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o +CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n diff --git a/kernel/async.c b/kernel/async.c index 4f9c1d614016..33258e6e20f8 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t uninitialized_var(calltime), delta, rettime; + ktime_t calltime, delta, rettime; /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { @@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t uninitialized_var(starttime), delta, endtime; + ktime_t starttime, delta, endtime; if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/audit.c b/kernel/audit.c index b2301bdc9773..7efaece534a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -136,6 +136,11 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); +/* Monotonically increasing sum of time the kernel has spent + * waiting while the backlog limit is exceeded. + */ +static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); + /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); - s.enabled = audit_enabled; - s.failure = audit_failure; + s.enabled = audit_enabled; + s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ - s.pid = auditd_pid_vnr(); - s.rate_limit = audit_rate_limit; - s.backlog_limit = audit_backlog_limit; - s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_queue); - s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.pid = auditd_pid_vnr(); + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_queue); + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; + s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_config_change("lost", 0, lost, 1); return lost; } + if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { + u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); + + audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); + return actual; + } break; } case AUDIT_GET_FEATURE: @@ -1800,7 +1812,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, { struct audit_buffer *ab; struct timespec64 t; - unsigned int uninitialized_var(serial); + unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { + long rtime = stime; + DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - stime = schedule_timeout(stime); + stime = schedule_timeout(rtime); + atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) @@ -2079,13 +2094,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { - audit_log_string(ab, "<no_memory>"); + audit_log_format(ab, "\"<no_memory>\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ - audit_log_string(ab, "<too_long>"); + audit_log_format(ab, "\"<too_long>\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 3596448bfdab..bfcfcd61adb6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group; /* fsnotify events we care about. */ #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF) static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) { @@ -152,35 +152,31 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) } /* Update mark data in audit rules based on fsnotify events. */ -static int audit_mark_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *dname) { - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; - const struct inode *inode = fsnotify_data_inode(data, data_type); audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); - BUG_ON(group != audit_fsnotify_group); - - if (WARN_ON(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) || + WARN_ON_ONCE(!inode)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); - } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) { audit_autoremove_mark_rule(audit_mark); + } return 0; } static const struct fsnotify_ops audit_mark_fsnotify_ops = { - .handle_event = audit_mark_handle_event, + .handle_inode_event = audit_mark_handle_event, .free_mark = audit_fsnotify_free_mark, }; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e49c912f862d..83e1c07fc99e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void) static struct audit_chunk *alloc_chunk(int count) { struct audit_chunk *chunk; - size_t size; int i; - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); + chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL); if (!chunk) return NULL; @@ -1037,11 +1035,9 @@ static void evict_chunk(struct audit_chunk *chunk) audit_schedule_prune(); } -static int audit_tree_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *file_name) { return 0; } @@ -1070,7 +1066,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark, } static const struct fsnotify_ops audit_tree_ops = { - .handle_event = audit_tree_handle_event, + .handle_inode_event = audit_tree_handle_event, .freeing_mark = audit_tree_freeing_mark, .free_mark = audit_tree_destroy_watch, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e09c551ae52d..246e5ba704c0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) + FS_MOVE_SELF | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { @@ -464,20 +464,17 @@ void audit_remove_watch_rule(struct audit_krule *krule) } /* Update watch data in audit rules based on fsnotify events. */ -static int audit_watch_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *dname) { - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - const struct inode *inode = fsnotify_data_inode(data, data_type); struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); - BUG_ON(group != audit_watch_group); - WARN_ON(!inode); + if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) || + WARN_ON_ONCE(!inode)) + return 0; if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); @@ -490,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .handle_event = audit_watch_handle_event, + .handle_inode_event = audit_watch_handle_event, .free_mark = audit_watch_free_mark, }; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fd840c40abf7..8dba8f0983b5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -75,6 +75,7 @@ #include <linux/uaccess.h> #include <linux/fsnotify_backend.h> #include <uapi/linux/limits.h> +#include <uapi/linux/netfilter/nf_tables.h> #include "audit.h" @@ -136,9 +137,26 @@ struct audit_nfcfgop_tab { }; static const struct audit_nfcfgop_tab audit_nfcfgs[] = { - { AUDIT_XT_OP_REGISTER, "register" }, - { AUDIT_XT_OP_REPLACE, "replace" }, - { AUDIT_XT_OP_UNREGISTER, "unregister" }, + { AUDIT_XT_OP_REGISTER, "xt_register" }, + { AUDIT_XT_OP_REPLACE, "xt_replace" }, + { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, + { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, + { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, + { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, + { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, + { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, + { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, + { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, + { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, + { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, + { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, + { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, + { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, + { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, + { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, + { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, + { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) @@ -1876,6 +1894,20 @@ __audit_reusename(const __user char *uptr) return NULL; } +inline void _audit_getcwd(struct audit_context *context) +{ + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); +} + +void __audit_getcwd(void) +{ + struct audit_context *context = audit_context(); + + if (context->in_syscall) + _audit_getcwd(context); +} + /** * __audit_getname - add a name to the list * @name: name to add @@ -1900,8 +1932,7 @@ void __audit_getname(struct filename *name) name->aname = n; name->refcnt++; - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); + _audit_getcwd(context); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2557,12 +2588,12 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, - enum audit_nfcfgop op) + enum audit_nfcfgop op, gfp_t gfp) { struct audit_buffer *ab; char comm[sizeof(current->comm)]; - ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a2a97fa3071b..370217dd7e39 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data) complete(&backtrace_work); } -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); +static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback); static void backtrace_test_irq(void) { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1131a921e1a6..e6eb9c0402da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 11584618e861..8ff419b632a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding programs to complete - * and free the array - */ - synchronize_rcu(); - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -494,6 +487,143 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +struct bpf_iter_seq_array_map_info { + struct bpf_map *map; + void *percpu_value_buf; + u32 index; +}; + +static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + if (info->index >= map->max_entries) + return NULL; + + if (*pos == 0) + ++*pos; + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + ++*pos; + ++info->index; + if (info->index >= map->max_entries) + return NULL; + + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int off = 0, cpu = 0; + void __percpu **pptr; + u32 size; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, v == NULL); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + + if (!info->percpu_value_buf) { + ctx.value = v; + } else { + pptr = v; + size = round_up(map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + size); + off += size; + } + ctx.value = info->percpu_value_buf; + } + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_array_map_seq_show(seq, v); +} + +static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_array_map_seq_show(seq, NULL); +} + +static int bpf_iter_init_array_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + return 0; +} + +static void bpf_iter_fini_array_map(void *priv_data) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_array_map_seq_ops = { + .start = bpf_array_map_seq_start, + .next = bpf_array_map_seq_next, + .stop = bpf_array_map_seq_stop, + .show = bpf_array_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_array_map_seq_ops, + .init_seq_private = bpf_iter_init_array_map, + .fini_seq_private = bpf_iter_fini_array_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), +}; + +static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -510,8 +640,12 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_btf_name = "bpf_array", + .map_btf_id = &array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; +static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -522,6 +656,9 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &percpu_array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -540,8 +677,6 @@ static void fd_array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - synchronize_rcu(); - /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); @@ -868,6 +1003,7 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -883,6 +1019,8 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, + .map_btf_name = "bpf_array", + .map_btf_id = &prog_array_map_btf_id, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -961,6 +1099,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -972,6 +1111,8 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &perf_event_array_map_btf_id, }; #ifdef CONFIG_CGROUPS @@ -994,6 +1135,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } +static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -1004,6 +1146,8 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &cgroup_array_map_btf_id, }; #endif @@ -1077,6 +1221,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1089,4 +1234,6 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &array_of_maps_map_btf_id, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dd612b80b9fe..b6715964b685 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -14,11 +14,13 @@ struct bpf_iter_target_info { struct bpf_iter_link { struct bpf_link link; + struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; @@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -199,11 +202,25 @@ done: return copied; } +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link); + return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) @@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->fini_seq_private) - iter_priv->tinfo->reg_info->fini_seq_private(seq->private); + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) static void bpf_iter_link_release(struct bpf_link *link) { + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->tinfo->reg_info->detach_target) + iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -368,16 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; + union bpf_iter_link_info linfo; struct bpf_iter_link *link; + u32 prog_btf_id, linfo_len; bool existed = false; - u32 prog_btf_id; int err; if (attr->link_create.target_fd || attr->link_create.flags) return -EINVAL; + memset(&linfo, 0, sizeof(union bpf_iter_link_info)); + + ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + linfo_len = attr->link_create.iter_info_len; + if (!ulinfo ^ !linfo_len) + return -EINVAL; + + if (ulinfo) { + err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), + linfo_len); + if (err) + return err; + linfo_len = min_t(u32, linfo_len, sizeof(linfo)); + if (copy_from_user(&linfo, ulinfo, linfo_len)) + return -EFAULT; + } + prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -403,21 +444,32 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (tinfo->reg_info->attach_target) { + err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + } + return bpf_link_settle(&link_primer); } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; @@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->init_seq_private) { - err = tinfo->reg_info->init_seq_private(priv_data->target_private); + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } - init_seq_meta(priv_data, tinfo, prog); + init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; @@ -463,6 +515,7 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { + struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; @@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link) goto free_fd; } - err = prepare_seq_file(file, - container_of(link, struct bpf_iter_link, link)); + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c6b0decaa46a..969c5d47f81f 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -611,6 +611,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } +static int bpf_struct_ops_map_btf_id; const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -620,6 +621,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, + .map_btf_name = "bpf_struct_ops_map", + .map_btf_id = &bpf_struct_ops_map_btf_id, }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0443600146dc..91afdd4c82e3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -18,6 +18,7 @@ #include <linux/sort.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <net/sock.h> @@ -3571,6 +3572,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_LINK_TYPE(_id, _name) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_LINK_TYPE +#undef BPF_MAP_TYPE +}; + +static int btf_vmlinux_map_ids_init(const struct btf *btf, + struct bpf_verifier_log *log) +{ + const struct bpf_map_ops *ops; + int i, btf_id; + + for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { + ops = btf_vmlinux_map_ops[i]; + if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) + continue; + if (!ops->map_btf_name || !ops->map_btf_id) { + bpf_log(log, "map type %d is misconfigured\n", i); + return -EINVAL; + } + btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, + BTF_KIND_STRUCT); + if (btf_id < 0) + return btf_id; + *ops->map_btf_id = btf_id; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -3586,12 +3622,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, return kern_ctx_type->type; } +BTF_ID_LIST(bpf_ctx_convert_btf_id) +BTF_ID(struct, bpf_ctx_convert) + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err, i; + int err; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3624,25 +3663,13 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; - /* find struct bpf_ctx_convert for type checking later */ - for (i = 1; i <= btf->nr_types; i++) { - const struct btf_type *t; - const char *tname; + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - t = btf_type_by_id(btf, i); - if (!__btf_type_is_struct(t)) - continue; - tname = __btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, "bpf_ctx_convert")) { - /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = t; - break; - } - } - if (i > btf->nr_types) { - err = -ENOENT; + /* find bpf map structs for map_ptr access checking */ + err = btf_vmlinux_map_ids_init(btf, log); + if (err < 0) goto errout; - } bpf_struct_ops_init(btf, log); @@ -3779,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } + + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off && + (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || + ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + info->reg_type = ctx_arg_info->reg_type; + return true; + } + } + if (t->type == 0) /* This is a pointer to void. * It is the same as scalar from the verifier safety pov. @@ -3790,16 +3830,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { info->reg_type = ctx_arg_info->reg_type; - break; + info->btf_id = ctx_arg_info->btf_id; + return true; } } + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { @@ -4049,101 +4090,17 @@ error: return -EINVAL; } -static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, - int arg) -{ - char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; - const struct btf_param *args; - const struct btf_type *t; - const char *tname, *sym; - u32 btf_id, i; - - if (!btf_vmlinux) { - bpf_log(log, "btf_vmlinux doesn't exist\n"); - return -EINVAL; - } - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return -EINVAL; - } - - sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); - if (!sym) { - bpf_log(log, "kernel doesn't have kallsyms\n"); - return -EFAULT; - } - - for (i = 1; i <= btf_vmlinux->nr_types; i++) { - t = btf_type_by_id(btf_vmlinux, i); - if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) - continue; - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (!strcmp(tname, fnname)) - break; - } - if (i > btf_vmlinux->nr_types) { - bpf_log(log, "helper %s type is not found\n", fnname); - return -ENOENT; - } - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return -EFAULT; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return -EFAULT; - - args = (const struct btf_param *)(t + 1); - if (arg >= btf_type_vlen(t)) { - bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", - fnname, arg); - return -EINVAL; - } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); - if (!btf_type_is_ptr(t) || !t->type) { - /* anything but the pointer to struct is a helper config bug */ - bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); - return -EFAULT; - } - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - /* skip modifiers */ - while (btf_type_is_modifier(t)) { - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - } - if (!btf_type_is_struct(t)) { - bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); - return -EFAULT; - } - bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, - arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); - return btf_id; -} - int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int arg) { - int *btf_id = &fn->btf_id[arg]; - int ret; + int id; - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) + if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) return -EINVAL; - - ret = READ_ONCE(*btf_id); - if (ret) - return ret; - /* ok to race the search. The result is the same */ - ret = __btf_resolve_helper_id(log, fn->func, arg); - if (!ret) { - /* Function argument cannot be type 'void' */ - bpf_log(log, "BTF resolution bug\n"); - return -EFAULT; - } - WRITE_ONCE(*btf_id, ret); - return ret; + id = fn->btf_id[arg]; + if (!id || id > btf_vmlinux->nr_types) + return -EINVAL; + return id; } static int __get_type_size(struct btf *btf, u32 btf_id, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ac53102e244a..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) } static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], - struct bpf_prog *prog) + struct bpf_cgroup_storage *new_storages[], + enum bpf_attach_type type, + struct bpf_prog *prog, + struct cgroup *cgrp) { enum bpf_cgroup_storage_type stype; + struct bpf_cgroup_storage_key key; + struct bpf_map *map; + + key.cgroup_inode_id = cgroup_id(cgrp); + key.attach_type = type; for_each_cgroup_storage_type(stype) { + map = prog->aux->cgroup_storage[stype]; + if (!map) + continue; + + storages[stype] = cgroup_storage_lookup((void *)map, &key, false); + if (storages[stype]) + continue; + storages[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storages[stype])) { - storages[stype] = NULL; - bpf_cgroup_storages_free(storages); + bpf_cgroup_storages_free(new_storages); return -ENOMEM; } + + new_storages[stype] = storages[stype]; } return 0; @@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], } static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], - struct cgroup* cgrp, + struct cgroup *cgrp, enum bpf_attach_type attach_type) { enum bpf_cgroup_storage_type stype; @@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); } -static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_unlink(storages[stype]); -} - /* Called when bpf_cgroup_link is auto-detached from dying cgroup. * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It * doesn't free link memory, which will eventually be done by bpf_link's @@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); struct bpf_prog_array *old_array; + struct list_head *storages = &cgrp->bpf.storages; + struct bpf_cgroup_storage *storage, *stmp; + unsigned int type; mutex_lock(&cgroup_mutex); for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; - struct bpf_prog_list *pl, *tmp; + struct bpf_prog_list *pl, *pltmp; - list_for_each_entry_safe(pl, tmp, progs, node) { + list_for_each_entry_safe(pl, pltmp, progs, node) { list_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) bpf_cgroup_link_auto_detach(pl->link); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work) bpf_prog_array_free(old_array); } + list_for_each_entry_safe(storage, stmp, storages, list_cg) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + mutex_unlock(&cgroup_mutex); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) @@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_LIST_HEAD(&cgrp->bpf.storages); + for (i = 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) goto cleanup; @@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; - struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl; int err; @@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (IS_ERR(pl)) return PTR_ERR(pl); - if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) + if (bpf_cgroup_storages_alloc(storage, new_storage, type, + prog ? : link->link.prog, cgrp)) return -ENOMEM; if (pl) { old_prog = pl->prog; - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_assign(old_storage, pl->storage); } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storages_free(storage); + bpf_cgroup_storages_free(new_storage); return -ENOMEM; } list_add_tail(&pl->node, progs); @@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup; - bpf_cgroup_storages_free(old_storage); if (old_prog) bpf_prog_put(old_prog); else static_branch_inc(&cgroup_bpf_enabled_key); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; cleanup: @@ -493,9 +508,7 @@ cleanup: pl->prog = old_prog; pl->link = NULL; } - bpf_cgroup_storages_free(pl->storage); - bpf_cgroup_storages_assign(pl->storage, old_storage); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_free(new_storage); if (!old_prog) { list_del(&pl->node); kfree(pl); @@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ @@ -803,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -821,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -833,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { @@ -872,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9df4cc9a2907..ed0b3578867c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, } } +/** + * bpf_prog_array_delete_safe_at() - Replaces the program at the given + * index into the program array with + * a dummy no-op program. + * @array: a bpf_prog_array + * @index: the index of the program to replace + * + * Skips over dummy programs, by not counting them, when calculating + * the position of the program to replace. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) +{ + return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); +} + +/** + * bpf_prog_array_update_at() - Updates the program at the given index + * into the program array. + * @array: a bpf_prog_array + * @index: the index of the program to update + * @prog: the program to insert into the array + * + * Skips over dummy programs, by not counting them, when calculating + * the position of the program to update. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, + struct bpf_prog *prog) +{ + struct bpf_prog_array_item *item; + + if (unlikely(index < 0)) + return -EINVAL; + + for (item = array->items; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) + continue; + if (!index) { + WRITE_ONCE(item->prog, prog); + return 0; + } + index--; + } + return -ENOENT; +} + int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, @@ -2042,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } -static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); - } -} - void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; u32 i; - bpf_free_cgroup_storage(aux); for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 27595fc6da56..f1c46529929b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,14 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -80,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -90,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -212,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -220,6 +228,75 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock_bh(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + case XDP_REDIRECT: + err = xdp_do_redirect(xdpf->dev_rx, &xdp, + rcpu->prog); + if (unlikely(err)) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + stats->redirect++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + if (stats->redirect) + xdp_do_flush_map(); + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -234,11 +311,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -259,8 +337,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -272,15 +350,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -297,7 +379,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -307,13 +389,38 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +bool cpu_map_prog_allowed(struct bpf_map *map) { + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) +{ + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -338,19 +445,22 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; + + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map_id); if (IS_ERR(rcpu->kthread)) - goto free_ptr_ring; + goto free_prog; get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ @@ -361,6 +471,9 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, return rcpu; +free_prog: + if (rcpu->prog) + bpf_prog_put(rcpu->prog); free_ptr_ring: ptr_ring_cleanup(rcpu->queue, NULL); free_queue: @@ -437,12 +550,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -450,18 +563,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -523,7 +636,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -543,6 +656,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, @@ -551,6 +665,8 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_cpu_map", + .map_btf_id = &cpu_map_btf_id, }; static int bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5fdbc776a760..10abb06065bb 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -749,6 +749,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -757,8 +758,11 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_btf_id, }; +static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -767,6 +771,8 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_hash_map_btf_id, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b32cc8ce8ff6..78dfff6a501b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1296,12 +1296,10 @@ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete + /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. + * bpf_free_used_maps() is called after bpf prog is no longer executing. + * There is no need to synchronize_rcu() here to protect map elements. */ - synchronize_rcu(); /* some of free_htab_elem() callbacks for elements of this map may * not have executed. Wait for them. @@ -1620,6 +1618,197 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, true, false); } +struct bpf_iter_seq_hash_map_info { + struct bpf_map *map; + struct bpf_htab *htab; + void *percpu_value_buf; // non-zero means percpu hash + unsigned long flags; + u32 bucket_id; + u32 skip_elems; +}; + +static struct htab_elem * +bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, + struct htab_elem *prev_elem) +{ + const struct bpf_htab *htab = info->htab; + unsigned long flags = info->flags; + u32 skip_elems = info->skip_elems; + u32 bucket_id = info->bucket_id; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + struct bucket *b; + u32 i, count; + + if (bucket_id >= htab->n_buckets) + return NULL; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + /* no update/deletion on this bucket, prev_elem should be still valid + * and we won't skip elements. + */ + n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); + elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); + if (elem) + return elem; + + /* not found, unlock and go to the next bucket */ + b = &htab->buckets[bucket_id++]; + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + for (i = bucket_id; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + flags = htab_lock_bucket(htab, b); + + count = 0; + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + if (count >= skip_elems) { + info->flags = flags; + info->bucket_id = i; + info->skip_elems = count; + return elem; + } + count++; + } + + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + struct htab_elem *elem; + + elem = bpf_hash_map_seq_find_next(info, NULL); + if (!elem) + return NULL; + + if (*pos == 0) + ++*pos; + return elem; +} + +static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_hash_map_seq_find_next(info, v); +} + +static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + u32 roundup_key_size, roundup_value_size; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + int ret = 0, off = 0, cpu; + struct bpf_prog *prog; + void __percpu *pptr; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + roundup_key_size = round_up(map->key_size, 8); + ctx.key = elem->key; + if (!info->percpu_value_buf) { + ctx.value = elem->key + roundup_key_size; + } else { + roundup_value_size = round_up(map->value_size, 8); + pptr = htab_elem_get_ptr(elem, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + roundup_value_size); + off += roundup_value_size; + } + ctx.value = info->percpu_value_buf; + } + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_hash_map_seq_show(seq, v); +} + +static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + if (!v) + (void)__bpf_hash_map_seq_show(seq, NULL); + else + htab_unlock_bucket(info->htab, + &info->htab->buckets[info->bucket_id], + info->flags); +} + +static int bpf_iter_init_hash_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + seq_info->htab = container_of(map, struct bpf_htab, map); + return 0; +} + +static void bpf_iter_fini_hash_map(void *priv_data) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_hash_map_seq_ops = { + .start = bpf_hash_map_seq_start, + .next = bpf_hash_map_seq_next, + .stop = bpf_hash_map_seq_stop, + .show = bpf_hash_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_hash_map_seq_ops, + .init_seq_private = bpf_iter_init_hash_map, + .fini_seq_private = bpf_iter_fini_hash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), +}; + +static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1631,8 +1820,12 @@ const struct bpf_map_ops htab_map_ops = { .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_map_btf_id, + .iter_seq_info = &iter_seq_info, }; +static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1645,6 +1838,9 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab_lru), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_map_btf_id, + .iter_seq_info = &iter_seq_info, }; /* Called from eBPF program */ @@ -1749,6 +1945,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1759,8 +1956,12 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; +static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1771,6 +1972,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_lru_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) @@ -1893,6 +2097,7 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } +static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -1905,4 +2110,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_of_maps_map_btf_id, }; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 33d01866bcc2..571bb351ed3b 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -13,6 +13,8 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF +#include "../cgroup/cgroup-internal.h" + #define LOCAL_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) @@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) return container_of(map, struct bpf_cgroup_storage_map, map); } -static int bpf_cgroup_storage_key_cmp( - const struct bpf_cgroup_storage_key *key1, - const struct bpf_cgroup_storage_key *key2) +static bool attach_type_isolated(const struct bpf_map *map) { - if (key1->cgroup_inode_id < key2->cgroup_inode_id) - return -1; - else if (key1->cgroup_inode_id > key2->cgroup_inode_id) - return 1; - else if (key1->attach_type < key2->attach_type) - return -1; - else if (key1->attach_type > key2->attach_type) - return 1; + return map->key_size == sizeof(struct bpf_cgroup_storage_key); +} + +static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, + const void *_key1, const void *_key2) +{ + if (attach_type_isolated(&map->map)) { + const struct bpf_cgroup_storage_key *key1 = _key1; + const struct bpf_cgroup_storage_key *key2 = _key2; + + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + } else { + const __u64 *cgroup_inode_id1 = _key1; + const __u64 *cgroup_inode_id2 = _key2; + + if (*cgroup_inode_id1 < *cgroup_inode_id2) + return -1; + else if (*cgroup_inode_id1 > *cgroup_inode_id2) + return 1; + } return 0; } -static struct bpf_cgroup_storage *cgroup_storage_lookup( - struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, - bool locked) +struct bpf_cgroup_storage * +cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, + void *key, bool locked) { struct rb_root *root = &map->root; struct rb_node *node; @@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup( storage = container_of(node, struct bpf_cgroup_storage, node); - switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { case -1: node = node->rb_left; break; @@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, this = container_of(*new, struct bpf_cgroup_storage, node); parent = *new; - switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { case -1: new = &((*new)->rb_left); break; @@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, return 0; } -static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; storage = cgroup_storage_lookup(map, key, false); @@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, +static int cgroup_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) - return -EINVAL; - - if (unlikely(flags & BPF_NOEXIST)) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && @@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, return 0; } -static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, void *_next_key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage_key *next = _next_key; struct bpf_cgroup_storage *storage; spin_lock_bh(&map->lock); @@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, if (!storage) goto enoent; - storage = list_next_entry(storage, list); + storage = list_next_entry(storage, list_map); if (!storage) goto enoent; } else { storage = list_first_entry(&map->list, - struct bpf_cgroup_storage, list); + struct bpf_cgroup_storage, list_map); } spin_unlock_bh(&map->lock); - next->attach_type = storage->key.attach_type; - next->cgroup_inode_id = storage->key.cgroup_inode_id; + + if (attach_type_isolated(&map->map)) { + struct bpf_cgroup_storage_key *next = _next_key; + *next = storage->key; + } else { + __u64 *next = _next_key; + *next = storage->key.cgroup_inode_id; + } return 0; enoent: @@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) struct bpf_map_memory mem; int ret; - if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && + attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); if (attr->value_size == 0) @@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *_map) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct list_head *storages = &map->list; + struct bpf_cgroup_storage *storage, *stmp; + + mutex_lock(&cgroup_mutex); + + list_for_each_entry_safe(storage, stmp, storages, list_map) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + + mutex_unlock(&cgroup_mutex); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); @@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - struct btf_member *m; - u32 offset, size; - - /* Key is expected to be of struct bpf_cgroup_storage_key type, - * which is: - * struct bpf_cgroup_storage_key { - * __u64 cgroup_inode_id; - * __u32 attach_type; - * }; - */ + if (attach_type_isolated(map)) { + struct btf_member *m; + u32 offset, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); + size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) + return -EINVAL; + } else { + u32 int_data; - /* - * Key_type must be a structure with two fields. - */ - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || - BTF_INFO_VLEN(key_type->info) != 2) - return -EINVAL; + /* + * Key is expected to be u64, which stores the cgroup_inode_id + */ - /* - * The first field must be a 64 bit integer at 0 offset. - */ - m = (struct btf_member *)(key_type + 1); - size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) - return -EINVAL; + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; - /* - * The second field must be a 32 bit integer at 64 bit offset. - */ - m++; - offset = offsetof(struct bpf_cgroup_storage_key, attach_type); - size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); - if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) - return -EINVAL; + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + } return 0; } -static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu; @@ -409,6 +450,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, rcu_read_unlock(); } +static int cgroup_storage_map_btf_id; const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -418,43 +460,20 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, + .map_btf_name = "bpf_cgroup_storage_map", + .map_btf_id = &cgroup_storage_map_btf_id, }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - int ret = -EBUSY; - - spin_lock_bh(&map->lock); - if (map->aux && map->aux != aux) - goto unlock; if (aux->cgroup_storage[stype] && aux->cgroup_storage[stype] != _map) - goto unlock; + return -EBUSY; - map->aux = aux; aux->cgroup_storage[stype] = _map; - ret = 0; -unlock: - spin_unlock_bh(&map->lock); - - return ret; -} - -void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - - spin_lock_bh(&map->lock); - if (map->aux == aux) { - WARN_ON(aux->cgroup_storage[stype] != _map); - map->aux = NULL; - aux->cgroup_storage[stype] = NULL; - } - spin_unlock_bh(&map->lock); + return 0; } static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) @@ -575,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, spin_lock_bh(&map->lock); WARN_ON(cgroup_storage_insert(map, storage)); - list_add(&storage->list, &map->list); + list_add(&storage->list_map, &map->list); + list_add(&storage->list_cg, &cgroup->bpf.storages); spin_unlock_bh(&map->lock); } @@ -593,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) root = &map->root; rb_erase(&storage->node, root); - list_del(&storage->list); + list_del(&storage->list_map); + list_del(&storage->list_cg); spin_unlock_bh(&map->lock); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index c8cc4e4cf98d..44474bf3ab7a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - /* Wait for outstanding programs to complete - * update/lookup/delete/get_next_key and free the trie. - */ - synchronize_rcu(); - /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. @@ -735,6 +730,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } +static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -743,4 +739,6 @@ const struct bpf_map_ops trie_map_ops = { .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, .map_check_btf = trie_check_btf, + .map_btf_name = "lpm_trie", + .map_btf_id = &trie_map_btf_id, }; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c69071e334bf..af86048e5afd 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -4,9 +4,10 @@ #include <linux/fs.h> #include <linux/filter.h> #include <linux/kernel.h> +#include <linux/btf_ids.h> struct bpf_iter_seq_map_info { - u32 mid; + u32 map_id; }; static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) @@ -14,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_map_info *info = seq->private; struct bpf_map *map; - map = bpf_map_get_curr_or_next(&info->mid); + map = bpf_map_get_curr_or_next(&info->map_id); if (!map) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return map; } static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_map_info *info = seq->private; - struct bpf_map *map; ++*pos; - ++info->mid; + ++info->map_id; bpf_map_put((struct bpf_map *)v); - map = bpf_map_get_curr_or_next(&info->mid); - if (!map) - return NULL; - - return map; + return bpf_map_get_curr_or_next(&info->map_id); } struct bpf_iter__bpf_map { @@ -81,22 +78,103 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; -static const struct bpf_iter_reg bpf_map_reg_info = { - .target = "bpf_map", +BTF_ID_LIST(btf_bpf_map_id) +BTF_ID(struct, bpf_map) + +static const struct bpf_iter_seq_info bpf_map_seq_info = { .seq_ops = &bpf_map_seq_ops, .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + +static struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map, map), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &bpf_map_seq_info, +}; + +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + u32 key_acc_size, value_acc_size, key_size, value_size; + struct bpf_map *map; + bool is_percpu = false; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + is_percpu = true; + else if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + goto put_map; + + key_acc_size = prog->aux->max_rdonly_access; + value_acc_size = prog->aux->max_rdwr_access; + key_size = map->key_size; + if (!is_percpu) + value_size = map->value_size; + else + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + + if (key_acc_size > key_size || value_acc_size > value_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) { - return bpf_iter_reg_target(&bpf_map_reg_info); + int ret; + + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 310241ca7991..542f275bf252 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -25,6 +25,32 @@ struct bpf_netns_link { /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) +{ + switch (type) { +#ifdef CONFIG_INET + case NETNS_BPF_SK_LOOKUP: + static_branch_dec(&bpf_sk_lookup_enabled); + break; +#endif + default: + break; + } +} + +static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) +{ + switch (type) { +#ifdef CONFIG_INET + case NETNS_BPF_SK_LOOKUP: + static_branch_inc(&bpf_sk_lookup_enabled); + break; +#endif + default: + break; + } +} + /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) @@ -36,12 +62,50 @@ static void netns_bpf_run_array_detach(struct net *net, bpf_prog_array_free(run_array); } +static int link_index(struct net *net, enum netns_bpf_attach_type type, + struct bpf_netns_link *link) +{ + struct bpf_netns_link *pos; + int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + if (pos == link) + return i; + i++; + } + return -ENOENT; +} + +static int link_count(struct net *net, enum netns_bpf_attach_type type) +{ + struct list_head *pos; + int i = 0; + + list_for_each(pos, &net->bpf.links[type]) + i++; + return i; +} + +static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, + struct bpf_prog_array *prog_array) +{ + struct bpf_netns_link *pos; + unsigned int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + prog_array->items[i].prog = pos->link.prog; + i++; + } +} + static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *old_array, *new_array; struct net *net; + int cnt, idx; mutex_lock(&netns_bpf_mutex); @@ -53,13 +117,41 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; - netns_bpf_run_array_detach(net, type); + /* Mark attach point as unused */ + netns_bpf_attach_type_unneed(type); + + /* Remember link position in case of safe delete */ + idx = link_index(net, type, net_link); list_del(&net_link->node); + cnt = link_count(net, type); + if (!cnt) { + netns_bpf_run_array_detach(net, type); + goto out_unlock; + } + + old_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!new_array) { + WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); + goto out_unlock; + } + fill_prog_array(net, type, new_array); + rcu_assign_pointer(net->bpf.run_array[type], new_array); + bpf_prog_array_free(old_array); + out_unlock: + net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -77,7 +169,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; - int ret = 0; + int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; @@ -95,7 +187,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); - WRITE_ONCE(run_array->items[0].prog, new_prog); + idx = link_index(net, type, net_link); + ret = bpf_prog_array_update_at(run_array, idx, new_prog); + if (ret) + goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); @@ -140,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, @@ -309,18 +405,30 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static int netns_bpf_max_progs(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; + default: + return 0; + } +} + static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; - int err; + int cnt, err; mutex_lock(&netns_bpf_mutex); - /* Allow attaching only one prog or link for now */ - if (!list_empty(&net->bpf.links[type])) { + cnt = link_count(net, type); + if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } @@ -334,6 +442,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; @@ -341,16 +452,22 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } - run_array->items[0].prog = link->prog; - rcu_assign_pointer(net->bpf.run_array[type], run_array); list_add_tail(&net_link->node, &net->bpf.links[type]); + fill_prog_array(net, type, run_array); + run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); + + /* Mark attach point as used */ + netns_bpf_attach_type_need(type); + out_unlock: mutex_unlock(&netns_bpf_mutex); return err; @@ -426,8 +543,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); - list_for_each_entry(net_link, &net->bpf.links[type], node) + list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ + netns_bpf_attach_type_unneed(type); + } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c new file mode 100644 index 000000000000..53a73c841c13 --- /dev/null +++ b/kernel/bpf/prog_iter.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_prog_info { + u32 prog_id; +}; + +static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + struct bpf_prog *prog; + + prog = bpf_prog_get_curr_or_next(&info->prog_id); + if (!prog) + return NULL; + + if (*pos == 0) + ++*pos; + return prog; +} + +static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + + ++*pos; + ++info->prog_id; + bpf_prog_put((struct bpf_prog *)v); + return bpf_prog_get_curr_or_next(&info->prog_id); +} + +struct bpf_iter__bpf_prog { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_prog *, prog); +}; + +DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog) + +static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_prog ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.prog = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_prog_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_prog_seq_show(seq, v, false); +} + +static void bpf_prog_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_prog_seq_show(seq, v, true); + else + bpf_prog_put((struct bpf_prog *)v); +} + +static const struct seq_operations bpf_prog_seq_ops = { + .start = bpf_prog_seq_start, + .next = bpf_prog_seq_next, + .stop = bpf_prog_seq_stop, + .show = bpf_prog_seq_show, +}; + +BTF_ID_LIST(btf_bpf_prog_id) +BTF_ID(struct, bpf_prog) + +static const struct bpf_iter_seq_info bpf_prog_seq_info = { + .seq_ops = &bpf_prog_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info), +}; + +static struct bpf_iter_reg bpf_prog_reg_info = { + .target = "bpf_prog", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_prog, prog), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_prog_seq_info, +}; + +static int __init bpf_prog_iter_init(void) +{ + bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id; + return bpf_iter_reg_target(&bpf_prog_reg_info); +} + +late_initcall(bpf_prog_iter_init); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 05c8e043b9d2..44184f82916a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - bpf_map_area_free(qs); } @@ -262,6 +255,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } +static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -273,8 +267,11 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &queue_map_btf_id, }; +static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -286,4 +283,6 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &stack_map_btf_id, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index cae9d505e04a..90b29c5b1da7 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -99,8 +99,6 @@ static void reuseport_array_free(struct bpf_map *map) struct sock *sk; u32 i; - synchronize_rcu(); - /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with @@ -351,6 +349,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } +static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, @@ -358,4 +357,6 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, + .map_btf_name = "reuseport_array", + .map_btf_id = &reuseport_array_map_btf_id, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0af88bbc1c15..002f8a5c9e51 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -213,13 +213,6 @@ static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); kfree(rb_map); @@ -292,6 +285,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static int ringbuf_map_btf_id; const struct bpf_map_ops ringbuf_map_ops = { .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, @@ -301,6 +295,8 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_name = "bpf_ringbuf_map", + .map_btf_id = &ringbuf_map_btf_id, }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 599488f25e40..4fd830a62be2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -4,11 +4,13 @@ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> +#include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> #include <linux/elf.h> #include <linux/pagemap.h> #include <linux/irq_work.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -348,11 +350,48 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ +#ifdef CONFIG_STACKTRACE + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +#else /* CONFIG_STACKTRACE */ + return NULL; +#endif +} + +static long __bpf_get_stackid(struct bpf_map *map, + struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -360,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -439,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -448,8 +499,78 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static __u64 count_kernel_ip(struct perf_callchain_entry *trace) +{ + __u64 nr_kernel = 0; + + while (nr_kernel < trace->nr) { + if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) + break; + nr_kernel++; + } + return nr_kernel; +} + +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, + struct bpf_map *, map, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + __u64 nr_kernel; + int ret; + + /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return bpf_get_stackid((unsigned long)(ctx->regs), + (unsigned long) map, flags, 0, 0); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + trace = ctx->data->callchain; + if (unlikely(!trace)) + return -EFAULT; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + ret = __bpf_get_stackid(map, trace, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + return -EFAULT; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + ret = __bpf_get_stackid(map, trace, flags); + } + return ret; +} + +const struct bpf_func_proto bpf_get_stackid_proto_pe = { + .func = bpf_get_stackid_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + struct perf_callchain_entry *trace_in, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +592,24 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (trace_in) + trace = trace_in; + else if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +637,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +653,91 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, NULL, buf, size, flags); +} + +BTF_ID_LIST(bpf_get_task_stack_btf_ids) +BTF_ID(struct, task_struct) + +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = (struct pt_regs *)(ctx->regs); + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + int err = -EINVAL; + __u64 nr_kernel; + + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + err = -EFAULT; + trace = ctx->data->callchain; + if (unlikely(!trace)) + goto clear; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + goto clear; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + } + return err; + +clear: + memset(buf, 0, size); + return err; + +} + +const struct bpf_func_proto bpf_get_stack_proto_pe = { + .func = bpf_get_stack_pe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { @@ -604,15 +827,13 @@ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - /* wait for bpf programs to complete before freeing stack map */ - synchronize_rcu(); - bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); put_callchain_buffers(); } +static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, @@ -621,6 +842,8 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stack_map", + .map_btf_id = &stack_trace_map_btf_id, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fd80ac81f70..86299a292214 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -2021,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2755,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2779,6 +2785,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2815,6 +2822,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2927,6 +2938,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: @@ -2950,6 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3033,6 +3046,25 @@ again: return map; } +struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) +{ + struct bpf_prog *prog; + + spin_lock_bh(&prog_idr_lock); +again: + prog = idr_get_next(&prog_idr, id); + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&prog_idr_lock); + + return prog; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) @@ -3851,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len static int link_create(union bpf_attr *attr) { enum bpf_prog_type ptype; @@ -3888,8 +3920,14 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; +#ifdef CONFIG_NET + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; +#endif default: ret = -EINVAL; } @@ -3953,6 +3991,29 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + static int bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; @@ -4202,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4dbf2b6035f8..232df29793e9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/fdtable.h> #include <linux/filter.h> +#include <linux/btf_ids.h> struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -50,7 +51,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) if (!task) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return task; } @@ -209,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) return NULL; } - ++*pos; + if (*pos == 0) + ++*pos; info->task = task; info->files = files; @@ -290,7 +293,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v) } } -static int init_seq_pidns(void *priv_data) +static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_task_common *common = priv_data; @@ -312,25 +315,36 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; -static const struct bpf_iter_reg task_reg_info = { - .target = "task", +BTF_ID_LIST(btf_task_file_ids) +BTF_ID(struct, task_struct) +BTF_ID(struct, file) + +static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static struct bpf_iter_reg task_reg_info = { + .target = "task", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &task_seq_info, }; -static const struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", +static const struct bpf_iter_seq_info task_file_seq_info = { .seq_ops = &task_file_seq_ops, .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + +static struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), @@ -338,16 +352,20 @@ static const struct bpf_iter_reg task_file_reg_info = { { offsetof(struct bpf_iter__task_file, file), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &task_file_seq_info, }; static int __init task_iter_init(void) { int ret; + task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; return bpf_iter_reg_target(&task_file_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94cead5a43e5..ef938f17b944 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -409,7 +409,9 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL; + type == PTR_TO_MEM_OR_NULL || + type == PTR_TO_RDONLY_BUF_OR_NULL || + type == PTR_TO_RDWR_BUF_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -503,6 +505,10 @@ static const char * const reg_type_str[] = { [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", }; static char slot_type_char[] = { @@ -1350,6 +1356,19 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } +static void mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, u32 btf_id) +{ + if (reg_type == SCALAR_VALUE) { + mark_reg_unknown(env, regs, regno); + return; + } + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf_id = btf_id; +} + #define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) @@ -2160,6 +2179,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BTF_ID_OR_NULL: + case PTR_TO_RDONLY_BUF: + case PTR_TO_RDONLY_BUF_OR_NULL: + case PTR_TO_RDWR_BUF: + case PTR_TO_RDWR_BUF_OR_NULL: return true; default: return false; @@ -3039,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env, return 0; } -static int check_tp_buffer_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int regno, int off, int size) +static int __check_buffer_access(struct bpf_verifier_env *env, + const char *buf_info, + const struct bpf_reg_state *reg, + int regno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid tracepoint buffer access: off=%d, size=%d", - regno, off, size); + "R%d invalid %s buffer access: off=%d, size=%d\n", + regno, buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off) || reg->var_off.value) { @@ -3054,16 +3078,49 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s", + "R%d invalid variable buffer offset: off=%d, var_off=%s\n", regno, off, tn_buf); return -EACCES; } + + return 0; +} + +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + int err; + + err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + if (err) + return err; + if (off + size > env->prog->aux->max_tp_access) env->prog->aux->max_tp_access = off + size; return 0; } +static int check_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size, + bool zero_size_allowed, + const char *buf_info, + u32 *max_access) +{ + int err; + + err = __check_buffer_access(env, buf_info, reg, regno, off, size); + if (err) + return err; + + if (off + size > *max_access) + *max_access = off + size; + + return 0; +} + /* BPF architecture zero extends alu32 ops into 64-bit registesr */ static void zext_32_to_64(struct bpf_reg_state *reg) { @@ -3181,19 +3238,68 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - if (atype == BPF_READ && value_regno >= 0) { - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; - } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + if (atype == BPF_READ && value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + + return 0; +} + +static int check_ptr_to_map_access(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + int regno, int off, int size, + enum bpf_access_type atype, + int value_regno) +{ + struct bpf_reg_state *reg = regs + regno; + struct bpf_map *map = reg->map_ptr; + const struct btf_type *t; + const char *tname; + u32 btf_id; + int ret; + + if (!btf_vmlinux) { + verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!map->ops->map_btf_id || !*map->ops->map_btf_id) { + verbose(env, "map_ptr access not supported for map type %d\n", + map->map_type); + return -ENOTSUPP; + } + + t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + + if (!env->allow_ptr_to_map_access) { + verbose(env, + "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; + } + + if (off < 0) { + verbose(env, "R%d is %s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (atype != BPF_READ) { + verbose(env, "only read from %s is supported\n", tname); + return -EACCES; } + ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (ret < 0) + return ret; + + if (value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + return 0; } + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -3362,6 +3468,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_BTF_ID) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); + } else if (reg->type == CONST_PTR_TO_MAP) { + err = check_ptr_to_map_access(env, regs, regno, off, size, t, + value_regno); + } else if (reg->type == PTR_TO_RDONLY_BUF) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); + return -EACCES; + } + err = check_buffer_access(env, reg, regno, off, size, false, + "rdonly", + &env->prog->aux->max_rdonly_access); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_RDWR_BUF) { + err = check_buffer_access(env, reg, regno, off, size, false, + "rdwr", + &env->prog->aux->max_rdwr_access); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -3603,6 +3729,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); + case PTR_TO_RDONLY_BUF: + if (meta && meta->raw_mode) + return -EACCES; + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdonly", + &env->prog->aux->max_rdonly_access); + case PTR_TO_RDWR_BUF: + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdwr", + &env->prog->aux->max_rdwr_access); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3734,12 +3872,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3811,17 +3951,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -3855,6 +4006,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != PTR_TO_MEM && + type != PTR_TO_RDONLY_BUF && + type != PTR_TO_RDWR_BUF && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -4643,10 +4796,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4749,6 +4904,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -4775,7 +4942,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS @@ -4793,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn env->prog->has_callchain_buf = true; } + if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) + env->prog->call_get_stack = true; + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -5030,6 +5202,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { + __mark_reg_unknown(env, dst_reg); + return 0; + } + verbose(env, "R%d 32-bit pointer arithmetic prohibited\n", dst); @@ -6707,6 +6884,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_BTF_ID; } else if (reg->type == PTR_TO_MEM_OR_NULL) { reg->type = PTR_TO_MEM; + } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { + reg->type = PTR_TO_RDONLY_BUF; + } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { + reg->type = PTR_TO_RDWR_BUF; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -7259,6 +7440,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -8110,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> + * this stack slot, but current has STACK_MISC -> * this verifier states are not equivalent, * return false to continue verification of this path */ @@ -10953,6 +11137,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 18175687133a..106e4500fd53 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,6 +11,8 @@ #include <asm/page.h> #include <asm/sections.h> +#include <crypto/sha.h> + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +#define NOTES_SIZE (&__stop_notes - &__start_notes) +#define BUILD_ID_MAX SHA1_DIGEST_SIZE +#define NT_GNU_BUILD_ID 3 + +struct elf_note_section { + struct elf_note n_hdr; + u8 n_data[]; +}; + +/* + * Add build ID from .notes section as generated by the GNU ld(1) + * or LLVM lld(1) --build-id option. + */ +static void add_build_id_vmcoreinfo(void) +{ + char build_id[BUILD_ID_MAX * 2 + 1]; + int n_remain = NOTES_SIZE; + + while (n_remain >= sizeof(struct elf_note)) { + const struct elf_note_section *note_sec = + &__start_notes + NOTES_SIZE - n_remain; + const u32 n_namesz = note_sec->n_hdr.n_namesz; + + if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID && + n_namesz != 0 && + !strcmp((char *)¬e_sec->n_data[0], "GNU")) { + if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) { + const u32 n_descsz = note_sec->n_hdr.n_descsz; + const u8 *s = ¬e_sec->n_data[n_namesz]; + + s = PTR_ALIGN(s, 4); + bin2hex(build_id, s, n_descsz); + build_id[2 * n_descsz] = '\0'; + VMCOREINFO_BUILD_ID(build_id); + return; + } + pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n", + note_sec->n_hdr.n_descsz, + BUILD_ID_MAX); + return; + } + n_remain -= sizeof(struct elf_note) + + ALIGN(note_sec->n_hdr.n_namesz, 4) + + ALIGN(note_sec->n_hdr.n_descsz, 4); + } +} + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); @@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void) } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + add_build_id_vmcoreinfo(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9e5934780f41..b16dbc1bf056 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing) atomic_set(&kgdb_break_tasklet_var, 0); } -static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); +static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt); void kgdb_schedule_breakpoint(void) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 683a799618ad..9d847ab851db 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -591,7 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - unsigned long uninitialized_var(flags); + unsigned long flags; /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1da3f44f2565..847a9d1fa634 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,10 +1,24 @@ # SPDX-License-Identifier: GPL-2.0-only +config NO_DMA + bool + config HAS_DMA bool depends on !NO_DMA default y +config DMA_OPS + bool + +# +# IOMMU drivers that can bypass the IOMMU code and optionally use the direct +# mapping fast path should select this option and set the dma_ops_bypass +# flag in struct device where applicable +# +config DMA_OPS_BYPASS + bool + config NEED_SG_DMA_LENGTH bool @@ -60,6 +74,7 @@ config DMA_NONCOHERENT_CACHE_SYNC config DMA_VIRT_OPS bool depends on HAS_DMA + select DMA_OPS config SWIOTLB bool @@ -174,11 +189,6 @@ config DMA_API_DEBUG drivers like double-freeing of DMA mappings or freeing mappings that were never allocated. - This also attempts to catch cases where a page owned by DMA is - accessed by the cpu in a way that could cause data corruption. For - example, this enables cow_user_page() to check that the source page is - not undergoing DMA. - This option causes a performance degradation. Use only if you want to debug device drivers and dma interactions. diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 370f63344e9c..32c7c1942bbd 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o +obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 15bc5026c485..cff7e60968b9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) +{ + unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT); + + return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN); +} + /** * dma_alloc_contiguous() - allocate contiguous pages * @dev: Pointer to device for which the allocation is performed. @@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - size_t count = size >> PAGE_SHIFT; - struct page *page = NULL; - struct cma *cma = NULL; - - if (dev && dev->cma_area) - cma = dev->cma_area; - else if (count > 1) - cma = dma_contiguous_default_area; - /* CMA can be used only in the context which permits sleeping */ - if (cma && gfpflags_allow_blocking(gfp)) { - size_t align = get_order(size); - size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - - page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); - } - - return page; + if (!gfpflags_allow_blocking(gfp)) + return NULL; + if (dev->cma_area) + return cma_alloc_aligned(dev->cma_area, size, gfp); + if (size <= PAGE_SIZE || !dma_contiguous_default_area) + return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } /** diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..8e9f7b301c6d 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -144,8 +144,12 @@ static const char *type2name[] = { [dma_debug_resource] = "resource", }; -static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", - "DMA_FROM_DEVICE", "DMA_NONE" }; +static const char *dir2name[] = { + [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL", + [DMA_TO_DEVICE] = "DMA_TO_DEVICE", + [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE", + [DMA_NONE] = "DMA_NONE", +}; /* * The access to some variables in this macro is racy. We can't use atomic_t @@ -444,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. - * - * At any time debug_dma_assert_idle() can be called to trigger a - * warning if any cachelines in the given page are in the active set. */ static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); static DEFINE_SPINLOCK(radix_lock); @@ -493,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) overlap = active_cacheline_set_overlap(cln, ++overlap); /* If we overflowed the overlap counter then we're potentially - * leaking dma-mappings. Otherwise, if maps and unmaps are - * balanced then this overflow may cause false negatives in - * debug_dma_assert_idle() as the cacheline may be marked idle - * prematurely. + * leaking dma-mappings. */ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), @@ -551,53 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) spin_unlock_irqrestore(&radix_lock, flags); } -/** - * debug_dma_assert_idle() - assert that a page is not undergoing dma - * @page: page to lookup in the dma_active_cacheline tree - * - * Place a call to this routine in cases where the cpu touching the page - * before the dma completes (page is dma_unmapped) will lead to data - * corruption. - */ -void debug_dma_assert_idle(struct page *page) -{ - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) &ents; - unsigned int nents, i; - unsigned long flags; - phys_addr_t cln; - - if (dma_debug_disabled()) - return; - - if (!page) - return; - - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(&radix_lock, flags); - nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); - - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(&radix_lock, flags); - - if (!entry) - return; - - cln = to_cacheline_number(entry); - err_printk(entry->dev, entry, - "cpu touching an active dma mapped cacheline [cln=%pa]\n", - &cln); -} - /* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. @@ -882,7 +833,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct dma_debug_entry *uninitialized_var(entry); + struct dma_debug_entry *entry; int count; if (dma_debug_disabled()) @@ -1071,7 +1022,7 @@ static void check_unmap(struct dma_debug_entry *ref) /* * Drivers should use dma_mapping_error() to check the returned * addresses of dma_map_single() and dma_map_page(). - * If not, print this warning message. See Documentation/DMA-API.txt. + * If not, print this warning message. See Documentation/core-api/dma-api.rst. */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 67f060b86a73..bb0041e99659 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -10,11 +10,9 @@ #include <linux/dma-direct.h> #include <linux/scatterlist.h> #include <linux/dma-contiguous.h> -#include <linux/dma-noncoherent.h> #include <linux/pfn.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> -#include <linux/swiotlb.h> /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it @@ -304,19 +302,6 @@ void dma_direct_free(struct device *dev, size_t size, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); - - if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(paddr, size, dir); -} -EXPORT_SYMBOL(dma_direct_sync_single_for_device); - void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -335,27 +320,11 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir); } } -EXPORT_SYMBOL(dma_direct_sync_sg_for_device); #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(paddr, size, dir); - arch_sync_dma_for_cpu_all(); - } - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); -} -EXPORT_SYMBOL(dma_direct_sync_single_for_cpu); - void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -376,20 +345,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(); } -EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); - -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - phys_addr_t phys = dma_to_phys(dev, addr); - - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); - - if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); -} -EXPORT_SYMBOL(dma_direct_unmap_page); void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -401,35 +356,8 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } -EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); - - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) - return swiotlb_map(dev, phys, size, dir, attrs); - - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; - } - - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(phys, size, dir); - return dma_addr; -} -EXPORT_SYMBOL(dma_direct_map_page); - int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { @@ -450,7 +378,6 @@ out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return 0; } -EXPORT_SYMBOL(dma_direct_map_sg); dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -467,7 +394,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, return dma_addr; } -EXPORT_SYMBOL(dma_direct_map_resource); int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a8c18c9a796f..0d129421e75f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,6 +105,196 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); +static bool dma_go_direct(struct device *dev, dma_addr_t mask, + const struct dma_map_ops *ops) +{ + if (likely(!ops)) + return true; +#ifdef CONFIG_DMA_OPS_BYPASS + if (dev->dma_ops_bypass) + return min_not_zero(mask, dev->bus_dma_limit) >= + dma_direct_get_required_mask(dev); +#endif + return false; +} + + +/* + * Check if the devices uses a direct mapping for streaming DMA operations. + * This allows IOMMU drivers to set a bypass mode if the DMA mask is large + * enough. + */ +static inline bool dma_alloc_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, dev->coherent_dma_mask, ops); +} + +static inline bool dma_map_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, *dev->dma_mask, ops); +} + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + else + addr = ops->map_page(dev, page, offset, size, dir, attrs); + debug_dma_map_page(dev, page, offset, size, dir, addr); + + return addr; +} +EXPORT_SYMBOL(dma_map_page_attrs); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_page_attrs); + +/* + * dma_maps_sg_attrs returns 0 on error and > 0 on success. + * It should never return a value < 0. + */ +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + int ents; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); + else + ents = ops->map_sg(dev, sg, nents, dir, attrs); + BUG_ON(ents < 0); + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} +EXPORT_SYMBOL(dma_map_sg_attrs); + +void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (dma_map_direct(dev, ops)) + dma_direct_unmap_sg(dev, sg, nents, dir, attrs); + else if (ops->unmap_sg) + ops->unmap_sg(dev, sg, nents, dir, attrs); +} +EXPORT_SYMBOL(dma_unmap_sg_attrs); + +dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr = DMA_MAPPING_ERROR; + + BUG_ON(!valid_dma_direction(dir)); + + /* Don't allow RAM to be mapped */ + if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) + return DMA_MAPPING_ERROR; + + if (dma_map_direct(dev, ops)) + addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); + else if (ops->map_resource) + addr = ops->map_resource(dev, phys_addr, size, dir, attrs); + + debug_dma_map_resource(dev, phys_addr, size, dir, addr); + return addr; +} +EXPORT_SYMBOL(dma_map_resource); + +void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (!dma_map_direct(dev, ops) && ops->unmap_resource) + ops->unmap_resource(dev, addr, size, dir, attrs); + debug_dma_unmap_resource(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_resource); + +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + else if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_single_for_device(dev, addr, size, dir); + else if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); + else if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops)) + dma_direct_sync_sg_for_device(dev, sg, nelems, dir); + else if (ops->sync_sg_for_device) + ops->sync_sg_for_device(dev, sg, nelems, dir); + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_device); + /* * Create scatter-list for the already allocated DMA buffer. */ @@ -138,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) @@ -208,7 +398,7 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); return ops->mmap != NULL; } @@ -233,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) @@ -246,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); @@ -277,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); @@ -309,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); @@ -320,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + /* + * ->dma_supported sets the bypass flag, so we must always call + * into the method here unless the device is truly direct mapped. + */ + if (!ops) return dma_direct_supported(dev, mask); if (!ops->dma_supported) return 1; @@ -376,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) arch_dma_cache_sync(dev, vaddr, size, dir); else if (ops->cache_sync) ops->cache_sync(dev, vaddr, size, dir); @@ -388,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev) const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); @@ -401,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) return dma_direct_need_sync(dev, dma_addr); return ops->sync_single_for_cpu || ops->sync_single_for_device; } diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile new file mode 100644 index 000000000000..34c8a3f1c735 --- /dev/null +++ b/kernel/entry/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Prevent the noinstr section from being pestered by sanitizer and other goodies +# as long as these things cannot be disabled per function. +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong +CFLAGS_common.o += -fno-stack-protector + +obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c new file mode 100644 index 000000000000..9852e0d62d95 --- /dev/null +++ b/kernel/entry/common.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/context_tracking.h> +#include <linux/entry-common.h> +#include <linux/livepatch.h> +#include <linux/audit.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall/interrupt entry disables interrupts, but user mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + */ +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_check_user_regs(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long ti_work) +{ + long ret = 0; + + /* Handle ptrace */ + if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = arch_syscall_enter_tracehook(regs); + if (ret || (ti_work & _TIF_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (ti_work & _TIF_SECCOMP) { + ret = __secure_computing(NULL); + if (ret == -1L) + return ret; + } + + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + unsigned long ti_work; + + enter_from_user_mode(regs); + instrumentation_begin(); + + local_irq_enable(); + ti_work = READ_ONCE(current_thread_info()->flags); + if (ti_work & SYSCALL_ENTER_WORK) + syscall = syscall_trace_enter(regs, syscall, ti_work); + instrumentation_end(); + + return syscall; +} + +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall/interupt exit enables interrupts, but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Invoke architecture specific last minute exit code, e.g. speculation + * mitigations, etc. + * 4) Tell lockdep that interrupts are enabled + */ +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* Workaround to allow gradual conversion of architecture code */ +void __weak arch_do_signal(struct pt_regs *regs) { } + +static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) +{ + /* + * Before returning to user space ensure that all pending work + * items have been completed. + */ + while (ti_work & EXIT_TO_USER_MODE_WORK) { + + local_irq_enable_exit_to_user(ti_work); + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (ti_work & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + + if (ti_work & _TIF_SIGPENDING) + arch_do_signal(regs); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); + } + + /* Architecture specific TIF work */ + arch_exit_to_user_mode_work(regs, ti_work); + + /* + * Disable interrupts and reevaluate the work flags as they + * might have changed while interrupts and preemption was + * enabled above. + */ + local_irq_disable_exit_to_user(); + ti_work = READ_ONCE(current_thread_info()->flags); + } + + /* Return the latest work state for arch_exit_to_user_mode() */ + return ti_work; +} + +static void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + + lockdep_assert_irqs_disabled(); + + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that the address limit is intact and no locks are held */ + addr_limit_user_check(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + +#ifndef _TIF_SINGLESTEP +static inline bool report_single_step(unsigned long ti_work) +{ + return false; +} +#else +/* + * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_usermode(). + */ +#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) + +static inline bool report_single_step(unsigned long ti_work) +{ + return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; +} +#endif + +static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) +{ + bool step; + + audit_syscall_exit(regs); + + if (ti_work & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(ti_work); + if (step || ti_work & _TIF_SYSCALL_TRACE) + arch_syscall_exit_tracehook(regs, step); +} + +/* + * Syscall specific exit to user mode preparation. Runs with interrupts + * enabled. + */ +static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + u32 cached_flags = READ_ONCE(current_thread_info()->flags); + unsigned long nr = syscall_get_nr(current, regs); + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) + local_irq_enable(); + } + + rseq_syscall(regs); + + /* + * Do one-time syscall specific work. If these work items are + * enabled, we want to run them exactly once per syscall exit with + * interrupts enabled. + */ + if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) + syscall_exit_work(regs, cached_flags); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + syscall_exit_to_user_mode_prepare(regs); + local_irq_disable_exit_to_user(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} + +noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} + +noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} + +noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + return ret; + } + + /* + * If this entry hit the idle task invoke rcu_irq_enter() whether + * RCU is watching or not. + * + * Interupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for __rcu_is_watching() here would prevent the nesting + * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interupt and eventually claim + * quiescient state and end grace periods prematurely. + * + * Unconditionally invoke rcu_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irq_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + instrumentation_begin(); + rcu_irq_enter_check_tick(); + /* Use the combo lockdep/tracing function */ + trace_hardirqs_off(); + instrumentation_end(); + + return ret; +} + +void irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + if (need_resched()) + preempt_schedule_irq(); + } +} + +noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + /* Check whether this returns to user mode */ + if (user_mode(regs)) { + irqentry_exit_to_user_mode(regs); + } else if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + rcu_irq_exit(); + } +} diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c new file mode 100644 index 000000000000..eb1a8a4c867c --- /dev/null +++ b/kernel/entry/kvm.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/entry-kvm.h> +#include <linux/kvm_host.h> + +static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +{ + do { + int ret; + + if (ti_work & _TIF_SIGPENDING) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(NULL); + } + + ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + if (ret) + return ret; + + ti_work = READ_ONCE(current_thread_info()->flags); + } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + return 0; +} + +int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +{ + unsigned long ti_work; + + /* + * This is invoked from the outer guest loop with interrupts and + * preemption enabled. + * + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts + * disabled in the inner loop before going into guest mode. No need + * to disable interrupts here. + */ + ti_work = READ_ONCE(current_thread_info()->flags); + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) + return 0; + + return xfer_to_guest_mode_work(vcpu, ti_work); +} +EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 334d48b16c36..58cbe357fb2b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -149,7 +149,7 @@ void put_callchain_buffers(void) } } -static struct perf_callchain_entry *get_callchain_entry(int *rctx) +struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) return NULL; entries = rcu_dereference(callchain_cpus_entries); - if (!entries) + if (!entries) { + put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; + } cpu = smp_processor_id(); @@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) (*rctx * perf_callchain_entry__sizeof())); } -static void +void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); @@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, int rctx; entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - if (!entry) - goto exit_put; + return NULL; ctx.entry = entry; ctx.max_stack = max_stack; @@ -218,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - set_fs(fs); + force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c436d705fbd..5bfe8e3c6e44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - set_fs(fs); + force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -9644,6 +9643,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (IS_ERR(prog)) return PTR_ERR(prog); + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + bpf_prog_put(prog); + return -EPROTO; + } + event->prog = prog; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); @@ -11585,7 +11602,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *uninitialized_var(gctx); + struct perf_event_context *ctx, *gctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -11689,7 +11706,7 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; /* - * Reuse ptrace permission checks for now. + * Preserve ptrace permission check for backwards compatibility. * * We must hold exec_update_mutex across this and any potential * perf_install_in_context() call for this new event to @@ -11697,7 +11714,7 @@ SYSCALL_DEFINE5(perf_event_open, * perf_event_exit_task() that could imply). */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto err_cred; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5f8b0c52fd2e..649fd53dc9ad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); @@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) if (!vaddr || !d) return -EINVAL; - ret = get_user_pages_remote(NULL, mm, vaddr, 1, + ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, &vma, NULL); if (unlikely(ret <= 0)) { /* @@ -477,7 +477,7 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL, NULL); if (result < 0) return result; @@ -2189,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int uninitialized_var(is_swbp); + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == get_trampoline_vaddr()) diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..733e80f334e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -93,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); + struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, @@ -217,6 +217,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); @@ -731,7 +732,7 @@ void __noreturn do_exit(long code) * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ - set_fs(USER_DS); + force_uaccess_begin(); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -804,7 +805,6 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent @@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { @@ -1711,6 +1727,30 @@ Efault: } #endif +/** + * thread_group_exited - check that a thread group has exited + * @pid: tgid of thread group to be checked. + * + * Test if the thread group represented by tgid has exited (all + * threads are zombies, dead or completely gone). + * + * Return: true if the thread group has exited. false otherwise. + */ +bool thread_group_exited(struct pid *pid) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && thread_group_empty(task)); + rcu_read_unlock(); + + return exited; +} +EXPORT_SYMBOL(thread_group_exited); + __weak void abort(void) { BUG(); diff --git a/kernel/fork.c b/kernel/fork.c index 2a8e7287a558..4d32190861bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREAD_SIZE_ORDER); if (likely(page)) { - tsk->stack = page_address(page); + tsk->stack = kasan_reset_tag(page_address(page)); return tsk->stack; } return NULL; @@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk) if (vm) { int i; - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - -(int)(PAGE_SIZE / 1024)); - + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); - } for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], @@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + stack = kasan_reset_tag(stack); tsk->stack = stack; return stack; } @@ -382,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); - - if (vm) { - int i; - - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_zone_page_state(page_zone(vm->pages[i]), - NR_KERNEL_STACK_KB, - PAGE_SIZE / 1024 * account); - } - } else { - /* - * All stack pages are in the same zone and belong to the - * same memcg. - */ - struct page *first_page = virt_to_page(stack); - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); - - mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - } + /* All stack pages are in the same node. */ + if (vm) + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + else + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } static int memcg_charge_kernel_stack(struct task_struct *tsk) @@ -415,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) struct vm_struct *vm = task_stack_vm_area(tsk); int ret; + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + if (vm) { int i; + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and both memcg_kmem_uncharge_page() - * and mod_memcg_page_state() in free_thread_stack() - * will ignore this page. So it's safe. + * pointer is NULL, and memcg_kmem_uncharge_page() in + * free_thread_stack() will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) return ret; - - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - PAGE_SIZE / 1024); } } #endif @@ -479,7 +457,6 @@ void free_task(struct task_struct *tsk) #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); - put_seccomp_filter(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); @@ -1480,7 +1457,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) goto out; } - newf = dup_fd(oldf, &error); + newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; @@ -1793,22 +1770,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct task_struct *task; struct pid *pid = file->private_data; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ - if (!task || (task->exit_state && thread_group_empty(task))) + if (thread_group_exited(pid)) poll_flags = EPOLLIN | EPOLLRDNORM; - rcu_read_unlock(); return poll_flags; } @@ -2038,7 +2011,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; - seqcount_init(&p->mems_allowed_seq); + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS memset(&p->irqtrace, 0, sizeof(p->irqtrace)); @@ -2102,8 +2075,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, - args->tls); + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); if (retval) goto bad_fork_cleanup_io; @@ -2422,6 +2394,20 @@ long _do_fork(struct kernel_clone_args *args) long nr; /* + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate + * field in struct clone_args and it still doesn't make sense to have + * them both point at the same memory location. Performing this check + * here has the advantage that we don't need to have a separate helper + * to check for legacy clone(). + */ + if ((args->flags & CLONE_PIDFD) && + (args->flags & CLONE_PARENT_SETTID) && + (args->pidfd == args->parent_tid)) + return -EINVAL; + + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event @@ -2478,42 +2464,6 @@ long _do_fork(struct kernel_clone_args *args) return nr; } -bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) -{ - /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ - if ((kargs->flags & CLONE_PIDFD) && - (kargs->flags & CLONE_PARENT_SETTID)) - return false; - - return true; -} - -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -/* For compatibility with architectures that call do_fork directly rather than - * using the syscall entry points below. */ -long do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) -{ - struct kernel_clone_args args = { - .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), - .pidfd = parent_tidptr, - .child_tid = child_tidptr, - .parent_tid = parent_tidptr, - .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), - .stack = stack_start, - .stack_size = stack_size, - }; - - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - - return _do_fork(&args); -} -#endif - /* * Create a kernel thread. */ @@ -2592,24 +2542,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif #ifdef __ARCH_WANT_SYS_CLONE3 -/* - * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from - * the registers containing the syscall arguments for clone. This doesn't work - * with clone3 since the TLS value is passed in clone_args instead. - */ -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -#error clone3 requires copy_thread_tls support in arch -#endif - noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) @@ -2906,14 +2844,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); + *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } @@ -2924,7 +2863,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by do_fork() cannot be used here directly + * functions used by _do_fork() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. @@ -2973,7 +2912,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3062,7 +3001,7 @@ int unshare_files(struct files_struct **displaced) struct files_struct *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, ©); + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) { *displaced = NULL; return error; diff --git a/kernel/futex.c b/kernel/futex.c index 4616d4ad609d..a5876694a60e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; mmap_read_lock(mm); - ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); @@ -1305,7 +1305,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; - u32 uninitialized_var(curval); + u32 curval; if (unlikely(should_fail_futex(true))) return -EFAULT; @@ -1475,7 +1475,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 uninitialized_var(curval), newval; + u32 curval, newval; struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); @@ -2325,7 +2325,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; + u32 uval, curval, newval; struct task_struct *oldowner, *newowner; u32 newtid; int ret, err = 0; @@ -2942,7 +2942,7 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); + u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; struct futex_q *top_waiter; @@ -3417,7 +3417,7 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { - u32 uval, uninitialized_var(nval), mval; + u32 uval, nval, mval; int err; /* Futex address must be 32bit aligned */ @@ -3547,7 +3547,7 @@ static void exit_robust_list(struct task_struct *curr) struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; unsigned long futex_offset; int rc; @@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: @@ -3847,7 +3847,7 @@ static void compat_exit_robust_list(struct task_struct *curr) struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20512252ecc9..10a5aff4eecc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Preflow handler support for fasteoi (sparc64) -config IRQ_PREFLOW_FASTEOI - bool - # Edge style eoi based handler (cell) config IRQ_EDGE_EOI_HANDLER bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 41e7e37a0928..857f5f4c8098 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -656,16 +656,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_level_irq); -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI -static inline void preflow_handler(struct irq_desc *desc) -{ - if (desc->preflow_handler) - desc->preflow_handler(&desc->irq_data); -} -#else -static inline void preflow_handler(struct irq_desc *desc) { } -#endif - static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) { if (!(desc->istate & IRQS_ONESHOT)) { @@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc) /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1478,6 +1465,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return 0; } +EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy); /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt @@ -1492,7 +1480,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) return -ENOSYS; } - +EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent); /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a4c2c915511d..76cd7ebd1178 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -142,7 +142,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, if (!domain) return NULL; - if (fwnode && is_fwnode_irqchip(fwnode)) { + if (is_fwnode_irqchip(fwnode)) { fwid = container_of(fwnode, struct irqchip_fwid, fwnode); switch (fwid->type) { @@ -281,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain, mutex_unlock(&irq_domain_mutex); } +EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 48c38e09c673..52ac5391dcc6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1308,9 +1308,6 @@ static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; - struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -1318,13 +1315,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); - param.sched_priority -= 1; } if (IS_ERR(t)) return PTR_ERR(t); - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + sched_set_fifo(t); /* * We keep the reference to the task struct even if @@ -2735,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) - return -ENODEV; + if (WARN_ON_ONCE(!chip)) { + err = -ENODEV; + goto out_unlock; + } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2749,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, if (data) err = chip->irq_set_irqchip_state(data, which, val); +out_unlock: irq_put_desc_busunlock(desc, flags); return err; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 8f557fa1f4fe..c6c7e187ae74 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq) unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - if (!desc || !(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) + if (!desc) return; + if (!(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + goto unlock; + desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); +unlock: irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 27634f4022d0..c48ce19a257f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { diff --git a/kernel/kcov.c b/kernel/kcov.c index 6afae0bcbac4..6b8368be89c8 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -96,7 +96,7 @@ struct kcov_percpu_data { int saved_sequence; }; -DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } -void kcov_remote_softirq_start(struct task_struct *t) +static void kcov_remote_softirq_start(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); unsigned int mode; @@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t) } } -void kcov_remote_softirq_stop(struct task_struct *t) +static void kcov_remote_softirq_stop(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 09cc78df53c6..ca40bef75a61 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -265,7 +265,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, goto out; } - ima_kexec_cmdline(image->cmdline_buf, + ima_kexec_cmdline(kernel_fd, image->cmdline_buf, image->cmdline_buf_len - 1); } @@ -636,6 +636,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) } /** + * arch_kexec_locate_mem_hole - Find free memory to place the segments. + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. + */ +int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + return kexec_locate_mem_hole(kbuf); +} + +/** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. * @@ -647,7 +660,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) */ int kexec_add_buffer(struct kexec_buf *kbuf) { - struct kexec_segment *ksegment; int ret; @@ -675,7 +687,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = kexec_locate_mem_hole(kbuf); + ret = arch_kexec_locate_mem_hole(kbuf); if (ret) return ret; @@ -1157,24 +1169,26 @@ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { int i, j; - unsigned long long start, end; + unsigned long long start, end, p_start, p_end; struct crash_mem_range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; end = mem->ranges[i].end; + p_start = mstart; + p_end = mend; if (mstart > end || mend < start) continue; /* Truncate any area outside of range */ if (mstart < start) - mstart = start; + p_start = start; if (mend > end) - mend = end; + p_end = end; /* Found completely overlapping range */ - if (mstart == start && mend == end) { + if (p_start == start && p_end == end) { mem->ranges[i].start = 0; mem->ranges[i].end = 0; if (i < mem->nr_ranges - 1) { @@ -1185,20 +1199,29 @@ int crash_exclude_mem_range(struct crash_mem *mem, mem->ranges[j].end = mem->ranges[j+1].end; } + + /* + * Continue to check if there are another overlapping ranges + * from the current position because of shifting the above + * mem ranges. + */ + i--; + mem->nr_ranges--; + continue; } mem->nr_ranges--; return 0; } - if (mstart > start && mend < end) { + if (p_start > start && p_end < end) { /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; + mem->ranges[i].end = p_start - 1; + temp_range.start = p_end + 1; temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; + } else if (p_start != start) + mem->ranges[i].end = p_start - 1; else - mem->ranges[i].start = mend + 1; + mem->ranges[i].start = p_end + 1; break; } @@ -1235,7 +1258,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, unsigned long long notes_addr; unsigned long mstart, mend; - /* extra phdr for vmcoreinfo elf note */ + /* extra phdr for vmcoreinfo ELF note */ nr_phdr = nr_cpus + 1; nr_phdr += mem->nr_ranges; @@ -1243,7 +1266,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel + * memory will be mapped in two ELF headers. One will contain kernel * text virtual addresses and other will have __va(physical) addresses. */ @@ -1270,7 +1293,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - /* Prepare one phdr of type PT_NOTE for each present cpu */ + /* Prepare one phdr of type PT_NOTE for each present CPU */ for_each_present_cpu(cpu) { phdr->p_type = PT_NOTE; notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); @@ -1312,10 +1335,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - phdr++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, ehdr->e_phnum, phdr->p_offset); + phdr++; } *addr = buf; diff --git a/kernel/kmod.c b/kernel/kmod.c index 37c3c4b97b8e..3cd075ce2a1e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,9 +36,8 @@ * * If you need less than 50 threads would mean we're dealing with systems * smaller than 3200 pages. This assumes you are capable of having ~13M memory, - * and this would only be an be an upper limit, after which the OOM killer - * would take effect. Systems like these are very unlikely if modules are - * enabled. + * and this would only be an upper limit, after which the OOM killer would take + * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e87679a48ba2..287b263c9cb9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1111,9 +1111,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -#define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) (-ENODEV) +static inline int prepare_kprobe(struct kprobe *p) +{ + return arch_prepare_kprobe(p); +} + +static inline int arm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} + +static inline int disarm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} #endif /* Arm a kprobe with text_mutex */ @@ -2145,6 +2156,13 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); + + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace. + */ + if (kprobe_ftrace(p)) + disarm_kprobe_ftrace(p); } /* Disable one kprobe */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 1d9e2fdfd67a..3edaa380dc7b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind); * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread - * The thread will be woken and put into park mode. */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, @@ -1241,13 +1240,16 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(tsk->mm); task_lock(tsk); + /* Hold off tlb flush IPIs while switching mm's */ + local_irq_disable(); active_mm = tsk->active_mm; if (active_mm != mm) { mmgrab(mm); tsk->active_mm = mm; } tsk->mm = mm; - switch_mm(active_mm, mm, tsk); + switch_mm_irqs_off(active_mm, mm, tsk); + local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); @@ -1256,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); - to_kthread(tsk)->oldfs = get_fs(); - set_fs(USER_DS); + to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1272,13 +1273,15 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - set_fs(to_kthread(tsk)->oldfs); + force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); sync_mm_rss(mm); + local_irq_disable(); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); + local_irq_enable(); task_unlock(tsk); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f361d75a75ac..2fad21d345b0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1723,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data) static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_forwards(this, (void *)&count, noop_count, &target_entry); @@ -1749,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_backwards(this, (void *)&count, noop_count, &target_entry); @@ -1804,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -1842,7 +1842,7 @@ static noinline int check_redundant(struct held_lock *src, struct held_lock *target) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -2244,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, { unsigned long usage_mask = 0, forward_mask, backward_mask; enum lock_usage_bit forward_bit = 0, backward_bit = 0; - struct lock_list *uninitialized_var(target_entry1); - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry1; + struct lock_list *target_entry; struct lock_list this, that; int ret; @@ -3438,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); @@ -3465,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 5525cd3ba0c8..02ef87f50df2 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); + seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ff6f50e06a0..9cfa5e89cff7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) static void torture_rtmutex_boost(struct torture_random_state *trsp) { - int policy; - struct sched_param param; const unsigned int factor = 50000; /* yes, quite arbitrary */ if (!rt_task(current)) { @@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (trsp && !(torture_random(trsp) % (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; + sched_set_fifo(current); } else /* common case, do nothing */ return; } else { @@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (!trsp || !(torture_random(trsp) % (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; + sched_set_normal(current, 0); } else /* common case, do nothing */ return; } - - sched_setscheduler_nocheck(current, policy, ¶m); } static void torture_rtmutex_delay(struct torture_random_state *trsp) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..cbff6ba53d56 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif diff --git a/kernel/module.c b/kernel/module.c index aa183c9ac0a2..1c5cff34d9f2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr, } /* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol_section(bool (*fn)(const struct symsearch *arr, +static bool each_symbol_section(bool (*fn)(const struct symsearch *arr, struct module *owner, void *data), void *data) @@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, } return false; } -EXPORT_SYMBOL_GPL(each_symbol_section); struct find_symbol_arg { /* Input */ @@ -496,6 +495,7 @@ struct find_symbol_arg { struct module *owner; const s32 *crc; const struct kernel_symbol *sym; + enum mod_license license; }; static bool check_exported_symbol(const struct symsearch *syms, @@ -505,9 +505,9 @@ static bool check_exported_symbol(const struct symsearch *syms, struct find_symbol_arg *fsa = data; if (!fsa->gplok) { - if (syms->licence == GPL_ONLY) + if (syms->license == GPL_ONLY) return false; - if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) { pr_warn("Symbol %s is being used by a non-GPL module, " "which will not be allowed in the future\n", fsa->name); @@ -529,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms, fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); fsa->sym = &syms->start[symnum]; + fsa->license = syms->license; return true; } @@ -585,9 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -const struct kernel_symbol *find_symbol(const char *name, +static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, + enum mod_license *license, bool gplok, bool warn) { @@ -602,13 +604,14 @@ const struct kernel_symbol *find_symbol(const char *name, *owner = fsa.owner; if (crc) *crc = fsa.crc; + if (license) + *license = fsa.license; return fsa.sym; } pr_debug("Failed to find symbol %s\n", name); return NULL; } -EXPORT_SYMBOL_GPL(find_symbol); /* * Search for module by name: must hold module_mutex (or preempt disabled @@ -869,7 +872,7 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { int err; @@ -888,7 +891,6 @@ int ref_module(struct module *a, struct module *b) } return 0; } -EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -1077,7 +1079,7 @@ void __symbol_put(const char *symbol) struct module *owner; preempt_disable(); - if (!find_symbol(symbol, &owner, NULL, true, false)) + if (!find_symbol(symbol, &owner, NULL, NULL, true, false)) BUG(); module_put(owner); preempt_enable(); @@ -1169,11 +1171,10 @@ static inline void module_unload_free(struct module *mod) { } -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(ref_module); static inline int module_unload_init(struct module *mod) { @@ -1356,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol("module_layout", NULL, &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) { preempt_enable(); BUG(); } @@ -1430,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } +static bool inherit_taint(struct module *mod, struct module *owner) +{ + if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) + return true; + + if (mod->using_gplonly_symbols) { + pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", + mod->name, owner->name); + return false; + } + + if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { + pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", + mod->name, owner->name); + set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); + } + return true; +} /* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(struct module *mod, @@ -1440,6 +1459,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, struct module *owner; const struct kernel_symbol *sym; const s32 *crc; + enum mod_license license; int err; /* @@ -1449,11 +1469,19 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, */ sched_annotate_sleep(); mutex_lock(&module_mutex); - sym = find_symbol(name, &owner, &crc, + sym = find_symbol(name, &owner, &crc, &license, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!sym) goto unlock; + if (license == GPL_ONLY) + mod->using_gplonly_symbols = true; + + if (!inherit_taint(mod, owner)) { + sym = NULL; + goto getname; + } + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; @@ -1520,18 +1548,34 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) static ssize_t module_sect_read(struct file *file, struct kobject *kobj, struct bin_attribute *battr, char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; if (pos != 0) return -EINVAL; - return sprintf(buf, "0x%px\n", - kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1580,7 +1624,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) goto out; sect_attrs->nsections++; sattr->battr.read = module_sect_read; - sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.size = MODULE_SECT_READ_SIZE; sattr->battr.attr.mode = 0400; *(gattr++) = &(sattr++)->battr; } @@ -2220,7 +2264,7 @@ void *__symbol_get(const char *symbol) const struct kernel_symbol *sym; preempt_disable(); - sym = find_symbol(symbol, &owner, NULL, true, true); + sym = find_symbol(symbol, &owner, NULL, NULL, true, true); if (sym && strong_try_module_get(owner)) sym = NULL; preempt_enable(); @@ -2256,7 +2300,7 @@ static int verify_exported_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(kernel_symbol_name(s), &owner, NULL, - true, false)) { + NULL, true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, kernel_symbol_name(s), @@ -3237,7 +3281,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__verbose", + info->debug = section_objs(info, "__dyndbg", sizeof(*info->debug), &info->num_debug); return 0; @@ -4473,7 +4517,6 @@ struct module *__module_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_address); /* * is_module_text_address - is this address inside module code? @@ -4512,7 +4555,6 @@ struct module *__module_text_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_text_address); /* Don't grab lock, we're oopsing. */ void print_modules(void) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index cd356630a311..12dd41b39a7f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p) static int check_setns_flags(unsigned long flags) { if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | - CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | + CLONE_NEWPID | CLONE_NEWCGROUP))) return -EINVAL; #ifndef CONFIG_USER_NS @@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags) if (flags & CLONE_NEWNET) return -EINVAL; #endif +#ifndef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + return -EINVAL; +#endif return 0; } @@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid) } #endif +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) { + ret = validate_ns(nsset, &nsp->time_ns->ns); + if (ret) + goto out; + } +#endif + out: if (pid_ns) put_pid_ns(pid_ns); @@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset) exit_sem(me); #endif +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + timens_commit(me, nsset->nsproxy->time_ns); +#endif + /* transfer ownership */ switch_task_namespaces(me, nsset->nsproxy); nsset->nsproxy = NULL; diff --git a/kernel/panic.c b/kernel/panic.c index e2157ca387c8..aef8872ba843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -505,7 +505,7 @@ static void do_oops_enter_exit(void) * Return true if the calling CPU is allowed to print oops-related info. * This is a bit racy.. */ -int oops_may_print(void) +bool oops_may_print(void) { return pause_on_oops_flag == 0; } @@ -551,7 +551,7 @@ static int init_oops_id(void) } late_initcall(init_oops_id); -void print_oops_end_marker(void) +static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..b2562a7ce525 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <net/sock.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -198,7 +199,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; - if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } @@ -635,17 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = security_file_receive(file); - if (ret) { - fput(file); - return ret; - } - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - fput(file); - else - fd_install(ret, file); + ret = receive_fd(file, O_CLOEXEC); + fput(file); return ret; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0e5ac162c3a8..ac135bd600eb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, struct ctl_table tmp = *table; int ret, next; - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; /* diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0a9326f5f421..c1ff7fa030ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Energy Model of CPUs + * Energy Model of devices * - * Copyright (c) 2018, Arm ltd. + * Copyright (c) 2018-2020, Arm ltd. * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. */ #define pr_fmt(fmt) "energy_model: " fmt @@ -15,30 +16,32 @@ #include <linux/sched/topology.h> #include <linux/slab.h> -/* Mapping of each CPU to the performance domain to which it belongs. */ -static DEFINE_PER_CPU(struct em_perf_domain *, em_data); - /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) { struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + snprintf(name, sizeof(name), "ps:%lu", ps->frequency); - /* Create per-cs directory */ + /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &cs->frequency); - debugfs_create_ulong("power", 0444, d, &cs->power); - debugfs_create_ulong("cost", 0444, d, &cs->cost); + debugfs_create_ulong("frequency", 0444, d, &ps->frequency); + debugfs_create_ulong("power", 0444, d, &ps->power); + debugfs_create_ulong("cost", 0444, d, &ps->cost); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +static void em_debug_create_pd(struct device *dev) { struct dentry *d; - char name[8]; int i; - snprintf(name, sizeof(name), "pd%d", cpu); - /* Create the directory of the performance domain */ - d = debugfs_create_dir(name, rootdir); + d = debugfs_create_dir(dev_name(dev), rootdir); - debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); + + /* Create a sub-directory for each performance state */ + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(&dev->em_pd->table[i], d); - /* Create a sub-directory for each capacity state */ - for (i = 0; i < pd->nr_cap_states; i++) - em_debug_create_cs(&pd->table[i], d); +} + +static void em_debug_remove_pd(struct device *dev) +{ + struct dentry *debug_dir; + + debug_dir = debugfs_lookup(dev_name(dev), rootdir); + debugfs_remove_recursive(debug_dir); } static int __init em_debug_init(void) @@ -76,58 +87,55 @@ static int __init em_debug_init(void) } core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} #endif -static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, - struct em_data_callback *cb) + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + int nr_states, struct em_data_callback *cb) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; - int i, ret, cpu = cpumask_first(span); - struct em_cap_state *table; - struct em_perf_domain *pd; + struct em_perf_state *table; + int i, ret; u64 fmax; - if (!cb->active_power) - return NULL; - - pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); - if (!pd) - return NULL; - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); if (!table) - goto free_pd; + return -ENOMEM; - /* Build the list of capacity states for this performance domain */ + /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest capacity state of 'cpu' above 'freq' and updates + * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, cpu); + ret = cb->active_power(&power, &freq, dev); if (ret) { - pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); - goto free_cs_table; + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); + goto free_ps_table; } /* * We expect the driver callback to increase the frequency for - * higher capacity states. + * higher performance states. */ if (freq <= prev_freq) { - pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); - goto free_cs_table; + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); + goto free_ps_table; } /* * The power returned by active_state() is expected to be * positive, in milli-watts and to fit into 16 bits. */ - if (!power || power > EM_CPU_MAX_POWER) { - pr_err("pd%d: invalid power: %lu\n", cpu, power); - goto free_cs_table; + if (!power || power > EM_MAX_POWER) { + dev_err(dev, "EM: invalid power: %lu\n", + power); + goto free_ps_table; } table[i].power = power; @@ -141,12 +149,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", - cpu, i, i - 1); + dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", + i, i - 1); prev_opp_eff = opp_eff; } - /* Compute the cost of each capacity_state. */ + /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { table[i].cost = div64_u64(fmax * table[i].power, @@ -154,39 +162,94 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, } pd->table = table; - pd->nr_cap_states = nr_states; - cpumask_copy(to_cpumask(pd->cpus), span); - - em_debug_create_pd(pd, cpu); + pd->nr_perf_states = nr_states; - return pd; + return 0; -free_cs_table: +free_ps_table: kfree(table); -free_pd: - kfree(pd); + return -EINVAL; +} + +static int em_create_pd(struct device *dev, int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) +{ + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret; + + if (_is_cpu_device(dev)) { + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + ret = em_create_perf_table(dev, pd, nr_states, cb); + if (ret) { + kfree(pd); + return ret; + } + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; - return NULL; + return dev->em_pd; } +EXPORT_SYMBOL_GPL(em_pd_get); /** * em_cpu_get() - Return the performance domain for a CPU * @cpu : CPU to find the performance domain for * - * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't * exist. */ struct em_perf_domain *em_cpu_get(int cpu) { - return READ_ONCE(per_cpu(em_data, cpu)); + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); } EXPORT_SYMBOL_GPL(em_cpu_get); /** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register + * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model + * @cpus : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. @@ -196,14 +259,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * * Return 0 on success */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) { unsigned long cap, prev_cap = 0; - struct em_perf_domain *pd; - int cpu, ret = 0; + int cpu, ret; - if (!span || !nr_states || !cb) + if (!dev || !nr_states || !cb) return -EINVAL; /* @@ -212,47 +274,79 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, */ mutex_lock(&em_pd_mutex); - for_each_cpu(cpu, span) { - /* Make sure we don't register again an existing domain. */ - if (READ_ONCE(per_cpu(em_data, cpu))) { - ret = -EEXIST; - goto unlock; - } + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } - /* - * All CPUs of a domain must have the same micro-architecture - * since they all share the same table. - */ - cap = arch_scale_cpu_capacity(cpu); - if (prev_cap && prev_cap != cap) { - pr_err("CPUs of %*pbl must have the same capacity\n", - cpumask_pr_args(span)); + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); ret = -EINVAL; goto unlock; } - prev_cap = cap; + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } } - /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(span, nr_states, cb); - if (!pd) { - ret = -EINVAL; + ret = em_create_pd(dev, nr_states, cb, cpus); + if (ret) goto unlock; - } - for_each_cpu(cpu, span) { - /* - * The per-cpu array can be read concurrently from em_cpu_get(). - * The barrier enforces the ordering needed to make sure readers - * can only access well formed em_perf_domain structs. - */ - smp_store_release(per_cpu_ptr(&em_data, cpu), pd); - } + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); - pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); unlock: mutex_unlock(&em_pd_mutex); - return ret; } -EXPORT_SYMBOL_GPL(em_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + kfree(dev->em_pd->table); + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 02ec716a4927..f33769f97aca 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -795,6 +795,103 @@ int hibernate(void) return error; } +/** + * hibernate_quiet_exec - Execute a function with all devices frozen. + * @func: Function to execute. + * @data: Data pointer to pass to @func. + * + * Return the @func return value or an error code if it cannot be executed. + */ +int hibernate_quiet_exec(int (*func)(void *data), void *data) +{ + int error, nr_calls = 0; + + lock_system_sleep(); + + if (!hibernate_acquire()) { + error = -EBUSY; + goto unlock; + } + + pm_prepare_console(); + + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; + goto exit; + } + + error = freeze_processes(); + if (error) + goto exit; + + lock_device_hotplug(); + + pm_suspend_clear_flags(); + + error = platform_begin(true); + if (error) + goto thaw; + + error = freeze_kernel_threads(); + if (error) + goto thaw; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto dpm_complete; + + suspend_console(); + + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = platform_pre_snapshot(true); + if (error) + goto skip; + + error = func(data); + +skip: + platform_finish(true); + + dpm_resume_start(PMSG_THAW); + +dpm_resume: + dpm_resume(PMSG_THAW); + + resume_console(); + +dpm_complete: + dpm_complete(PMSG_THAW); + + thaw_kernel_threads(); + +thaw: + platform_end(true); + + unlock_device_hotplug(); + + thaw_processes(); + +exit: + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + + pm_restore_console(); + + hibernate_release(); + +unlock: + unlock_system_sleep(); + + return error; +} +EXPORT_SYMBOL_GPL(hibernate_quiet_exec); /** * software_resume - Resume from a saved hibernation image. @@ -1062,7 +1159,7 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } @@ -1162,7 +1259,7 @@ static ssize_t reserved_size_store(struct kobject *kobj, power_attr(reserved_size); -static struct attribute * g[] = { +static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, @@ -1190,7 +1287,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy( resume_file, str, 255 ); + strncpy(resume_file, str, 255); return 1; } diff --git a/kernel/power/power.h b/kernel/power/power.h index ba2094db6294..32fc89ac96c3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info) return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -static inline char *check_image_kernel(struct swsusp_info *info) +static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 881128b9351e..d25749bce7cf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_node_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) @@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -static char *check_image_kernel(struct swsusp_info *info) +static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm) static int check_header(struct swsusp_info *info) { - char *reason; + const char *reason; reason = check_image_kernel(info); if (!reason && info->num_physpages != get_num_physpages()) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b71eaf5f5a86..9b75f6bfc333 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -943,6 +943,14 @@ out: return ret; } +/* + * Be careful when modifying this function!!! + * + * Only few operations are supported because the device works only with the + * entire variable length messages (records). Non-standard values are + * returned in the other cases and has been this way for quite some time. + * User space applications might depend on this behavior. + */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; @@ -2658,7 +2666,7 @@ early_param("keep_bootcon", keep_bootcon_setup); static int try_enable_new_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; - int i; + int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; @@ -2681,8 +2689,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return 0; if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - return -EIO; + (err = newcon->setup(newcon, c->options)) != 0) + return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) { @@ -2695,7 +2703,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() - * and setup() had a change to be called. + * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index ec903d781778..21448d3374e2 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -361,7 +361,6 @@ rcu_perf_writer(void *arg) int i_max; long me = (long)arg; struct rcu_head *rhp = NULL; - struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -370,8 +369,7 @@ rcu_perf_writer(void *arg) VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sp.sched_priority = 1; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_set_fifo_low(current); if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); @@ -427,9 +425,7 @@ retry: started = true; if (!done && i >= MIN_MEAS) { done = true; - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, - SCHED_NORMAL, &sp); + sched_set_normal(current, 0); pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d0d265304d14..f453bf8d2f1e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -895,16 +895,11 @@ static int rcu_torture_boost(void *arg) unsigned long endtime; unsigned long oldstarttime; struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } + sched_set_fifo_low(current); init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac7198ed3197..8ce77d9ac716 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -59,6 +59,7 @@ #include <linux/sched/clock.h> #include <linux/vmalloc.h> #include <linux/mm.h> +#include <linux/kasan.h> #include "../time/tick-internal.h" #include "tree.h" @@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); + kasan_record_aux_stack(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ diff --git a/kernel/regset.c b/kernel/regset.c new file mode 100644 index 000000000000..586823786f39 --- /dev/null +++ b/kernel/regset.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/regset.h> + +static int __regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data) +{ + void *p = *data, *to_free = NULL; + int res; + + if (!regset->regset_get) + return -EOPNOTSUPP; + if (size > regset->n * regset->size) + size = regset->n * regset->size; + if (!p) { + to_free = p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + } + res = regset->regset_get(target, regset, + (struct membuf){.p = p, .left = size}); + if (res < 0) { + kfree(to_free); + return res; + } + *data = p; + return size - res; +} + +int regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void *data) +{ + return __regset_get(target, regset, size, &data); +} +EXPORT_SYMBOL(regset_get); + +int regset_get_alloc(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data) +{ + *data = NULL; + return __regset_get(target, regset, size, data); +} +EXPORT_SYMBOL(regset_get_alloc); + +/** + * copy_regset_to_user - fetch a thread's user_regset data into user memory + * @target: thread to be examined + * @view: &struct user_regset_view describing user thread machine state + * @setno: index in @view->regsets + * @offset: offset into the regset data, in bytes + * @size: amount of data to copy, in bytes + * @data: user-mode pointer to copy into + */ +int copy_regset_to_user(struct task_struct *target, + const struct user_regset_view *view, + unsigned int setno, + unsigned int offset, unsigned int size, + void __user *data) +{ + const struct user_regset *regset = &view->regsets[setno]; + void *buf; + int ret; + + ret = regset_get_alloc(target, regset, size, &buf); + if (ret > 0) + ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; + kfree(buf); + return ret; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a0e7b449b88..8471a0f7eb32 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5496,6 +5496,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5505,13 +5507,11 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } -EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } -EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { @@ -5536,7 +5536,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -6387,10 +6431,10 @@ void sched_show_task(struct task_struct *p) if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -6399,8 +6443,8 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, + pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + free, task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); @@ -6435,13 +6479,6 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index dc6835bc6490..e39008242cf4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); +cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ba8f230feb9..1a68a0536add 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6509,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) max_util = max(max_util, cpu_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util); } /* diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e53b711bd643..967732c0766c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -616,11 +616,8 @@ out: static int psi_poll_worker(void *data) { struct psi_group *group = (struct psi_group *)data; - struct sched_param param = { - .sched_priority = 1, - }; - sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + sched_set_fifo_low(current); while (true) { wait_event_interruptible(group->poll_wait, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3fd283892761..28709f6b0975 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; if (trace_sched_update_nr_running_tp_enabled()) { - call_trace_sched_update_nr_running(rq, count); + call_trace_sched_update_nr_running(rq, -count); } /* Check if we still need preemption */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9079d865a935..007b0a6b0152 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas) * * The complexity of the Energy Model is defined as: * - * C = nr_pd * (nr_cpus + nr_cs) + * C = nr_pd * (nr_cpus + nr_ps) * * with parameters defined as: * - nr_pd: the number of performance domains * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance + * - nr_ps: the sum of the number of performance states of all performance * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * with 10 performance states each, nr_ps = 2 * 10 = 20). * * It is generally not a good idea to use such a model in the wake-up path on * very complex platforms because of the associated scheduling overheads. The * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * with per-CPU DVFS and less than 8 performance states each, for example. */ #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map) pd = tmp; /* - * Count performance domains and capacity states for the + * Count performance domains and performance states for the * complexity check. */ nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); + nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; diff --git a/kernel/scs.c b/kernel/scs.c index 5d4d9bbdec36..4ff4a7ba0094 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -17,7 +17,7 @@ static void __scs_account(void *s, int account) { struct page *scs_page = virt_to_page(s); - mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d653d8426de9..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,6 +13,7 @@ * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ +#define pr_fmt(fmt) "seccomp: " fmt #include <linux/refcount.h> #include <linux/audit.h> @@ -41,6 +42,15 @@ #include <linux/tracehook.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> +#include <linux/lockdep.h> + +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) enum notify_state { SECCOMP_NOTIFY_INIT, @@ -77,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -94,27 +136,35 @@ struct seccomp_knotif { * filter->notify_lock. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. - * @wqh: A wait queue for poll. */ struct notification { struct semaphore request; u64 next_id; struct list_head notifications; - wait_queue_head_t wqh; }; /** * struct seccomp_filter - container for seccomp BPF programs * - * @usage: reference count to manage the object lifetime. - * get/put helpers should be used when accessing an instance - * outside of a lifetime-guarded section. In general, this - * is only needed for handling filters shared across tasks. + * @refs: Reference count to manage the object lifetime. + * A filter's reference count is incremented for each directly + * attached task, once for the dependent filter, and if + * requested for the user notifier. When @refs reaches zero, + * the filter can be freed. + * @users: A filter's @users count is incremented for each directly + * attached task (filter installation, fork(), thread_sync), + * and once for the dependent filter (tracked in filter->prev). + * When it reaches zero it indicates that no direct or indirect + * users of that filter exist. No new tasks can get associated with + * this filter after reaching 0. The @users count is always smaller + * or equal to @refs. Hence, reaching 0 for @users does not mean + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. + * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -124,15 +174,17 @@ struct notification { * how namespaces work. * * seccomp_filter objects should never be modified after being attached - * to a task_struct (other than @usage). + * to a task_struct (other than @refs). */ struct seccomp_filter { - refcount_t usage; + refcount_t refs; + refcount_t users; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; + wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -366,6 +418,59 @@ static inline pid_t seccomp_can_sync_threads(void) return 0; } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_destroy(filter->prog); + kfree(filter); + } +} + +static void __seccomp_filter_orphan(struct seccomp_filter *orig) +{ + while (orig && refcount_dec_and_test(&orig->users)) { + if (waitqueue_active(&orig->wqh)) + wake_up_poll(&orig->wqh, EPOLLHUP); + orig = orig->prev; + } +} + +static void __put_seccomp_filter(struct seccomp_filter *orig) +{ + /* Clean up single-reference branches iteratively. */ + while (orig && refcount_dec_and_test(&orig->refs)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + seccomp_filter_free(freeme); + } +} + +static void __seccomp_filter_release(struct seccomp_filter *orig) +{ + /* Notify about any unused filters in the task's former filter tree. */ + __seccomp_filter_orphan(orig); + /* Finally drop all references to the task's former tree. */ + __put_seccomp_filter(orig); +} + +/** + * seccomp_filter_release - Detach the task from its filter tree, + * drop its reference count, and notify + * about unused filters + * + * This function should only be called when the task is exiting as + * it detaches it from its filter tree. As such, READ_ONCE() and + * barriers are not needed here, as would normally be needed. + */ +void seccomp_filter_release(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + + /* Detach task from its filter tree. */ + tsk->seccomp.filter = NULL; + __seccomp_filter_release(orig); +} + /** * seccomp_sync_threads: sets all threads to use current's filter * @@ -390,14 +495,19 @@ static inline void seccomp_sync_threads(unsigned long flags) /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); + /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ - put_seccomp_filter(thread); + __seccomp_filter_release(thread->seccomp.filter); + + /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + atomic_set(&thread->seccomp.filter_count, + atomic_read(&thread->seccomp.filter_count)); /* * Don't let an unprivileged task work around @@ -461,7 +571,9 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - refcount_set(&sfilter->usage, 1); + refcount_set(&sfilter->refs, 1); + refcount_set(&sfilter->users, 1); + init_waitqueue_head(&sfilter->wqh); return sfilter; } @@ -544,6 +656,7 @@ static long seccomp_attach_filter(unsigned int flags, */ filter->prev = current->seccomp.filter; current->seccomp.filter = filter; + atomic_inc(¤t->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) @@ -554,7 +667,7 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - refcount_inc(&filter->usage); + refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -564,30 +677,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; __get_seccomp_filter(orig); -} - -static inline void seccomp_filter_free(struct seccomp_filter *filter) -{ - if (filter) { - bpf_prog_destroy(filter->prog); - kfree(filter); - } -} - -static void __put_seccomp_filter(struct seccomp_filter *orig) -{ - /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->usage)) { - struct seccomp_filter *freeme = orig; - orig = orig->prev; - seccomp_filter_free(freeme); - } -} - -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) -{ - __put_seccomp_filter(tsk->seccomp.filter); + refcount_inc(&orig->users); } static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) @@ -684,20 +774,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ + -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { - const int *syscall_whitelist = mode1_syscalls; + const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = get_compat_mode1_syscalls(); + allowed_syscalls = get_compat_mode1_syscalls(); #endif do { - if (*syscall_whitelist == this_syscall) + if (*allowed_syscalls == this_syscall) return; - } while (*++syscall_whitelist); + } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); @@ -735,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -743,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -755,25 +857,43 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); - wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); mutex_unlock(&match->notify_lock); /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1011,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1021,6 +1146,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) return 0; } +/* must be called with notif_lock held */ +static inline struct seccomp_knotif * +find_notification(struct seccomp_filter *filter, u64 id) +{ + struct seccomp_knotif *cur; + + lockdep_assert_held(&filter->notify_lock); + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == id) + return cur; + } + + return NULL; +} + + static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { @@ -1064,7 +1206,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, unotif.data = *(knotif->data); knotif->state = SECCOMP_NOTIFY_SENT; - wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); ret = 0; out: mutex_unlock(&filter->notify_lock); @@ -1078,15 +1220,8 @@ out: * may have died when we released the lock, so we need to make * sure it's still around. */ - knotif = NULL; mutex_lock(&filter->notify_lock); - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == unotif.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, unotif.id); if (knotif) { knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); @@ -1101,7 +1236,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { struct seccomp_notif_resp resp = {}; - struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_knotif *knotif; long ret; if (copy_from_user(&resp, buf, sizeof(resp))) @@ -1118,13 +1253,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (ret < 0) return ret; - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == resp.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, resp.id); if (!knotif) { ret = -ENOENT; goto out; @@ -1150,7 +1279,7 @@ out: static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { - struct seccomp_knotif *knotif = NULL; + struct seccomp_knotif *knotif; u64 id; long ret; @@ -1161,17 +1290,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, if (ret < 0) return ret; - ret = -ENOENT; - list_for_each_entry(knotif, &filter->notif->notifications, list) { - if (knotif->id == id) { - if (knotif->state == SECCOMP_NOTIFY_SENT) - ret = 0; - goto out; - } + knotif = find_notification(filter, id); + if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + else + ret = -ENOENT; + + mutex_unlock(&filter->notify_lock); + return ret; +} + +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; } -out: + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + return ret; } @@ -1181,13 +1402,22 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } @@ -1200,7 +1430,7 @@ static __poll_t seccomp_notify_poll(struct file *file, __poll_t ret = 0; struct seccomp_knotif *cur; - poll_wait(file, &filter->notif->wqh, poll_tab); + poll_wait(file, &filter->wqh, poll_tab); if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; @@ -1216,6 +1446,9 @@ static __poll_t seccomp_notify_poll(struct file *file, mutex_unlock(&filter->notify_lock); + if (refcount_read(&filter->users) == 0) + ret |= EPOLLHUP; + return ret; } @@ -1244,7 +1477,6 @@ static struct file *init_listener(struct seccomp_filter *filter) sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); - init_waitqueue_head(&filter->notif->wqh); ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, filter, O_RDWR); @@ -1822,7 +2054,7 @@ static int __init seccomp_sysctl_init(void) hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); if (!hdr) - pr_warn("seccomp: sysctl registration failed\n"); + pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); diff --git a/kernel/softirq.c b/kernel/softirq.c index 5e9aaa648a74..bf88d7f62433 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -553,7 +553,10 @@ static void tasklet_action_common(struct softirq_action *a, if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); - t->func(t->data); + if (t->use_callback) + t->callback(t); + else + t->func(t->data); tasklet_unlock(t); continue; } @@ -579,6 +582,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a) tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } +void tasklet_setup(struct tasklet_struct *t, + void (*callback)(struct tasklet_struct *)) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->callback = callback; + t->use_callback = true; + t->data = 0; +} +EXPORT_SYMBOL(tasklet_setup); + void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -586,6 +601,7 @@ void tasklet_init(struct tasklet_struct *t, t->state = 0; atomic_set(&t->count, 0); t->func = func; + t->use_callback = false; t->data = data; } EXPORT_SYMBOL(tasklet_init); diff --git a/kernel/stackleak.c b/kernel/stackleak.c index b193a59fc05b..a8fc9ae1d03d 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void) } NOKPROBE_SYMBOL(stackleak_erase); -void __used notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers notrace stackleak_track_stack(void) { - /* - * N.B. stackleak_erase() fills the kernel stack with the poison value, - * which has the register width. That code assumes that the value - * of 'lowest_stack' is aligned on the register width boundary. - * - * That is true for x86 and x86_64 because of the kernel stack - * alignment on these platforms (for details, see 'cc_stack_align' in - * arch/x86/Makefile). Take care of that when you port STACKLEAK to - * new platforms. - */ - unsigned long sp = (unsigned long)&sp; + unsigned long sp = current_stack_pointer; /* * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than @@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void) */ BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + /* 'lowest_stack' should be aligned on the register width boundary */ + sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && sp >= (unsigned long)task_stack_page(current) + sizeof(unsigned long)) { diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 2af66e449aa6..946f44a9e86a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) if (current->flags & PF_KTHREAD) return 0; - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - set_fs(fs); + force_uaccess_end(fs); return c.len; } diff --git a/kernel/sys.c b/kernel/sys.c index 00a96746e28a..ca11af9d815d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2007,12 +2007,15 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.exe_fd != (u32)-1) { /* - * Make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. + * Check if the current user is checkpoint/restore capable. + * At the time of this writing, it checks for CAP_SYS_ADMIN + * or CAP_CHECKPOINT_RESTORE. + * Note that a user with access to ptrace can masquerade an + * arbitrary program as any executable, even setuid ones. + * This may have implications in the tomoyo subsystem. */ - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) - return -EINVAL; + if (!checkpoint_restore_ns_capable(current_user_ns())) + return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3b69a560a7ac..4d59775ea79c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -364,7 +364,6 @@ COND_SYSCALL(socketcall); COND_SYSCALL_COMPAT(socketcall); /* compat syscalls for arm64, x86, ... */ -COND_SYSCALL_COMPAT(sysctl); COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b4d2dc270a5..287862f91717 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, @@ -2852,6 +2852,15 @@ static struct ctl_table vm_table[] = { .proc_handler = sysctl_compaction_handler, }, { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(sysctl_compaction_proactiveness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c deleted file mode 100644 index 7d550cc76a3b..000000000000 --- a/kernel/sysctl_binary.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stat.h> -#include <linux/sysctl.h> -#include "../fs/xfs/xfs_sysctl.h" -#include <linux/sunrpc/debug.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/fs.h> -#include <linux/nsproxy.h> -#include <linux/pid_namespace.h> -#include <linux/file.h> -#include <linux/ctype.h> -#include <linux/netdevice.h> -#include <linux/kernel.h> -#include <linux/uuid.h> -#include <linux/slab.h> -#include <linux/compat.h> - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* - * CTL_KERN/KERN_VERSION is used by older glibc and cannot - * ever go away. - */ - if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk(KERN_CONT "%d.", name[i]); - printk(KERN_CONT "\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS) - -static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE); - -#define FNV32_OFFSET 2166136261U -#define FNV32_PRIME 0x01000193 - -/* - * Print each legacy sysctl (approximately) only once. - * To avoid making the tables non-const use a external - * hash-table instead. - * Worst case hash collision: 6, but very rarely. - * NOTE! We don't use the SMP-safe bit tests. We simply - * don't care enough. - */ -static void warn_on_bintable(const int *name, int nlen) -{ - int i; - u32 hash = FNV32_OFFSET; - - for (i = 0; i < nlen; i++) - hash = (hash ^ name[i]) * FNV32_PRIME; - hash %= WARN_ONCE_HASH_SIZE; - if (__test_and_set_bit(hash, warn_once_bitmap)) - return; - deprecated_sysctl_warning(name, nlen); -} - -static ssize_t do_sysctl(int __user *args_name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - int name[CTL_MAXNAME]; - int i; - - /* Check args->nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; -} - - -#ifdef CONFIG_COMPAT - -struct compat_sysctl_args { - compat_uptr_t name; - int nlen; - compat_uptr_t oldval; - compat_uptr_t oldlenp; - compat_uptr_t newval; - compat_size_t newlen; - compat_ulong_t __unused[4]; -}; - -COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) -{ - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; -} - -#endif /* CONFIG_COMPAT */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2ffb466af77e..ca223a89530a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. + * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..c4038511d5c9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5d9fc22d836a..afc65e6be33e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); - int err; if (!current_is_single_threaded()) return -EUSERS; @@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - timens_set_vvar_page(current, ns); - - err = vdso_join_timens(current, ns); - if (err) - return err; - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); - int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; - timens_set_vvar_page(tsk, ns); - - err = vdso_join_timens(tsk, ns); - if (err) - return err; - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; + timens_commit(tsk, ns); + return 0; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + /* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. */ -void run_posix_cpu_timers(void) +void __init posix_cputimers_init_work(void) { - struct task_struct *tsk = current; - struct k_itimer *timer, *next; - unsigned long flags; - LIST_HEAD(firing); + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} - lockdep_assert_irqs_disabled(); +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; } + /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. */ - check_thread_timers(tsk, &firing); + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) +{ + struct k_itimer *timer, *next; + unsigned long flags, start; + LIST_HEAD(firing); + + if (!lock_task_sighand(tsk, &flags)) + return; - check_process_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); + + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - lockdep_posixtimer_exit(); +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + __run_posix_cpu_timers(tsk); } /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0deaf4b79fb4..1c03eec6ca9b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, - * make it the final one one. + * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63a632f9896c..4c47f388a83f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset @@ -2001,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation + * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index bcbb52db2256..4ca2787d1642 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H -/* - * timekeeping debug functions - */ + #include <linux/clocksource.h> +#include <linux/spinlock.h> #include <linux/time.h> +/* + * timekeeping debug functions + */ #ifdef CONFIG_DEBUG_FS extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else @@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 026ac01af9da..a16764b0116e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -157,7 +157,8 @@ EXPORT_SYMBOL(jiffies_64); /* * The time start value for each level to select the bucket at enqueue - * time. + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) @@ -204,8 +205,8 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; + bool next_expiry_recalc; bool is_idle; - bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -488,35 +489,48 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) { + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static int calc_wheel_index(unsigned long expires, unsigned long clk) +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { - idx = calc_index(expires, 0); + idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { - idx = calc_index(expires, 1); + idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { - idx = calc_index(expires, 2); + idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { - idx = calc_index(expires, 3); + idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { - idx = calc_index(expires, 4); + idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { - idx = calc_index(expires, 5); + idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { - idx = calc_index(expires, 6); + idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { - idx = calc_index(expires, 7); + idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; + *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the @@ -525,34 +539,11 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; - idx = calc_index(expires, LVL_DEPTH - 1); + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } -/* - * Enqueue the timer into the hash bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ -static void enqueue_timer(struct timer_base *base, struct timer_list *timer, - unsigned int idx) -{ - hlist_add_head(&timer->entry, base->vectors + idx); - __set_bit(idx, base->pending_map); - timer_set_idx(timer, idx); - - trace_timer_start(timer, timer->expires, timer->flags); -} - -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) -{ - unsigned int idx; - - idx = calc_wheel_index(timer->expires, base->clk); - enqueue_timer(base, timer, idx); -} - static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { @@ -574,34 +565,48 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ - if (!base->is_idle) - return; + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); +} - /* Check whether this is the new first expiring timer: */ - if (time_after_eq(timer->expires, base->next_expiry)) - return; +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) +{ + + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); /* - * Set the next expiry time and kick the CPU so it can reevaluate the - * wheel: + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. */ - if (time_before(timer->expires, base->clk)) { + if (time_before(bucket_expiry, base->next_expiry)) { /* - * Prevent from forward_timer_base() moving the base->clk - * backward + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: */ - base->next_expiry = base->clk; - } else { - base->next_expiry = timer->expires; + base->next_expiry = bucket_expiry; + base->next_expiry_recalc = false; + trigger_dyntick_cpu(base, timer); } - wake_up_nohz_cpu(base->cpu); } -static void -internal_add_timer(struct timer_base *base, struct timer_list *timer) +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); - trigger_dyntick_cpu(base, timer); + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS @@ -834,8 +839,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, if (!timer_pending(timer)) return 0; - if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } detach_timer(timer, clear_pending); return 1; @@ -885,20 +892,14 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { -#ifdef CONFIG_NO_HZ_COMMON - unsigned long jnow; + unsigned long jnow = READ_ONCE(jiffies); /* - * We only forward the base when we are idle or have just come out of - * idle (must_forward_clk logic), and have a delta between base clock - * and jiffies. In the common case, run_timers will take care of it. + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jffies. */ - if (likely(!base->must_forward_clk)) - return; - - jnow = READ_ONCE(jiffies); - base->must_forward_clk = base->is_idle; - if ((long)(jnow - base->clk) < 2) + if ((long)(jnow - base->clk) < 1) return; /* @@ -912,7 +913,6 @@ static inline void forward_timer_base(struct timer_base *base) return; base->clk = base->next_expiry; } -#endif } @@ -960,9 +960,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { + unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; - unsigned long clk = 0, flags; int ret = 0; BUG_ON(!timer->function); @@ -1001,7 +1001,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } clk = base->clk; - idx = calc_wheel_index(expires, clk); + idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending @@ -1054,16 +1054,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only - * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise - * we need to (re)calculate the wheel index via - * internal_add_timer(). + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). */ - if (idx != UINT_MAX && clk == base->clk) { - enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer); - } else { + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else internal_add_timer(base, timer); - } out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1466,10 +1463,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int __collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - unsigned long clk = base->clk; + unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; @@ -1491,7 +1488,6 @@ static int __collect_expired_timers(struct timer_base *base, return levels; } -#ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level @@ -1524,6 +1520,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; @@ -1531,6 +1528,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; } /* * Clock for the next level. If the current level clock lower @@ -1568,13 +1572,17 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ - adj = clk & LVL_CLK_MASK ? 1 : 0; + adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } + + base->next_expiry_recalc = false; + return next; } +#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: @@ -1631,9 +1639,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; raw_spin_lock(&base->lock); - nextevt = __next_timer_interrupt(base); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); - base->next_expiry = nextevt; + /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk @@ -1659,10 +1669,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) { - base->must_forward_clk = true; + if ((expires - basem) > TICK_NSEC) base->is_idle = true; - } } raw_spin_unlock(&base->lock); @@ -1686,42 +1694,6 @@ void timer_clear_idle(void) */ base->is_idle = false; } - -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - unsigned long now = READ_ONCE(jiffies); - - /* - * NOHZ optimization. After a long idle sleep we need to forward the - * base to current jiffies. Avoid a loop by searching the bitfield for - * the next expiring timer. - */ - if ((long)(now - base->clk) > 2) { - unsigned long next = __next_timer_interrupt(base); - - /* - * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time: - */ - if (time_after(next, now)) { - /* - * The call site will increment base->clk and then - * terminate the expiry loop immediately. - */ - base->clk = now; - return 0; - } - base->clk = next; - } - return __collect_expired_timers(base, heads); -} -#else -static inline int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - return __collect_expired_timers(base, heads); -} #endif /* @@ -1761,32 +1733,23 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (!time_after_eq(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); - /* - * timer_base::must_forward_clk must be cleared before running - * timers so that any timer functions that call mod_timer() will - * not try to forward the base. Idle tracking / clock forwarding - * logic is only used with BASE_STD timers. - * - * The must_forward_clk flag is cleared unconditionally also for - * the deferrable base. The deferrable base is not affected by idle - * tracking and never forwarded, so clearing the flag is a NOOP. - * - * The fact that the deferrable base is never forwarded can cause - * large variations in granularity for deferrable timers, but they - * can be deferred for long periods due to idle anyway. - */ - base->must_forward_clk = false; - - while (time_after_eq(jiffies, base->clk)) { - + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); + /* + * The only possible reason for not finding any expired + * timer at this clk is that all matching timers have been + * dequeued. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; + base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); @@ -1816,12 +1779,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk)) { + if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); @@ -1986,7 +1949,6 @@ int timers_prepare_cpu(unsigned int cpu) base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->is_idle = false; - base->must_forward_clk = true; } return 0; } @@ -2039,6 +2001,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2054,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 54ce6eb2ca36..88e6b8ed6ca5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -13,6 +13,8 @@ #include <vdso/helpers.h> #include <vdso/vsyscall.h> +#include "timekeeping_internal.h" + static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { @@ -127,3 +129,42 @@ void update_vsyscall_tz(void) __arch_sync_vdso_data(vdata); } + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6575bb0a0434..e153be351548 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -2,9 +2,9 @@ # Do not instrument the tracer itself: +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + ifdef CONFIG_FUNCTION_TRACER -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) # Avoid recursion due to instrumentation. KCSAN_SANITIZE := n @@ -31,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +CFLAGS_bpf_trace.o := -I$(src) + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7bc3d6175868..a8d4f253ed77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -11,14 +11,19 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> +#include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/error-injection.h> +#include <linux/btf_ids.h> #include <asm/tlb.h> #include "trace_probe.h" #include "trace.h" +#define CREATE_TRACE_POINTS +#include "bpf_trace.h" + #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) @@ -374,9 +379,33 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, } } +static DEFINE_RAW_SPINLOCK(trace_printk_lock); + +#define BPF_TRACE_PRINTK_SIZE 1024 + +static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +{ + static char buf[BPF_TRACE_PRINTK_SIZE]; + unsigned long flags; + va_list ap; + int ret; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); + va_start(ap, fmt); + ret = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + /* vsnprintf() will not append null for zero-length strings */ + if (ret == 0) + buf[0] = '\0'; + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + + return ret; +} + /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -420,6 +449,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, goto fmt_str; } + if (fmt[i + 1] == 'B') { + i++; + goto fmt_next; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && @@ -478,8 +512,7 @@ fmt_next: */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(0 /* Fake ip */, \ - fmt, ##__VA_ARGS__) + bpf_do_trace_printk(fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ @@ -516,10 +549,15 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { /* - * this program might be calling bpf_trace_printk, - * so allocate per-cpu printk buffers + * This program might be calling bpf_trace_printk, + * so enable the associated bpf_trace/bpf_trace_printk event. + * Repeat this each time as it is possible a user has + * disabled bpf_trace_printk events. By loading a program + * calling bpf_trace_printk() however the user has expressed + * the intent to see such events. */ - trace_printk_init_buffers(); + if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) + pr_warn_ratelimited("could not enable bpf_trace_printk events"); return &bpf_trace_printk_proto; } @@ -636,7 +674,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, if (fmt[i] == 'p') { if (fmt[i + 1] == 0 || fmt[i + 1] == 'K' || - fmt[i + 1] == 'x') { + fmt[i + 1] == 'x' || + fmt[i + 1] == 'B') { /* just kernel pointers */ params[fmt_cnt] = args[fmt_cnt]; fmt_cnt++; @@ -681,7 +720,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, } if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') { + fmt[i] != 'u' && fmt[i] != 'x' && + fmt[i] != 'X') { err = -EINVAL; goto out; } @@ -703,7 +743,9 @@ out: return err; } -static int bpf_seq_printf_btf_ids[5]; +BTF_ID_LIST(bpf_seq_printf_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, @@ -721,7 +763,9 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -static int bpf_seq_write_btf_ids[5]; +BTF_ID_LIST(bpf_seq_write_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, @@ -1134,6 +1178,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } @@ -1363,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_tp; + return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto_tp; + return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: @@ -1512,6 +1560,16 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h new file mode 100644 index 000000000000..9acbc11ac7bb --- /dev/null +++ b/kernel/trace/bpf_trace.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_trace + +#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_BPF_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(bpf_trace_printk, + + TP_PROTO(const char *bpf_string), + + TP_ARGS(bpf_string), + + TP_STRUCT__entry( + __string(bpf_string, bpf_string) + ), + + TP_fast_assign( + __assign_str(bpf_string, bpf_string); + ), + + TP_printk("%s", __get_str(bpf_string)) +); + +#endif /* _TRACE_BPF_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_trace + +#include <trace/define_trace.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72064541bef2..275441254bb5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -#define FTRACE_PID_IGNORE -1 -#define FTRACE_PID_TRACE -2 - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -2388,6 +2385,14 @@ struct ftrace_ops direct_ops = { .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, + /* + * By declaring the main trampoline as this trampoline + * it will never have one allocated for it. Allocated + * trampolines should not call direct functions. + * The direct_ops should only be called by the builtin + * ftrace_regs_caller trampoline. + */ + .trampoline = FTRACE_REGS_ADDR, }; #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ @@ -6255,8 +6260,19 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; + if (ops_references_rec(ops, rec)) { + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + continue; + cnt++; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + if (cnt == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; + } } return cnt; @@ -6435,8 +6451,8 @@ void ftrace_module_enable(struct module *mod) if (ftrace_start_up) cnt += referenced_filters(rec); - /* This clears FTRACE_FL_DISABLED */ - rec->flags = cnt; + rec->flags &= ~FTRACE_FL_DISABLED; + rec->flags += cnt; if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(rec, 1); @@ -7066,12 +7082,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } else { unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00867ff82412..93ef0ab6ea20 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) +#define for_each_online_buffer_cpu(buffer, cpu) \ + for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) + #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) @@ -413,12 +416,27 @@ struct rb_irq_work { struct rb_event_info { u64 ts; u64 delta; + u64 before; + u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* + * Used for the add_timestamp + * NONE + * EXTEND - wants a time extend + * ABSOLUTE - the buffer requests all events to have absolute time stamps + * FORCE - force a full time stamp. + */ +enum { + RB_ADD_STAMP_NONE = 0, + RB_ADD_STAMP_EXTEND = BIT(1), + RB_ADD_STAMP_ABSOLUTE = BIT(2), + RB_ADD_STAMP_FORCE = BIT(3) +}; +/* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 @@ -435,6 +453,28 @@ enum { RB_CTX_MAX }; +#if BITS_PER_LONG == 32 +#define RB_TIME_32 +#endif + +/* To test on 64 bit machines */ +//#define RB_TIME_32 + +#ifdef RB_TIME_32 + +struct rb_time_struct { + local_t cnt; + local_t top; + local_t bottom; +}; +#else +#include <asm/local64.h> +struct rb_time_struct { + local64_t time; +}; +#endif +typedef struct rb_time_struct rb_time_t; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -470,7 +510,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - u64 write_stamp; + rb_time_t write_stamp; + rb_time_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -513,6 +554,189 @@ struct ring_buffer_iter { int missed_events; }; +#ifdef RB_TIME_32 + +/* + * On 32 bit machines, local64_t is very expensive. As the ring + * buffer doesn't need all the features of a true 64 bit atomic, + * on 32 bit, it uses these functions (64 still uses local64_t). + * + * For the ring buffer, 64 bit required operations for the time is + * the following: + * + * - Only need 59 bits (uses 60 to make it even). + * - Reads may fail if it interrupted a modification of the time stamp. + * It will succeed if it did not interrupt another write even if + * the read itself is interrupted by a write. + * It returns whether it was successful or not. + * + * - Writes always succeed and will overwrite other writes and writes + * that were done by events interrupting the current write. + * + * - A write followed by a read of the same time stamp will always succeed, + * but may not contain the same value. + * + * - A cmpxchg will fail if it interrupted another write or cmpxchg. + * Other than that, it acts like a normal cmpxchg. + * + * The 60 bit time stamp is broken up by 30 bits in a top and bottom half + * (bottom being the least significant 30 bits of the 60 bit time stamp). + * + * The two most significant bits of each half holds a 2 bit counter (0-3). + * Each update will increment this counter by one. + * When reading the top and bottom, if the two counter bits match then the + * top and bottom together make a valid 60 bit number. + */ +#define RB_TIME_SHIFT 30 +#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) + +static inline int rb_time_cnt(unsigned long val) +{ + return (val >> RB_TIME_SHIFT) & 3; +} + +static inline u64 rb_time_val(unsigned long top, unsigned long bottom) +{ + u64 val; + + val = top & RB_TIME_VAL_MASK; + val <<= RB_TIME_SHIFT; + val |= bottom & RB_TIME_VAL_MASK; + + return val; +} + +static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) +{ + unsigned long top, bottom; + unsigned long c; + + /* + * If the read is interrupted by a write, then the cnt will + * be different. Loop until both top and bottom have been read + * without interruption. + */ + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(bottom)) + return false; + + *ret = rb_time_val(top, bottom); + return true; +} + +static bool rb_time_read(rb_time_t *t, u64 *ret) +{ + unsigned long cnt; + + return __rb_time_read(t, ret, &cnt); +} + +static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +{ + return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); +} + +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +{ + *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); + *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); +} + +static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) +{ + val = rb_time_val_cnt(val, cnt); + local_set(t, val); +} + +static void rb_time_set(rb_time_t *t, u64 val) +{ + unsigned long cnt, top, bottom; + + rb_time_split(val, &top, &bottom); + + /* Writes always succeed with a valid number even if it gets interrupted. */ + do { + cnt = local_inc_return(&t->cnt); + rb_time_val_set(&t->top, top, cnt); + rb_time_val_set(&t->bottom, bottom, cnt); + } while (cnt != local_read(&t->cnt)); +} + +static inline bool +rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) +{ + unsigned long ret; + + ret = local_cmpxchg(l, expect, set); + return ret == expect; +} + +static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + unsigned long cnt, top, bottom; + unsigned long cnt2, top2, bottom2; + u64 val; + + /* The cmpxchg always fails if it interrupted an update */ + if (!__rb_time_read(t, &val, &cnt2)) + return false; + + if (val != expect) + return false; + + cnt = local_read(&t->cnt); + if ((cnt & 3) != cnt2) + return false; + + cnt2 = cnt + 1; + + rb_time_split(val, &top, &bottom); + top = rb_time_val_cnt(top, cnt); + bottom = rb_time_val_cnt(bottom, cnt); + + rb_time_split(set, &top2, &bottom2); + top2 = rb_time_val_cnt(top2, cnt2); + bottom2 = rb_time_val_cnt(bottom2, cnt2); + + if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) + return false; + return true; +} + +#else /* 64 bits */ + +/* local64_t always succeeds */ + +static inline bool rb_time_read(rb_time_t *t, u64 *ret) +{ + *ret = local64_read(&t->time); + return true; +} +static void rb_time_set(rb_time_t *t, u64 val) +{ + local64_set(&t->time, val); +} + +static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + u64 val; + val = local64_cmpxchg(&t->time, expect, set); + return val == expect; +} +#endif + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -577,7 +801,7 @@ static void rb_wake_up_waiters(struct irq_work *work) */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { - struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); + struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; int ret = 0; @@ -746,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, static inline u64 rb_time_stamp(struct trace_buffer *buffer) { + u64 ts; + + /* Skip retpolines :-( */ + if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + ts = trace_clock_local(); + else + ts = buffer->clock(); + /* shift to debug/test normalization and TIME_EXTENTS */ - return buffer->clock() << DEBUG_SHIFT; + return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) @@ -2372,8 +2604,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * +/* Slow path */ +static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) @@ -2397,6 +2629,66 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static void +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + u64 write_stamp; + + WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)info->before, + (unsigned long long)info->after, + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); +} + +static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event **event, + struct rb_event_info *info, + u64 *delta, + unsigned int *length) +{ + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + + if (unlikely(info->delta > (1ULL << 59))) { + /* did the clock go backwards */ + if (info->before == info->after && info->before > info->ts) { + /* not interrupted */ + static int once; + + /* + * This is possible with a recalibrating of the TSC. + * Do not produce a call stack, but just report it. + */ + if (!once) { + once++; + pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", + info->before, info->ts); + } + } else + rb_check_timestamp(cpu_buffer, info); + if (!abs) + info->delta = 0; + } + *event = rb_add_time_stamp(*event, info->delta, abs); + *length -= RB_LEN_TIME_EXTEND; + *delta = 0; +} + /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event @@ -2416,21 +2708,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ - if (unlikely(info->add_timestamp)) { - bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); - - event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } + if (unlikely(info->add_timestamp)) + rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; @@ -2473,12 +2756,38 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) +static __always_inline bool +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { - return true; + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static u64 rb_time_delta(struct ring_buffer_event *event) +{ + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return 0; + + case RINGBUF_TYPE_TIME_EXTEND: + return ring_buffer_event_time_stamp(event); + + case RINGBUF_TYPE_TIME_STAMP: + return 0; + + case RINGBUF_TYPE_DATA: + return event->time_delta; + default: + return 0; + } } -#endif static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, @@ -2488,6 +2797,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long index; unsigned long addr; + u64 write_stamp; + u64 delta; new_index = rb_event_index(event); old_index = new_index + rb_event_ts_length(event); @@ -2496,10 +2807,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = READ_ONCE(cpu_buffer->tail_page); + delta = rb_time_delta(event); + + if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) + return 0; + + /* Make sure the write stamp is read before testing the location */ + barrier(); + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); + + /* Something came in, can't discard */ + if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, + write_stamp, write_stamp - delta)) + return 0; + + /* + * If an event were to come in now, it would see that the + * write_stamp and the before_stamp are different, and assume + * that this event just added itself before updating + * the write stamp. The interrupting event will fix the + * write stamp for us, and use the before stamp as its delta. + */ + /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2551,10 +2884,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - /* Only update the write stamp if the page has an event */ - if (rb_page_write(cpu_buffer->commit_page)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2626,54 +2955,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static __always_inline bool -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static __always_inline void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = ring_buffer_event_time_stamp(event); - cpu_buffer->write_stamp += delta; - } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { - delta = ring_buffer_event_time_stamp(event); - cpu_buffer->write_stamp = delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -2864,58 +3149,138 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); -static noinline void -rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct rb_event_info *info) -{ - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info->delta, - (unsigned long long)info->ts, - (unsigned long long)cpu_buffer->write_stamp, - sched_clock_stable() ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n" - "or add trace_clock=global to the kernel command line\n"); - info->add_timestamp = 1; -} - static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { struct ring_buffer_event *event; struct buffer_page *tail_page; - unsigned long tail, write; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(info->add_timestamp)) - info->length += RB_LEN_TIME_EXTEND; + unsigned long tail, write, w; + bool a_ok; + bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); - write = local_add_return(info->length, &tail_page->write); + + /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; + barrier(); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + barrier(); + info->ts = rb_time_stamp(cpu_buffer->buffer); + + if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { + info->delta = info->ts; + } else { + /* + * If interrupting an event time update, we may need an + * absolute timestamp. + * Don't bother if this is the start of a new page (w == 0). + */ + if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } else { + info->delta = info->ts - info->after; + if (unlikely(test_time_stamp(info->delta))) { + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } + } + } + + /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); + + /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; + tail = write - info->length; + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) { + if (tail != w) { + /* before and after may now different, fix it up*/ + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); + } + return rb_move_tail(cpu_buffer, tail, info); + } + + if (likely(tail == w)) { + u64 save_before; + bool s_ok; + + /* Nothing interrupted us between A and C */ + /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); + barrier(); + /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); + RB_WARN_ON(cpu_buffer, !s_ok); + if (likely(!(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) + /* This did not interrupt any time update */ + info->delta = info->ts - info->after; + else + /* Just use full timestamp for inerrupting event */ + info->delta = info->ts; + barrier(); + if (unlikely(info->ts != save_before)) { + /* SLOW PATH - Interrupted between C and E */ + + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + RB_WARN_ON(cpu_buffer, !a_ok); + + /* Write stamp must only go forward */ + if (save_before > info->after) { + /* + * We do not care about the result, only that + * it gets updated atomically. + */ + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, save_before); + } + } + } else { + u64 ts; + /* SLOW PATH - Interrupted between A and C */ + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); + ts = rb_time_stamp(cpu_buffer->buffer); + barrier(); + /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + info->after < ts) { + /* Nothing came after this event between C and E */ + info->delta = ts - info->after; + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, info->ts); + info->ts = ts; + } else { + /* + * Interrupted beween C and E: + * Lost the previous events time stamp. Just set the + * delta to zero, and this will be the same time as + * the event this event interrupted. And the events that + * came after this will still be correct (as they would + * have built their delta on the previous event. + */ + info->delta = 0; + } + info->add_timestamp &= ~RB_ADD_STAMP_FORCE; + } + /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) + if (unlikely(!tail && !(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, tail, info); - /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); @@ -2927,7 +3292,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then update * its timestamp. */ - if (!tail) + if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ @@ -2944,9 +3309,10 @@ rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; - u64 diff; + int add_ts_default; rb_start_commit(cpu_buffer); + /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* @@ -2964,8 +3330,16 @@ rb_reserve_next_event(struct trace_buffer *buffer, #endif info.length = rb_calculate_event_length(length); + + if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + add_ts_default = RB_ADD_STAMP_ABSOLUTE; + info.length += RB_LEN_TIME_EXTEND; + } else { + add_ts_default = RB_ADD_STAMP_NONE; + } + again: - info.add_timestamp = 0; + info.add_timestamp = add_ts_default; info.delta = 0; /* @@ -2980,35 +3354,16 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - info.ts = rb_time_stamp(cpu_buffer->buffer); - diff = info.ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - if (ring_buffer_time_stamp_abs(buffer)) { - info.delta = info.ts; - rb_handle_timestamp(cpu_buffer, &info); - } else /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { - info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) - rb_handle_timestamp(cpu_buffer, &info); - } - event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { - if (info.add_timestamp) + if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } - if (!event) - goto out_fail; - - return event; - + if (likely(event)) + return event; out_fail: rb_end_commit(cpu_buffer); return NULL; @@ -3154,11 +3509,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, if (rb_try_to_discard(cpu_buffer, event)) goto out; - /* - * The commit is still visible by the reader, so we - * must still update the timestamp. - */ - rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); @@ -4475,8 +4825,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; - cpu_buffer->write_stamp = 0; - cpu_buffer->read_stamp = 0; + rb_time_set(&cpu_buffer->write_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; @@ -4484,6 +4834,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_head_page_activate(cpu_buffer); } +/* Must have disabled the cpu buffer then done a synchronize_rcu */ +static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + arch_spin_unlock(&cpu_buffer->lock); + + out: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} + /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of @@ -4492,7 +4862,6 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -4503,24 +4872,42 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) /* Make sure all commits have finished */ synchronize_rcu(); - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + reset_disabled_cpu_buffer(cpu_buffer); - if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); - arch_spin_lock(&cpu_buffer->lock); +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; - rb_reset_cpu(cpu_buffer); + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; - arch_spin_unlock(&cpu_buffer->lock); + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + /* Make sure all commits have finished */ + synchronize_rcu(); - atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&cpu_buffer->resize_disabled); + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } } -EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset - reset a ring buffer @@ -4528,10 +4915,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct trace_buffer *buffer) { + struct ring_buffer_per_cpu *cpu_buffer; int cpu; - for_each_buffer_cpu(buffer, cpu) - ring_buffer_reset_cpu(buffer, cpu); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } + + /* Make sure all commits have finished */ + synchronize_rcu(); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } } EXPORT_SYMBOL_GPL(ring_buffer_reset); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 8df0aa810950..78e576575b79 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); static int producer_nice = MAX_NICE; static int consumer_nice = MAX_NICE; -static int producer_fifo = -1; -static int consumer_fifo = -1; +static int producer_fifo; +static int consumer_fifo; module_param(producer_nice, int, 0644); MODULE_PARM_DESC(producer_nice, "nice prio for producer"); @@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644); MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); module_param(producer_fifo, int, 0644); -MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); +MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo"); module_param(consumer_fifo, int, 0644); -MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); +MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo"); static int read_events; @@ -303,22 +303,22 @@ static void ring_buffer_producer(void) trace_printk("ERROR!\n"); if (!disable_reader) { - if (consumer_fifo < 0) + if (consumer_fifo) + trace_printk("Running Consumer at SCHED_FIFO %s\n", + consumer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Consumer at nice: %d\n", consumer_nice); - else - trace_printk("Running Consumer at SCHED_FIFO %d\n", - consumer_fifo); } - if (producer_fifo < 0) + if (producer_fifo) + trace_printk("Running Producer at SCHED_FIFO %s\n", + producer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Producer at nice: %d\n", producer_nice); - else - trace_printk("Running Producer at SCHED_FIFO %d\n", - producer_fifo); /* Let the user know that the test is running at low priority */ - if (producer_fifo < 0 && consumer_fifo < 0 && + if (!producer_fifo && !consumer_fifo && producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); @@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void) * Run them as low-prio background tasks by default: */ if (!disable_reader) { - if (consumer_fifo >= 0) { - struct sched_param param = { - .sched_priority = consumer_fifo - }; - sched_setscheduler(consumer, SCHED_FIFO, ¶m); - } else + if (consumer_fifo >= 2) + sched_set_fifo(consumer); + else if (consumer_fifo == 1) + sched_set_fifo_low(consumer); + else set_user_nice(consumer, consumer_nice); } - if (producer_fifo >= 0) { - struct sched_param param = { - .sched_priority = producer_fifo - }; - sched_setscheduler(producer, SCHED_FIFO, ¶m); - } else + if (producer_fifo >= 2) + sched_set_fifo(producer); + else if (producer_fifo == 1) + sched_set_fifo_low(producer); + else set_user_nice(producer, producer_nice); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..f40d850ebabc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1543,8 +1543,7 @@ static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); - fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, - tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) @@ -2003,7 +2002,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu) void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; - int cpu; if (!buffer) return; @@ -2015,8 +2013,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf) buf->time_start = buffer_ftrace_now(buf, buf->cpu); - for_each_online_cpu(cpu) - ring_buffer_reset_cpu(buffer, cpu); + ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } @@ -2932,12 +2929,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, skip++; #endif - /* - * Since events can happen in NMIs there's no safe way to - * use the per cpu ftrace_stacks. We reserve it and if an interrupt - * or NMI comes in, it will just have to use the default - * FTRACE_STACK_SIZE. - */ preempt_disable_notrace(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; @@ -3137,6 +3128,9 @@ static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; + if (trace_percpu_buffer) + return 0; + buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; @@ -3339,6 +3333,26 @@ int trace_array_vprintk(struct trace_array *tr, return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever encorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ __printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) @@ -3346,12 +3360,16 @@ int trace_array_printk(struct trace_array *tr, int ret; va_list ap; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) - return 0; - if (!tr) return -ENOENT; + /* This is only allowed for created instances */ + if (tr == &global_trace) + return 0; + + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + return 0; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); @@ -3359,6 +3377,27 @@ int trace_array_printk(struct trace_array *tr, } EXPORT_SYMBOL_GPL(trace_array_printk); +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) @@ -5887,7 +5926,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } /* If trace pipe files are being read, we can't change the tracer */ - if (tr->current_trace->ref) { + if (tr->trace_ref) { ret = -EBUSY; goto out; } @@ -6103,7 +6142,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); - tr->current_trace->ref++; + tr->trace_ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -6122,7 +6161,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - tr->current_trace->ref--; + tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -7406,7 +7445,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (ret) return ret; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; @@ -7424,7 +7463,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; - tr->current_trace->ref++; + tr->trace_ref++; mutex_unlock(&trace_types_lock); @@ -7525,14 +7564,14 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - iter->tr->current_trace->ref--; + iter->tr->trace_ref--; __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); - kfree(info); + kvfree(info); mutex_unlock(&trace_types_lock); @@ -8733,7 +8772,7 @@ static int __remove_instance(struct trace_array *tr) int i; /* Reference counter for a newly created trace array = 1. */ - if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) + if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); @@ -8945,9 +8984,7 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!tracefs_initialized()) || - (IS_ENABLED(CONFIG_DEBUG_FS) && - WARN_ON(!debugfs_initialized()))) + if (WARN_ON(!tracefs_initialized())) return ERR_PTR(-ENODEV); /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13db4000af3f..610d21355526 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -356,6 +356,7 @@ struct trace_array { struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; + int trace_ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -547,7 +548,6 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; - int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE @@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + struct ftrace_func_command { struct list_head list; char *name; @@ -1114,7 +1118,8 @@ struct ftrace_func_command { extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { - return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != + FTRACE_PID_IGNORE; } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f6f55682d3e2..a85effb2373b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); - register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); - unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index e2be7bb7ef7e..17873e5d0353 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -283,6 +283,7 @@ static bool disable_migrate; static void move_to_next_cpu(void) { struct cpumask *current_mask = &save_cpumask; + struct trace_array *tr = hwlat_trace; int next_cpu; if (disable_migrate) @@ -296,7 +297,7 @@ static void move_to_next_cpu(void) goto disable; get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(smp_processor_id(), current_mask); put_online_cpus(); @@ -371,9 +372,8 @@ static int start_kthread(struct trace_array *tr) return 0; /* Just pick the first CPU on first iteration */ - current_mask = &save_cpumask; get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); put_online_cpus(); next_cpu = cpumask_first(current_mask); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 73976de7f8cc..4d1893564912 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,7 +20,7 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE + 1; +static int next_event_type = __TRACE_LAST_TYPE; enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -675,11 +675,11 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { struct trace_event *e; - int last = __TRACE_LAST_TYPE; + int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { *list = &ftrace_event_list; - return last + 1; + return next; } /* @@ -687,17 +687,17 @@ static int trace_search_list(struct list_head **list) * lets see if somebody freed one. */ list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != last + 1) + if (e->type != next) break; - last++; + next++; } /* Did we used up all 65 thousand events??? */ - if ((last + 1) > TRACE_EVENT_TYPE_MAX) + if (next > TRACE_EVENT_TYPE_MAX) return 0; *list = &e->list; - return last + 1; + return next; } void trace_event_read_lock(void) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fdd47f99b18f..f4286c9bdeb4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1456,7 +1456,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, default: return 0; } - return 0; } static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) diff --git a/kernel/umh.c b/kernel/umh.c index 79f139a7ca03..fcf3ee803630 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -26,8 +26,6 @@ #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> -#include <linux/shmem_fs.h> -#include <linux/pipe_fs_i.h> #include <trace/events/module.h> @@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); -static LIST_HEAD(umh_list); -static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -102,16 +98,9 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - sub_info->pid = task_pid_nr(current); - if (sub_info->file) { - retval = do_execve_file(sub_info->file, - sub_info->argv, sub_info->envp); - if (!retval) - current->flags |= PF_UMH; - } else - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* @@ -130,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ + /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { + if (pid < 0) sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - kernel_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } + else + kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); - umh_complete(sub_info); } @@ -405,140 +373,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); -struct subprocess_info *call_usermodehelper_setup_file(struct file *file, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), void *data) -{ - struct subprocess_info *sub_info; - struct umh_info *info = data; - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; - - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); - if (!sub_info) - return NULL; - - sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); - if (!sub_info->argv) { - kfree(sub_info); - return NULL; - } - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - sub_info->path = "none"; - sub_info->file = file; - sub_info->init = init; - sub_info->cleanup = cleanup; - sub_info->data = data; - return sub_info; -} - -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) -{ - struct umh_info *umh_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - umh_info->pipe_to_umh = to_umh[1]; - umh_info->pipe_from_umh = from_umh[0]; - return 0; -} - -static void umh_clean_and_save_pid(struct subprocess_info *info) -{ - struct umh_info *umh_info = info->data; - - /* cleanup if umh_pipe_setup() was successful but exec failed */ - if (info->pid && info->retval) { - fput(umh_info->pipe_to_umh); - fput(umh_info->pipe_from_umh); - } - - argv_free(info->argv); - umh_info->pid = info->pid; -} - -/** - * fork_usermode_blob - fork a blob of bytes as a usermode process - * @data: a blob of bytes that can be do_execv-ed as a file - * @len: length of the blob - * @info: information about usermode process (shouldn't be NULL) - * - * If info->cmdline is set it will be used as command line for the - * user process, else "usermodehelper" is used. - * - * Returns either negative error or zero which indicates success - * in executing a blob of bytes as a usermode process. In such - * case 'struct umh_info *info' is populated with two pipes - * and a pid of the process. The caller is responsible for health - * check of the user process, killing it via pid, and closing the - * pipes when user process is no longer needed. - */ -int fork_usermode_blob(void *data, size_t len, struct umh_info *info) -{ - struct subprocess_info *sub_info; - struct file *file; - ssize_t written; - loff_t pos = 0; - int err; - - file = shmem_kernel_file_setup("", len, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - written = kernel_write(file, data, len, &pos); - if (written != len) { - err = written; - if (err >= 0) - err = -ENOMEM; - goto out; - } - - err = -ENOMEM; - sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_clean_and_save_pid, info); - if (!sub_info) - goto out; - - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (!err) { - mutex_lock(&umh_list_lock); - list_add(&info->list, &umh_list); - mutex_unlock(&umh_list_lock); - } -out: - fput(file); - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_blob); - /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa @@ -700,26 +534,6 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -void __exit_umh(struct task_struct *tsk) -{ - struct umh_info *info; - pid_t pid = tsk->pid; - - mutex_lock(&umh_list_lock); - list_for_each_entry(info, &umh_list, list) { - if (info->pid == pid) { - list_del(&info->list); - mutex_unlock(&umh_list_lock); - goto out; - } - } - mutex_unlock(&umh_list_lock); - return; -out: - if (info->cleanup) - info->cleanup(info); -} - struct ctl_table usermodehelper_table[] = { { .procname = "bset", diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c new file mode 100644 index 000000000000..0b35212ffc3d --- /dev/null +++ b/kernel/usermode_driver.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * umd - User mode driver support + */ +#include <linux/shmem_fs.h> +#include <linux/pipe_fs_i.h> +#include <linux/mount.h> +#include <linux/fs_struct.h> +#include <linux/task_work.h> +#include <linux/usermode_driver.h> + +static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) +{ + struct file_system_type *type; + struct vfsmount *mnt; + struct file *file; + ssize_t written; + loff_t pos = 0; + + type = get_fs_type("tmpfs"); + if (!type) + return ERR_PTR(-ENODEV); + + mnt = kern_mount(type); + put_filesystem(type); + if (IS_ERR(mnt)) + return mnt; + + file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + if (IS_ERR(file)) { + mntput(mnt); + return ERR_CAST(file); + } + + written = kernel_write(file, data, len, &pos); + if (written != len) { + int err = written; + if (err >= 0) + err = -ENOMEM; + filp_close(file, NULL); + mntput(mnt); + return ERR_PTR(err); + } + + fput(file); + + /* Flush delayed fput so exec can open the file read-only */ + flush_delayed_fput(); + task_work_run(); + return mnt; +} + +/** + * umd_load_blob - Remember a blob of bytes for fork_usermode_driver + * @info: information about usermode driver + * @data: a blob of bytes that can be executed as a file + * @len: The lentgh of the blob + * + */ +int umd_load_blob(struct umd_info *info, const void *data, size_t len) +{ + struct vfsmount *mnt; + + if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) + return -EBUSY; + + mnt = blob_to_mnt(data, len, info->driver_name); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + info->wd.mnt = mnt; + info->wd.dentry = mnt->mnt_root; + return 0; +} +EXPORT_SYMBOL_GPL(umd_load_blob); + +/** + * umd_unload_blob - Disassociate @info from a previously loaded blob + * @info: information about usermode driver + * + */ +int umd_unload_blob(struct umd_info *info) +{ + if (WARN_ON_ONCE(!info->wd.mnt || + !info->wd.dentry || + info->wd.mnt->mnt_root != info->wd.dentry)) + return -EINVAL; + + kern_unmount(info->wd.mnt); + info->wd.mnt = NULL; + info->wd.dentry = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(umd_unload_blob); + +static int umd_setup(struct subprocess_info *info, struct cred *new) +{ + struct umd_info *umd_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + set_fs_pwd(current->fs, &umd_info->wd); + umd_info->pipe_to_umh = to_umh[1]; + umd_info->pipe_from_umh = from_umh[0]; + umd_info->tgid = get_pid(task_tgid(current)); + return 0; +} + +static void umd_cleanup(struct subprocess_info *info) +{ + struct umd_info *umd_info = info->data; + + /* cleanup if umh_setup() was successful but exec failed */ + if (info->retval) { + fput(umd_info->pipe_to_umh); + fput(umd_info->pipe_from_umh); + put_pid(umd_info->tgid); + umd_info->tgid = NULL; + } +} + +/** + * fork_usermode_driver - fork a usermode driver + * @info: information about usermode driver (shouldn't be NULL) + * + * Returns either negative error or zero which indicates success in + * executing a usermode driver. In such case 'struct umd_info *info' + * is populated with two pipes and a tgid of the process. The caller is + * responsible for health check of the user process, killing it via + * tgid, and closing the pipes when user process is no longer needed. + */ +int fork_usermode_driver(struct umd_info *info) +{ + struct subprocess_info *sub_info; + const char *argv[] = { info->driver_name, NULL }; + int err; + + if (WARN_ON_ONCE(info->tgid)) + return -EBUSY; + + err = -ENOMEM; + sub_info = call_usermodehelper_setup(info->driver_name, + (char **)argv, NULL, GFP_KERNEL, + umd_setup, umd_cleanup, info); + if (!sub_info) + goto out; + + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); +out: + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_driver); + + |