diff options
Diffstat (limited to 'kernel')
91 files changed, 4492 insertions, 1229 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..155b5380500a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -125,6 +125,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o +CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n diff --git a/kernel/audit.c b/kernel/audit.c index ec38479f9228..4e8082eb7516 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1851,7 +1851,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_clear_dummy(ab->ctx); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/audit.h b/kernel/audit.h index f0233dc40b17..ddc22878433d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -290,13 +290,6 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); - -static inline void audit_clear_dummy(struct audit_context *ctx) -{ - if (ctx) - ctx->dummy = 0; -} - #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -330,7 +323,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED -#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a23390457..fd840c40abf7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1417,6 +1417,9 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a2a97fa3071b..370217dd7e39 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data) complete(&backtrace_work); } -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); +static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback); static void backtrace_test_irq(void) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..0443600146dc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3746,7 +3746,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3768,7 +3768,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -4058,6 +4058,11 @@ static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, const char *tname, *sym; u32 btf_id, i; + if (!btf_vmlinux) { + bpf_log(log, "btf_vmlinux doesn't exist\n"); + return -EINVAL; + } + if (IS_ERR(btf_vmlinux)) { bpf_log(log, "btf_vmlinux is malformed\n"); return -EINVAL; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b4b288a3c3c9..b32cc8ce8ff6 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -779,15 +779,20 @@ static void htab_elem_free_rcu(struct rcu_head *head) htab_elem_free(htab, l); } -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) { struct bpf_map *map = &htab->map; + void *ptr; if (map->ops->map_fd_put_ptr) { - void *ptr = fd_htab_map_get_ptr(map, l); - + ptr = fd_htab_map_get_ptr(map, l); map->ops->map_fd_put_ptr(ptr); } +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { __pcpu_freelist_push(&htab->freelist, &l->fnode); @@ -839,6 +844,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; + htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 78cf061f8179..310241ca7991 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -19,18 +19,21 @@ struct bpf_netns_link { * with netns_bpf_mutex held. */ struct net *net; + struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); /* Must be called with netns_bpf_mutex held. */ -static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +static void netns_bpf_run_array_detach(struct net *net, + enum netns_bpf_attach_type type) { - struct bpf_netns_link *net_link = - container_of(link, struct bpf_netns_link, link); + struct bpf_prog_array *run_array; - net_link->net = NULL; + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); } static void bpf_netns_link_release(struct bpf_link *link) @@ -40,22 +43,18 @@ static void bpf_netns_link_release(struct bpf_link *link) enum netns_bpf_attach_type type = net_link->netns_type; struct net *net; - /* Link auto-detached by dying netns. */ - if (!net_link->net) - return; - mutex_lock(&netns_bpf_mutex); - /* Recheck after potential sleep. We can race with cleanup_net - * here, but if we see a non-NULL struct net pointer pre_exit - * has not happened yet and will block on netns_bpf_mutex. + /* We can race with cleanup_net, but if we see a non-NULL + * struct net pointer, pre_exit has not run yet and wait for + * netns_bpf_mutex. */ net = net_link->net; if (!net) goto out_unlock; - net->bpf.links[type] = NULL; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + list_del(&net_link->node); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -76,6 +75,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *run_array; struct net *net; int ret = 0; @@ -93,8 +93,11 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, goto out_unlock; } + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + WRITE_ONCE(run_array->items[0].prog, new_prog); + old_prog = xchg(&link->prog, new_prog); - rcu_assign_pointer(net->bpf.progs[type], new_prog); bpf_prog_put(old_prog); out_unlock: @@ -142,14 +145,38 @@ static const struct bpf_link_ops bpf_netns_link_ops = { .show_fdinfo = bpf_netns_link_show_fdinfo, }; +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct net *net, + enum netns_bpf_attach_type type) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array *run_array; + u32 prog_cnt = 0, flags = 0; + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) + prog_cnt = bpf_prog_array_length(run_array); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + return bpf_prog_array_copy_to_user(run_array, prog_ids, + attr->query.prog_cnt); +} + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; enum netns_bpf_attach_type type; - struct bpf_prog *attached; struct net *net; + int ret; if (attr->query.query_flags) return -EINVAL; @@ -162,36 +189,25 @@ int netns_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(net)) return PTR_ERR(net); - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_query(attr, uattr, net, type); + mutex_unlock(&netns_bpf_mutex); put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; + return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; + struct bpf_prog *attached; struct net *net; int ret; + if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; @@ -200,19 +216,47 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); + ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } + if (ret) + goto out_unlock; + + attached = net->bpf.progs[type]; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out_unlock; + } + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) { + WRITE_ONCE(run_array->items[0].prog, prog); + } else { + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + ret = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + } + + net->bpf.progs[type] = prog; + if (attached) + bpf_prog_put(attached); + out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -221,63 +265,74 @@ out_unlock: /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) + enum netns_bpf_attach_type type, + struct bpf_prog *old) { struct bpf_prog *attached; /* Progs attached via links cannot be detached */ - if (net->bpf.links[type]) + if (!list_empty(&net->bpf.links[type])) return -EINVAL; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (!attached) + attached = net->bpf.progs[type]; + if (!attached || attached != old) return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } -int netns_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; + struct bpf_prog *prog; int ret; + if (attr->target_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); + bpf_prog_put(prog); + return ret; } static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { - struct bpf_prog *prog; + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); /* Allow attaching only one prog or link for now */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { err = -E2BIG; goto out_unlock; } /* Links are not compatible with attaching prog directly */ - prog = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (prog) { + if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - err = flow_dissector_bpf_prog_attach(net, link->prog); + err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; default: err = -EINVAL; @@ -286,7 +341,15 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - net->bpf.links[type] = link; + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + err = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = link->prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + + list_add_tail(&net_link->node, &net->bpf.links[type]); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -345,23 +408,34 @@ out_put_net: return err; } +static int __net_init netns_bpf_pernet_init(struct net *net) +{ + int type; + + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + INIT_LIST_HEAD(&net->bpf.links[type]); + + return 0; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; - struct bpf_link *link; + struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { - link = net->bpf.links[type]; - if (link) - bpf_netns_link_auto_detach(link); - else - __netns_bpf_prog_detach(net, type); + netns_bpf_run_array_detach(net, type); + list_for_each_entry(net_link, &net->bpf.links[type], node) + net_link->net = NULL; /* auto-detach link */ + if (net->bpf.progs[type]) + bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 21cde24386db..cae9d505e04a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -20,11 +20,14 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - struct sock __rcu **socks; + uintptr_t sk_user_data; write_lock_bh(&sk->sk_callback_lock); - socks = sk->sk_user_data; - if (socks) { + sk_user_data = (uintptr_t)sk->sk_user_data; + if (sk_user_data & SK_USER_DATA_BPF) { + struct sock __rcu **socks; + + socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of @@ -252,6 +255,7 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; + uintptr_t sk_user_data; struct socket *socket; int err, fd; @@ -305,7 +309,9 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | + SK_USER_DATA_BPF; + WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 180414bb0d3e..0af88bbc1c15 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -132,15 +132,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; - if (!data_sz || !PAGE_ALIGNED(data_sz)) - return ERR_PTR(-EINVAL); - -#ifdef CONFIG_64BIT - /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ - if (data_sz > RINGBUF_MAX_DATA_SZ) - return ERR_PTR(-E2BIG); -#endif - rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return ERR_PTR(-ENOMEM); @@ -166,9 +157,16 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || - attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + !is_power_of_2(attr->max_entries) || + !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (attr->max_entries > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); if (!rb_map) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..0fd80ac81f70 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2121,7 +2121,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) !bpf_capable()) return -EPERM; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; @@ -2893,13 +2893,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - return netns_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -3139,7 +3137,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -3165,7 +3164,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -3221,7 +3220,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3290,11 +3290,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3328,7 +3328,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3363,7 +3363,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3394,7 +3394,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3451,7 +3451,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3497,7 +3497,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3540,7 +3541,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3555,7 +3557,8 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } -static int bpf_link_get_info_by_fd(struct bpf_link *link, +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3608,15 +3611,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file->private_data, + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34cde841ab68..94cead5a43e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -399,8 +399,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_BTF_ID; + type == PTR_TO_SOCK_COMMON; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -9801,7 +9800,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err; + int err, num_exentries; if (env->subprog_cnt <= 1) return 0; @@ -9876,6 +9875,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + num_exentries = 0; + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + num_exentries++; + } + func[i]->aux->num_exentries = num_exentries; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1ea181a58465..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6475,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b6397a186ce9..d51175cedfca 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -64,7 +64,6 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_unlock_irqrestore(cpu_lock, flags); } -EXPORT_SYMBOL_GPL(cgroup_rstat_updated); /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b98468..18175687133a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9e5934780f41..b16dbc1bf056 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing) atomic_set(&kgdb_break_tasklet_var, 0); } -static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); +static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt); void kgdb_schedule_breakpoint(void) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 61774aec46b4..a790026e42d0 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -792,6 +792,19 @@ static void gdb_cmd_query(struct kgdb_state *ks) } break; #endif +#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT + case 'S': + if (!strncmp(remcom_in_buffer, "qSupported:", 11)) + strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + break; + case 'X': + if (!strncmp(remcom_in_buffer, "qXfer:", 6)) + kgdb_arch_handle_qxfer_pkt(remcom_in_buffer, + remcom_out_buffer); + break; +#endif + default: + break; } } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 93f578a8e613..67f060b86a73 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -70,7 +70,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -539,3 +539,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..a8c18c9a796f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 8cfa01243ed2..6bc74a2d5127 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -6,7 +6,6 @@ #include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> #include <linux/init.h> #include <linux/genalloc.h> #include <linux/set_memory.h> @@ -69,12 +68,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, 1 << order, - order, false); - else - page = alloc_pages(gfp, order); + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; @@ -118,8 +112,7 @@ remove_mapping: dma_common_free_remap(addr, pool_size); #endif free_page: __maybe_unused - if (!dma_release_from_contiguous(NULL, page, 1 << order)) - __free_pages(page, order); + __free_pages(page, order); out: return ret; } @@ -203,7 +196,7 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dev_to_pool(struct device *dev) +static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) { u64 phys_mask; gfp_t gfp; @@ -217,47 +210,79 @@ static inline struct gen_pool *dev_to_pool(struct device *dev) return atomic_pool_kernel; } -static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +{ + if (bad_pool == atomic_pool_kernel) + return atomic_pool_dma32 ? : atomic_pool_dma; + + if (bad_pool == atomic_pool_dma32) + return atomic_pool_dma; + + return NULL; +} + +static inline struct gen_pool *dma_guess_pool(struct device *dev, + struct gen_pool *bad_pool) { - struct gen_pool *pool = dev_to_pool(dev); + if (bad_pool) + return dma_get_safer_pool(bad_pool); - if (unlikely(!pool)) - return false; - return gen_pool_has_addr(pool, (unsigned long)start, size); + return dma_guess_pool_from_device(dev); } void *dma_alloc_from_pool(struct device *dev, size_t size, struct page **ret_page, gfp_t flags) { - struct gen_pool *pool = dev_to_pool(dev); - unsigned long val; + struct gen_pool *pool = NULL; + unsigned long val = 0; void *ptr = NULL; - - if (!pool) { - WARN(1, "%pGg atomic pool not initialised!\n", &flags); - return NULL; + phys_addr_t phys; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) { + WARN(1, "Failed to get suitable pool for %s\n", + dev_name(dev)); + break; + } + + val = gen_pool_alloc(pool, size); + if (!val) + continue; + + phys = gen_pool_virt_to_phys(pool, val); + if (dma_coherent_ok(dev, phys, size)) + break; + + gen_pool_free(pool, val, size); + val = 0; } - val = gen_pool_alloc(pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(pool, val); + if (val) { *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; memset(ptr, 0, size); + + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); } - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); return ptr; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - struct gen_pool *pool = dev_to_pool(dev); + struct gen_pool *pool = NULL; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) + return false; - if (!dma_in_atomic_pool(dev, start, size)) - return false; - gen_pool_free(pool, (unsigned long)start, size); - return true; + if (gen_pool_has_addr(pool, (unsigned long)start, size)) { + gen_pool_free(pool, (unsigned long)start, size); + return true; + } + } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 851fc5e0e24b..78e69e10482a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1237,12 +1238,26 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + + return NULL; +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -4470,7 +4485,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4528,11 +4543,11 @@ retry: } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -4575,7 +4590,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4666,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8645,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11045,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* @@ -12409,8 +12511,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e84eb52b646b..25de10c904e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2199,7 +2199,7 @@ static void handle_swbp(struct pt_regs *regs) if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..2a8e7287a558 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new) { - *new = *orig; + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; } @@ -1954,8 +1960,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -1977,7 +1983,7 @@ static __latent_entropy struct task_struct *copy_process( * to stop root fork bombs. */ retval = -EAGAIN; - if (nr_threads >= max_threads) + if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ @@ -2035,19 +2041,11 @@ static __latent_entropy struct task_struct *copy_process( seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; - p->hardirqs_enabled = 0; - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->hardirq_context = 0; - p->softirq_context = 0; + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); + p->irqtrace.hardirq_disable_ip = _THIS_IP_; + p->irqtrace.softirq_enable_ip = _THIS_IP_; + p->softirqs_enabled = 1; + p->softirq_context = 0; #endif p->pagefault_disabled = 0; @@ -2304,6 +2302,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/futex.c b/kernel/futex.c index 05e88562de68..83404124b77b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include <linux/compat.h> -#include <linux/slab.h> -#include <linux/poll.h> -#include <linux/fs.h> -#include <linux/file.h> #include <linux/jhash.h> -#include <linux/init.h> -#include <linux/futex.h> -#include <linux/mount.h> #include <linux/pagemap.h> #include <linux/syscalls.h> -#include <linux/signal.h> -#include <linux/export.h> -#include <linux/magic.h> -#include <linux/pid.h> -#include <linux/nsproxy.h> -#include <linux/ptrace.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> -#include <linux/refcount.h> #include <asm/futex.h> @@ -476,7 +459,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -500,8 +483,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -538,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -626,7 +609,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } @@ -677,10 +660,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1611,13 +1590,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + return ret; spin_lock(&hb->lock); @@ -1640,9 +1619,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); -out: return ret; } @@ -1709,10 +1685,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1730,13 +1706,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1744,8 +1720,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1781,11 +1755,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret; } @@ -1992,20 +1961,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out_put_keys; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2025,13 +1992,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2090,12 +2055,10 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2106,8 +2069,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2216,12 +2177,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: return ret ? ret : task_count; } @@ -2567,7 +2522,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2580,7 +2535,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2594,8 +2549,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2692,12 +2646,11 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2706,9 +2659,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2853,7 +2803,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2961,13 +2910,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2980,12 +2927,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3114,16 +3060,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3265,7 +3208,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3274,7 +3217,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3284,7 +3227,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3374,11 +3317,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4f9f844074db..b95ff5d5f4bd 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), @@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_WAKEUP_STATE), BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), + + BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), + + BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), }; static const struct irq_bit_descr irqdesc_states[] = { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..48c38e09c673 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,9 +195,9 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -205,9 +205,19 @@ static void irq_validate_effective_affinity(struct irq_data *data) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); -#endif } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) +{ + cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); +} +#else +static inline void irq_validate_effective_affinity(struct irq_data *data) { } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) { } +#endif + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -304,6 +314,30 @@ static int irq_try_set_affinity(struct irq_data *data, return ret; } +static bool irq_set_affinity_deactivated(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + /* + * Handle irq chips which can handle affinity only in activated + * state correctly + * + * If the interrupt is not yet activated, just store the affinity + * mask and do not call the chip driver at all. On activation the + * driver has to make sure anyway that the interrupt is in a + * useable state so startup works. + */ + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || + irqd_is_activated(data) || !irqd_affinity_on_activate(data)) + return false; + + cpumask_copy(desc->irq_common_data.affinity, mask); + irq_init_effective_affinity(data, mask); + irqd_set(data, IRQD_AFFINITY_SET); + return true; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -314,6 +348,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; + if (irq_set_affinity_deactivated(data, mask, force)) + return 0; + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 27634f4022d0..c48ce19a257f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..95cb74f73292 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/compiler.h> /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, @@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ @@ -644,19 +678,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +708,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index d4999b38d1be..65ca5539c470 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -7,8 +7,11 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) -CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ - $(call cc-option,-fno-stack-protector,) +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o -obj-$(CONFIG_KCSAN_SELFTEST) += test.o +obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o + +CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index be9e625227f3..75fe701f4127 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -3,8 +3,7 @@ #ifndef _KERNEL_KCSAN_ATOMIC_H #define _KERNEL_KCSAN_ATOMIC_H -#include <linux/jiffies.h> -#include <linux/sched.h> +#include <linux/types.h> /* * Special rules for certain memory where concurrent conflicting accesses are @@ -13,8 +12,7 @@ */ static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* volatile globals that have been observed in data races. */ - return ptr == &jiffies || ptr == ¤t->state; + return false; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..9147ff6a12e5 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -291,6 +291,20 @@ static inline unsigned int get_delay(void) 0); } +void kcsan_save_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->kcsan_save_irqtrace = task->irqtrace; +#endif +} + +void kcsan_restore_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->irqtrace = task->kcsan_save_irqtrace; +#endif +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { + kcsan_save_irqtrace(current); kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT, watchpoint - watchpoints); + kcsan_restore_irqtrace(current); } else { /* * The other thread may not print any diagnostics, as it has @@ -396,9 +412,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } + /* + * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's + * runtime is entered for every memory access, and potentially useful + * information is lost if dirtied by KCSAN. + */ + kcsan_save_irqtrace(current); if (!kcsan_interrupt_watcher) - /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ - raw_local_irq_save(irq_flags); + local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -539,7 +560,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) - raw_local_irq_restore(irq_flags); + local_irq_restore(irq_flags); + kcsan_restore_irqtrace(current); out: user_access_restore(ua_flags); } @@ -754,6 +776,7 @@ EXPORT_SYMBOL(__kcsan_check_access); */ #define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr); \ void __tsan_read##size(void *ptr) \ { \ check_access(ptr, size, 0); \ @@ -762,6 +785,7 @@ EXPORT_SYMBOL(__kcsan_check_access); void __tsan_unaligned_read##size(void *ptr) \ __alias(__tsan_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr); \ void __tsan_write##size(void *ptr) \ { \ check_access(ptr, size, KCSAN_ACCESS_WRITE); \ @@ -777,12 +801,14 @@ DEFINE_TSAN_READ_WRITE(4); DEFINE_TSAN_READ_WRITE(8); DEFINE_TSAN_READ_WRITE(16); +void __tsan_read_range(void *ptr, size_t size); void __tsan_read_range(void *ptr, size_t size) { check_access(ptr, size, 0); } EXPORT_SYMBOL(__tsan_read_range); +void __tsan_write_range(void *ptr, size_t size); void __tsan_write_range(void *ptr, size_t size) { check_access(ptr, size, KCSAN_ACCESS_WRITE); @@ -799,6 +825,7 @@ EXPORT_SYMBOL(__tsan_write_range); * the size-check of compiletime_assert_rwonce_type(). */ #define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr); \ void __tsan_volatile_read##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -811,6 +838,7 @@ EXPORT_SYMBOL(__tsan_write_range); void __tsan_unaligned_volatile_read##size(void *ptr) \ __alias(__tsan_volatile_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr); \ void __tsan_volatile_write##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -836,14 +864,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16); * The below are not required by KCSAN, but can still be emitted by the * compiler. */ +void __tsan_func_entry(void *call_pc); void __tsan_func_entry(void *call_pc) { } EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void); void __tsan_func_exit(void) { } EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void); void __tsan_init(void) { } diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c new file mode 100644 index 000000000000..fed6fcb5768c --- /dev/null +++ b/kernel/kcsan/kcsan-test.c @@ -0,0 +1,1107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN test with various race scenarious to test runtime behaviour. Since the + * interface with which KCSAN's reports are obtained is via the console, this is + * the output we should verify. For each test case checks the presence (or + * absence) of generated reports. Relies on 'console' tracepoint to capture + * reports as they appear in the kernel log. + * + * Makes use of KUnit for test organization, and the Torture framework for test + * thread control. + * + * Copyright (C) 2020, Google LLC. + * Author: Marco Elver <elver@google.com> + */ + +#include <kunit/test.h> +#include <linux/jiffies.h> +#include <linux/kcsan-checks.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/timer.h> +#include <linux/torture.h> +#include <linux/tracepoint.h> +#include <linux/types.h> +#include <trace/events/printk.h> + +/* Points to current test-case memory access "kernels". */ +static void (*access_kernels[2])(void); + +static struct task_struct **threads; /* Lists of threads. */ +static unsigned long end_time; /* End time of test. */ + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[3][512]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Setup test checking loop. */ +static __no_kcsan inline void +begin_test_checks(void (*func1)(void), void (*func2)(void)) +{ + kcsan_disable_current(); + + /* + * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at + * least one race is reported. + */ + end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500); + + /* Signal start; release potential initialization of shared data. */ + smp_store_release(&access_kernels[0], func1); + smp_store_release(&access_kernels[1], func2); +} + +/* End test checking loop. */ +static __no_kcsan inline bool +end_test_checks(bool stop) +{ + if (!stop && time_before(jiffies, end_time)) { + /* Continue checking */ + might_sleep(); + return false; + } + + kcsan_enable_current(); + return true; +} + +/* + * Probe for console output: checks if a race was reported, and obtains observed + * lines of interest. + */ +__no_kcsan +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + /* + * Note that KCSAN reports under a global lock, so we do not risk the + * possibility of having multiple reports interleaved. If that were the + * case, we'd expect tests to fail. + */ + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) { + /* + * KCSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + + if (strnstr(buf, "race at unknown origin", len)) { + if (WARN_ON(nlines != 2)) + goto out; + + /* No second line of interest. */ + strcpy(observed.lines[nlines++], "<none>"); + } + } + +out: + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +__no_kcsan +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Report information we expect in a report. */ +struct expect_report { + /* Access information of both accesses. */ + struct { + void *fn; /* Function pointer to expected function of top frame. */ + void *addr; /* Address of access; unchecked if NULL. */ + size_t size; /* Size of access; unchecked if @addr is NULL. */ + int type; /* Access type, see KCSAN_ACCESS definitions. */ + } access[2]; +}; + +/* Check observed report matches information in @r. */ +__no_kcsan +static bool report_matches(const struct expect_report *r) +{ + const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + int i; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", + is_assert ? "assert: race" : "data-race"); + if (r->access[1].fn) { + char tmp[2][64]; + int cmp; + + /* Expect lexographically sorted function names in title. */ + scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn); + scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn); + cmp = strcmp(tmp[0], tmp[1]); + cur += scnprintf(cur, end - cur, "%ps / %ps", + cmp < 0 ? r->access[0].fn : r->access[1].fn, + cmp < 0 ? r->access[1].fn : r->access[0].fn); + } else { + scnprintf(cur, end - cur, "%pS", r->access[0].fn); + /* The exact offset won't match, remove it. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + } + + /* Access 1 */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + if (!r->access[1].fn) + cur += scnprintf(cur, end - cur, "race at unknown origin, with "); + + /* Access 1 & 2 */ + for (i = 0; i < 2; ++i) { + const char *const access_type = + (r->access[i].type & KCSAN_ACCESS_ASSERT) ? + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "write" : + "read"); + const char *const access_type_aux = + (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? + " (scoped)" : + ""); + + if (i == 1) { + /* Access 2 */ + cur = expect[2]; + end = &expect[2][sizeof(expect[2]) - 1]; + + if (!r->access[1].fn) { + /* Dummy string if no second access is available. */ + strcpy(cur, "<none>"); + break; + } + } + + cur += scnprintf(cur, end - cur, "%s%s to ", access_type, + access_type_aux); + + if (r->access[i].addr) /* Address is optional. */ + cur += scnprintf(cur, end - cur, "0x%px of %zu bytes", + r->access[i].addr, r->access[i].size); + } + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && + /* Access info may appear in any order. */ + ((strstr(observed.lines[1], expect[1]) && + strstr(observed.lines[2], expect[2])) || + (strstr(observed.lines[1], expect[2]) && + strstr(observed.lines[2], expect[1]))); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test kernels ===== */ + +static long test_sink; +static long test_var; +/* @test_array should be large enough to fall into multiple watchpoint slots. */ +static long test_array[3 * PAGE_SIZE / sizeof(long)]; +static struct { + long val[8]; +} test_struct; +static DEFINE_SEQLOCK(test_seqlock); + +/* + * Helper to avoid compiler optimizing out reads, and to generate source values + * for writes. + */ +__no_kcsan +static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } + +static noinline void test_kernel_read(void) { sink_value(test_var); } + +static noinline void test_kernel_write(void) +{ + test_var = READ_ONCE_NOCHECK(test_sink) + 1; +} + +static noinline void test_kernel_write_nochange(void) { test_var = 42; } + +/* Suffixed by value-change exception filter. */ +static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; } + +static noinline void test_kernel_read_atomic(void) +{ + sink_value(READ_ONCE(test_var)); +} + +static noinline void test_kernel_write_atomic(void) +{ + WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); +} + +__no_kcsan +static noinline void test_kernel_write_uninstrumented(void) { test_var++; } + +static noinline void test_kernel_data_race(void) { data_race(test_var++); } + +static noinline void test_kernel_assert_writer(void) +{ + ASSERT_EXCLUSIVE_WRITER(test_var); +} + +static noinline void test_kernel_assert_access(void) +{ + ASSERT_EXCLUSIVE_ACCESS(test_var); +} + +#define TEST_CHANGE_BITS 0xff00ff00 + +static noinline void test_kernel_change_bits(void) +{ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + /* + * Avoid race of unknown origin for this test, just pretend they + * are atomic. + */ + kcsan_nestable_atomic_begin(); + test_var ^= TEST_CHANGE_BITS; + kcsan_nestable_atomic_end(); + } else + WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_change(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_nochange(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS); +} + +/* To check that scoped assertions do trigger anywhere in scope. */ +static noinline void test_enter_scope(void) +{ + int x = 0; + + /* Unrelated accesses to scoped assert. */ + READ_ONCE(test_sink); + kcsan_check_read(&x, sizeof(x)); +} + +static noinline void test_kernel_assert_writer_scoped(void) +{ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_assert_access_scoped(void) +{ + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_rmw_array(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_array); ++i) + test_array[i]++; +} + +static noinline void test_kernel_write_struct(void) +{ + kcsan_check_write(&test_struct, sizeof(test_struct)); + kcsan_disable_current(); + test_struct.val[3]++; /* induce value change */ + kcsan_enable_current(); +} + +static noinline void test_kernel_write_struct_part(void) +{ + test_struct.val[3] = 42; +} + +static noinline void test_kernel_read_struct_zero_size(void) +{ + kcsan_check_read(&test_struct.val[3], 0); +} + +static noinline void test_kernel_jiffies_reader(void) +{ + sink_value((long)jiffies); +} + +static noinline void test_kernel_seqlock_reader(void) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&test_seqlock); + sink_value(test_var); + } while (read_seqretry(&test_seqlock, seq)); +} + +static noinline void test_kernel_seqlock_writer(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&test_seqlock, flags); + test_var++; + write_sequnlock_irqrestore(&test_seqlock, flags); +} + +/* ===== Test cases ===== */ + +/* Simple test with normal data race. */ +__no_kcsan +static void test_basic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write, test_kernel_read); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Stress KCSAN with lots of concurrent races on different addresses until + * timeout. + */ +__no_kcsan +static void test_concurrent_races(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + /* NULL will match any address. */ + { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array); + do { + match_expect |= report_matches(&expect); + match_never |= report_matches(&never); + } while (!end_test_checks(false)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */ +__no_kcsan +static void test_novalue_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should + * never apply work. + */ +__no_kcsan +static void test_novalue_change_exception(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that data races of unknown origin are reported. */ +__no_kcsan +static void test_unknown_origin(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { NULL }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */ +__no_kcsan +static void test_write_write_assume_atomic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write, test_kernel_write); + do { + sink_value(READ_ONCE(test_var)); /* induce value-change */ + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races with writes larger than word-size are always reported, + * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races where only one write is larger than word-size are always + * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct_part(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that races with atomic accesses never result in reports. */ +__no_kcsan +static void test_read_atomic_write_atomic(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that a race with an atomic and plain access result in reports. */ +__no_kcsan +static void test_read_plain_atomic_write(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_write_atomic); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Zero-sized accesses should never cause data race reports. */ +__no_kcsan +static void test_zero_size_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the data_race() macro. */ +__no_kcsan +static void test_data_race(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_race, test_kernel_data_race); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access_writer(struct kunit *test) +{ + const struct expect_report expect_access_writer = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + const struct expect_report expect_access_access = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + bool match_expect_access_writer = false; + bool match_expect_access_access = false; + bool match_never = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer); + do { + match_expect_access_writer |= report_matches(&expect_access_writer); + match_expect_access_access |= report_matches(&expect_access_access); + match_never |= report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect_access_writer); + KUNIT_EXPECT_TRUE(test, match_expect_access_access); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_bits_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_change_bits, &test_var, sizeof(test_var), + KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_bits_nochange(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer_scoped(struct kunit *test) +{ + const struct expect_report expect_start = { + .access = { + { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report expect_anywhere = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect_start = false; + bool match_expect_anywhere = false; + + begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange); + do { + match_expect_start |= report_matches(&expect_start); + match_expect_anywhere |= report_matches(&expect_anywhere); + } while (!end_test_checks(match_expect_start && match_expect_anywhere)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_anywhere); +} + +__no_kcsan +static void test_assert_exclusive_access_scoped(struct kunit *test) +{ + const struct expect_report expect_start1 = { + .access = { + { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + const struct expect_report expect_start2 = { + .access = { expect_start1.access[0], expect_start1.access[0] }, + }; + const struct expect_report expect_inscope = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect_start = false; + bool match_expect_inscope = false; + + begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read); + end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */ + do { + match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2); + match_expect_inscope |= report_matches(&expect_inscope); + } while (!end_test_checks(match_expect_start && match_expect_inscope)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_inscope); +} + +/* + * jiffies is special (declared to be volatile) and its accesses are typically + * not marked; this test ensures that the compiler nor KCSAN gets confused about + * jiffies's declaration on different architectures. + */ +__no_kcsan +static void test_jiffies_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that racing accesses in seqlock critical sections are not reported. */ +__no_kcsan +static void test_seqlock_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Each test case is run with different numbers of threads. Until KUnit supports + * passing arguments for each test case, we encode #threads in the test case + * name (read by get_num_threads()). [The '-' was chosen as a stylistic + * preference to separate test name and #threads.] + * + * The thread counts are chosen to cover potentially interesting boundaries and + * corner cases (range 2-5), and then stress the system with larger counts. + */ +#define KCSAN_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name "-02" }, \ + { .run_case = test_name, .name = #test_name "-03" }, \ + { .run_case = test_name, .name = #test_name "-04" }, \ + { .run_case = test_name, .name = #test_name "-05" }, \ + { .run_case = test_name, .name = #test_name "-08" }, \ + { .run_case = test_name, .name = #test_name "-16" } + +static struct kunit_case kcsan_test_cases[] = { + KCSAN_KUNIT_CASE(test_basic), + KCSAN_KUNIT_CASE(test_concurrent_races), + KCSAN_KUNIT_CASE(test_novalue_change), + KCSAN_KUNIT_CASE(test_novalue_change_exception), + KCSAN_KUNIT_CASE(test_unknown_origin), + KCSAN_KUNIT_CASE(test_write_write_assume_atomic), + KCSAN_KUNIT_CASE(test_write_write_struct), + KCSAN_KUNIT_CASE(test_write_write_struct_part), + KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), + KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_zero_size_access), + KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_access), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_jiffies_noreport), + KCSAN_KUNIT_CASE(test_seqlock_noreport), + {}, +}; + +/* ===== End test cases ===== */ + +/* Get number of threads encoded in test name. */ +static bool __no_kcsan +get_num_threads(const char *test, int *nthreads) +{ + int len = strlen(test); + + if (WARN_ON(len < 3)) + return false; + + *nthreads = test[len - 1] - '0'; + *nthreads += (test[len - 2] - '0') * 10; + + if (WARN_ON(*nthreads < 0)) + return false; + + return true; +} + +/* Concurrent accesses from interrupts. */ +__no_kcsan +static void access_thread_timer(struct timer_list *timer) +{ + static atomic_t cnt = ATOMIC_INIT(0); + unsigned int idx; + void (*func)(void); + + idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); +} + +/* The main loop for each thread. */ +__no_kcsan +static int access_thread(void *arg) +{ + struct timer_list timer; + unsigned int cnt = 0; + unsigned int idx; + void (*func)(void); + + timer_setup_on_stack(&timer, access_thread_timer, 0); + do { + might_sleep(); + + if (!timer_pending(&timer)) + mod_timer(&timer, jiffies + 1); + else { + /* Iterate through all kernels. */ + idx = cnt++ % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); + } + } while (!torture_must_stop()); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + + torture_kthread_stopping("access_thread"); + return 0; +} + +__no_kcsan +static int test_init(struct kunit *test) +{ + unsigned long flags; + int nthreads; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); ++i) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + if (!torture_init_begin((char *)test->name, 1)) + return -EBUSY; + + if (!get_num_threads(test->name, &nthreads)) + goto err; + + if (WARN_ON(threads)) + goto err; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) { + if (WARN_ON(access_kernels[i])) + goto err; + } + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const int min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err("%s: too few online CPUs (%u < %d) for test", + test->name, num_online_cpus(), min_required_cpus); + goto err; + } else if (nthreads > num_online_cpus() - min_unused_cpus) { + nthreads = num_online_cpus() - min_unused_cpus; + pr_warn("%s: limiting number of threads to %d\n", + test->name, nthreads); + } + } + + if (nthreads) { + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), + GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, + threads[i])) + goto err; + } + } + + torture_init_end(); + + return 0; + +err: + kfree(threads); + threads = NULL; + torture_init_end(); + return -EINVAL; +} + +__no_kcsan +static void test_exit(struct kunit *test) +{ + struct task_struct **stop_thread; + int i; + + if (torture_cleanup_begin()) + return; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) + WRITE_ONCE(access_kernels[i], NULL); + + if (threads) { + for (stop_thread = threads; *stop_thread; stop_thread++) + torture_stop_kthread(reader_thread, *stop_thread); + + kfree(threads); + threads = NULL; + } + + torture_cleanup_end(); +} + +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan-test", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; + +__no_kcsan +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +__no_kcsan +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kcsan_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kcsan_test_suites); +} + +static void kcsan_test_exit(void) +{ + __kunit_test_suites_exit(kcsan_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kcsan_test_init); +module_exit(kcsan_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 763d6d08d94b..29480010dc30 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -9,6 +9,7 @@ #define _KERNEL_KCSAN_KCSAN_H #include <linux/kcsan.h> +#include <linux/sched.h> /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 @@ -23,6 +24,12 @@ extern unsigned int kcsan_udelay_interrupt; extern bool kcsan_enabled; /* + * Save/restore IRQ flags state trace dirtied by KCSAN. + */ +void kcsan_save_irqtrace(struct task_struct *task); +void kcsan_restore_irqtrace(struct task_struct *task); + +/* * Initialize debugfs file. */ void kcsan_debugfs_init(void); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ac5f8345bae9..9d07e175de0f 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task) if (!task) return; + /* Restore IRQ state trace for printing. */ + kcsan_restore_irqtrace(task); + pr_err("\n"); debug_show_held_locks(task); print_irqtrace_events(task); @@ -606,10 +609,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, goto out; /* - * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if - * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a race in utilities used by - * lockdep. + * Because we may generate reports when we're in scheduler code, the use + * of printk() could deadlock. Until such time that all printing code + * called in print_report() is scheduler-safe, accept the risk, and just + * get our message out. As such, also disable lockdep to hide the + * warning, and avoid disabling lockdep for the rest of the kernel. */ lockdep_off(); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/selftest.c index d26a052d3383..d26a052d3383 100644 --- a/kernel/kcsan/test.c +++ b/kernel/kcsan/selftest.c diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..e87679a48ba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/jump_label.h> +#include <linux/perf_event.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -123,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -188,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -206,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); @@ -295,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -563,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work) mutex_lock(&kprobe_mutex); cpus_read_lock(); mutex_lock(&text_mutex); - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -589,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(); - mutex_unlock(&module_mutex); mutex_unlock(&text_mutex); cpus_read_unlock(); @@ -2232,6 +2264,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) kprobe_remove_area_blacklist(entry, entry + 1); } +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2448,7 +2502,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(pi->file->f_cred)) addr = NULL; if (sym) @@ -2540,7 +2594,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(m->file->f_cred)) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/kthread.c b/kernel/kthread.c index 132f84a5fde3..1d9e2fdfd67a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> +#include <linux/sched/isolation.h> #include <trace/events/sched.h> @@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + set_cpus_allowed_ptr(task, + housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; @@ -608,7 +610,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 84ed1d1d5013..2fad21d345b0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task) static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE(--current->lockdep_recursion)) + if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) current->lockdep_recursion = 0; } @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3646,10 +3648,19 @@ static void __trace_hardirqs_on_caller(void) */ void lockdep_hardirqs_on_prepare(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) + return; + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3677,7 +3688,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3690,12 +3701,35 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); void noinstr lockdep_hardirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; + + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3720,10 +3754,11 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != current->curr_chain_key); +skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; + this_cpu_write(hardirqs_enabled, 1); + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); @@ -3733,9 +3768,18 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); */ void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; + if (unlikely(!debug_locks)) + return; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) return; /* @@ -3745,13 +3789,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + this_cpu_write(hardirqs_enabled, 0); + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); } else { debug_atomic_inc(redundant_hardirqs_off); @@ -3764,7 +3810,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3776,7 +3822,7 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } @@ -3785,17 +3831,17 @@ void lockdep_softirqs_on(unsigned long ip) /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + if (lockdep_hardirqs_enabled()) + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3804,8 +3850,6 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3815,13 +3859,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? @@ -3843,7 +3889,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3852,7 +3898,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3890,7 +3936,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -3983,7 +4029,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4826,11 +4872,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } @@ -5851,9 +5897,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", + : "", rcu_scheduler_active, debug_locks); /* diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..8ff6f50e06a0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; - lock_is_write_held = 1; + lock_is_write_held = true; if (WARN_ON_ONCE(lock_is_read_held)) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; + lock_is_write_held = false; cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(); - lock_is_read_held = 1; + lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; + lock_is_read_held = false; cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); @@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + bool fail = false; int i, n_stress; long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; @@ -904,7 +904,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +935,7 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; + lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..1de006ed3aa8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; diff --git a/kernel/module.c b/kernel/module.c index 0c6573b98c36..aa183c9ac0a2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1510,8 +1510,7 @@ static inline bool sect_empty(const Elf_Shdr *sect) } struct module_sect_attr { - struct module_attribute mattr; - char *name; + struct bin_attribute battr; unsigned long address; }; @@ -1521,13 +1520,18 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? - (void *)sattr->address : NULL); + container_of(battr, struct module_sect_attr, battr); + + if (pos != 0) + return -EINVAL; + + return sprintf(buf, "0x%px\n", + kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1535,7 +1539,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) unsigned int section; for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs->attrs[section].battr.attr.name); kfree(sect_attrs); } @@ -1544,42 +1548,41 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; - struct attribute **gattr; + struct bin_attribute **gattr; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { Elf_Shdr *sec = &info->sechdrs[i]; if (sect_empty(sec)) continue; + sysfs_bin_attr_init(&sattr->battr); sattr->address = sec->sh_addr; - sattr->name = kstrdup(info->secstrings + sec->sh_name, - GFP_KERNEL); - if (sattr->name == NULL) + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (sattr->battr.attr.name == NULL) goto out; sect_attrs->nsections++; - sysfs_attr_init(&sattr->mattr.attr); - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUSR; - *(gattr++) = &(sattr++)->mattr.attr; + sattr->battr.read = module_sect_read; + sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; } *gattr = NULL; @@ -1669,7 +1672,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) continue; if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; nattr->attr.mode = S_IRUGO; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *) info->sechdrs[i].sh_addr; @@ -2785,7 +2788,7 @@ void * __weak module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __func__); + NUMA_NO_NODE, __builtin_return_address(0)); } bool __weak module_init_section(const char *name) @@ -4379,7 +4382,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; } return err; diff --git a/kernel/padata.c b/kernel/padata.c index 29fc5d87a4cd..16cb894dc272 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -250,13 +250,11 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_find_next(struct parallel_data *pd, bool remove_object) { - struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; int cpu = pd->cpu; - next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; + reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); if (list_empty(&reorder->list)) { @@ -291,7 +289,7 @@ static void padata_reorder(struct parallel_data *pd) int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; - struct padata_parallel_queue *next_queue; + struct padata_list *reorder; /* * We need to ensure that only one cpu can work on dequeueing of @@ -335,13 +333,12 @@ static void padata_reorder(struct parallel_data *pd) * * Ensure reorder queue is read after pd->lock is dropped so we see * new objects from another task in padata_do_serial. Pairs with - * smp_mb__after_atomic in padata_do_serial. + * smp_mb in padata_do_serial. */ smp_mb(); - next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); - if (!list_empty(&next_queue->reorder.list) && - padata_find_next(pd, false)) + reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); + if (!list_empty(&reorder->list) && padata_find_next(pd, false)) queue_work(pinst->serial_wq, &pd->reorder_work); } @@ -401,24 +398,23 @@ void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); - struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - hashed_cpu); + struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; - spin_lock(&pqueue->reorder.lock); + spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) + list_for_each_entry_reverse(cur, &reorder->list, list) if (cur->seq_nr < padata->seq_nr) break; list_add(&padata->list, &cur->list); - spin_unlock(&pqueue->reorder.lock); + spin_unlock(&reorder->lock); /* * Ensure the addition to the reorder list is ordered correctly * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ - smp_mb__after_atomic(); + smp_mb(); padata_reorder(pd); } @@ -441,28 +437,6 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) return err; } -static int pd_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) -{ - int err = -ENOMEM; - - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) - goto free_pcpu_mask; - - cpumask_copy(pd->cpumask.pcpu, pcpumask); - cpumask_copy(pd->cpumask.cbcpu, cbcpumask); - - return 0; - -free_pcpu_mask: - free_cpumask_var(pd->cpumask.pcpu); -out: - return err; -} - static void __init padata_mt_helper(struct work_struct *w) { struct padata_work *pw = container_of(w, struct padata_work, pw_work); @@ -575,17 +549,15 @@ static void padata_init_squeues(struct parallel_data *pd) } } -/* Initialize all percpu queues used by parallel workers */ -static void padata_init_pqueues(struct parallel_data *pd) +/* Initialize per-CPU reorder lists */ +static void padata_init_reorder_list(struct parallel_data *pd) { int cpu; - struct padata_parallel_queue *pqueue; + struct padata_list *list; for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - - __padata_list_init(&pqueue->reorder); - atomic_set(&pqueue->num_obj, 0); + list = per_cpu_ptr(pd->reorder_list, cpu); + __padata_list_init(list); } } @@ -593,30 +565,31 @@ static void padata_init_pqueues(struct parallel_data *pd) static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) { struct padata_instance *pinst = ps->pinst; - const struct cpumask *cbcpumask; - const struct cpumask *pcpumask; struct parallel_data *pd; - cbcpumask = pinst->rcpumask.cbcpu; - pcpumask = pinst->rcpumask.pcpu; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); if (!pd) goto err; - pd->pqueue = alloc_percpu(struct padata_parallel_queue); - if (!pd->pqueue) + pd->reorder_list = alloc_percpu(struct padata_list); + if (!pd->reorder_list) goto err_free_pd; pd->squeue = alloc_percpu(struct padata_serial_queue); if (!pd->squeue) - goto err_free_pqueue; + goto err_free_reorder_list; pd->ps = ps; - if (pd_setup_cpumasks(pd, pcpumask, cbcpumask)) + + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) goto err_free_squeue; + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) + goto err_free_pcpu; - padata_init_pqueues(pd); + cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); + cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); + + padata_init_reorder_list(pd); padata_init_squeues(pd); pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); @@ -626,10 +599,12 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) return pd; +err_free_pcpu: + free_cpumask_var(pd->cpumask.pcpu); err_free_squeue: free_percpu(pd->squeue); -err_free_pqueue: - free_percpu(pd->pqueue); +err_free_reorder_list: + free_percpu(pd->reorder_list); err_free_pd: kfree(pd); err: @@ -640,7 +615,7 @@ static void padata_free_pd(struct parallel_data *pd) { free_cpumask_var(pd->cpumask.pcpu); free_cpumask_var(pd->cpumask.cbcpu); - free_percpu(pd->pqueue); + free_percpu(pd->reorder_list); free_percpu(pd->squeue); kfree(pd); } @@ -682,12 +657,6 @@ static int padata_replace(struct padata_instance *pinst) pinst->flags |= PADATA_RESET; - cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, - cpu_online_mask); - - cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, - cpu_online_mask); - list_for_each_entry(ps, &pinst->pslist, list) { err = padata_replace_one(ps); if (err) @@ -789,43 +758,6 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - * - * Return: 0 on success or negative error code - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err = -EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - #ifdef CONFIG_HOTPLUG_CPU static int __padata_add_cpu(struct padata_instance *pinst, int cpu) @@ -907,9 +839,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); - padata_stop(pinst); - free_cpumask_var(pinst->rcpumask.cbcpu); - free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); destroy_workqueue(pinst->serial_wq); @@ -1044,18 +973,12 @@ static struct kobj_type padata_attr_type = { }; /** - * padata_alloc - allocate and initialize a padata instance and specify - * cpumasks for serial and parallel workers. - * + * padata_alloc - allocate and initialize a padata instance * @name: used to identify the instance - * @pcpumask: cpumask that will be used for padata parallelization - * @cbcpumask: cpumask that will be used for padata serialization * * Return: new instance on success, NULL on error */ -static struct padata_instance *padata_alloc(const char *name, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +struct padata_instance *padata_alloc(const char *name) { struct padata_instance *pinst; @@ -1081,26 +1004,16 @@ static struct padata_instance *padata_alloc(const char *name, free_cpumask_var(pinst->cpumask.pcpu); goto err_free_serial_wq; } - if (!padata_validate_cpumask(pinst, pcpumask) || - !padata_validate_cpumask(pinst, cbcpumask)) - goto err_free_masks; - - if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL)) - goto err_free_masks; - if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) - goto err_free_rcpumask_pcpu; INIT_LIST_HEAD(&pinst->pslist); - cpumask_copy(pinst->cpumask.pcpu, pcpumask); - cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask); - cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); + cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask); + cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_rcpumask_cbcpu; + goto err_free_masks; - pinst->flags = 0; + __padata_start(pinst); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); @@ -1116,10 +1029,6 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; -err_free_rcpumask_cbcpu: - free_cpumask_var(pinst->rcpumask.cbcpu); -err_free_rcpumask_pcpu: - free_cpumask_var(pinst->rcpumask.pcpu); err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); @@ -1133,21 +1042,7 @@ err_free_inst: err: return NULL; } - -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @name: used to identify the instance - * - * Return: new instance on success, NULL on error - */ -struct padata_instance *padata_alloc_possible(const char *name) -{ - return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); +EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0a9326f5f421..c1ff7fa030ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Energy Model of CPUs + * Energy Model of devices * - * Copyright (c) 2018, Arm ltd. + * Copyright (c) 2018-2020, Arm ltd. * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. */ #define pr_fmt(fmt) "energy_model: " fmt @@ -15,30 +16,32 @@ #include <linux/sched/topology.h> #include <linux/slab.h> -/* Mapping of each CPU to the performance domain to which it belongs. */ -static DEFINE_PER_CPU(struct em_perf_domain *, em_data); - /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) { struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + snprintf(name, sizeof(name), "ps:%lu", ps->frequency); - /* Create per-cs directory */ + /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &cs->frequency); - debugfs_create_ulong("power", 0444, d, &cs->power); - debugfs_create_ulong("cost", 0444, d, &cs->cost); + debugfs_create_ulong("frequency", 0444, d, &ps->frequency); + debugfs_create_ulong("power", 0444, d, &ps->power); + debugfs_create_ulong("cost", 0444, d, &ps->cost); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +static void em_debug_create_pd(struct device *dev) { struct dentry *d; - char name[8]; int i; - snprintf(name, sizeof(name), "pd%d", cpu); - /* Create the directory of the performance domain */ - d = debugfs_create_dir(name, rootdir); + d = debugfs_create_dir(dev_name(dev), rootdir); - debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); + + /* Create a sub-directory for each performance state */ + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(&dev->em_pd->table[i], d); - /* Create a sub-directory for each capacity state */ - for (i = 0; i < pd->nr_cap_states; i++) - em_debug_create_cs(&pd->table[i], d); +} + +static void em_debug_remove_pd(struct device *dev) +{ + struct dentry *debug_dir; + + debug_dir = debugfs_lookup(dev_name(dev), rootdir); + debugfs_remove_recursive(debug_dir); } static int __init em_debug_init(void) @@ -76,58 +87,55 @@ static int __init em_debug_init(void) } core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} #endif -static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, - struct em_data_callback *cb) + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + int nr_states, struct em_data_callback *cb) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; - int i, ret, cpu = cpumask_first(span); - struct em_cap_state *table; - struct em_perf_domain *pd; + struct em_perf_state *table; + int i, ret; u64 fmax; - if (!cb->active_power) - return NULL; - - pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); - if (!pd) - return NULL; - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); if (!table) - goto free_pd; + return -ENOMEM; - /* Build the list of capacity states for this performance domain */ + /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest capacity state of 'cpu' above 'freq' and updates + * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, cpu); + ret = cb->active_power(&power, &freq, dev); if (ret) { - pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); - goto free_cs_table; + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); + goto free_ps_table; } /* * We expect the driver callback to increase the frequency for - * higher capacity states. + * higher performance states. */ if (freq <= prev_freq) { - pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); - goto free_cs_table; + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); + goto free_ps_table; } /* * The power returned by active_state() is expected to be * positive, in milli-watts and to fit into 16 bits. */ - if (!power || power > EM_CPU_MAX_POWER) { - pr_err("pd%d: invalid power: %lu\n", cpu, power); - goto free_cs_table; + if (!power || power > EM_MAX_POWER) { + dev_err(dev, "EM: invalid power: %lu\n", + power); + goto free_ps_table; } table[i].power = power; @@ -141,12 +149,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", - cpu, i, i - 1); + dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", + i, i - 1); prev_opp_eff = opp_eff; } - /* Compute the cost of each capacity_state. */ + /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { table[i].cost = div64_u64(fmax * table[i].power, @@ -154,39 +162,94 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, } pd->table = table; - pd->nr_cap_states = nr_states; - cpumask_copy(to_cpumask(pd->cpus), span); - - em_debug_create_pd(pd, cpu); + pd->nr_perf_states = nr_states; - return pd; + return 0; -free_cs_table: +free_ps_table: kfree(table); -free_pd: - kfree(pd); + return -EINVAL; +} + +static int em_create_pd(struct device *dev, int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) +{ + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret; + + if (_is_cpu_device(dev)) { + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + ret = em_create_perf_table(dev, pd, nr_states, cb); + if (ret) { + kfree(pd); + return ret; + } + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; - return NULL; + return dev->em_pd; } +EXPORT_SYMBOL_GPL(em_pd_get); /** * em_cpu_get() - Return the performance domain for a CPU * @cpu : CPU to find the performance domain for * - * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't * exist. */ struct em_perf_domain *em_cpu_get(int cpu) { - return READ_ONCE(per_cpu(em_data, cpu)); + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); } EXPORT_SYMBOL_GPL(em_cpu_get); /** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register + * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model + * @cpus : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. @@ -196,14 +259,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * * Return 0 on success */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) { unsigned long cap, prev_cap = 0; - struct em_perf_domain *pd; - int cpu, ret = 0; + int cpu, ret; - if (!span || !nr_states || !cb) + if (!dev || !nr_states || !cb) return -EINVAL; /* @@ -212,47 +274,79 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, */ mutex_lock(&em_pd_mutex); - for_each_cpu(cpu, span) { - /* Make sure we don't register again an existing domain. */ - if (READ_ONCE(per_cpu(em_data, cpu))) { - ret = -EEXIST; - goto unlock; - } + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } - /* - * All CPUs of a domain must have the same micro-architecture - * since they all share the same table. - */ - cap = arch_scale_cpu_capacity(cpu); - if (prev_cap && prev_cap != cap) { - pr_err("CPUs of %*pbl must have the same capacity\n", - cpumask_pr_args(span)); + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); ret = -EINVAL; goto unlock; } - prev_cap = cap; + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } } - /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(span, nr_states, cb); - if (!pd) { - ret = -EINVAL; + ret = em_create_pd(dev, nr_states, cb, cpus); + if (ret) goto unlock; - } - for_each_cpu(cpu, span) { - /* - * The per-cpu array can be read concurrently from em_cpu_get(). - * The barrier enforces the ordering needed to make sure readers - * can only access well formed em_perf_domain structs. - */ - smp_store_release(per_cpu_ptr(&em_data, cpu), pd); - } + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); - pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); unlock: mutex_unlock(&em_pd_mutex); - return ret; } -EXPORT_SYMBOL_GPL(em_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + kfree(dev->em_pd->table); + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 02ec716a4927..5714f51ba9f8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1062,7 +1062,7 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } @@ -1162,7 +1162,7 @@ static ssize_t reserved_size_store(struct kobject *kobj, power_attr(reserved_size); -static struct attribute * g[] = { +static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, @@ -1190,7 +1190,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy( resume_file, str, 255 ); + strncpy(resume_file, str, 255); return 1; } diff --git a/kernel/power/power.h b/kernel/power/power.h index ba2094db6294..32fc89ac96c3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info) return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -static inline char *check_image_kernel(struct swsusp_info *info) +static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 881128b9351e..cef154261fe2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -static char *check_image_kernel(struct swsusp_info *info) +static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm) static int check_header(struct swsusp_info *info) { - char *reason; + const char *reason; reason = check_image_kernel(info); if (!reason && info->num_physpages != get_num_physpages()) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 452feae8de20..3cf6132a4bb9 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,6 +61,25 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index f91f2c2cf138..95f5117ef8da 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..ec903d781778 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader performance statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE @@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void) } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. + * RCU perf reader kthread. Repeatedly does empty RCU read-side critical + * section, minimizing update-side interference. However, the point of + * this test is not to evaluate reader performance, but instead to serve + * as a test load for update-side performance testing. */ static int rcu_perf_reader(void *arg) @@ -576,11 +583,8 @@ static int compute_real(int n) static int rcu_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_perf_cleanup(); kernel_power_off(); @@ -693,11 +697,8 @@ kfree_perf_cleanup(void) static int kfree_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= - kfree_nrealthreads); - } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ @@ -723,7 +724,7 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } - pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..d0d265304d14 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -7,7 +7,7 @@ * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * - * See also: Documentation/RCU/torture.txt + * See also: Documentation/RCU/torture.rst */ #define pr_fmt(fmt) fmt @@ -109,6 +109,10 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, read_exit_delay, 13, + "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, + "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -146,6 +150,7 @@ static struct task_struct *stall_task; static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -177,6 +182,7 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; @@ -1166,6 +1172,7 @@ rcu_torture_writer(void *arg) WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -1370,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) struct rt_read_seg *rtrsp1; unsigned long long ts; + WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); @@ -1539,10 +1547,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", + pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld\n", data_race(n_read_exits)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1634,7 +1643,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1643,7 +1653,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2175,7 +2186,7 @@ static void rcu_torture_barrier1cb(void *rcu_void) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; @@ -2338,6 +2349,99 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + (void)rcu_torture_one_read(trsp); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + int count = 0; + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + if (++count > read_exit_burst) { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + for (i = 0; i < read_exit_delay; i++) { + schedule_timeout_uninterruptible(HZ); + if (READ_ONCE(read_exit_child_stop)) + break; + } + if (!READ_ONCE(read_exit_child_stop)) + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + count = 0; + } + if (READ_ONCE(read_exit_child_stop)) + break; + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", + "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + errexit = true; + tsp = NULL; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits ++; + stutter_wait("rcu_torture_read_exit"); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return -EINVAL; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + static enum cpuhp_state rcutor_hp; static void @@ -2359,6 +2463,7 @@ rcu_torture_cleanup(void) } show_rcu_gp_kthreads(); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2370,7 +2475,6 @@ rcu_torture_cleanup(void) reader_tasks[i]); kfree(reader_tasks); } - rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { @@ -2682,6 +2786,9 @@ rcu_torture_init(void) firsterr = rcu_torture_barrier_init(); if (firsterr) goto unwind; + firsterr = rcu_torture_read_exit_init(); + if (firsterr) + goto unwind; if (object_debug) rcu_test_debug_objects(); torture_init_end(); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c new file mode 100644 index 000000000000..d9291f883b54 --- /dev/null +++ b/kernel/rcu/refscale.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Scalability test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes <joel@joelfernandes.org> + +#define pr_fmt(fmt) fmt + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/torture.h> +#include <linux/types.h> + +#include "rcu.h" + +#define SCALE_FLAG "-ref-scale: " + +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) + +#define VERBOSE_SCALEOUT(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + +#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. +torture_param(long, loops, 10000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); + +#ifdef MODULE +# define REFSCALE_SHUTDOWN 0 +#else +# define REFSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); + +struct reader_task { + struct task_struct *task; + int start_reader; + wait_queue_head_t wq; + u64 last_duration_ns; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; +static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_scale_ops { + void (*init)(void); + void (*cleanup)(void); + void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int udl, const int ndl); + const char *name; +}; + +static struct ref_scale_ops *cur_ops; + +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + +static void ref_rcu_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + un_delay(udl, ndl); + rcu_read_unlock(); + } +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, + .name = "rcu" +}; + +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; + +static void srcu_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, + .name = "srcu" +}; + +// Definitions for RCU Tasks ref scale testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, + .name = "rcu-tasks" +}; + +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, + .name = "rcu-trace" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static void ref_refcnt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } +} + +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + un_delay(udl, ndl); + atomic_dec(&refcnt); + } +} + +static struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + un_delay(udl, ndl); + read_unlock(&test_rwlock); + } +} + +static struct ref_scale_ops rwlock_ops = { + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static void ref_rwsem_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } +} + +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + un_delay(udl, ndl); + up_read(&test_rwsem); + } +} + +static struct ref_scale_ops rwsem_ops = { + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, + .name = "rwsem" +}; + +static void rcu_scale_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); +} + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_scale_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + u64 start; + s64 duration; + + VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); +repeat: + VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + rcu_scale_one_reader(); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + rcu_scale_one_reader(); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + rcu_scale_one_reader(); + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + rcu_scale_one_reader(); + + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_scale_reader"); + return 0; +} + +static void reset_readers(void) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < nreaders; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +static u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char *buf; + u64 sum = 0; + + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", + exp_idx); + + for (i = 0; i < n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + SCALEOUT("%s\n", buf); + + kfree(buf); + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + bool errexit = false; + int exp, r; + char buf1[64]; + char *buf; + u64 *result_avg; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_SCALEOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + errexit = true; + } + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + // Start exp readers up per experiment + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (errexit) + break; + if (torture_must_stop()) + goto end; + + reset_readers(); + atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); + + exp_idx = exp; + + for (r = 0; r < nreaders; r++) { + smp_store_release(&reader_tasks[r].start_reader, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", + nreaders); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_SCALEOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); + } + + // Print the average of all experiments + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + + for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + + if (errexit) + break; + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); + strcat(buf, buf1); + } + + if (!errexit) + SCALEOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); + return 0; +} + +static void +ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); +} + +static void +ref_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_scale_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do scale-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_scale_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_scale_init(void) +{ + long i; + int firsterr = 0; + static struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_scale_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_scale_cleanup(); + return firsterr; +} + +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d3ef700fb0e..c100acf332ed 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ static bool srcu_might_be_idle(struct srcu_struct *ssp) @@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long t; unsigned long tlast; + check_init_srcu_struct(ssp); /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabalistically probe global state. @@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, } rhp->func = func; idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..835e2df8590a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644); #define RTGS_WAIT_READERS 9 #define RTGS_INVOKE_CBS 10 #define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INIT", "RTGS_WAIT_WAIT_CBS", @@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INVOKE_CBS", "RTGS_WAIT_CBS", }; +#endif /* #ifndef CONFIG_TINY_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) rtp->gp_jiffies = jiffies; } +#ifndef CONFIG_TINY_RCU /* Return state name. */ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) { @@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) return "???"; return rcu_tasks_gp_state_names[j]; } +#endif /* #ifndef CONFIG_TINY_RCU */ // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, @@ -205,7 +209,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_interruptible(HZ/10); + schedule_timeout_idle(HZ/10); } continue; } @@ -227,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); + schedule_timeout_idle(HZ/10); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } +#endif /* #ifndef CONFIG_TINY_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -336,7 +342,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_interruptible(HZ/fract); + schedule_timeout_idle(HZ/fract); if (fract > 1) fract--; @@ -402,7 +408,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(struct list_head *hop) +static void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This @@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RUDE_RCU */ static void show_rcu_tasks_rude_gp_kthread(void) {} @@ -727,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -atomic_t trc_n_readers_need_end; // Number of waited-for readers. -DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. +static atomic_t trc_n_readers_need_end; // Number of waited-for readers. +static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -835,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { - WARN_ON_ONCE(ofl & !is_idle_task(t)); + WARN_ON_ONCE(ofl && !is_idle_task(t)); // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -1118,11 +1128,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period * * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, - * anyway) cond_resched(). + * grace period has elapsed, in other words after all currently executing + * rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). * * This is a very specialized primitive, intended only for a few uses in * tracing and other situations requiring manipulation of function preambles @@ -1164,6 +1173,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1174,18 +1184,21 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +#ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); show_rcu_tasks_trace_gp_kthread(); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dd572ce7c747..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -23,6 +23,7 @@ #include <linux/cpu.h> #include <linux/prefetch.h> #include <linux/slab.h> +#include <linux/mm.h> #include "rcu.h" @@ -84,9 +85,9 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); - kfree((void *)head - offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); + kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..ac7198ed3197 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include <linux/slab.h> #include <linux/sched/isolation.h> #include <linux/sched/clock.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> #include "../time/tick-internal.h" #include "tree.h" @@ -175,6 +177,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -954,7 +965,6 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know @@ -990,8 +1000,11 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() @@ -1638,7 +1651,7 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); } static unsigned long sleep_duration; @@ -1661,7 +1674,7 @@ static void rcu_gp_torture_wait(void) duration = xchg(&sleep_duration, 0UL); if (duration > 0) { pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); - schedule_timeout_uninterruptible(duration); + schedule_timeout_idle(duration); pr_alert("%s: Wait complete\n", __func__); } } @@ -2443,6 +2456,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2726,7 +2740,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } @@ -2894,8 +2908,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -2957,53 +2971,53 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 - -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) +#define FREE_N_CHANNELS 2 /** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set + * @records: Array of the kvfree_rcu() pointers */ -struct kfree_rcu_bulk_data { +struct kvfree_rcu_bulk_data { unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; - struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; + struct kvfree_rcu_bulk_data *next; + void *records[]; }; +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kvfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3012,28 +3026,84 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; - // Number of objects for which GP not started int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + +static inline struct kvfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kvfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3041,38 +3111,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head) static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); - head = krwp->head_free; - krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); - - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); + // Channel 3. + head = krwp->head_free; + krwp->head_free = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); + + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); + + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); + + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - if (cmpxchg(&krcp->bcached, NULL, bhead)) - free_page((unsigned long) bhead); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); - cond_resched_tasks_rcu_qs(); + cond_resched_tasks_rcu_qs(); + } } /* @@ -3082,14 +3177,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3105,8 +3201,8 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; - int i; + bool repeat = false; + int i, j; lockdep_assert_held(&krcp->lock); @@ -3114,21 +3210,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3137,17 +3237,21 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, @@ -3157,14 +3261,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -3177,32 +3281,50 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; - bnode = (struct kfree_rcu_bulk_data *) + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3212,53 +3334,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; + void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - spin_lock(&krcp->lock); + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + + krcp = krc_this_cpu_lock(&flags); // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3266,10 +3397,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } WRITE_ONCE(krcp->count, krcp->count + 1); @@ -3282,11 +3419,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } } -EXPORT_SYMBOL_GPL(kfree_call_rcu); +EXPORT_SYMBOL_GPL(kvfree_call_rcu); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) @@ -3315,11 +3461,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); sc->nr_to_scan -= count; freed += count; @@ -3328,7 +3474,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - return freed; + return freed == 0 ? SHRINK_STOP : freed; } static struct shrinker kfree_rcu_shrinker = { @@ -3346,15 +3492,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3842,10 +3988,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3857,12 +4002,10 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4249,13 +4392,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kvfree_rcu_bulk_data *bnode; - spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 43991a40b084..c96ae351688b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -41,7 +41,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -73,9 +73,9 @@ struct rcu_node { unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ - u8 grpnum; /* CPU/group number for next level up. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ @@ -149,7 +149,7 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -301,6 +302,8 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -346,8 +349,6 @@ struct rcu_state { /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 72952edad1e4..1888c0eb1216 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -403,7 +403,7 @@ retry_ipi: /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } /* CPU really is offline, so we must report its QS. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 352223664ebd..982fc5be5269 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg) if (spincnt > 10) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } @@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_interruptible(1); + schedule_timeout_idle(1); } else if (!needwait_gp) { /* Wait for callbacks to appear. */ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..b5d3b4794db4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr { */ static bool check_slow_task(struct task_struct *t, void *arg) { - struct rcu_node *rnp; struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) return false; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; - rnp = t->rcu_blocked_node; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); return true; } @@ -468,7 +466,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) /* * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); @@ -535,7 +533,7 @@ static void print_cpu_stall(unsigned long gps) /* * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); @@ -649,6 +647,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -690,9 +689,11 @@ void show_rcu_gp_kthreads(void) } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..2de49b5d8dd2 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -42,6 +42,7 @@ #include <linux/kprobes.h> #include <linux/slab.h> #include <linux/irq_work.h> +#include <linux/rcupdate_trace.h> #define CREATE_TRACE_POINTS @@ -207,7 +208,7 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); - rcu_boot_ended = 1; + rcu_boot_ended = true; } /* @@ -279,6 +280,7 @@ struct lockdep_map rcu_sched_lock_map = { }; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); @@ -390,13 +392,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -407,9 +410,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); diff --git a/kernel/reboot.c b/kernel/reboot.c index 491f1347bf43..e7b78d5ae1ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -26,7 +26,7 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); -#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#if defined(CONFIG_ARM) #define DEFAULT_REBOOT_MODE = REBOOT_HARD #else #define DEFAULT_REBOOT_MODE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca5db40392d4..4a0e7b449b88 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,6 +6,10 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS + #include "sched.h" #include <linux/nospec.h> @@ -23,9 +27,6 @@ #include "pelt.h" #include "smp.h" -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> - /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -36,6 +37,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -75,6 +79,100 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most elegible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task()/ + * cpu_cgroup_fork(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -791,9 +889,46 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -873,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -990,10 +1183,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1021,6 +1242,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1036,6 +1266,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1114,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1128,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1144,8 +1386,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } + + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1158,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1180,6 +1430,15 @@ static int uclamp_validate(struct task_struct *p, if (upper_bound > SCHED_CAPACITY_SCALE) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } @@ -1194,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p, */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); /* Keep using defined clamps across class changes */ if (uc_se->user_defined) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + __uclamp_update_util_min_rt_default(p); + else + uclamp_se_set(uc_se, uclamp_none(clamp_id), false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -1225,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1237,19 +1503,33 @@ static void uclamp_fork(struct task_struct *p) } } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = 0; +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1278,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -1311,9 +1592,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1601,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -1410,20 +1685,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (p->sched_class > rq->curr->sched_class) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -1474,8 +1739,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -1483,8 +1747,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; @@ -2236,10 +2499,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2249,12 +2512,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2395,6 +2677,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2402,10 +2692,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2461,8 +2749,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2502,15 +2790,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. + * + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. @@ -2526,7 +2832,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * * In particular: @@ -2547,8 +2853,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); @@ -2583,7 +2889,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2592,9 +2898,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2613,8 +2916,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -2962,6 +3277,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -2986,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -3026,6 +3347,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -3142,8 +3464,10 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3151,8 +3475,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must @@ -3651,17 +3976,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); - -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -4024,8 +4338,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(prev->sched_class <= &fair_sched_class && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -4097,6 +4410,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4116,9 +4430,16 @@ static void __sched notrace __schedule(bool preempt) /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4129,10 +4450,38 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { @@ -4444,6 +4793,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5477,6 +5827,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif @@ -5834,7 +6189,7 @@ again: if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* @@ -6668,6 +7023,14 @@ void __init sched_init(void) unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || + &fair_sched_class + 1 != &rt_sched_class || + &rt_sched_class + 1 != &dl_sched_class); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7389,6 +7752,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); @@ -8076,4 +8441,7 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..8cb06c8c7eb1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) { + cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } + } + + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + return 1; } else { int best_cpu = cpudl_maximum(cp); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..e39008242cf4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; } @@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); +cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ff9435dee1df..5a55d2300452 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - -/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * @@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); update: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f63f337c7147..3862a28cd05d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + unsigned long cap = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity) && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + return __dl_bw_capacity(i); + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} #endif static inline @@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1604,6 +1643,7 @@ static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; if (sd_flag != SD_BALANCE_WAKE) @@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && @@ -2430,8 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class = { - .next = &rt_sched_class, +const struct sched_class dl_sched_class + __attribute__((section("__dl_sched_class"))) = { .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -2551,11 +2600,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2635,6 +2687,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) } /* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting rediculous long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ + +/* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or @@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } @@ -2715,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { + unsigned long flags, cap; unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; - int cpus, ret; - unsigned long flags; + int ret; dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + cap = dl_bw_capacity(dest_cpu); + overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); if (overflow) { ret = -EBUSY; } else { @@ -2737,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ + int cpus = dl_bw_cpus(dest_cpu); + __dl_add(dl_b, p->dl.dl_bw, cpus); ret = 0; } @@ -2769,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, bool dl_cpu_busy(unsigned int cpu) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 658aa7a2ae6f..1a68a0536add 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -22,8 +22,6 @@ */ #include "sched.h" -#include <trace/events/sched.h> - /* * Targeted preemption latency for CPU-bound tasks: * @@ -3094,7 +3092,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); @@ -3440,16 +3438,18 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * divider; @@ -3463,16 +3463,18 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; se->avg.runnable_sum = se->avg.runnable_avg * divider; @@ -3500,7 +3502,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + divider = get_pelt_divider(&cfs_rq->avg); if (runnable_sum >= 0) { /* @@ -3646,7 +3648,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3701,7 +3703,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay @@ -3922,6 +3924,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); } /* @@ -3952,6 +3956,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + trace_sched_util_est_cfs_tp(cfs_rq); + /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4017,6 +4023,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); } static inline int task_fits_capacity(struct task_struct *p, long capacity) @@ -4039,7 +4047,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -5614,14 +5626,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } -dequeue_throttle: - if (!se) - sub_nr_running(rq, 1); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, 1); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; +dequeue_throttle: util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -6497,7 +6509,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) max_util = max(max_util, cpu_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util); } /* @@ -7157,7 +7169,7 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -7638,7 +7650,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) @@ -8038,7 +8057,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(cpu); @@ -8070,7 +8089,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); @@ -8692,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_has_spare: /* Select group with most idle CPUs */ - if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + if (idlest_sgs->idle_cpus > sgs->idle_cpus) return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) + return false; + break; } @@ -10016,7 +10041,12 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); @@ -10337,6 +10367,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -10357,14 +10395,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } @@ -11107,8 +11137,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { - .next = &idle_sched_class, +const struct sched_class fair_sched_class + __attribute__((section("__fair_sched_class"))) = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -11281,3 +11311,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd) #endif } EXPORT_SYMBOL_GPL(sched_trace_rd_span); + +int sched_trace_rq_nr_running(struct rq *rq) +{ + return rq ? rq->nr_running : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1ae95b9150d3..6bf34986f45c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -453,11 +453,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -465,8 +460,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ +const struct sched_class idle_sched_class + __attribute__((section("__idle_sched_class"))) = { /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -486,8 +481,6 @@ const struct sched_class idle_sched_class = { .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 808244f3ddd9..5a6ea03f9882 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..d2a655643a02 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +void calc_global_load(void) { unsigned long sample_window; long active, delta; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b4b1ff96642f..2c613e1cff3a 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,8 +28,6 @@ #include "sched.h" #include "pelt.h" -#include <trace/events/sched.h> - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -83,8 +81,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -264,7 +260,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index eb034d9f024d..795e43e02afc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return LOAD_AVG_MAX - 1024 + avg->period_contrib; +} + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8f45cdb6463b..e53b711bd643 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,7 +190,6 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -199,7 +198,7 @@ static void group_init(struct psi_group *group) memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } void __init psi_init(void) @@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ +/* Schedule polling if it's not already scheduled. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { - struct kthread_worker *kworker; + struct task_struct *task; - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + /* + * Do not reschedule if already scheduled. + * Possible race with a timer scheduled after this check but before + * mod_timer below can be tolerated because group->polling_next_update + * will keep updates on schedule. + */ + if (timer_pending(&group->poll_timer)) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->poll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); - else - atomic_set(&group->poll_scheduled, 0); + if (likely(task)) + mod_timer(&group->poll_timer, jiffies + delay); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_poll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); - - atomic_set(&group->poll_scheduled, 0); - mutex_lock(&group->trigger_lock); now = sched_clock(); @@ -623,6 +613,35 @@ out: mutex_unlock(&group->trigger_lock); } +static int psi_poll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + struct sched_param param = { + .sched_priority = 1, + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + + while (true) { + wait_event_interruptible(group->poll_wait, + atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_poll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, poll_timer); + + atomic_set(&group->poll_wakeup, 1); + wake_up_interruptible(&group->poll_wait); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_lock(&group->trigger_lock); - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; + if (!rcu_access_pointer(group->poll_task)) { + struct task_struct *task; - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { + task = kthread_create(psi_poll_worker, group, "psimon"); + if (IS_ERR(task)) { kfree(t); mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + return ERR_CAST(task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); + atomic_set(&group->poll_wakeup, 0); + init_waitqueue_head(&group->poll_wait); + wake_up_process(task); + timer_setup(&group->poll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->poll_task, task); } list_add(&t->node, &group->triggers); @@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref) { struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct task_struct *task_to_destroy = NULL; if (static_branch_likely(&psi_disabled)) return; @@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref) period = min(period, div_u64(tmp->win.size, UPDATES_PER_WINDOW)); group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ + /* Destroy poll_task when the last trigger is destroyed */ if (group->poll_states == 0) { group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, + task_to_destroy = rcu_dereference_protected( + group->poll_task, lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } } @@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref) /* * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * poll_task RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_task */ synchronize_rcu(); /* * Destroy the kworker after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. + * can no longer be found through group->poll_task. * But it might have been already scheduled before * that - deschedule it cleanly before destroying it. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + del_timer_sync(&group->poll_timer); + kthread_stop(task_to_destroy); } kfree(t); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f395ddb75f38..f215eea6a966 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2429,8 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { - .next = &fair_sched_class, +const struct sched_class rt_sched_class + __attribute__((section("__rt_sched_class"))) = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 877fb08eb1b0..3fd283892761 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,6 +67,7 @@ #include <linux/tsacct_kern.h> #include <asm/tlb.h> +#include <asm-generic/vmlinux.lds.h> #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> @@ -75,6 +76,8 @@ #include "cpupri.h" #include "cpudeadline.h" +#include <trace/events/sched.h> + #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -96,6 +99,7 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -310,11 +314,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) __dl_update(dl_b, -((s32)tsk_bw / cpus)); } -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, + u64 old_bw, u64 new_bw) { return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the CPU original capacity of the + * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the + * task and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; } extern void init_dl_bw(struct dl_bw *dl_b); @@ -862,6 +881,8 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ /* @@ -1182,6 +1203,16 @@ struct rq_flags { #endif }; +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(&rq->lock); @@ -1739,7 +1770,6 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) struct sched_class { - const struct sched_class *next; #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; @@ -1748,7 +1778,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); @@ -1796,7 +1826,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -}; +} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1810,17 +1840,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } -#ifdef CONFIG_SMP -#define sched_class_highest (&stop_sched_class) -#else -#define sched_class_highest (&dl_sched_class) -#endif +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __begin_sched_classes[]; +extern struct sched_class __end_sched_classes[]; + +#define sched_class_highest (__end_sched_classes - 1) +#define sched_class_lowest (__begin_sched_classes - 1) #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class = class->next) + for (class = (_from); class != (_to); class--) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, NULL) + for_class_range(class, sched_class_highest, sched_class_lowest) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1930,12 +1961,7 @@ extern int __init sched_tick_offload_init(void); */ static inline void sched_update_tick_dependency(struct rq *rq) { - int cpu; - - if (!tick_nohz_full_enabled()) - return; - - cpu = cpu_of(rq); + int cpu = cpu_of(rq); if (!tick_nohz_full_cpu(cpu)) return; @@ -1955,6 +1981,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count) unsigned prev_nr = rq->nr_running; rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1969,6 +1998,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } + /* Check if we still need preemption */ sched_update_tick_dependency(rq); } @@ -2016,6 +2049,16 @@ void arch_scale_freq_tick(void) #endif #ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ static __always_inline unsigned long arch_scale_freq_capacity(int cpu) { @@ -2349,12 +2392,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ static __always_inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2371,6 +2437,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, @@ -2378,6 +2457,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, { return util; } + +static inline bool uclamp_is_used(void) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..394bc8126a1e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -115,8 +109,8 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { - .next = &dl_sched_class, +const struct sched_class stop_sched_class + __attribute__((section("__stop_sched_class"))) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, @@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = { .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..007b0a6b0152 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas) * * The complexity of the Energy Model is defined as: * - * C = nr_pd * (nr_cpus + nr_cs) + * C = nr_pd * (nr_cpus + nr_ps) * * with parameters defined as: * - nr_pd: the number of performance domains * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance + * - nr_ps: the sum of the number of performance states of all performance * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * with 10 performance states each, nr_ps = 2 * 10 = 20). * * It is generally not a good idea to use such a model in the wake-up path on * very complex platforms because of the associated scheduling overheads. The * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * with per-CPU DVFS and less than 8 performance states each, for example. */ #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map) pd = tmp; /* - * Count performance domains and capacity states for the + * Count performance domains and performance states for the * complexity check. */ nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); + nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; @@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; + sd_flags &= TOPOLOGY_SD_FLAGS; /* Apply detected topology flags */ sd_flags |= dflags; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index ba059fbfc53a..01f5d3020589 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -389,7 +389,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/kernel/signal.c b/kernel/signal.c index 5ca48cc5da76..6f16f7c5d375 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -719,7 +719,7 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info) * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { - /* Synchronous signals have a postive si_code */ + /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; @@ -2529,9 +2529,6 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) - task_work_run(); - if (unlikely(uprobe_deny_signal())) return false; @@ -2544,6 +2541,13 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + if (unlikely(current->task_works)) { + spin_unlock_irq(&sighand->siglock); + task_work_run(); + goto relock; + } + /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes diff --git a/kernel/smp.c b/kernel/smp.c index aa17eedff5be..d0ae8eb6bf8b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -634,8 +634,7 @@ static int __init nrcpus(char *str) { int nr_cpus; - get_option(&str, &nr_cpus); - if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) nr_cpu_ids = nr_cpus; return 0; diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..bf88d7f62433 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -224,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } @@ -547,7 +553,10 @@ static void tasklet_action_common(struct softirq_action *a, if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); - t->func(t->data); + if (t->use_callback) + t->callback(t); + else + t->func(t->data); tasklet_unlock(t); continue; } @@ -573,6 +582,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a) tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } +void tasklet_setup(struct tasklet_struct *t, + void (*callback)(struct tasklet_struct *)) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->callback = callback; + t->use_callback = true; + t->data = 0; +} +EXPORT_SYMBOL(tasklet_setup); + void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -580,6 +601,7 @@ void tasklet_init(struct tasklet_struct *t, t->state = 0; atomic_set(&t->count, 0); t->func = func; + t->use_callback = false; t->data = data; } EXPORT_SYMBOL(tasklet_init); diff --git a/kernel/stackleak.c b/kernel/stackleak.c index b193a59fc05b..a8fc9ae1d03d 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void) } NOKPROBE_SYMBOL(stackleak_erase); -void __used notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers notrace stackleak_track_stack(void) { - /* - * N.B. stackleak_erase() fills the kernel stack with the poison value, - * which has the register width. That code assumes that the value - * of 'lowest_stack' is aligned on the register width boundary. - * - * That is true for x86 and x86_64 because of the kernel stack - * alignment on these platforms (for details, see 'cc_stack_align' in - * arch/x86/Makefile). Take care of that when you port STACKLEAK to - * new platforms. - */ - unsigned long sp = (unsigned long)&sp; + unsigned long sp = current_stack_pointer; /* * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than @@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void) */ BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + /* 'lowest_stack' should be aligned on the register width boundary */ + sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && sp >= (unsigned long)task_stack_page(current) + sizeof(unsigned long)) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index db1ce7af2563..1b4d2dc270a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1780,6 +1780,20 @@ static struct ctl_table kern_table[] = { .proc_handler = sched_rt_handler, }, { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, .maxlen = sizeof(int), @@ -1801,6 +1815,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler, }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, #endif #ifdef CONFIG_SCHED_AUTOGROUP { diff --git a/kernel/task_work.c b/kernel/task_work.c index 825f28259a19..5c0848ca1287 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,9 +25,10 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * 0 if succeeds or -ESRCH. */ int -task_work_add(struct task_struct *task, struct callback_head *work, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; + unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -36,8 +37,19 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); - if (notify) + switch (notify) { + case TWA_RESUME: set_notify_resume(task); + break; + case TWA_SIGNAL: + if (lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } + break; + } + return 0; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fa3f800d7d76..0deaf4b79fb4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -20,31 +20,6 @@ #include "timekeeping.h" /** - * struct clock_read_data - data required to read from sched_clock() - * - * @epoch_ns: sched_clock() value at last update - * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. - * @shift: Shift value for scaled math conversion. - * - * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=40 bytes and, when combined - * with the seqcount used to synchronize access, comfortably fits into - * a 64 byte cache line. - */ -struct clock_read_data { - u64 epoch_ns; - u64 epoch_cyc; - u64 sched_clock_mask; - u64 (*read_sched_clock)(void); - u32 mult; - u32 shift; -}; - -/** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) * @@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } +struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount_latch(&cd.seq); + return cd.read_data + (*seq & 1); +} + +int sched_clock_read_retry(unsigned int seq) +{ + return read_seqcount_retry(&cd.seq, seq); +} + unsigned long long notrace sched_clock(void) { u64 cyc, res; @@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void) struct clock_read_data *rd; do { - seq = raw_read_seqcount(&cd.seq); - rd = cd.read_data + (seq & 1); + rd = sched_clock_read_begin(&seq); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (read_seqcount_retry(&cd.seq, seq)); + } while (sched_clock_read_retry(seq)); return res; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20d489841c8..63a632f9896c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64); void do_timer(unsigned long ticks) { jiffies_64 += ticks; - calc_global_load(ticks); + calc_global_load(); } /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 398e6eadb861..026ac01af9da 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,6 +43,7 @@ #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> +#include <linux/random.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -521,8 +522,8 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ - if (expires >= WHEEL_TIMEOUT_CUTOFF) - expires = WHEEL_TIMEOUT_MAX; + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1); } @@ -584,7 +585,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - base->next_expiry = timer->expires; + if (time_before(timer->expires, base->clk)) { + /* + * Prevent from forward_timer_base() moving the base->clk + * backward + */ + base->next_expiry = base->clk; + } else { + base->next_expiry = timer->expires; + } wake_up_nohz_cpu(base->cpu); } @@ -896,10 +905,13 @@ static inline void forward_timer_base(struct timer_base *base) * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) + if (time_after(base->next_expiry, jnow)) { base->clk = jnow; - else + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; base->clk = base->next_expiry; + } #endif } @@ -1731,6 +1743,13 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); + + /* The current CPU might make use of net randoms without receiving IRQs + * to renew them often enough. Let's update the net_rand_state from a + * non-constant value that's not affine to the number of calls to make + * sure it's updated when there's some activity (we don't care in idle). + */ + this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** diff --git a/kernel/torture.c b/kernel/torture.c index a1a41484ff6d..1061492f14bd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); static bool disable_onoff_at_boot; module_param(disable_onoff_at_boot, bool, 0444); +static bool ftrace_dump_at_shutdown; +module_param(ftrace_dump_at_shutdown, bool, 0444); + static char *torture_type; static int verbose; @@ -527,7 +530,8 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - rcu_ftrace_dump(DUMP_ALL); + if (ftrace_dump_at_shutdown) + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5ef0484513ec..7ba62d68885a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -348,7 +348,7 @@ static int __blk_trace_remove(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; @@ -362,9 +362,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -483,12 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; + lockdep_assert_held(&q->debugfs_mutex); + if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - if (!blk_debugfs_root) - return -ENOENT; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -503,7 +502,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * we can be. */ if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; @@ -522,12 +521,29 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; - ret = -ENOENT; - - dir = debugfs_lookup(buts->name, blk_debugfs_root); - if (!dir) + /* + * When tracing the whole disk reuse the existing debugfs directory + * created by the block layer on init. For partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (bdev && bdev == bdev->bd_contains) + dir = q->debugfs_dir; + else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + /* + * As blktrace relies on debugfs for its interface the debugfs directory + * is required, contrary to the usual mantra of not checking for debugfs + * files or directories. + */ + if (IS_ERR_OR_NULL(dir)) { + pr_warn("debugfs_dir not present for %s so skipping\n", + buts->name); + ret = -ENOENT; + goto err; + } + bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); @@ -563,8 +579,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: - if (dir && !bt->dir) - dput(dir); if (ret) blk_trace_free(bt); return ret; @@ -597,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -645,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start) struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -685,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -716,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: @@ -743,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -754,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP @@ -846,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore, blk_trace_request_get_cgid(q, rq)); } +static void blk_add_trace_rq_merge(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, + blk_trace_request_get_cgid(q, rq)); +} + static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) @@ -1130,6 +1151,8 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); + ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); @@ -1176,6 +1199,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); @@ -1642,7 +1666,7 @@ static int blk_trace_remove_queue(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -1817,10 +1841,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; @@ -1838,7 +1862,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: @@ -1881,10 +1905,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; @@ -1901,7 +1925,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { @@ -1916,7 +1940,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1903b80db6eb..72064541bef2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2764,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ + ftrace_remove_trampoline_from_kallsyms(ops); + } + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2934,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -6178,6 +6222,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6514,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6540,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6553,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6733,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ + ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); + } } void ftrace_init_trace_array(struct trace_array *tr) |