diff options
Diffstat (limited to 'kernel')
88 files changed, 3447 insertions, 1850 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..68a89a9f7ccd 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -482,13 +482,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..7787bdcb5d68 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; @@ -4564,7 +4564,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -4573,6 +4573,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9a500fadbef5..4f1472409ef8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -227,6 +227,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; @@ -302,8 +305,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; int err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..966b7b34cde0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -696,14 +696,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -712,11 +713,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -810,8 +811,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -1510,6 +1512,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { @@ -2093,8 +2100,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ @@ -2787,7 +2795,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_prog_info info = {}; + struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_stats stats; char __user *uinsns; @@ -2799,6 +2807,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; @@ -3062,7 +3071,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_map_info info = {}; + struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; @@ -3071,6 +3080,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -3354,7 +3364,7 @@ err_put: SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -3366,6 +3376,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index be1a1c83cdd1..f2d7cea86ffe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) */ p++; if (p >= end) { + (*pos)++; return NULL; } else { *pos = *p; @@ -782,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work) pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) + if (!pathbuf || !agentbuf || !strlen(agentbuf)) goto out; spin_lock_irq(&css_set_lock); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..3dead0416b91 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) + if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; - else if (!list_empty(&cset->mg_tasks)) + it->cur_tasks_head = &cset->tasks; + } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; - else + it->cur_tasks_head = &cset->mg_tasks; + } else { it->task_pos = cset->dying_tasks.next; + it->cur_tasks_head = &cset->dying_tasks; + } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; @@ -4463,10 +4467,14 @@ repeat: else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) + if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; - if (it->task_pos == it->mg_tasks_head) + it->cur_tasks_head = it->mg_tasks_head; + } + if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; + it->cur_tasks_head = it->dying_tasks_head; + } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { @@ -4485,11 +4493,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (!atomic_read(&task->signal->live)) + if (it->cur_tasks_head == it->dying_tasks_head && + !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (task->flags & PF_EXITING) + if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } @@ -4595,6 +4604,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; + if (pos) + (*pos)++; + return css_task_iter_next(it); } @@ -4610,7 +4622,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, * from position 0, so we can simply keep iterating on !0 *pos. */ if (!it) { - if (WARN_ON_ONCE((*pos)++)) + if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); it = kzalloc(sizeof(*it), GFP_KERNEL); @@ -4618,10 +4630,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, return ERR_PTR(-ENOMEM); of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); - } else if (!(*pos)++) { + } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); - } + } else + return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } @@ -6258,6 +6271,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) return; } + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + rcu_read_lock(); while (true) { diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 0296b4bda8f1..ce430885c26c 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -198,11 +198,13 @@ void __init context_tracking_cpu_set(int cpu) if (initialized) return; +#ifdef CONFIG_HAVE_TIF_NOHZ /* * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork * This assumes that init is the only task at this early boot stage. */ set_tsk_thread_flag(&init_task, TIF_NOHZ); +#endif WARN_ON_ONCE(!tasklist_empty()); initialized = true; diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c706af713fb..2371292f30b0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void) static void lockdep_acquire_cpus_lock(void) { - rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); + rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); } static void lockdep_release_cpus_lock(void) { - rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_); + rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } /* @@ -1041,7 +1041,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) return _cpu_down(cpu, 0, target); } -static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) +static int cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -1051,11 +1051,72 @@ static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) return err; } -int cpu_down(unsigned int cpu) +/** + * cpu_device_down - Bring down a cpu device + * @dev: Pointer to the cpu device to offline + * + * This function is meant to be used by device core cpu subsystem only. + * + * Other subsystems should use remove_cpu() instead. + */ +int cpu_device_down(struct device *dev) { - return do_cpu_down(cpu, CPUHP_OFFLINE); + return cpu_down(dev->id, CPUHP_OFFLINE); +} + +int remove_cpu(unsigned int cpu) +{ + int ret; + + lock_device_hotplug(); + ret = device_offline(get_cpu_device(cpu)); + unlock_device_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(remove_cpu); + +void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) +{ + unsigned int cpu; + int error; + + cpu_maps_update_begin(); + + /* + * Make certain the cpu I'm about to reboot on is online. + * + * This is inline to what migrate_to_reboot_cpu() already do. + */ + if (!cpu_online(primary_cpu)) + primary_cpu = cpumask_first(cpu_online_mask); + + for_each_online_cpu(cpu) { + if (cpu == primary_cpu) + continue; + + error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (error) { + pr_err("Failed to offline CPU%d - error=%d", + cpu, error); + break; + } + } + + /* + * Ensure all but the reboot CPU are offline. + */ + BUG_ON(num_online_cpus() > 1); + + /* + * Make sure the CPUs won't be enabled by someone else after this + * point. Kexec will reboot to a new kernel shortly resetting + * everything along the way. + */ + cpu_hotplug_disabled++; + + cpu_maps_update_done(); } -EXPORT_SYMBOL(cpu_down); #else #define takedown_cpu NULL @@ -1124,8 +1185,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) } /* - * The caller of do_cpu_up might have raced with another - * caller. Ignore it for now. + * The caller of cpu_up() might have raced with another + * caller. Nothing to do. */ if (st->state >= target) goto out; @@ -1169,7 +1230,7 @@ out: return ret; } -static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) +static int cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -1203,16 +1264,70 @@ out: return err; } -int cpu_up(unsigned int cpu) +/** + * cpu_device_up - Bring up a cpu device + * @dev: Pointer to the cpu device to online + * + * This function is meant to be used by device core cpu subsystem only. + * + * Other subsystems should use add_cpu() instead. + */ +int cpu_device_up(struct device *dev) +{ + return cpu_up(dev->id, CPUHP_ONLINE); +} + +int add_cpu(unsigned int cpu) +{ + int ret; + + lock_device_hotplug(); + ret = device_online(get_cpu_device(cpu)); + unlock_device_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(add_cpu); + +/** + * bringup_hibernate_cpu - Bring up the CPU that we hibernated on + * @sleep_cpu: The cpu we hibernated on and should be brought up. + * + * On some architectures like arm64, we can hibernate on any CPU, but on + * wake up the CPU we hibernated on might be offline as a side effect of + * using maxcpus= for example. + */ +int bringup_hibernate_cpu(unsigned int sleep_cpu) { - return do_cpu_up(cpu, CPUHP_ONLINE); + int ret; + + if (!cpu_online(sleep_cpu)) { + pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); + ret = cpu_up(sleep_cpu, CPUHP_ONLINE); + if (ret) { + pr_err("Failed to bring hibernate-CPU up!\n"); + return ret; + } + } + return 0; +} + +void bringup_nonboot_cpus(unsigned int setup_max_cpus) +{ + unsigned int cpu; + + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu, CPUHP_ONLINE); + } } -EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int freeze_secondary_cpus(int primary) +int __freeze_secondary_cpus(int primary, bool suspend) { int cpu, error = 0; @@ -1237,7 +1352,7 @@ int freeze_secondary_cpus(int primary) if (cpu == primary) continue; - if (pm_wakeup_pending()) { + if (suspend && pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; @@ -2028,9 +2143,9 @@ static ssize_t write_cpuhp_target(struct device *dev, goto out; if (st->state < target) - ret = do_cpu_up(dev->id, target); + ret = cpu_up(dev->id, target); else - ret = do_cpu_down(dev->id, target); + ret = cpu_down(dev->id, target); out: unlock_device_hotplug(); return ret ? ret : count; diff --git a/kernel/events/core.c b/kernel/events/core.c index e453589da97c..d22e4ba59dfa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,6 +49,7 @@ #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> +#include <linux/min_heap.h> #include "internal.h" @@ -891,6 +892,47 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_unlock(); } +static int perf_cgroup_ensure_storage(struct perf_event *event, + struct cgroup_subsys_state *css) +{ + struct perf_cpu_context *cpuctx; + struct perf_event **storage; + int cpu, heap_size, ret = 0; + + /* + * Allow storage to have sufficent space for an iterator for each + * possibly nested cgroup plus an iterator for events with no cgroup. + */ + for (heap_size = 1; css; css = css->parent) + heap_size++; + + for_each_possible_cpu(cpu) { + cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + if (heap_size <= cpuctx->heap_size) + continue; + + storage = kmalloc_node(heap_size * sizeof(struct perf_event *), + GFP_KERNEL, cpu_to_node(cpu)); + if (!storage) { + ret = -ENOMEM; + break; + } + + raw_spin_lock_irq(&cpuctx->ctx.lock); + if (cpuctx->heap_size < heap_size) { + swap(cpuctx->heap, storage); + if (storage == cpuctx->heap_default) + storage = NULL; + cpuctx->heap_size = heap_size; + } + raw_spin_unlock_irq(&cpuctx->ctx.lock); + + kfree(storage); + } + + return ret; +} + static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) @@ -910,6 +952,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, goto out; } + ret = perf_cgroup_ensure_storage(event, css); + if (ret) + goto out; + cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -1531,6 +1577,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right) if (left->cpu > right->cpu) return false; +#ifdef CONFIG_CGROUP_PERF + if (left->cgrp != right->cgrp) { + if (!left->cgrp || !left->cgrp->css.cgroup) { + /* + * Left has no cgroup but right does, no cgroups come + * first. + */ + return true; + } + if (!right->cgrp || !right->cgrp->css.cgroup) { + /* + * Right has no cgroup but left does, no cgroups come + * first. + */ + return false; + } + /* Two dissimilar cgroups, order by id. */ + if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) + return true; + + return false; + } +#endif + if (left->group_index < right->group_index) return true; if (left->group_index > right->group_index) @@ -1610,25 +1680,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the @cpu subtree. + * Get the leftmost event in the cpu/cgroup subtree. */ static struct perf_event * -perf_event_groups_first(struct perf_event_groups *groups, int cpu) +perf_event_groups_first(struct perf_event_groups *groups, int cpu, + struct cgroup *cgrp) { struct perf_event *node_event = NULL, *match = NULL; struct rb_node *node = groups->tree.rb_node; +#ifdef CONFIG_CGROUP_PERF + u64 node_cgrp_id, cgrp_id = 0; + + if (cgrp) + cgrp_id = cgrp->kn->id; +#endif while (node) { node_event = container_of(node, struct perf_event, group_node); if (cpu < node_event->cpu) { node = node->rb_left; - } else if (cpu > node_event->cpu) { + continue; + } + if (cpu > node_event->cpu) { node = node->rb_right; - } else { - match = node_event; + continue; + } +#ifdef CONFIG_CGROUP_PERF + node_cgrp_id = 0; + if (node_event->cgrp && node_event->cgrp->css.cgroup) + node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; + + if (cgrp_id < node_cgrp_id) { node = node->rb_left; + continue; + } + if (cgrp_id > node_cgrp_id) { + node = node->rb_right; + continue; } +#endif + match = node_event; + node = node->rb_left; } return match; @@ -1641,12 +1734,26 @@ static struct perf_event * perf_event_groups_next(struct perf_event *event) { struct perf_event *next; +#ifdef CONFIG_CGROUP_PERF + u64 curr_cgrp_id = 0; + u64 next_cgrp_id = 0; +#endif next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next && next->cpu == event->cpu) - return next; + if (next == NULL || next->cpu != event->cpu) + return NULL; - return NULL; +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp && event->cgrp->css.cgroup) + curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + + if (next->cgrp && next->cgrp->css.cgroup) + next_cgrp_id = next->cgrp->css.cgroup->kn->id; + + if (curr_cgrp_id != next_cgrp_id) + return NULL; +#endif + return next; } /* @@ -1986,6 +2093,12 @@ static int perf_get_aux_event(struct perf_event *event, return 1; } +static inline struct list_head *get_event_list(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; +} + static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; @@ -2028,12 +2141,8 @@ static void perf_group_detach(struct perf_event *event) if (!RB_EMPTY_NODE(&event->group_node)) { add_event_to_groups(sibling, event->ctx); - if (sibling->state == PERF_EVENT_STATE_ACTIVE) { - struct list_head *list = sibling->attr.pinned ? - &ctx->pinned_active : &ctx->flexible_active; - - list_add_tail(&sibling->active_list, list); - } + if (sibling->state == PERF_EVENT_STATE_ACTIVE) + list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); @@ -2182,6 +2291,7 @@ __perf_remove_from_context(struct perf_event *event, if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; + ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2350,6 +2460,8 @@ event_sched_in(struct perf_event *event, { int ret = 0; + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) @@ -3077,12 +3189,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; - /* - * If we had been multiplexing, no rotations are necessary, now no events - * are active. - */ - ctx->rotate_necessary = 0; - perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) @@ -3092,6 +3198,13 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); + + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + ctx->rotate_necessary = 0; } perf_pmu_enable(ctx->pmu); } @@ -3388,71 +3501,103 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static int visit_groups_merge(struct perf_event_groups *groups, int cpu, - int (*func)(struct perf_event *, void *), void *data) +static bool perf_less_group_idx(const void *l, const void *r) { - struct perf_event **evt, *evt1, *evt2; - int ret; - - evt1 = perf_event_groups_first(groups, -1); - evt2 = perf_event_groups_first(groups, cpu); - - while (evt1 || evt2) { - if (evt1 && evt2) { - if (evt1->group_index < evt2->group_index) - evt = &evt1; - else - evt = &evt2; - } else if (evt1) { - evt = &evt1; - } else { - evt = &evt2; - } + const struct perf_event *le = l, *re = r; - ret = func(*evt, data); - if (ret) - return ret; + return le->group_index < re->group_index; +} - *evt = perf_event_groups_next(*evt); - } +static void swap_ptr(void *l, void *r) +{ + void **lp = l, **rp = r; - return 0; + swap(*lp, *rp); } -struct sched_in_data { - struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - int can_add_hw; +static const struct min_heap_callbacks perf_min_heap = { + .elem_size = sizeof(struct perf_event *), + .less = perf_less_group_idx, + .swp = swap_ptr, }; -static int pinned_sched_in(struct perf_event *event, void *data) +static void __heap_add(struct min_heap *heap, struct perf_event *event) { - struct sched_in_data *sid = data; + struct perf_event **itrs = heap->data; - if (event->state <= PERF_EVENT_STATE_OFF) - return 0; + if (event) { + itrs[heap->nr] = event; + heap->nr++; + } +} - if (!event_filter_match(event)) - return 0; +static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, + struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), + void *data) +{ +#ifdef CONFIG_CGROUP_PERF + struct cgroup_subsys_state *css = NULL; +#endif + /* Space for per CPU and/or any CPU event iterators. */ + struct perf_event *itrs[2]; + struct min_heap event_heap; + struct perf_event **evt; + int ret; + + if (cpuctx) { + event_heap = (struct min_heap){ + .data = cpuctx->heap, + .nr = 0, + .size = cpuctx->heap_size, + }; + + lockdep_assert_held(&cpuctx->ctx.lock); - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) - list_add_tail(&event->active_list, &sid->ctx->pinned_active); +#ifdef CONFIG_CGROUP_PERF + if (cpuctx->cgrp) + css = &cpuctx->cgrp->css; +#endif + } else { + event_heap = (struct min_heap){ + .data = itrs, + .nr = 0, + .size = ARRAY_SIZE(itrs), + }; + /* Events not within a CPU context may be on any CPU. */ + __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); } + evt = event_heap.data; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + +#ifdef CONFIG_CGROUP_PERF + for (; css; css = css->parent) + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); +#endif + + min_heapify_all(&event_heap, &perf_min_heap); + + while (event_heap.nr) { + ret = func(*evt, data); + if (ret) + return ret; + + *evt = perf_event_groups_next(*evt); + if (*evt) + min_heapify(&event_heap, 0, &perf_min_heap); + else + min_heap_pop(&event_heap, &perf_min_heap); + } return 0; } -static int flexible_sched_in(struct perf_event *event, void *data) +static int merge_sched_in(struct perf_event *event, void *data) { - struct sched_in_data *sid = data; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -3460,14 +3605,17 @@ static int flexible_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - int ret = group_sched_in(event, sid->cpuctx, sid->ctx); - if (ret) { - sid->can_add_hw = 0; - sid->ctx->rotate_necessary = 1; - return 0; - } - list_add_tail(&event->active_list, &sid->ctx->flexible_active); + if (group_can_go_on(event, cpuctx, *can_add_hw)) { + if (!group_sched_in(event, cpuctx, ctx)) + list_add_tail(&event->active_list, get_event_list(event)); + } + + if (event->state == PERF_EVENT_STATE_INACTIVE) { + if (event->attr.pinned) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + *can_add_hw = 0; + ctx->rotate_necessary = 1; } return 0; @@ -3477,30 +3625,28 @@ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct sched_in_data sid = { - .ctx = ctx, - .cpuctx = cpuctx, - .can_add_hw = 1, - }; + int can_add_hw = 1; - visit_groups_merge(&ctx->pinned_groups, + if (ctx != &cpuctx->ctx) + cpuctx = NULL; + + visit_groups_merge(cpuctx, &ctx->pinned_groups, smp_processor_id(), - pinned_sched_in, &sid); + merge_sched_in, &can_add_hw); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct sched_in_data sid = { - .ctx = ctx, - .cpuctx = cpuctx, - .can_add_hw = 1, - }; + int can_add_hw = 1; - visit_groups_merge(&ctx->flexible_groups, + if (ctx != &cpuctx->ctx) + cpuctx = NULL; + + visit_groups_merge(cpuctx, &ctx->flexible_groups, smp_processor_id(), - flexible_sched_in, &sid); + merge_sched_in, &can_add_hw); } static void @@ -3841,6 +3987,12 @@ ctx_event_to_rotate(struct perf_event_context *ctx) typeof(*event), group_node); } + /* + * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() + * finds there are unschedulable events, it will set it again. + */ + ctx->rotate_necessary = 0; + return event; } @@ -6555,6 +6707,11 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } +static inline bool perf_sample_save_hw_index(struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6643,6 +6800,8 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (perf_sample_save_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* @@ -6836,6 +6995,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { + if (perf_sample_save_hw_index(event)) + size += sizeof(u64); + size += data->br_stack->nr * sizeof(struct perf_branch_entry); } @@ -10349,6 +10511,9 @@ skip_type: cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); + + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } got_cpu_context: @@ -10794,12 +10959,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; - if (cgroup_fd != -1) { - err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); - if (err) - goto err_ns; - } - pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); @@ -10821,6 +10980,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; } + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_pmu; + } + err = exclusive_event_init(event); if (err) goto err_pmu; @@ -10881,12 +11046,12 @@ err_per_task: exclusive_event_destroy(event); err_pmu: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); if (event->hw.target) diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..d70d47159640 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -103,17 +103,8 @@ static void __exit_signal(struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); - if (group_dead) { + if (group_dead) posix_cpu_timers_exit_group(tsk); - } else { - /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - } #endif if (group_dead) { @@ -258,6 +249,7 @@ void rcuwait_wake_up(struct rcuwait *w) wake_up_process(task); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX @@ -619,8 +611,8 @@ static void forget_original_parent(struct task_struct *father, reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..d90af13431c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -397,8 +397,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } @@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); + RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..b59532862bc0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -135,8 +135,7 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers for both - * shared and private futexes in get_futex_key_refs(). + * to futex and the waiters read (see hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -331,17 +330,6 @@ static void compat_exit_robust_list(struct task_struct *curr); static inline void compat_exit_robust_list(struct task_struct *curr) { } #endif -static inline void futex_get_mm(union futex_key *key) -{ - mmgrab(key->private.mm); - /* - * Ensure futex_get_mm() implies a full barrier such that - * get_futex_key() implies a full barrier. This is relied upon - * as smp_mb(); (B), see the ordering comment above. - */ - smp_mb__after_atomic(); -} - /* * Reflects a new waiter being added to the waitqueue. */ @@ -370,6 +358,10 @@ static inline void hb_waiters_dec(struct futex_hash_bucket *hb) static inline int hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); return atomic_read(&hb->waiters); #else return 1; @@ -385,9 +377,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } @@ -407,70 +399,6 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) - return; - - /* - * On MMU less systems futexes are always "private" as there is no per - * process address space. We need the smp wmb nevertheless - yes, - * arch/blackfin has MMU less SMP ... - */ - if (!IS_ENABLED(CONFIG_MMU)) { - smp_mb(); /* explicit smp_mb(); (B) */ - return; - } - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ - break; - case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies smp_mb(); (B) */ - break; - default: - /* - * Private futexes do not hold reference on an inode or - * mm, therefore the only purpose of calling get_futex_key_refs - * is because we need the barrier for the lockless waiter check. - */ - smp_mb(); /* explicit smp_mb(); (B) */ - } -} - -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. This is - * a no-op for private futexes, see comment in the get - * counterpart. - */ -static void drop_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) { - /* If we're here then we tried to put a key we failed to get */ - WARN_ON_ONCE(1); - return; - } - - if (!IS_ENABLED(CONFIG_MMU)) - return; - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } -} - enum futex_access { FUTEX_READ, FUTEX_WRITE @@ -505,6 +433,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +485,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -556,7 +530,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -659,8 +632,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,36 +663,8 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } @@ -733,7 +676,6 @@ out: static inline void put_futex_key(union futex_key *key) { - drop_futex_key_refs(key); } /** @@ -1723,10 +1665,9 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) oparg = 1 << oparg; } - if (!access_ok(uaddr, sizeof(u32))) - return -EFAULT; - + pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + pagefault_enable(); if (ret) return ret; @@ -1868,7 +1809,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } - get_futex_key_refs(key2); q->key = *key2; } @@ -1890,7 +1830,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - get_futex_key_refs(key); q->key = *key; __unqueue_futex(q); @@ -2001,7 +1940,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int drop_count = 0, task_count = 0, ret; + int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; @@ -2122,7 +2061,6 @@ retry_private: */ if (ret > 0) { WARN_ON(pi_state); - drop_count++; task_count++; /* * If we acquired the lock, then the user space value @@ -2242,7 +2180,6 @@ retry_private: * doing so. */ requeue_pi_wake_futex(this, &key2, hb2); - drop_count++; continue; } else if (ret) { /* @@ -2263,7 +2200,6 @@ retry_private: } } requeue_futex(this, hb1, hb2, &key2); - drop_count++; } /* @@ -2278,15 +2214,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); - /* - * drop_futex_key_refs() must be called outside the spinlocks. During - * the requeue we moved futex_q's from the hash bucket at key1 to the - * one at key2 and updated their key pointer. We no longer need to - * hold the references to key1. - */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - out_put_keys: put_futex_key(&key2); out_put_key1: @@ -2416,7 +2343,6 @@ retry: ret = 1; } - drop_futex_key_refs(&q->key); return ret; } diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index f92d9a687372..20d501af4f2e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -43,6 +43,10 @@ config GENERIC_IRQ_MIGRATION config AUTO_IRQ_AFFINITY bool +# Interrupt injection mechanism +config GENERIC_IRQ_INJECTION + bool + # Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND bool @@ -127,6 +131,7 @@ config SPARSE_IRQ config GENERIC_IRQ_DEBUGFS bool "Expose irq internals in debugfs" depends on DEBUG_FS + select GENERIC_IRQ_INJECTION default n ---help--- diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b3fa2d87d2f3..41e7e37a0928 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -278,7 +278,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) } } if (resend) - check_irq_resend(desc); + check_irq_resend(desc, false); return ret; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index a949bd39e343..4f9f844074db 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -190,33 +190,7 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, return -EFAULT; if (!strncmp(buf, "trigger", size)) { - unsigned long flags; - int err; - - /* Try the HW interface first */ - err = irq_set_irqchip_state(irq_desc_get_irq(desc), - IRQCHIP_STATE_PENDING, true); - if (!err) - return count; - - /* - * Otherwise, try to inject via the resend interface, - * which may or may not succeed. - */ - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - - if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { - /* Can't do level nor NMIs, sorry */ - err = -EINVAL; - } else { - desc->istate |= IRQS_PENDING; - check_irq_resend(desc); - err = 0; - } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + int err = irq_inject_interrupt(irq_desc_get_irq(desc)); return err ? err : count; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a4ace611f47f..a8e14c80b405 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -145,6 +145,13 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags for_each_action_of_desc(desc, action) { irqreturn_t res; + /* + * If this IRQ would be threaded under force_irqthreads, mark it so. + */ + if (irq_settings_can_thread(desc) && + !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))) + lockdep_hardirq_threaded(); + trace_irq_handler_entry(irq, action); res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c9d8eb7f5c02..7db284b10ac9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -108,7 +108,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc); +int check_irq_resend(struct irq_desc *desc, bool inject); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); @@ -425,6 +425,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } +static inline bool handle_enforce_irqctx(struct irq_data *data) +{ + return irqd_is_handle_enforce_irqctx(data); +} bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) @@ -451,6 +455,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline bool handle_enforce_irqctx(struct irq_data *data) +{ + return false; +} #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 98a5f10d1900..1a7723604399 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq) int generic_handle_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *data; if (!desc) return -EINVAL; + + data = irq_desc_get_irq_data(desc); + if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + return -EPERM; + generic_handle_irq_desc(desc); return 0; } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7527e5ef6fe5..35b8d97c3a1d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -46,11 +46,11 @@ const struct fwnode_operations irqchip_fwnode_ops; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** - * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for + * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain * @type: Type of irqchip_fwnode. See linux/irqdomain.h - * @name: Optional user provided domain name * @id: Optional user provided id if name != NULL + * @name: Optional user provided domain name * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a poiner to the embedded @@ -1310,6 +1310,11 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { + if (!domain->ops->alloc) { + pr_debug("domain->ops->alloc() is NULL\n"); + return -ENOSYS; + } + return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } @@ -1347,11 +1352,6 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, return -EINVAL; } - if (!domain->ops->alloc) { - pr_debug("domain->ops->alloc() is NULL\n"); - return -ENOSYS; - } - if (realloc && irq_base >= 0) { virq = irq_base; } else { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7eee98c38f25..fe40c658f86f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 98c04ca5fa43..27634f4022d0 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -47,6 +47,43 @@ static void resend_irqs(unsigned long arg) /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static int irq_sw_resend(struct irq_desc *desc) +{ + unsigned int irq = irq_desc_get_irq(desc); + + /* + * Validate whether this interrupt can be safely injected from + * non interrupt context + */ + if (handle_enforce_irqctx(&desc->irq_data)) + return -EINVAL; + + /* + * If the interrupt is running in the thread context of the parent + * irq we need to be careful, because we cannot trigger it + * directly. + */ + if (irq_settings_is_nested_thread(desc)) { + /* + * If the parent_irq is valid, we retrigger the parent, + * otherwise we do nothing. + */ + if (!desc->parent_irq) + return -EINVAL; + irq = desc->parent_irq; + } + + /* Set it pending and activate the softirq: */ + set_bit(irq, irqs_resend); + tasklet_schedule(&resend_tasklet); + return 0; +} + +#else +static int irq_sw_resend(struct irq_desc *desc) +{ + return -EINVAL; +} #endif /* @@ -54,49 +91,83 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); * * Is called with interrupts disabled and desc->lock held. */ -void check_irq_resend(struct irq_desc *desc) +int check_irq_resend(struct irq_desc *desc, bool inject) { + int err = 0; + /* - * We do not resend level type interrupts. Level type - * interrupts are resent by hardware when they are still - * active. Clear the pending bit so suspend/resume does not - * get confused. + * We do not resend level type interrupts. Level type interrupts + * are resent by hardware when they are still active. Clear the + * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; - return; + return -EINVAL; } + if (desc->istate & IRQS_REPLAY) - return; - if (desc->istate & IRQS_PENDING) { - desc->istate &= ~IRQS_PENDING; + return -EBUSY; + + if (!(desc->istate & IRQS_PENDING) && !inject) + return 0; + + desc->istate &= ~IRQS_PENDING; + + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + err = irq_sw_resend(desc); + + /* If the retrigger was successfull, mark it with the REPLAY bit */ + if (!err) desc->istate |= IRQS_REPLAY; + return err; +} - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { -#ifdef CONFIG_HARDIRQS_SW_RESEND - unsigned int irq = irq_desc_get_irq(desc); - - /* - * If the interrupt is running in the thread - * context of the parent irq we need to be - * careful, because we cannot trigger it - * directly. - */ - if (irq_settings_is_nested_thread(desc)) { - /* - * If the parent_irq is valid, we - * retrigger the parent, otherwise we - * do nothing. - */ - if (!desc->parent_irq) - return; - irq = desc->parent_irq; - } - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); - tasklet_schedule(&resend_tasklet); -#endif - } - } +#ifdef CONFIG_GENERIC_IRQ_INJECTION +/** + * irq_inject_interrupt - Inject an interrupt for testing/error injection + * @irq: The interrupt number + * + * This function must only be used for debug and testing purposes! + * + * Especially on x86 this can cause a premature completion of an interrupt + * affinity change causing the interrupt line to become stale. Very + * unlikely, but possible. + * + * The injection can fail for various reasons: + * - Interrupt is not activated + * - Interrupt is NMI type or currently replaying + * - Interrupt is level type + * - Interrupt does not support hardware retrigger and software resend is + * either not enabled or not possible for the interrupt. + */ +int irq_inject_interrupt(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int err; + + /* Try the state injection hardware interface first */ + if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) + return 0; + + /* That failed, try via the resend mechanism */ + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return -EINVAL; + + /* + * Only try to inject when the interrupt is: + * - not NMI type + * - activated + */ + if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data)) + err = -EINVAL; + else + err = check_irq_resend(desc, true); + + irq_put_desc_busunlock(desc, flags); + return err; } +EXPORT_SYMBOL_GPL(irq_inject_interrupt); +#endif diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 828cc30774bc..48b5d1b6af4d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -153,7 +153,9 @@ static void irq_work_run_list(struct llist_head *list) */ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + lockdep_irq_work_enter(work); work->func(work); + lockdep_irq_work_exit(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. diff --git a/kernel/kthread.c b/kernel/kthread.c index b262f47046ca..bfbfa481be3a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -199,8 +199,15 @@ static void __kthread_parkme(struct kthread *self) if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + /* + * Thread is going to call schedule(), do not preempt it, + * or the caller of kthread_park() may spend more time in + * wait_task_inactive(). + */ + preempt_disable(); complete(&self->parked); - schedule(); + schedule_preempt_disabled(); + preempt_enable(); } __set_current_state(TASK_RUNNING); } @@ -245,8 +252,14 @@ static int kthread(void *_create) /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; + /* + * Thread is going to call schedule(), do not preempt it, + * or the creator may spend more time in wait_task_inactive(). + */ + preempt_disable(); complete(done); - schedule(); + schedule_preempt_disabled(); + preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32406ef0d6a2..1511690e4de7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,12 +84,39 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *__owner; + +static inline void lockdep_lock(void) +{ + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + arch_spin_lock(&__lock); + __owner = current; + current->lockdep_recursion++; +} + +static inline void lockdep_unlock(void) +{ + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) + return; + + current->lockdep_recursion--; + __owner = NULL; + arch_spin_unlock(&__lock); +} + +static inline bool lockdep_assert_locked(void) +{ + return DEBUG_LOCKS_WARN_ON(__owner != current); +} + static struct task_struct *lockdep_selftest_task_struct; + static int graph_lock(void) { - arch_spin_lock(&lockdep_lock); + lockdep_lock(); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -97,27 +124,15 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return 0; } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; return 1; } -static inline int graph_unlock(void) +static inline void graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; + lockdep_unlock(); } /* @@ -128,7 +143,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return ret; } @@ -147,6 +162,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); #define KEYHASH_SIZE (1UL << KEYHASH_BITS) static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; +unsigned long nr_zapped_classes; #ifndef CONFIG_DEBUG_LOCKDEP static #endif @@ -377,18 +393,31 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + void lockdep_off(void) { - current->lockdep_recursion++; + current->lockdep_recursion += LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_off); void lockdep_on(void) { - current->lockdep_recursion--; + current->lockdep_recursion -= LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_on); +static inline void lockdep_recursion_finish(void) +{ + if (WARN_ON_ONCE(--current->lockdep_recursion)) + current->lockdep_recursion = 0; +} + void lockdep_set_selftest_task(struct task_struct *task) { lockdep_selftest_task_struct = task; @@ -575,6 +604,7 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -653,7 +683,9 @@ static void print_lock_name(struct lock_class *class) printk(KERN_CONT " ("); __print_lock_name(class); - printk(KERN_CONT "){%s}", usage); + printk(KERN_CONT "){%s}-{%hd:%hd}", usage, + class->wait_type_outer ?: class->wait_type_inner, + class->wait_type_inner); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -787,6 +819,7 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } +/* used from NMI context -- must be lockless */ static inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { @@ -1070,13 +1103,15 @@ static inline void check_data_structures(void) { } #endif /* CONFIG_DEBUG_LOCKDEP */ +static void init_chain_block_buckets(void); + /* * Initialize the lock_classes[] array elements, the free_lock_classes list * and also the delayed_free structure. */ static void init_data_structures_once(void) { - static bool ds_initialized, rcu_head_initialized; + static bool __read_mostly ds_initialized, rcu_head_initialized; int i; if (likely(rcu_head_initialized)) @@ -1100,6 +1135,7 @@ static void init_data_structures_once(void) INIT_LIST_HEAD(&lock_classes[i].locks_after); INIT_LIST_HEAD(&lock_classes[i].locks_before); } + init_chain_block_buckets(); } static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) @@ -1230,6 +1266,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) WARN_ON_ONCE(!list_empty(&class->locks_before)); WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); + class->wait_type_inner = lock->wait_type_inner; + class->wait_type_outer = lock->wait_type_outer; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -1469,6 +1507,8 @@ static int __bfs(struct lock_list *source_entry, struct circular_queue *cq = &lock_cq; int ret = 1; + lockdep_assert_locked(); + if (match(source_entry, data)) { *target_entry = source_entry; ret = 0; @@ -1491,8 +1531,6 @@ static int __bfs(struct lock_list *source_entry, head = get_dep_list(lock, offset); - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); - list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; @@ -1719,9 +1757,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -1746,9 +1784,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -2298,18 +2336,6 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, return 0; } -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - #else static inline int check_irq_usage(struct task_struct *curr, @@ -2317,13 +2343,27 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } +#endif /* CONFIG_TRACE_IRQFLAGS */ -static inline void inc_chains(void) +static void inc_chains(int irq_context) { - nr_process_chains++; + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains++; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains++; + else + nr_process_chains++; } -#endif /* CONFIG_TRACE_IRQFLAGS */ +static void dec_chains(int irq_context) +{ + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains--; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains--; + else + nr_process_chains--; +} static void print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) @@ -2622,8 +2662,235 @@ out_bug: struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); -int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +unsigned long nr_zapped_lock_chains; +unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */ +unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */ +unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */ + +/* + * The first 2 chain_hlocks entries in the chain block in the bucket + * list contains the following meta data: + * + * entry[0]: + * Bit 15 - always set to 1 (it is not a class index) + * Bits 0-14 - upper 15 bits of the next block index + * entry[1] - lower 16 bits of next block index + * + * A next block index of all 1 bits means it is the end of the list. + * + * On the unsized bucket (bucket-0), the 3rd and 4th entries contain + * the chain block size: + * + * entry[2] - upper 16 bits of the chain block size + * entry[3] - lower 16 bits of the chain block size + */ +#define MAX_CHAIN_BUCKETS 16 +#define CHAIN_BLK_FLAG (1U << 15) +#define CHAIN_BLK_LIST_END 0xFFFFU + +static int chain_block_buckets[MAX_CHAIN_BUCKETS]; + +static inline int size_to_bucket(int size) +{ + if (size > MAX_CHAIN_BUCKETS) + return 0; + + return size - 1; +} + +/* + * Iterate all the chain blocks in a bucket. + */ +#define for_each_chain_block(bucket, prev, curr) \ + for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \ + (curr) >= 0; \ + (prev) = (curr), (curr) = chain_block_next(curr)) + +/* + * next block or -1 + */ +static inline int chain_block_next(int offset) +{ + int next = chain_hlocks[offset]; + + WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG)); + + if (next == CHAIN_BLK_LIST_END) + return -1; + + next &= ~CHAIN_BLK_FLAG; + next <<= 16; + next |= chain_hlocks[offset + 1]; + + return next; +} + +/* + * bucket-0 only + */ +static inline int chain_block_size(int offset) +{ + return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3]; +} + +static inline void init_chain_block(int offset, int next, int bucket, int size) +{ + chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG; + chain_hlocks[offset + 1] = (u16)next; + + if (size && !bucket) { + chain_hlocks[offset + 2] = size >> 16; + chain_hlocks[offset + 3] = (u16)size; + } +} + +static inline void add_chain_block(int offset, int size) +{ + int bucket = size_to_bucket(size); + int next = chain_block_buckets[bucket]; + int prev, curr; + + if (unlikely(size < 2)) { + /* + * We can't store single entries on the freelist. Leak them. + * + * One possible way out would be to uniquely mark them, other + * than with CHAIN_BLK_FLAG, such that we can recover them when + * the block before it is re-added. + */ + if (size) + nr_lost_chain_hlocks++; + return; + } + + nr_free_chain_hlocks += size; + if (!bucket) { + nr_large_chain_blocks++; + + /* + * Variable sized, sort large to small. + */ + for_each_chain_block(0, prev, curr) { + if (size >= chain_block_size(curr)) + break; + } + init_chain_block(offset, curr, 0, size); + if (prev < 0) + chain_block_buckets[0] = offset; + else + init_chain_block(prev, offset, 0, 0); + return; + } + /* + * Fixed size, add to head. + */ + init_chain_block(offset, next, bucket, size); + chain_block_buckets[bucket] = offset; +} + +/* + * Only the first block in the list can be deleted. + * + * For the variable size bucket[0], the first block (the largest one) is + * returned, broken up and put back into the pool. So if a chain block of + * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be + * queued up after the primordial chain block and never be used until the + * hlock entries in the primordial chain block is almost used up. That + * causes fragmentation and reduce allocation efficiency. That can be + * monitored by looking at the "large chain blocks" number in lockdep_stats. + */ +static inline void del_chain_block(int bucket, int size, int next) +{ + nr_free_chain_hlocks -= size; + chain_block_buckets[bucket] = next; + + if (!bucket) + nr_large_chain_blocks--; +} + +static void init_chain_block_buckets(void) +{ + int i; + + for (i = 0; i < MAX_CHAIN_BUCKETS; i++) + chain_block_buckets[i] = -1; + + add_chain_block(0, ARRAY_SIZE(chain_hlocks)); +} + +/* + * Return offset of a chain block of the right size or -1 if not found. + * + * Fairly simple worst-fit allocator with the addition of a number of size + * specific free lists. + */ +static int alloc_chain_hlocks(int req) +{ + int bucket, curr, size; + + /* + * We rely on the MSB to act as an escape bit to denote freelist + * pointers. Make sure this bit isn't set in 'normal' class_idx usage. + */ + BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG); + + init_data_structures_once(); + + if (nr_free_chain_hlocks < req) + return -1; + + /* + * We require a minimum of 2 (u16) entries to encode a freelist + * 'pointer'. + */ + req = max(req, 2); + bucket = size_to_bucket(req); + curr = chain_block_buckets[bucket]; + + if (bucket) { + if (curr >= 0) { + del_chain_block(bucket, req, chain_block_next(curr)); + return curr; + } + /* Try bucket 0 */ + curr = chain_block_buckets[0]; + } + + /* + * The variable sized freelist is sorted by size; the first entry is + * the largest. Use it if it fits. + */ + if (curr >= 0) { + size = chain_block_size(curr); + if (likely(size >= req)) { + del_chain_block(0, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + } + + /* + * Last resort, split a block in a larger sized bucket. + */ + for (size = MAX_CHAIN_BUCKETS; size > req; size--) { + bucket = size_to_bucket(size); + curr = chain_block_buckets[bucket]; + if (curr < 0) + continue; + + del_chain_block(bucket, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + + return -1; +} + +static inline void free_chain_hlocks(int base, int size) +{ + add_chain_block(base, max(size, 2)); +} struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { @@ -2803,7 +3070,7 @@ static inline int add_chain_cache(struct task_struct *curr, * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (lockdep_assert_locked()) return 0; chain = alloc_lock_chain(); @@ -2824,15 +3091,8 @@ static inline int add_chain_cache(struct task_struct *curr, BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - nr_chain_hlocks += chain->depth; - } else { + j = alloc_chain_hlocks(chain->depth); + if (j < 0) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2841,9 +3101,16 @@ static inline int add_chain_cache(struct task_struct *curr, return 0; } + chain->base = j; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class_idx; + + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); - inc_chains(); + inc_chains(chain->irq_context); return 1; } @@ -2987,6 +3254,8 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void init_chain_block_buckets(void) { } #endif /* CONFIG_PROVE_LOCKING */ /* @@ -3081,10 +3350,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); + lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + lockdep_hardirqs_enabled(curr), + lockdep_softirqs_enabled(curr)); print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); @@ -3429,9 +3698,9 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; - current->lockdep_recursion = 1; + current->lockdep_recursion++; __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } NOKPROBE_SYMBOL(lockdep_hardirqs_on); @@ -3468,7 +3737,7 @@ NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: */ -void trace_softirqs_on(unsigned long ip) +void lockdep_softirqs_on(unsigned long ip) { struct task_struct *curr = current; @@ -3487,7 +3756,7 @@ void trace_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion = 1; + current->lockdep_recursion++; /* * We'll do an OFF -> ON transition: */ @@ -3502,13 +3771,13 @@ void trace_softirqs_on(unsigned long ip) */ if (curr->hardirqs_enabled) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } /* * Softirqs were disabled: */ -void trace_softirqs_off(unsigned long ip) +void lockdep_softirqs_off(unsigned long ip) { struct task_struct *curr = current; @@ -3596,7 +3865,8 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return 2 * !!task->hardirq_context + !!task->softirq_context; + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } static int separate_irq_context(struct task_struct *curr, @@ -3682,6 +3952,113 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +static int +print_lock_invalid_wait_context(struct task_struct *curr, + struct held_lock *hlock) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + pr_warn("\n"); + pr_warn("=============================\n"); + pr_warn("[ BUG: Invalid wait context ]\n"); + print_kernel_ident(); + pr_warn("-----------------------------\n"); + + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + pr_warn("other info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + pr_warn("stack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Verify the wait_type context. + * + * This check validates we takes locks in the right wait-type order; that is it + * ensures that we do not take mutexes inside spinlocks and do not attempt to + * acquire spinlocks inside raw_spinlocks and the sort. + * + * The entire thing is slightly more complex because of RCU, RCU is a lock that + * can be taken from (pretty much) any context but also has constraints. + * However when taken in a stricter environment the RCU lock does not loosen + * the constraints. + * + * Therefore we must look for the strictest environment in the lock stack and + * compare that to the lock we're trying to acquire. + */ +static int check_wait_context(struct task_struct *curr, struct held_lock *next) +{ + short next_inner = hlock_class(next)->wait_type_inner; + short next_outer = hlock_class(next)->wait_type_outer; + short curr_inner; + int depth; + + if (!curr->lockdep_depth || !next_inner || next->trylock) + return 0; + + if (!next_outer) + next_outer = next_inner; + + /* + * Find start of current irq_context.. + */ + for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) { + struct held_lock *prev = curr->held_locks + depth; + if (prev->irq_context != next->irq_context) + break; + } + depth++; + + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (curr->hardirq_context) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + curr_inner = LD_WAIT_CONFIG; + else + curr_inner = LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + curr_inner = LD_WAIT_CONFIG; + } else { + curr_inner = LD_WAIT_MAX; + } + + for (; depth < curr->lockdep_depth; depth++) { + struct held_lock *prev = curr->held_locks + depth; + short prev_inner = hlock_class(prev)->wait_type_inner; + + if (prev_inner) { + /* + * We can have a bigger inner than a previous one + * when outer is smaller than inner, as with RCU. + * + * Also due to trylocks. + */ + curr_inner = min(curr_inner, prev_inner); + } + } + + if (next_outer > curr_inner) + return print_lock_invalid_wait_context(curr, next); + + return 0; +} + #else /* CONFIG_PROVE_LOCKING */ static inline int @@ -3701,13 +4078,20 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } +static inline int check_wait_context(struct task_struct *curr, + struct held_lock *next) +{ + return 0; +} + #endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) +void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, + short inner, short outer) { int i; @@ -3728,6 +4112,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + lock->wait_type_outer = outer; + lock->wait_type_inner = inner; + /* * No key, no joy, we need to hash something. */ @@ -3755,13 +4142,13 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; register_lock_class(lock, subclass, 1); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map); +EXPORT_SYMBOL_GPL(lockdep_init_map_waits); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3862,7 +4249,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { + if (depth) { /* we're holding locks */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -3904,6 +4291,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; + if (check_wait_context(curr, hlock)) + return 0; + /* Initialize the lock usage bit */ if (!mark_usage(curr, hlock, check)) return 0; @@ -4139,7 +4529,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; } - lockdep_init_map(lock, name, key, 0); + lockdep_init_map_waits(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes; @@ -4437,11 +4829,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_set_class); @@ -4454,15 +4846,45 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_downgrade); +/* NMI context !!! */ +static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class = look_up_lock_class(lock, subclass); + + /* if it doesn't have a class (yet), it certainly hasn't been used yet */ + if (!class) + return; + + if (!(class->usage_mask & LOCK_USED)) + return; + + hlock->class_idx = class - lock_classes; + + print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES); +#endif +} + +static bool lockdep_nmi(void) +{ + if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + return false; + + if (!in_nmi()) + return false; + + return true; +} + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -4473,17 +4895,34 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(current->lockdep_recursion)) { + /* XXX allow trylock from NMI ?!? */ + if (lockdep_nmi() && !trylock) { + struct held_lock hlock; + + hlock.acquire_ip = ip; + hlock.instance = lock; + hlock.nest_lock = nest_lock; + hlock.irq_context = 2; // XXX + hlock.trylock = trylock; + hlock.read = read; + hlock.check = check; + hlock.hardirqs_off = true; + hlock.references = 0; + + verify_lock_unused(lock, &hlock, subclass); + } return; + } raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquire); @@ -4497,11 +4936,11 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_release(lock, ip); if (__lock_release(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_release); @@ -4517,9 +4956,9 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; ret = __lock_is_held(lock, read); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return ret; @@ -4538,9 +4977,9 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; cookie = __lock_pin_lock(lock); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return cookie; @@ -4557,9 +4996,9 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_repin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_repin_lock); @@ -4574,9 +5013,9 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_unpin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_unpin_lock); @@ -4712,10 +5151,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_contended(lock, ip); __lock_contended(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_contended); @@ -4732,9 +5171,9 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_acquired(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); @@ -4768,57 +5207,33 @@ static void remove_class_from_lock_chain(struct pending_free *pf, struct lock_class *class) { #ifdef CONFIG_PROVE_LOCKING - struct lock_chain *new_chain; - u64 chain_key; int i; for (i = chain->base; i < chain->base + chain->depth; i++) { if (chain_hlocks[i] != class - lock_classes) continue; - /* The code below leaks one chain_hlock[] entry. */ - if (--chain->depth > 0) { - memmove(&chain_hlocks[i], &chain_hlocks[i + 1], - (chain->base + chain->depth - i) * - sizeof(chain_hlocks[0])); - } /* * Each lock class occurs at most once in a lock chain so once * we found a match we can break out of this loop. */ - goto recalc; + goto free_lock_chain; } /* Since the chain has not been modified, return. */ return; -recalc: - chain_key = INITIAL_CHAIN_KEY; - for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); - if (chain->depth && chain->chain_key == chain_key) - return; +free_lock_chain: + free_chain_hlocks(chain->base, chain->depth); /* Overwrite the chain key for concurrent RCU readers. */ - WRITE_ONCE(chain->chain_key, chain_key); + WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY); + dec_chains(chain->irq_context); + /* * Note: calling hlist_del_rcu() from inside a * hlist_for_each_entry_rcu() loop is safe. */ hlist_del_rcu(&chain->entry); __set_bit(chain - lock_chains, pf->lock_chains_being_freed); - if (chain->depth == 0) - return; - /* - * If the modified lock chain matches an existing lock chain, drop - * the modified lock chain. - */ - if (lookup_chain_cache(chain_key)) - return; - new_chain = alloc_lock_chain(); - if (WARN_ON_ONCE(!new_chain)) { - debug_locks_off(); - return; - } - *new_chain = *chain; - hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); + nr_zapped_lock_chains++; #endif } @@ -4874,6 +5289,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) } remove_class_from_lock_chains(pf, class); + nr_zapped_classes++; } static void reinit_class(struct lock_class *class) @@ -4958,8 +5374,7 @@ static void free_zapped_rcu(struct rcu_head *ch) return; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); /* closed head */ pf = delayed_free.pf + (delayed_free.index ^ 1); @@ -4971,8 +5386,7 @@ static void free_zapped_rcu(struct rcu_head *ch) */ call_rcu_zapped(delayed_free.pf + delayed_free.index); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5017,13 +5431,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); call_rcu_zapped(pf); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); /* @@ -5045,10 +5457,10 @@ static void lockdep_free_key_range_imm(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_free_key_range(pf, start, size); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5144,10 +5556,10 @@ static void lockdep_reset_lock_imm(struct lockdep_map *lock) unsigned long flags; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_reset_lock(pf, lock); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 18d85aebbb57..baca699b94e9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -106,6 +106,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define STACK_TRACE_HASH_SIZE 16384 #endif +/* + * Bit definitions for lock_chain.irq_context + */ +#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0) +#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1) + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -124,17 +130,21 @@ extern const char *__get_key_name(const struct lockdep_subclass_key *key, struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; +extern unsigned long nr_zapped_classes; +extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); -extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; -extern unsigned int max_lockdep_depth; +extern unsigned int nr_free_chain_hlocks; +extern unsigned int nr_lost_chain_hlocks; +extern unsigned int nr_large_chain_blocks; +extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 231684cfc5ae..5525cd3ba0c8 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -128,15 +128,22 @@ static int lc_show(struct seq_file *m, void *v) struct lock_chain *chain = v; struct lock_class *class; int i; + static const char * const irq_strs[] = { + [0] = "0", + [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT| + LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq", + }; if (v == SEQ_START_TOKEN) { - if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + if (!nr_free_chain_hlocks) seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } - seq_printf(m, "irq_context: %d\n", chain->irq_context); + seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]); for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); @@ -271,8 +278,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", lock_chain_count(), MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n", + MAX_LOCKDEP_CHAIN_HLOCKS - + (nr_free_chain_hlocks + nr_lost_chain_hlocks), + MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks lost: %11u\n", + nr_lost_chain_hlocks); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -336,6 +347,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " debug_locks: %11u\n", debug_locks); + /* + * Zappped classes and lockdep data buffers reuse statistics. + */ + seq_puts(m, "\n"); + seq_printf(m, " zapped classes: %11lu\n", + nr_zapped_classes); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " zapped lock chains: %11lu\n", + nr_zapped_lock_chains); + seq_printf(m, " large chain blocks: %11u\n", + nr_large_chain_blocks); +#endif return 0; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 99475a66c94f..5efbfc68ce99 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -618,7 +618,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { static int lock_torture_writer(void *arg) { struct lock_stress_stats *lwsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); set_user_nice(current, MAX_NICE); @@ -655,7 +655,7 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -696,15 +696,16 @@ static void __torture_print_stats(char *page, if (statp[i].n_lock_fail) fail = true; sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_fail) - max = statp[i].n_lock_fail; - if (min > statp[i].n_lock_fail) - min = statp[i].n_lock_fail; + if (max < statp[i].n_lock_acquired) + max = statp[i].n_lock_acquired; + if (min > statp[i].n_lock_acquired) + min = statp[i].n_lock_acquired; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", write ? "Writes" : "Reads ", - sum, max, min, max / 2 > min ? "???" : "", + sum, max, min, + !onoff_interval && max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) atomic_inc(&cxt.n_lock_torture_errors); diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 771d4ca96dda..a7276aaf2abc 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -85,7 +85,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif lock->magic = lock; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 364d38a0c444..a008a1ba21a7 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,27 +1,29 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/atomic.h> -#include <linux/rwsem.h> #include <linux/percpu.h> +#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/errno.h> -#include "rwsem.h" - int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, - const char *name, struct lock_class_key *rwsem_key) + const char *name, struct lock_class_key *key) { sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss); - __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); - sem->readers_block = 0; + init_waitqueue_head(&sem->waiters); + atomic_set(&sem->block, 0); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); @@ -41,73 +43,139 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) +static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { + __this_cpu_inc(*sem->read_count); + /* * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * - * If the reader misses the writer's assignment of readers_block, then - * the writer is guaranteed to see the reader's increment. + * If the reader misses the writer's assignment of sem->block, then the + * writer is guaranteed to see the reader's increment. * * Conversely, any readers that increment their sem->read_count after - * the writer looks are guaranteed to see the readers_block value, - * which in turn means that they are guaranteed to immediately - * decrement their sem->read_count, so that it doesn't matter that the - * writer missed them. + * the writer looks are guaranteed to see the sem->block value, which + * in turn means that they are guaranteed to immediately decrement + * their sem->read_count, so that it doesn't matter that the writer + * missed them. */ smp_mb(); /* A matches D */ /* - * If !readers_block the critical section starts here, matched by the + * If !sem->block the critical section starts here, matched by the * release in percpu_up_write(). */ - if (likely(!smp_load_acquire(&sem->readers_block))) + if (likely(!atomic_read_acquire(&sem->block))) + return true; + + __this_cpu_dec(*sem->read_count); + + /* Prod writer to re-evaluate readers_active_check() */ + rcuwait_wake_up(&sem->writer); + + return false; +} + +static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) +{ + if (atomic_read(&sem->block)) + return false; + + return atomic_xchg(&sem->block, 1) == 0; +} + +static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) +{ + if (reader) { + bool ret; + + preempt_disable(); + ret = __percpu_down_read_trylock(sem); + preempt_enable(); + + return ret; + } + return __percpu_down_write_trylock(sem); +} + +/* + * The return value of wait_queue_entry::func means: + * + * <0 - error, wakeup is terminated and the error is returned + * 0 - no wakeup, a next waiter is tried + * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. + * + * We use EXCLUSIVE for both readers and writers to preserve FIFO order, + * and play games with the return value to allow waking multiple readers. + * + * Specifically, we wake readers until we've woken a single writer, or until a + * trylock fails. + */ +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int wake_flags, + void *key) +{ + struct task_struct *p = get_task_struct(wq_entry->private); + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; + struct percpu_rw_semaphore *sem = key; + + /* concurrent against percpu_down_write(), can get stolen */ + if (!__percpu_rwsem_trylock(sem, reader)) return 1; - /* - * Per the above comment; we still have preemption disabled and - * will thus decrement on the same CPU as we incremented. - */ - __percpu_up_read(sem); + list_del_init(&wq_entry->entry); + smp_store_release(&wq_entry->private, NULL); - if (try) - return 0; + wake_up_process(p); + put_task_struct(p); - /* - * We either call schedule() in the wait, or we'll fall through - * and reschedule on the preempt_enable() in percpu_down_read(). - */ - preempt_enable_no_resched(); + return !reader; /* wake (readers until) 1 writer */ +} + +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +{ + DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); + bool wait; + spin_lock_irq(&sem->waiters.lock); /* - * Avoid lockdep for the down/up_read() we already have them. + * Serialize against the wakeup in percpu_up_write(), if we fail + * the trylock, the wakeup must see us on the list. */ - __down_read(&sem->rw_sem); - this_cpu_inc(*sem->read_count); - __up_read(&sem->rw_sem); + wait = !__percpu_rwsem_trylock(sem, reader); + if (wait) { + wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; + __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); + } + spin_unlock_irq(&sem->waiters.lock); - preempt_disable(); - return 1; + while (wait) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!smp_load_acquire(&wq_entry.private)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL_GPL(__percpu_down_read); -void __percpu_up_read(struct percpu_rw_semaphore *sem) +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { - smp_mb(); /* B matches C */ - /* - * In other words, if they see our decrement (presumably to aggregate - * zero, as that is the only time it matters) they will also see our - * critical section. - */ - __this_cpu_dec(*sem->read_count); + if (__percpu_down_read_trylock(sem)) + return true; - /* Prod writer to recheck readers_active */ - rcuwait_wake_up(&sem->writer); + if (try) + return false; + + preempt_enable(); + percpu_rwsem_wait(sem, /* .reader = */ true); + preempt_disable(); + + return true; } -EXPORT_SYMBOL_GPL(__percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ @@ -124,6 +192,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read); * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. + * + * Assumes sem->block is set. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { @@ -142,32 +212,36 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) void percpu_down_write(struct percpu_rw_semaphore *sem) { + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); - down_write(&sem->rw_sem); - /* - * Notify new readers to block; up until now, and thus throughout the - * longish rcu_sync_enter() above, new readers could still come in. + * Try set sem->block; this provides writer-writer exclusion. + * Having sem->block set makes new readers block. */ - WRITE_ONCE(sem->readers_block, 1); + if (!__percpu_down_write_trylock(sem)) + percpu_rwsem_wait(sem, /* .reader = */ false); - smp_mb(); /* D matches A */ + /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ /* - * If they don't see our writer of readers_block, then we are - * guaranteed to see their sem->read_count increment, and therefore - * will wait for them. + * If they don't see our store of sem->block, then we are guaranteed to + * see their sem->read_count increment, and therefore will wait for + * them. */ - /* Wait for all now active readers to complete. */ - rcuwait_wait_event(&sem->writer, readers_active_check(sem)); + /* Wait for all active readers to complete. */ + rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *sem) { + rwsem_release(&sem->dep_map, _RET_IP_); + /* * Signal the writer is done, no fast path yet. * @@ -178,12 +252,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ - smp_store_release(&sem->readers_block, 0); + atomic_set_release(&sem->block, 0); /* - * Release the write lock, this will allow readers back in the game. + * Prod any pending reader/writer to make progress. */ - up_write(&sem->rw_sem); + __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); /* * Once this completes (at least one RCU-sched grace period hence) the diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 851bbb10819d..c9f090d64f00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)val; + WRITE_ONCE(lock->owner, (struct task_struct *)val); } static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 0d9b6be9ecc8..f11b9bd3431d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,7 +28,6 @@ #include <linux/rwsem.h> #include <linux/atomic.h> -#include "rwsem.h" #include "lock_events.h" /* @@ -329,7 +328,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif #ifdef CONFIG_DEBUG_RWSEMS sem->magic = sem; @@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long flags; bool ret = true; - BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); - if (need_resched()) { lockevent_inc(rwsem_opt_fail); return false; @@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); @@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_semaphore *sem) { long tmp; diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 2534ce49f648..e69de29bb2d1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __INTERNAL_RWSEM_H -#define __INTERNAL_RWSEM_H -#include <linux/rwsem.h> - -extern void __down_read(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); - -#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 472dd462a40c..b9d93087ee66 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -14,14 +14,14 @@ #include <linux/export.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, short inner) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner); #endif lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; @@ -39,7 +39,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); #endif lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; diff --git a/kernel/notifier.c b/kernel/notifier.c index 63d7501ac638..5989bbb93039 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/pid.c b/kernel/pid.c index 0f4ecb57214c..647b4bb457b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,16 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ + retval = -ENOMEM; + if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 83edf8698118..db0bed2cae26 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,31 +1,21 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * This module exposes the interface to kernel space for specifying - * QoS dependencies. It provides infrastructure for registration of: + * Power Management Quality of Service (PM QoS) support base. * - * Dependents on a QoS value : register requests - * Watchers of QoS value : get notified when target QoS value changes + * Copyright (C) 2020 Intel Corporation * - * This QoS design is best effort based. Dependents register their QoS needs. - * Watchers register to keep track of the current QoS needs of the system. + * Authors: + * Mark Gross <mgross@linux.intel.com> + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * - * There are 3 basic classes of QoS parameter: latency, timeout, throughput - * each have defined units: - * latency: usec - * timeout: usec <-- currently not used. - * throughput: kbs (kilo byte / sec) + * Provided here is an interface for specifying PM QoS dependencies. It allows + * entities depending on QoS constraints to register their requests which are + * aggregated as appropriate to produce effective constraints (target values) + * that can be monitored by entities needing to respect them, either by polling + * or through a built-in notification mechanism. * - * There are lists of pm_qos_objects each one wrapping requests, notifiers - * - * User mode requests on a QOS parameter register themselves to the - * subsystem by opening the device node /dev/... and writing there request to - * the node. As long as the process holds a file handle open to the node the - * client continues to be accounted for. Upon file release the usermode - * request is removed and a new qos target is computed. This way when the - * request that the application has is cleaned up when closes the file - * pointer or exits the pm_qos_object will get an opportunity to clean up. - * - * Mark Gross <mgross@linux.intel.com> + * In addition to the basic functionality, more specific interfaces for managing + * global CPU latency QoS requests and frequency QoS requests are provided. */ /*#define DEBUG*/ @@ -54,56 +44,19 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_object { - struct pm_qos_constraints *constraints; - struct miscdevice pm_qos_power_miscdev; - char *name; -}; - static DEFINE_SPINLOCK(pm_qos_lock); -static struct pm_qos_object null_pm_qos; - -static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_constraints cpu_dma_constraints = { - .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), - .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &cpu_dma_lat_notifier, -}; -static struct pm_qos_object cpu_dma_pm_qos = { - .constraints = &cpu_dma_constraints, - .name = "cpu_dma_latency", -}; - -static struct pm_qos_object *pm_qos_array[] = { - &null_pm_qos, - &cpu_dma_pm_qos, -}; - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos); -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos); -static int pm_qos_power_open(struct inode *inode, struct file *filp); -static int pm_qos_power_release(struct inode *inode, struct file *filp); - -static const struct file_operations pm_qos_power_fops = { - .write = pm_qos_power_write, - .read = pm_qos_power_read, - .open = pm_qos_power_open, - .release = pm_qos_power_release, - .llseek = noop_llseek, -}; - -/* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_constraints *c) +/** + * pm_qos_read_value - Return the current effective constraint value. + * @c: List of PM QoS constraint requests. + */ +s32 pm_qos_read_value(struct pm_qos_constraints *c) { - struct plist_node *node; - int total_value = 0; + return READ_ONCE(c->target_value); +} +static int pm_qos_get_value(struct pm_qos_constraints *c) +{ if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -114,111 +67,42 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; - case PM_QOS_SUM: - plist_for_each(node, &c->list) - total_value += node->prio; - - return total_value; - default: - /* runtime check for not using enum */ - BUG(); + WARN(1, "Unknown PM QoS type in %s\n", __func__); return PM_QOS_DEFAULT_VALUE; } } -s32 pm_qos_read_value(struct pm_qos_constraints *c) -{ - return c->target_value; -} - -static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) +static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - c->target_value = value; + WRITE_ONCE(c->target_value, value); } -static int pm_qos_debug_show(struct seq_file *s, void *unused) -{ - struct pm_qos_object *qos = (struct pm_qos_object *)s->private; - struct pm_qos_constraints *c; - struct pm_qos_request *req; - char *type; - unsigned long flags; - int tot_reqs = 0; - int active_reqs = 0; - - if (IS_ERR_OR_NULL(qos)) { - pr_err("%s: bad qos param!\n", __func__); - return -EINVAL; - } - c = qos->constraints; - if (IS_ERR_OR_NULL(c)) { - pr_err("%s: Bad constraints on qos?\n", __func__); - return -EINVAL; - } - - /* Lock to ensure we have a snapshot */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (plist_head_empty(&c->list)) { - seq_puts(s, "Empty!\n"); - goto out; - } - - switch (c->type) { - case PM_QOS_MIN: - type = "Minimum"; - break; - case PM_QOS_MAX: - type = "Maximum"; - break; - case PM_QOS_SUM: - type = "Sum"; - break; - default: - type = "Unknown"; - } - - plist_for_each_entry(req, &c->list, node) { - char *state = "Default"; - - if ((req->node).prio != c->default_value) { - active_reqs++; - state = "Active"; - } - tot_reqs++; - seq_printf(s, "%d: %d: %s\n", tot_reqs, - (req->node).prio, state); - } - - seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n", - type, pm_qos_get_value(c), active_reqs, tot_reqs); - -out: - spin_unlock_irqrestore(&pm_qos_lock, flags); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(pm_qos_debug); - /** - * pm_qos_update_target - manages the constraints list and calls the notifiers - * if needed - * @c: constraints data struct - * @node: request to add to the list, to update or to remove - * @action: action to take on the constraints list - * @value: value of the request to add or update + * pm_qos_update_target - Update a list of PM QoS constraint requests. + * @c: List of PM QoS requests. + * @node: Target list entry. + * @action: Action to carry out (add, update or remove). + * @value: New request value for the target list entry. * - * This function returns 1 if the aggregated constraint value has changed, 0 - * otherwise. + * Update the given list of PM QoS constraint requests, @c, by carrying an + * @action involving the @node list entry and @value on it. + * + * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node + * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store + * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove + * @node from the list, ignore @value). + * + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value) { - unsigned long flags; int prev_value, curr_value, new_value; - int ret; + unsigned long flags; spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(c); if (value == PM_QOS_DEFAULT_VALUE) new_value = c->default_value; @@ -231,9 +115,8 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, break; case PM_QOS_UPDATE_REQ: /* - * to change the list, we atomically remove, reinit - * with new value and add, then see if the extremal - * changed + * To change the list, atomically remove, reinit with new value + * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); /* fall through */ @@ -252,16 +135,14 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); - if (prev_value != curr_value) { - ret = 1; - if (c->notifiers) - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - } else { - ret = 0; - } - return ret; + + if (prev_value == curr_value) + return 0; + + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, curr_value, NULL); + + return 1; } /** @@ -283,14 +164,12 @@ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, /** * pm_qos_update_flags - Update a set of PM QoS flags. - * @pqf: Set of flags to update. + * @pqf: Set of PM QoS flags to update. * @req: Request to add to the set, to modify, or to remove from the set. * @action: Action to take on the set. * @val: Value of the request to add or modify. * - * Update the given set of PM QoS flags and call notifiers if the aggregate - * value has changed. Returns 1 if the aggregate constraint value has changed, - * 0 otherwise. + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, @@ -326,288 +205,180 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, spin_unlock_irqrestore(&pm_qos_lock, irqflags); trace_pm_qos_update_flags(action, prev_value, curr_value); - return prev_value != curr_value; -} -/** - * pm_qos_request - returns current system wide qos expectation - * @pm_qos_class: identification of which qos value is requested - * - * This function returns the current target value. - */ -int pm_qos_request(int pm_qos_class) -{ - return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); -} -EXPORT_SYMBOL_GPL(pm_qos_request); - -int pm_qos_request_active(struct pm_qos_request *req) -{ - return req->pm_qos_class != 0; + return prev_value != curr_value; } -EXPORT_SYMBOL_GPL(pm_qos_request_active); -static void __pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) -{ - trace_pm_qos_update_request(req->pm_qos_class, new_value); +#ifdef CONFIG_CPU_IDLE +/* Definitions related to the CPU latency QoS. */ - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); -} +static struct pm_qos_constraints cpu_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), + .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .type = PM_QOS_MIN, +}; /** - * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout - * @work: work struct for the delayed work (timeout) - * - * This cancels the timeout request by falling back to the default at timeout. + * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ -static void pm_qos_work_fn(struct work_struct *work) +s32 cpu_latency_qos_limit(void) { - struct pm_qos_request *req = container_of(to_delayed_work(work), - struct pm_qos_request, - work); - - __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); + return pm_qos_read_value(&cpu_latency_constraints); } /** - * pm_qos_add_request - inserts new qos request into the list - * @req: pointer to a preallocated handle - * @pm_qos_class: identifies which list of qos request to use - * @value: defines the qos request + * cpu_latency_qos_request_active - Check the given PM QoS request. + * @req: PM QoS request to check. * - * This function inserts a new entry in the pm_qos_class list of requested qos - * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request - * handle. Caller needs to save this handle for later use in updates and - * removal. + * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' + * otherwise. */ - -void pm_qos_add_request(struct pm_qos_request *req, - int pm_qos_class, s32 value) +bool cpu_latency_qos_request_active(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ - return; + return req->qos == &cpu_latency_constraints; +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); - if (pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); - return; - } - req->pm_qos_class = pm_qos_class; - INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); - trace_pm_qos_add_request(pm_qos_class, value); - pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); +static void cpu_latency_qos_apply(struct pm_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret = pm_qos_update_target(req->qos, &req->node, action, value); + if (ret > 0) + wake_up_all_idle_cpus(); } -EXPORT_SYMBOL_GPL(pm_qos_add_request); /** - * pm_qos_update_request - modifies an existing qos request - * @req : handle to list element holding a pm_qos request to use - * @value: defines the qos request + * cpu_latency_qos_add_request - Add new CPU latency QoS request. + * @req: Pointer to a preallocated handle. + * @value: Requested constraint value. * - * Updates an existing qos request for the pm_qos_class of parameters along - * with updating the target pm_qos_class value. + * Use @value to initialize the request handle pointed to by @req, insert it as + * a new entry to the CPU latency QoS list and recompute the effective QoS + * constraint for that list. * - * Attempts are made to make this code callable on hot code paths. + * Callers need to save the handle for later use in updates and removal of the + * QoS request represented by it. */ -void pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) +void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + if (cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for already added request\n", __func__); return; } - cancel_delayed_work_sync(&req->work); - __pm_qos_update_request(req, new_value); + trace_pm_qos_add_request(value); + + req->qos = &cpu_latency_constraints; + cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); } -EXPORT_SYMBOL_GPL(pm_qos_update_request); +EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); /** - * pm_qos_update_request_timeout - modifies an existing qos request temporarily. - * @req : handle to list element holding a pm_qos request to use - * @new_value: defines the temporal qos request - * @timeout_us: the effective duration of this qos request in usecs. + * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. + * @req : QoS request to update. + * @new_value: New requested constraint value. * - * After timeout_us, this qos request is cancelled automatically. + * Use @new_value to update the QoS request represented by @req in the CPU + * latency QoS list along with updating the effective constraint value for that + * list. */ -void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, - unsigned long timeout_us) +void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { if (!req) return; - if (WARN(!pm_qos_request_active(req), - "%s called for unknown object.", __func__)) + + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; + } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request(new_value); - trace_pm_qos_update_request_timeout(req->pm_qos_class, - new_value, timeout_us); - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + if (new_value == req->node.prio) + return; - schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); + cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } +EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); /** - * pm_qos_remove_request - modifies an existing qos request - * @req: handle to request list element + * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. + * @req: QoS request to remove. * - * Will remove pm qos request from the list of constraints and - * recompute the current target value for the pm_qos_class. Call this - * on slow code paths. + * Remove the CPU latency QoS request represented by @req from the CPU latency + * QoS list along with updating the effective constraint value for that list. */ -void pm_qos_remove_request(struct pm_qos_request *req) +void cpu_latency_qos_remove_request(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); - trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); - pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, - PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } -EXPORT_SYMBOL_GPL(pm_qos_remove_request); - -/** - * pm_qos_add_notifier - sets notification entry for changes to target value - * @pm_qos_class: identifies which qos target changes should be notified. - * @notifier: notifier block managed by caller. - * - * will register the notifier into a notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_add_notifier); - -/** - * pm_qos_remove_notifier - deletes notification entry from chain. - * @pm_qos_class: identifies which qos target changes are notified. - * @notifier: notifier block to be removed. - * - * will remove the notifier from the notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; +EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); - retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); +/* User space interface to the CPU latency QoS via misc device. */ - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); - -/* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) +static int cpu_latency_qos_open(struct inode *inode, struct file *filp) { - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, - &pm_qos_debug_fops); + struct pm_qos_request *req; - return misc_register(&qos->pm_qos_power_miscdev); -} + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; + cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; - for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; - } - return -1; + return 0; } -static int pm_qos_power_open(struct inode *inode, struct file *filp) +static int cpu_latency_qos_release(struct inode *inode, struct file *filp) { - long pm_qos_class; - - pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { - struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return -ENOMEM; - - pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); - filp->private_data = req; - - return 0; - } - return -EPERM; -} + struct pm_qos_request *req = filp->private_data; -static int pm_qos_power_release(struct inode *inode, struct file *filp) -{ - struct pm_qos_request *req; + filp->private_data = NULL; - req = filp->private_data; - pm_qos_remove_request(req); + cpu_latency_qos_remove_request(req); kfree(req); return 0; } - -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) { - s32 value; - unsigned long flags; struct pm_qos_request *req = filp->private_data; + unsigned long flags; + s32 value; - if (!req) - return -EINVAL; - if (!pm_qos_request_active(req)) + if (!req || !cpu_latency_qos_request_active(req)) return -EINVAL; spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); + value = pm_qos_get_value(&cpu_latency_constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) { s32 value; - struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) @@ -620,36 +391,38 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, return ret; } - req = filp->private_data; - pm_qos_update_request(req, value); + cpu_latency_qos_update_request(filp->private_data, value); return count; } +static const struct file_operations cpu_latency_qos_fops = { + .write = cpu_latency_qos_write, + .read = cpu_latency_qos_read, + .open = cpu_latency_qos_open, + .release = cpu_latency_qos_release, + .llseek = noop_llseek, +}; -static int __init pm_qos_power_init(void) -{ - int ret = 0; - int i; - struct dentry *d; - - BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); +static struct miscdevice cpu_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_dma_latency", + .fops = &cpu_latency_qos_fops, +}; - d = debugfs_create_dir("pm_qos", NULL); +static int __init cpu_latency_qos_init(void) +{ + int ret; - for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i], d); - if (ret < 0) { - pr_err("%s: %s setup failed\n", - __func__, pm_qos_array[i]->name); - return ret; - } - } + ret = misc_register(&cpu_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_latency_qos_miscdev.name); return ret; } - -late_initcall(pm_qos_power_init); +late_initcall(cpu_latency_qos_init); +#endif /* CONFIG_CPU_IDLE */ /* Definitions related to the frequency QoS below. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 77438954cc2b..58ed9478787f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -409,21 +409,7 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_ALLOC_SWAP_PAGE: { - compat_loff_t __user *uoffset = compat_ptr(arg); - loff_t offset; - mm_segment_t old_fs; - int err; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, cmd, (unsigned long) &offset); - set_fs(old_fs); - if (!err && put_user(offset, uoffset)) - err = -EFAULT; - return err; - } - + case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 82d5fba48b2f..f91f2c2cf138 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,6 +3,10 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +ifeq ($(CONFIG_KCSAN),y) +KBUILD_CFLAGS += -g -fno-omit-frame-pointer +endif + obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 05f936ed167a..00ddc92c5774 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +extern int rcu_cpu_stall_suppress_at_boot; + +static inline bool rcu_stall_is_suppressed_at_boot(void) +{ + return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); +} + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress; +} + #define rcu_ftrace_dump_stall_suppress() \ do { \ if (!rcu_cpu_stall_suppress) \ @@ -218,6 +230,11 @@ do { \ } while (0) #else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot(); +} #define rcu_ftrace_dump_stall_suppress() #define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ @@ -325,7 +342,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ (cpu) <= rnp->grphi; \ (cpu) = cpumask_next((cpu), cpu_possible_mask)) @@ -335,7 +353,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #define rcu_find_next_bit(rnp, cpu, mask) \ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ - for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5f4fd3b8777c..9a0f66133b4b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) { return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; + &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]); } /* @@ -381,8 +381,6 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); - rclp->head = NULL; - rclp->tail = &rclp->head; } /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index da94b89cd531..a4a8d097d84d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/mm.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/err.h> @@ -611,6 +612,7 @@ kfree_perf_thread(void *arg) long me = (long)arg; struct kfree_obj *alloc_ptr; u64 start_time, end_time; + long long mem_begin, mem_during = 0; VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); @@ -626,6 +628,12 @@ kfree_perf_thread(void *arg) } do { + if (!mem_during) { + mem_during = mem_begin = si_mem_available(); + } else if (loop % (kfree_loops / 4) == 0) { + mem_during = (mem_during + si_mem_available()) / 2; + } + for (i = 0; i < kfree_alloc_num; i++) { alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) @@ -645,9 +653,11 @@ kfree_perf_thread(void *arg) else b_rcu_gp_test_finished = cur_ops->get_gp_seq(); - pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + if (shutdown) { smp_mb(); /* Assign before wake. */ wake_up(&shutdown_wq); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1aeecc165b21..5453bd557f43 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!rcu_fwd_cb_nodelay && + if (!READ_ONCE(rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; - i = rp->rtort_pipe_count; + i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + WRITE_ONCE(rp->rtort_pipe_count, i + 1); + if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg) if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; + WRITE_ONCE(old_rp->rtort_pipe_count, + old_rp->rtort_pipe_count + 1); switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1067,7 +1069,8 @@ rcu_torture_writer(void *arg) if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && - !torture_must_stop()) + !torture_must_stop() && + rcu_inkernel_boot_has_ended()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != @@ -1290,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) atomic_inc(&n_rcu_torture_mberror); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); - pipe_count = p->rtort_pipe_count; + pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; @@ -1404,14 +1407,15 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); + batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { @@ -1420,9 +1424,10 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); + rtcp = rcu_access_pointer(rcu_torture_current); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current ? "ver" : "VER", + rtcp, + rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1478,7 +1483,8 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL) { + rcu_access_pointer(rcu_torture_current) && + !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; @@ -1993,8 +1999,11 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - rcu_torture_fwd_prog_cr(rfp); + if (!IS_ENABLED(CONFIG_TINY_RCU) || + rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + if (rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ @@ -2044,6 +2053,14 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) atomic_inc(&barrier_cbs_invoked); } +/* IPI handler to get callback posted on desired CPU, if online. */ +static void rcu_torture_barrier1cb(void *rcu_void) +{ + struct rcu_head *rhp = rcu_void; + + cur_ops->call(rhp, rcu_torture_barrier_cbf); +} + /* kthread function to register callbacks used to test RCU barriers. */ static int rcu_torture_barrier_cbs(void *arg) { @@ -2067,9 +2084,11 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - local_irq_disable(); /* Just to test no-irq call_rcu(). */ - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - local_irq_enable(); + if (smp_call_function_single(myid, rcu_torture_barrier1cb, + &rcu, 1)) { + // IPI failed, so use direct call from current CPU. + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + } if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -2105,7 +2124,21 @@ static int rcu_torture_barrier(void *arg) pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", atomic_read(&barrier_cbs_invoked), n_barrier_cbs); - WARN_ON_ONCE(1); + WARN_ON(1); + // Wait manually for the remaining callbacks + i = 0; + do { + if (WARN_ON(i++ > HZ)) + i = INT_MIN; + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + } while (atomic_read(&barrier_cbs_invoked) != + n_barrier_cbs && + !torture_must_stop()); + smp_mb(); // Can't trust ordering if broken. + if (!torture_must_stop()) + pr_err("Recovered: barrier_cbs_invoked = %d\n", + atomic_read(&barrier_cbs_invoked)); } else { n_barrier_successes++; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 657e6a7d1c03..0c71505f0e19 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@linux.ibm.com> + * Authors: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -450,7 +450,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + state = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - ssp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp); mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - snp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); @@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); } @@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp == sdp->mynode) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - snp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } @@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); /* If grace period not already done and none in progress, start it. */ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && @@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return ssp->srcu_idx; + return READ_ONCE(ssp->srcu_idx); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1130,7 +1130,9 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); + spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + spin_unlock_irq_rcu_node(ssp); } if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..06548e2ebb72 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update mechanism for mutual exclusion + * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> * * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. @@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -342,14 +343,17 @@ bool rcu_eqs_special_set(int cpu) { int old; int new; + int new_old; struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + new_old = atomic_read(&rdp->dynticks); do { - old = atomic_read(&rdp->dynticks); + old = new_old; if (old & RCU_DYNTICK_CTRL_CTR) return false; new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); + new_old = atomic_cmpxchg(&rdp->dynticks, old, new); + } while (new_old != old); return true; } @@ -410,10 +414,15 @@ static long blimit = DEFAULT_RCU_BLIMIT; static long qhimark = DEFAULT_RCU_QHIMARK; #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ static long qlowmark = DEFAULT_RCU_QLOMARK; +#define DEFAULT_RCU_QOVLD_MULT 2 +#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) +static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ +static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); +module_param(qovld, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; @@ -818,11 +827,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { raw_spin_lock_rcu_node(rdp->mynode); // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; + WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } raw_spin_unlock_rcu_node(rdp->mynode); @@ -899,7 +909,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - rdp->rcu_forced_tick = false; + WRITE_ONCE(rdp->rcu_forced_tick, false); } } @@ -1072,7 +1082,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || - time_after(jiffies, rcu_state.jiffies_resched))) { + time_after(jiffies, rcu_state.jiffies_resched) || + rcu_state.cbovld)) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); @@ -1089,8 +1100,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * So hit them over the head with the resched_cpu() hammer! */ if (tick_nohz_full_cpu(rdp->cpu) && - time_after(jiffies, - READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || + rcu_state.cbovld)) { WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); @@ -1113,6 +1124,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -1126,8 +1138,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, - rnp->level, rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + gp_seq_req, rnp->level, + rnp->grplo, rnp->grphi, s); } /* @@ -1174,7 +1187,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, TPS("Prestarted")); goto unlock_out; } - rnp->gp_seq_needed = gp_seq_req; + WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req); if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf or internal node, and a @@ -1199,18 +1212,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; - if (!rcu_state.gp_kthread) { + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + if (!READ_ONCE(rcu_state.gp_kthread)) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { - rnp_start->gp_seq_needed = rnp->gp_seq_needed; - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed); + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); } if (rnp != rnp_start) raw_spin_unlock_rcu_node(rnp); @@ -1235,12 +1248,13 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken (unless in - * an interrupt or softirq handler), and don't bother awakening when there - * is nothing for the grace-period kthread to do (as in several CPUs raced - * to awaken, and we lost), and finally don't try to awaken a kthread that - * has not yet been created. If all those checks are passed, track some - * debug information and awaken. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in an + * interrupt or softirq handler, in which case we just might immediately + * sleep upon return, resulting in a grace-period hang), and don't bother + * awakening when there is nothing for the grace-period kthread to do + * (as in several CPUs raced to awaken, we lost), and finally don't try + * to awaken a kthread that has not yet been created. If all those checks + * are passed, track some debug information and awaken. * * So why do the self-wakeup when in an interrupt or softirq handler * in the grace-period kthread's context? Because the kthread might have @@ -1250,10 +1264,10 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) */ static void rcu_gp_kthread_wake(void) { - if ((current == rcu_state.gp_kthread && - !in_irq() && !in_serving_softirq()) || - !READ_ONCE(rcu_state.gp_flags) || - !rcu_state.gp_kthread) + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); + + if ((current == t && !in_irq() && !in_serving_softirq()) || + !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); @@ -1321,7 +1335,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; @@ -1386,7 +1400,7 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; - bool need_gp; + bool need_qs; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -1400,10 +1414,13 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) unlikely(READ_ONCE(rdp->gpwrap))) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ + rdp->core_needs_qs = false; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { if (!offloaded) ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ + if (rdp->core_needs_qs) + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1415,14 +1432,14 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) * go looking for one. */ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); - need_gp = !!(rnp->qsmask & rdp->grpmask); - rdp->cpu_no_qs.b.norm = need_gp; - rdp->core_needs_qs = need_gp; + need_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_qs; + rdp->core_needs_qs = need_qs; zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1651,8 +1668,7 @@ static void rcu_gp_fqs_loop(void) WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( @@ -1666,13 +1682,11 @@ static void rcu_gp_fqs_loop(void) /* If time for quiescent-state forcing, do it. */ if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); first_gp_fqs = false; - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); @@ -1683,8 +1697,7 @@ static void rcu_gp_fqs_loop(void) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -1701,8 +1714,9 @@ static void rcu_gp_fqs_loop(void) */ static void rcu_gp_cleanup(void) { - unsigned long gp_duration; + int cpu; bool needgp = false; + unsigned long gp_duration; unsigned long new_gp_seq; bool offloaded; struct rcu_data *rdp; @@ -1748,6 +1762,12 @@ static void rcu_gp_cleanup(void) needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ needgp = rcu_future_gp_cleanup(rnp) || needgp; + // Reset overload indication for CPUs no longer overloaded + if (rcu_is_leaf_node(rnp)) + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + check_cb_ovld_locked(rdp, rnp); + } sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -1774,9 +1794,9 @@ static void rcu_gp_cleanup(void) rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + rcu_state.gp_seq, TPS("newreq")); } else { WRITE_ONCE(rcu_state.gp_flags, @@ -1795,8 +1815,7 @@ static int __noreturn rcu_gp_kthread(void *unused) /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); rcu_state.gp_state = RCU_GP_WAIT_GPS; swait_event_idle_exclusive(rcu_state.gp_wq, @@ -1809,8 +1828,7 @@ static int __noreturn rcu_gp_kthread(void *unused) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwaitsig")); } @@ -1881,7 +1899,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); - rnp->qsmask &= ~mask; + WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, @@ -1904,7 +1922,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); - oldmask = rnp_c->qsmask; + oldmask = READ_ONCE(rnp_c->qsmask); } /* @@ -1987,6 +2005,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + if (rdp->cpu == smp_processor_id()) + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2052,7 +2072,7 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; blkd = !!(rnp->qsmask & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; } @@ -2294,10 +2314,13 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) struct rcu_data *rdp; struct rcu_node *rnp; + rcu_state.cbovld = rcu_state.cbovldnext; + rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { @@ -2579,11 +2602,48 @@ static void rcu_leak_callback(struct rcu_head *rhp) } /* - * Helper function for call_rcu() and friends. The cpu argument will - * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier() - * is expected to specify a CPU. + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. The caller must hold the leaf rcu_node + * structure's ->lock. */ +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) +{ + raw_lockdep_assert_held_rcu_node(rnp); + if (qovld_calc <= 0) + return; // Early boot and wildcard value set. + if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask); + else + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask); +} + +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. No locks need be held, but the + * caller must have disabled interrupts. + * + * Note that this function ignores the possibility that there are a lot + * of callbacks all of which have already seen the end of their respective + * grace periods. This omission is due to the need for no-CBs CPUs to + * be holding ->nocb_lock to do this check, which is too heavy for a + * common-case operation. + */ +static void check_cb_ovld(struct rcu_data *rdp) +{ + struct rcu_node *const rnp = rdp->mynode; + + if (qovld_calc <= 0 || + ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) == + !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask))) + return; // Early boot wildcard value or already set correctly. + raw_spin_lock_rcu_node(rnp); + check_cb_ovld_locked(rdp, rnp); + raw_spin_unlock_rcu_node(rnp); +} + +/* Helper function for call_rcu() and friends. */ static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -2621,9 +2681,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) rcu_segcblist_init(&rdp->cblist); } + check_cb_ovld(rdp); if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. - /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ + // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, @@ -2689,22 +2750,47 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) + +/** + * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * @nr_records: Number of active pointers in the array + * @records: Array of the kfree_rcu() pointers + * @next: Next bulk object in the block chain + * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set + */ +struct kfree_rcu_bulk_data { + unsigned long nr_records; + void *records[KFREE_BULK_MAX_ENTR]; + struct kfree_rcu_bulk_data *next; + struct rcu_head *head_free_debug; +}; + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct kfree_rcu_bulk_data *bhead_free; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -2718,6 +2804,8 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; + struct kfree_rcu_bulk_data *bhead; + struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; @@ -2727,14 +2815,24 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static __always_inline void +debug_rcu_head_unqueue_bulk(struct rcu_head *head) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + for (; head; head = head->next) + debug_rcu_head_unqueue(head); +#endif +} + /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->head_free. + * It frees all the objects queued on ->bhead_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; struct rcu_head *head, *next; + struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; @@ -2744,22 +2842,44 @@ static void kfree_rcu_work(struct work_struct *work) spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; + bhead = krwp->bhead_free; + krwp->bhead_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); - // List "head" is now private, so traverse locklessly. + /* "bhead" is now private, so traverse locklessly. */ + for (; bhead; bhead = bnext) { + bnext = bhead->next; + + debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, + bhead->nr_records, bhead->records); + + kfree_bulk(bhead->nr_records, bhead->records); + rcu_lock_release(&rcu_callback_map); + + if (cmpxchg(&krcp->bcached, NULL, bhead)) + free_page((unsigned long) bhead); + + cond_resched_tasks_rcu_qs(); + } + + /* + * Emergency case only. It can happen under low memory + * condition when an allocation gets failed, so the "bulk" + * path can not be temporary maintained. + */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; next = head->next; - // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { - /* Could be optimized with kfree_bulk() in future. */ + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) kfree((void *)head - offset); - } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2774,26 +2894,48 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + struct kfree_rcu_cpu_work *krwp; + bool queued = false; int i; - struct kfree_rcu_cpu_work *krwp = NULL; lockdep_assert_held(&krcp->lock); - for (i = 0; i < KFREE_N_BATCHES; i++) - if (!krcp->krw_arr[i].head_free) { - krwp = &(krcp->krw_arr[i]); - break; - } - // If a previous RCU batch is in progress, we cannot immediately - // queue another one, so return false to tell caller to retry. - if (!krwp) - return false; + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); - krwp->head_free = krcp->head; - krcp->head = NULL; - INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krwp->rcu_work); - return true; + /* + * Try to detach bhead or head and attach it over any + * available corresponding free channel. It can be that + * a previous RCU batch is in progress, it means that + * immediately to queue another one is not possible so + * return false to tell caller to retry. + */ + if ((krcp->bhead && !krwp->bhead_free) || + (krcp->head && !krwp->head_free)) { + /* Channel 1. */ + if (!krwp->bhead_free) { + krwp->bhead_free = krcp->bhead; + krcp->bhead = NULL; + } + + /* Channel 2. */ + if (!krwp->head_free) { + krwp->head_free = krcp->head; + krcp->head = NULL; + } + + /* + * One work is per one batch, so there are two "free channels", + * "bhead_free" and "head_free" the batch can handle. It can be + * that the work is in the pending state when two channels have + * been detached following each other, one by one. + */ + queue_rcu_work(system_wq, &krwp->rcu_work); + queued = true; + } + } + + return queued; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, @@ -2830,19 +2972,65 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } +static inline bool +kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, + struct rcu_head *head, rcu_callback_t func) +{ + struct kfree_rcu_bulk_data *bnode; + + if (unlikely(!krcp->initialized)) + return false; + + lockdep_assert_held(&krcp->lock); + + /* Check if a new block is required. */ + if (!krcp->bhead || + krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + bnode = xchg(&krcp->bcached, NULL); + if (!bnode) { + WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + } + + /* Switch to emergency path. */ + if (unlikely(!bnode)) + return false; + + /* Initialize the new block. */ + bnode->nr_records = 0; + bnode->next = krcp->bhead; + bnode->head_free_debug = NULL; + + /* Attach it to the head. */ + krcp->bhead = bnode; + } + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + head->func = func; + head->next = krcp->bhead->head_free_debug; + krcp->bhead->head_free_debug = head; +#endif + + /* Finally insert. */ + krcp->bhead->records[krcp->bhead->nr_records++] = + (void *) head - (unsigned long) func; + + return true; +} + /* - * Queue a request for lazy invocation of kfree() after a grace period. + * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace + * period. Please note there are two paths are maintained, one is the main one + * that uses kfree_bulk() interface and second one is emergency one, that is + * used only when the main path can not be maintained temporary, due to memory + * pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch - * will be kfree'd in workqueue context. This allows us to: - * - * 1. Batch requests together to reduce the number of grace periods during - * heavy kfree_rcu() load. - * - * 2. It makes it possible to use kfree_bulk() on a large number of - * kfree_rcu() requests thus reducing cache misses and the per-object - * overhead of kfree(). + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will + * be free'd in workqueue context. This allows us to: batch requests together to + * reduce the number of grace periods during heavy kfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -2861,9 +3049,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) __func__, head); goto unlock_return; } - head->func = func; - head->next = krcp->head; - krcp->head = head; + + /* + * Under high memory pressure GFP_NOWAIT can fail, + * in that case the emergency path is maintained. + */ + if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + head->func = func; + head->next = krcp->head; + krcp->head = head; + } // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && @@ -3075,24 +3270,32 @@ static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) /* * RCU callback function for rcu_barrier(). If we are last, wake * up the task executing rcu_barrier(). + * + * Note that the value of rcu_state.barrier_sequence must be captured + * before the atomic_dec_and_test(). Otherwise, if this CPU is not last, + * other CPUs might count the value down to zero before this CPU gets + * around to invoking rcu_barrier_trace(), which might result in bogus + * data from the next instance of rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { + unsigned long __maybe_unused s = rcu_state.barrier_sequence; + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { - rcu_barrier_trace(TPS("LastCB"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); } else { - rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("CB"), -1, s); } } /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *unused) +static void rcu_barrier_func(void *cpu_in) { - struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; @@ -3119,7 +3322,7 @@ static void rcu_barrier_func(void *unused) */ void rcu_barrier(void) { - int cpu; + uintptr_t cpu; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -3142,13 +3345,14 @@ void rcu_barrier(void) rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* - * Initialize the count to one rather than to zero in order to - * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Exclude CPU-hotplug operations - * to ensure that no offline CPU has callbacks queued. + * Initialize the count to two rather than to zero in order + * to avoid a too-soon return to zero in case of an immediate + * invocation of the just-enqueued callback (or preemption of + * this task). Exclude CPU-hotplug operations to ensure that no + * offline non-offloaded CPU has callbacks queued. */ init_completion(&rcu_state.barrier_completion); - atomic_set(&rcu_state.barrier_cpu_count, 1); + atomic_set(&rcu_state.barrier_cpu_count, 2); get_online_cpus(); /* @@ -3158,13 +3362,23 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (!cpu_online(cpu) && + if (cpu_is_offline(cpu) && !rcu_segcblist_is_offloaded(&rdp->cblist)) continue; - if (rcu_segcblist_n_cbs(&rdp->cblist)) { + if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); + smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); + } else if (rcu_segcblist_n_cbs(&rdp->cblist) && + cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, + rcu_state.barrier_sequence); + local_irq_disable(); + rcu_barrier_func((void *)cpu); + local_irq_enable(); + } else if (cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, + rcu_state.barrier_sequence); } else { rcu_barrier_trace(TPS("OnlineNQ"), cpu, rcu_state.barrier_sequence); @@ -3176,7 +3390,7 @@ void rcu_barrier(void) * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count)) complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ @@ -3275,12 +3489,12 @@ int rcutree_prepare_cpu(unsigned int cpu) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gp_seq = rnp->gp_seq; - rdp->gp_seq_needed = rnp->gp_seq; + rdp->gp_seq = READ_ONCE(rnp->gp_seq); + rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; + rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); @@ -3378,7 +3592,7 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; oldmask ^= rnp->expmaskinitnext; @@ -3431,7 +3645,7 @@ void rcu_report_dead(unsigned int cpu) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } - rnp->qsmaskinitnext &= ~mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); @@ -3545,7 +3759,10 @@ static int __init rcu_spawn_gp_kthread(void) } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); - rcu_state.gp_kthread = t; + WRITE_ONCE(rcu_state.gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + // Reset .gp_activity and .gp_req_activity before setting .gp_kthread. + smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); @@ -3769,8 +3986,11 @@ static void __init kfree_rcu_batch_init(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); - for (i = 0; i < KFREE_N_BATCHES; i++) + for (i = 0; i < KFREE_N_BATCHES; i++) { + INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } @@ -3809,6 +4029,13 @@ void __init rcu_init(void) rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); srcu_init(); + + /* Fill in default value for rcutree.qovld boot parameter. */ + /* -After- the rcu_node ->lock fields are initialized! */ + if (qovld < 0) + qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; + else + qovld_calc = qovld; } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0c87e4c161c2..9dc2ec021da5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,8 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long cbovldmask; + /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ @@ -321,6 +323,8 @@ struct rcu_state { atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ + u8 cbovld; /* Callback overload now? */ + u8 cbovldnext; /* ^ ^ next time? */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dcbd75791f39..1a617b9dffb0 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -314,7 +314,7 @@ static bool exp_funnel_lock(unsigned long s) sync_exp_work_done(s)); return true; } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ + WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("nxtlvl")); @@ -485,6 +485,7 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) static void synchronize_rcu_expedited_wait(void) { int cpu; + unsigned long j; unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; @@ -496,7 +497,7 @@ static void synchronize_rcu_expedited_wait(void) trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { @@ -508,12 +509,16 @@ static void synchronize_rcu_expedited_wait(void) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } } + j = READ_ONCE(jiffies_till_first_fqs); + if (synchronize_rcu_expedited_wait_once(j + HZ)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)); } for (;;) { if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", @@ -589,7 +594,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; + WRITE_ONCE(rnp->exp_seq_rq, s); spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c6ea81cd4189..097635c41135 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -56,6 +56,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (qovld != DEFAULT_RCU_QOVLD) + pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) @@ -753,7 +755,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - (long)rnp->gp_seq, (long)rnp->completedqs); + (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs); for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); @@ -1032,18 +1034,18 @@ static int rcu_boost_kthread(void *arg) trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { - rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); - rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); if (more2boost) spincnt++; else spincnt = 0; if (spincnt > 10) { - rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); @@ -1077,12 +1079,12 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - ULONG_CMP_GE(jiffies, rnp->boost_time))) { + (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, - rnp->boost_kthread_status); + READ_ONCE(rnp->boost_kthread_status)); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1486,6 +1488,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); * flag the contention. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) { lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) @@ -1529,6 +1532,7 @@ static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) * Release the specified rcu_data structure's ->nocb_bypass_lock. */ static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) { lockdep_assert_irqs_disabled(); raw_spin_unlock(&rdp->nocb_bypass_lock); @@ -1577,8 +1581,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist) && - cpu_online(rdp->cpu)) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) lockdep_assert_held(&rdp->nocb_lock); } @@ -1930,6 +1933,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) struct rcu_data *rdp; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; /* * Each pass through the following loop checks for CBs and for the @@ -1969,10 +1973,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ } // Need to wait on some grace period? - WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)); if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { if (!needwait_gp || diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 55f9b84790d3..119ed6afd20f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -102,7 +102,7 @@ static void record_gp_stall_check_time(void) unsigned long j = jiffies; unsigned long j1; - rcu_state.gp_start = j; + WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); /* Record ->gp_start before ->jiffies_stall. */ smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ @@ -383,7 +383,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -452,7 +452,7 @@ static void print_cpu_stall(void) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -504,7 +504,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); @@ -578,6 +578,7 @@ void show_rcu_gp_kthreads(void) unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; ja = j - READ_ONCE(rcu_state.gp_activity); @@ -585,28 +586,28 @@ void show_rcu_gp_kthreads(void) jw = j - READ_ONCE(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + rcu_state.gp_state, t ? t->state : 0x1ffffL, ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), (long)READ_ONCE(rcu_state.gp_seq), (long)READ_ONCE(rcu_get_root()->gp_seq_needed), READ_ONCE(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)rnp->gp_seq, - (long)rnp->gp_seq_needed); + rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), + (long)READ_ONCE(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) + if (READ_ONCE(rdp->gpwrap) || + ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)rdp->gp_seq_needed); + cpu, (long)READ_ONCE(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -631,7 +632,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, static atomic_t warned = ATOMIC_INIT(0); if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread. return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || @@ -642,7 +645,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) { @@ -655,9 +659,10 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_xchg(&warned, 1)) { if (rnp_root != rnp) /* irqs remain disabled. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..28a8bdc5072f 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -183,6 +183,8 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); +static bool rcu_boot_ended __read_mostly; + /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -191,7 +193,17 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); + rcu_boot_ended = 1; +} + +/* + * Let rcutorture know when it is OK to turn it up to eleven. + */ +bool rcu_inkernel_boot_has_ended(void) +{ + return rcu_boot_ended; } +EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended); #endif /* #ifndef CONFIG_TINY_RCU */ @@ -227,18 +239,30 @@ core_initcall(rcu_set_runtime_mode); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +struct lockdep_map rcu_lock_map = { + .name = "rcu_read_lock", + .key = &rcu_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */ +}; EXPORT_SYMBOL_GPL(rcu_lock_map); static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +struct lockdep_map rcu_bh_lock_map = { + .name = "rcu_read_lock_bh", + .key = &rcu_bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */ +}; EXPORT_SYMBOL_GPL(rcu_bh_lock_map); static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +struct lockdep_map rcu_sched_lock_map = { + .name = "rcu_read_lock_sched", + .key = &rcu_sched_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_SPIN, +}; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); static struct lock_class_key rcu_callback_key; @@ -464,13 +488,19 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall +// warnings. Also used by rcutorture even if stall warnings are excluded. +int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); +module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); + #ifdef CONFIG_TASKS_RCU /* @@ -528,7 +558,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rhp->func = func; raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); needwake = !rcu_tasks_cbs_head; - *rcu_tasks_cbs_tail = rhp; + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ @@ -658,7 +688,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* If there were none, wait a bit and start over. */ if (!list) { wait_event_interruptible(rcu_tasks_cbs_wq, - rcu_tasks_cbs_head); + READ_ONCE(rcu_tasks_cbs_head)); if (!rcu_tasks_cbs_head) { WARN_ON(signal_pending(current)); schedule_timeout_interruptible(HZ/10); @@ -801,7 +831,7 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) { preempt_disable(); current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); @@ -809,7 +839,7 @@ void exit_tasks_rcu_start(void) } /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { preempt_disable(); __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a1ad5b7d5521..a778554f9dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -29,12 +29,12 @@ void complete(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (x->done != UINT_MAX) x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -58,10 +58,12 @@ void complete_all(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + lockdep_assert_RT_in_threaded_ctx(); + + raw_spin_lock_irqsave(&x->wait.lock, flags); x->done = UINT_MAX; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_all_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DECLARE_WAITQUEUE(wait, current); + DECLARE_SWAITQUEUE(wait); - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); + __finish_swait(&x->wait, &wait); if (!x->done) return timeout; } @@ -100,9 +102,9 @@ __wait_for_common(struct completion *x, complete_acquire(x); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); complete_release(x); @@ -291,12 +293,12 @@ bool try_wait_for_completion(struct completion *x) if (!READ_ONCE(x->done)) return false; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = false; else if (x->done != UINT_MAX) x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -322,8 +324,8 @@ bool completion_done(struct completion *x) * otherwise we can end up freeing the completion before complete() * is done referencing it. */ - spin_lock_irqsave(&x->wait.lock, flags); - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a9983da4408..a2694ba82874 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -269,7 +269,6 @@ static void __hrtick_start(void *arg) rq_lock(rq, &rf); __hrtick_restart(rq); - rq->hrtick_csd_pending = 0; rq_unlock(rq, &rf); } @@ -293,12 +292,10 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_set_expires(timer, time); - if (rq == this_rq()) { + if (rq == this_rq()) __hrtick_restart(rq); - } else if (!rq->hrtick_csd_pending) { + else smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); - rq->hrtick_csd_pending = 1; - } } #else @@ -322,8 +319,6 @@ void hrtick_start(struct rq *rq, u64 delay) static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - rq->hrtick_csd.flags = 0; rq->hrtick_csd.func = __hrtick_start; rq->hrtick_csd.info = rq; @@ -761,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; - p->se.runnable_weight = load->weight; return; } @@ -774,7 +768,6 @@ static void set_load_weight(struct task_struct *p, bool update_load) } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; - p->se.runnable_weight = load->weight; } } @@ -1652,7 +1645,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + /* + * Picking a ~random cpu helps in cases where we are changing affinity + * for groups of tasks (ie. cpuset), so that load balancing is not + * immediately required to distribute the tasks within their new mask. + */ + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; @@ -3578,6 +3576,17 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +DEFINE_PER_CPU(unsigned long, thermal_pressure); + +void arch_set_thermal_pressure(struct cpumask *cpus, + unsigned long th_pressure) +{ + int cpu; + + for_each_cpu(cpu, cpus) + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3588,12 +3597,16 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + unsigned long thermal_pressure; + arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); update_rq_clock(rq); + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); calc_global_load_tick(rq); psi_task_tick(rq); @@ -3671,7 +3684,6 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; - curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -4074,6 +4086,8 @@ static void __sched notrace __schedule(bool preempt) */ ++*switch_count; + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + trace_sched_switch(preempt, prev, next); /* Also unlocks the rq: */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1a2719e1350a..0033731a0797 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -41,8 +41,67 @@ static int convert_prio(int prio) return cpupri; } +static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, int idx) +{ + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; + + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racey, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + return 0; + + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) + return 0; + + if (lowest_mask) { + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_empty(lowest_mask)) + return 0; + } + + return 1; +} + +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask) +{ + return cpupri_find_fitness(cp, p, lowest_mask, NULL); +} + /** - * cpupri_find - find the best (lowest-pri) CPU in the system + * cpupri_find_fitness - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) @@ -58,84 +117,59 @@ static int convert_prio(int prio) * * Return: (int)bool - CPUs were found */ -int cpupri_find(struct cpupri *cp, struct task_struct *p, +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask, bool (*fitness_fn)(struct task_struct *p, int cpu)) { - int idx = 0; int task_pri = convert_prio(p->prio); + int idx, cpu; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - int skip = 0; - - if (!atomic_read(&(vec)->count)) - skip = 1; - /* - * When looking at the vector, we need to read the counter, - * do a memory barrier, then read the mask. - * - * Note: This is still all racey, but we can deal with it. - * Ideally, we only want to look at masks that are set. - * - * If a mask is not set, then the only thing wrong is that we - * did a little more work than necessary. - * - * If we read a zero count but the mask is set, because of the - * memory barriers, that can only happen when the highest prio - * task for a run queue has left the run queue, in which case, - * it will be followed by a pull. If the task we are processing - * fails to find a proper place to go, that pull request will - * pull this task if the run queue is running at a lower - * priority. - */ - smp_rmb(); - /* Need to do the rmb for every iteration */ - if (skip) + if (!__cpupri_find(cp, p, lowest_mask, idx)) continue; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) - continue; + if (!lowest_mask || !fitness_fn) + return 1; - if (lowest_mask) { - int cpu; - - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); - - /* - * We have to ensure that we have at least one bit - * still set in the array, since the map could have - * been concurrently emptied between the first and - * second reads of vec->mask. If we hit this - * condition, simply act as though we never hit this - * priority level and continue on. - */ - if (cpumask_empty(lowest_mask)) - continue; - - if (!fitness_fn) - return 1; - - /* Ensure the capacity of the CPUs fit the task */ - for_each_cpu(cpu, lowest_mask) { - if (!fitness_fn(p, cpu)) - cpumask_clear_cpu(cpu, lowest_mask); - } - - /* - * If no CPU at the current priority can fit the task - * continue looking - */ - if (cpumask_empty(lowest_mask)) - continue; + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); } + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) + continue; + return 1; } + /* + * If we failed to find a fitting lowest_mask, kick off a new search + * but without taking into account any fitness criteria this time. + * + * This rule favours honouring priority over fitting the task in the + * correct CPU (Capacity Awareness being the only user now). + * The idea is that if a higher priority task can run, then it should + * run even if this ends up being on unfitting CPU. + * + * The cost of this trade-off is not entirely clear and will probably + * be good for some workloads and bad for others. + * + * The main idea here is that if some CPUs were overcommitted, we try + * to spread which is what the scheduler traditionally did. Sys admins + * must do proper RT planning to avoid overloading the system if they + * really care. + */ + if (fitness_fn) + return cpupri_find(cp, p, lowest_mask); + return 0; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 32dd520db11f..efbb492bb94c 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -19,8 +19,10 @@ struct cpupri { #ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask, - bool (*fitness_fn)(struct task_struct *p, int cpu)); + struct cpumask *lowest_mask); +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cff3e656566d..dac9104d126f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -909,8 +909,10 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } while (read_seqcount_retry(&vtime->seqcount, seq)); } -static int vtime_state_check(struct vtime *vtime, int cpu) +static int vtime_state_fetch(struct vtime *vtime, int cpu) { + int state = READ_ONCE(vtime->state); + /* * We raced against a context switch, fetch the * kcpustat task again. @@ -927,10 +929,10 @@ static int vtime_state_check(struct vtime *vtime, int cpu) * * Case 1) is ok but 2) is not. So wait for a safe VTIME state. */ - if (vtime->state == VTIME_INACTIVE) + if (state == VTIME_INACTIVE) return -EAGAIN; - return 0; + return state; } static u64 kcpustat_user_vtime(struct vtime *vtime) @@ -949,14 +951,15 @@ static int kcpustat_field_vtime(u64 *cpustat, { struct vtime *vtime = &tsk->vtime; unsigned int seq; - int err; do { + int state; + seq = read_seqcount_begin(&vtime->seqcount); - err = vtime_state_check(vtime, cpu); - if (err < 0) - return err; + state = vtime_state_fetch(vtime, cpu); + if (state < 0) + return state; *val = cpustat[usage]; @@ -969,7 +972,7 @@ static int kcpustat_field_vtime(u64 *cpustat, */ switch (usage) { case CPUTIME_SYSTEM: - if (vtime->state == VTIME_SYS) + if (state == VTIME_SYS) *val += vtime->stime + vtime_delta(vtime); break; case CPUTIME_USER: @@ -981,11 +984,11 @@ static int kcpustat_field_vtime(u64 *cpustat, *val += kcpustat_user_vtime(vtime); break; case CPUTIME_GUEST: - if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) + if (state == VTIME_GUEST && task_nice(tsk) <= 0) *val += vtime->gtime + vtime_delta(vtime); break; case CPUTIME_GUEST_NICE: - if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) + if (state == VTIME_GUEST && task_nice(tsk) > 0) *val += vtime->gtime + vtime_delta(vtime); break; default: @@ -1036,23 +1039,23 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, { struct vtime *vtime = &tsk->vtime; unsigned int seq; - int err; do { u64 *cpustat; u64 delta; + int state; seq = read_seqcount_begin(&vtime->seqcount); - err = vtime_state_check(vtime, cpu); - if (err < 0) - return err; + state = vtime_state_fetch(vtime, cpu); + if (state < 0) + return state; *dst = *src; cpustat = dst->cpustat; /* Task is sleeping, dead or idle, nothing to add */ - if (vtime->state < VTIME_SYS) + if (state < VTIME_SYS) continue; delta = vtime_delta(vtime); @@ -1061,15 +1064,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, * Task runs either in user (including guest) or kernel space, * add pending nohz time to the right place. */ - if (vtime->state == VTIME_SYS) { + if (state == VTIME_SYS) { cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; - } else if (vtime->state == VTIME_USER) { + } else if (state == VTIME_USER) { if (task_nice(tsk) > 0) cpustat[CPUTIME_NICE] += vtime->utime + delta; else cpustat[CPUTIME_USER] += vtime->utime + delta; } else { - WARN_ON_ONCE(vtime->state != VTIME_GUEST); + WARN_ON_ONCE(state != VTIME_GUEST); if (task_nice(tsk) > 0) { cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; cpustat[CPUTIME_NICE] += vtime->gtime + delta; @@ -1080,7 +1083,7 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, } } while (read_seqcount_retry(&vtime->seqcount, seq)); - return err; + return 0; } void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43323f875cb9..504d2f51b0d6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -153,7 +153,7 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; @@ -334,6 +334,8 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) return dl_rq->root.rb_leftmost == &dl_se->rb_node; } +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) { raw_spin_lock_init(&dl_b->dl_runtime_lock); @@ -2496,7 +2498,7 @@ int sched_dl_global_validate(void) return ret; } -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 879d3ccf3806..8331bc04aea2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -402,11 +402,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); - P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); - P(se->avg.runnable_load_avg); + P(se->avg.runnable_avg); #endif #undef PN_SCHEDSTAT @@ -524,11 +523,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); - SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->avg.runnable_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", + cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", @@ -537,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", cfs_rq->removed.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", - cfs_rq->removed.runnable_sum); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg", + cfs_rq->removed.runnable_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); @@ -947,13 +945,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); - P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); - P(se.avg.runnable_load_sum); + P(se.avg.runnable_sum); P(se.avg.util_sum); P(se.avg.load_avg); - P(se.avg.runnable_load_avg); + P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c8a379c357e..d7fb20adabeb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -86,6 +86,19 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +int sched_thermal_decay_shift; +static int __init setup_sched_thermal_decay_shift(char *str) +{ + int _shift = 0; + + if (kstrtoint(str, 0, &_shift)) + pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); + + sched_thermal_decay_shift = clamp(_shift, 0, 10); + return 1; +} +__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); + #ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. @@ -741,9 +754,7 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); - - se->runnable_weight = se->load.weight; + sa->load_avg = scale_load_down(se->load.weight); /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -796,6 +807,8 @@ void post_init_entity_util_avg(struct task_struct *p) } } + sa->runnable_avg = cpu_scale; + if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -1473,36 +1486,51 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); - -static unsigned long cpu_runnable_load(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { + /* The node has spare capacity that can be used to run more tasks. */ + node_has_spare = 0, + /* + * The node is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + node_fully_busy, + /* + * The node is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + node_overloaded +}; /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; - + unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; + unsigned int nr_running; + unsigned int weight; + enum numa_type node_type; + int idle_cpu; }; -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) +static inline bool is_core_idle(int cpu) { - int cpu; +#ifdef CONFIG_SCHED_SMT + int sibling; - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; - ns->load += cpu_runnable_load(rq); - ns->compute_capacity += capacity_of(cpu); + if (!idle_cpu(cpu)) + return false; } +#endif + return true; } struct task_numa_env { @@ -1521,20 +1549,128 @@ struct task_numa_env { int best_cpu; }; +static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_util(int cpu); +static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, + struct numa_stats *ns) +{ + if ((ns->nr_running > ns->weight) && + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + return node_overloaded; + + if ((ns->nr_running < ns->weight) || + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + return node_has_spare; + + return node_fully_busy; +} + +#ifdef CONFIG_SCHED_SMT +/* Forward declarations of select_idle_sibling helpers */ +static inline bool test_idle_cores(int cpu, bool def); +static inline int numa_idle_core(int idle_core, int cpu) +{ + if (!static_branch_likely(&sched_smt_present) || + idle_core >= 0 || !test_idle_cores(cpu, false)) + return idle_core; + + /* + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ + if (is_core_idle(cpu)) + idle_core = cpu; + + return idle_core; +} +#else +static inline int numa_idle_core(int idle_core, int cpu) +{ + return idle_core; +} +#endif + +/* + * Gather all necessary information to make NUMA balancing placement + * decisions that are compatible with standard load balancer. This + * borrows code and logic from update_sg_lb_stats but sharing a + * common implementation is impractical. + */ +static void update_numa_stats(struct task_numa_env *env, + struct numa_stats *ns, int nid, + bool find_idle) +{ + int cpu, idle_core = -1; + + memset(ns, 0, sizeof(*ns)); + ns->idle_cpu = -1; + + rcu_read_lock(); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->load += cpu_load(rq); + ns->util += cpu_util(cpu); + ns->nr_running += rq->cfs.h_nr_running; + ns->compute_capacity += capacity_of(cpu); + + if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (READ_ONCE(rq->numa_migrate_on) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) + continue; + + if (ns->idle_cpu == -1) + ns->idle_cpu = cpu; + + idle_core = numa_idle_core(idle_core, cpu); + } + } + rcu_read_unlock(); + + ns->weight = cpumask_weight(cpumask_of_node(nid)); + + ns->node_type = numa_classify(env->imbalance_pct, ns); + + if (idle_core >= 0) + ns->idle_cpu = idle_core; +} + static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { struct rq *rq = cpu_rq(env->dst_cpu); - /* Bail out if run-queue part of active NUMA balance. */ - if (xchg(&rq->numa_migrate_on, 1)) + /* Check if run-queue part of active NUMA balance. */ + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { + int cpu; + int start = env->dst_cpu; + + /* Find alternative idle CPU. */ + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + if (cpu == env->best_cpu || !idle_cpu(cpu) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { + continue; + } + + env->dst_cpu = cpu; + rq = cpu_rq(env->dst_cpu); + if (!xchg(&rq->numa_migrate_on, 1)) + goto assign; + } + + /* Failed to find an alternative idle CPU */ return; + } +assign: /* * Clear previous best_cpu/rq numa-migrate flag, since task now * found a better CPU to move/swap. */ - if (env->best_cpu != -1) { + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { rq = cpu_rq(env->best_cpu); WRITE_ONCE(rq->numa_migrate_on, 0); } @@ -1590,7 +1726,7 @@ static bool load_too_imbalanced(long src_load, long dst_load, * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task */ -static void task_numa_compare(struct task_numa_env *env, +static bool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); @@ -1601,9 +1737,10 @@ static void task_numa_compare(struct task_numa_env *env, int dist = env->dist; long moveimp = imp; long load; + bool stopsearch = false; if (READ_ONCE(dst_rq->numa_migrate_on)) - return; + return false; rcu_read_lock(); cur = rcu_dereference(dst_rq->curr); @@ -1614,8 +1751,10 @@ static void task_numa_compare(struct task_numa_env *env, * Because we have preemption enabled we can get migrated around and * end try selecting ourselves (current == env->p) as a swap candidate. */ - if (cur == env->p) + if (cur == env->p) { + stopsearch = true; goto unlock; + } if (!cur) { if (maymove && moveimp >= env->best_imp) @@ -1624,18 +1763,27 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; } + /* Skip this swap candidate if cannot move to the source cpu. */ + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) + goto unlock; + + /* + * Skip this swap candidate if it is not moving to its preferred + * node and the best task is. + */ + if (env->best_task && + env->best_task->numa_preferred_nid == env->src_nid && + cur->numa_preferred_nid != env->src_nid) { + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. - */ - /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) - goto unlock; - - /* + * * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ @@ -1662,6 +1810,19 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } + /* Discourage picking a task already on its preferred node */ + if (cur->numa_preferred_nid == env->dst_nid) + imp -= imp / 16; + + /* + * Encourage picking a task that moves to its preferred node. + * This potentially makes imp larger than it's maximum of + * 1998 (see SMALLIMP and task_weight for why) but in this + * case, it does not matter. + */ + if (cur->numa_preferred_nid == env->src_nid) + imp += imp / 8; + if (maymove && moveimp > imp && moveimp > env->best_imp) { imp = moveimp; cur = NULL; @@ -1669,6 +1830,15 @@ static void task_numa_compare(struct task_numa_env *env, } /* + * Prefer swapping with a task moving to its preferred node over a + * task that is not. + */ + if (env->best_task && cur->numa_preferred_nid == env->src_nid && + env->best_task->numa_preferred_nid != env->src_nid) { + goto assign; + } + + /* * If the NUMA importance is less than SMALLIMP, * task migration might only result in ping pong * of tasks and also hurt performance due to cache @@ -1691,42 +1861,95 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; assign: - /* - * One idle CPU per node is evaluated for a task numa move. - * Call select_idle_sibling to maybe find a better one. - */ + /* Evaluate an idle CPU for a task numa move. */ if (!cur) { + int cpu = env->dst_stats.idle_cpu; + + /* Nothing cached so current CPU went idle since the search. */ + if (cpu < 0) + cpu = env->dst_cpu; + /* - * select_idle_siblings() uses an per-CPU cpumask that - * can be used from IRQ context. + * If the CPU is no longer truly idle and the previous best CPU + * is, keep using it. */ - local_irq_disable(); - env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, - env->dst_cpu); - local_irq_enable(); + if (!idle_cpu(cpu) && env->best_cpu >= 0 && + idle_cpu(env->best_cpu)) { + cpu = env->best_cpu; + } + + env->dst_cpu = cpu; } task_numa_assign(env, cur, imp); + + /* + * If a move to idle is allowed because there is capacity or load + * balance improves then stop the search. While a better swap + * candidate may exist, a search is not free. + */ + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) + stopsearch = true; + + /* + * If a swap candidate must be identified and the current best task + * moves its preferred node then stop the search. + */ + if (!maymove && env->best_task && + env->best_task->numa_preferred_nid == env->src_nid) { + stopsearch = true; + } unlock: rcu_read_unlock(); + + return stopsearch; } static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { - long src_load, dst_load, load; bool maymove = false; int cpu; - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; - /* - * If the improvement from just moving env->p direction is better - * than swapping tasks around, check if a move is possible. + * If dst node has spare capacity, then check if there is an + * imbalance that would be overruled by the load balancer. */ - maymove = !load_too_imbalanced(src_load, dst_load, env); + if (env->dst_stats.node_type == node_has_spare) { + unsigned int imbalance; + int src_running, dst_running; + + /* + * Would movement cause an imbalance? Note that if src has + * more running tasks that the imbalance is ignored as the + * move improves the imbalance from the perspective of the + * CPU load balancer. + * */ + src_running = env->src_stats.nr_running - 1; + dst_running = env->dst_stats.nr_running + 1; + imbalance = max(0, dst_running - src_running); + imbalance = adjust_numa_imbalance(imbalance, src_running); + + /* Use idle CPU if there is no imbalance */ + if (!imbalance) { + maymove = true; + if (env->dst_stats.idle_cpu >= 0) { + env->dst_cpu = env->dst_stats.idle_cpu; + task_numa_assign(env, NULL, 0); + return; + } + } + } else { + long src_load, dst_load, load; + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + maymove = !load_too_imbalanced(src_load, dst_load, env); + } for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ @@ -1734,7 +1957,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp, maymove); + if (task_numa_compare(env, taskimp, groupimp, maymove)) + break; } } @@ -1788,10 +2012,10 @@ static int task_numa_migrate(struct task_struct *p) dist = env.dist = node_distance(env.src_nid, env.dst_nid); taskweight = task_weight(p, env.src_nid, dist); groupweight = group_weight(p, env.src_nid, dist); - update_numa_stats(&env.src_stats, env.src_nid); + update_numa_stats(&env, &env.src_stats, env.src_nid, false); taskimp = task_weight(p, env.dst_nid, dist) - taskweight; groupimp = group_weight(p, env.dst_nid, dist) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); @@ -1824,7 +2048,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1848,15 +2072,17 @@ static int task_numa_migrate(struct task_struct *p) } /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) + if (env.best_cpu == -1) { + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); return -EAGAIN; + } best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); return ret; } @@ -1864,7 +2090,7 @@ static int task_numa_migrate(struct task_struct *p) WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); put_task_struct(env.best_task); return ret; } @@ -2835,25 +3061,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight += se->runnable_weight; - - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; - cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; -} - -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight -= se->runnable_weight; - - sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); - sub_positive(&cfs_rq->avg.runnable_load_sum, - se_runnable(se) * se->avg.runnable_load_sum); -} - -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { cfs_rq->avg.load_avg += se->avg.load_avg; @@ -2868,28 +3075,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) } #else static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight, unsigned long runnable) + unsigned long weight) { if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); } dequeue_load_avg(cfs_rq, se); - se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP @@ -2897,16 +3098,13 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); - se->avg.runnable_load_avg = - div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); } while (0); #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { + if (se->on_rq) account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } + } void reweight_task(struct task_struct *p, int prio) @@ -2916,7 +3114,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight, weight); + reweight_entity(cfs_rq, se, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -3028,50 +3226,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } - -/* - * This calculates the effective runnable weight for a group entity based on - * the group entity weight calculated above. - * - * Because of the above approximation (2), our group entity weight is - * an load_avg based ratio (3). This means that it includes blocked load and - * does not represent the runnable weight. - * - * Approximate the group entity's runnable weight per ratio from the group - * runqueue: - * - * grq->avg.runnable_load_avg - * ge->runnable_weight = ge->load.weight * -------------------------- (7) - * grq->avg.load_avg - * - * However, analogous to above, since the avg numbers are slow, this leads to - * transients in the from-idle case. Instead we use: - * - * ge->runnable_weight = ge->load.weight * - * - * max(grq->avg.runnable_load_avg, grq->runnable_weight) - * ----------------------------------------------------- (8) - * max(grq->avg.load_avg, grq->load.weight) - * - * Where these max() serve both to use the 'instant' values to fix the slow - * from-idle and avoid the /0 on to-idle, similar to (6). - */ -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) -{ - long runnable, load_avg; - - load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - - runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); - - runnable *= shares; - if (load_avg) - runnable /= load_avg; - - return clamp_t(long, runnable, MIN_SHARES, shares); -} #endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3083,7 +3237,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); static void update_cfs_group(struct sched_entity *se) { struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long shares, runnable; + long shares; if (!gcfs_rq) return; @@ -3092,16 +3246,15 @@ static void update_cfs_group(struct sched_entity *se) return; #ifndef CONFIG_SMP - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); + shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else shares = calc_group_shares(gcfs_rq); - runnable = calc_group_runnable(gcfs_rq, shares); #endif - reweight_entity(cfs_rq_of(se), se, shares, runnable); + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3226,11 +3379,11 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * Per the above update_tg_cfs_util() is trivial and simply copies the running - * sum over (but still wrong, because the group entity and group rq do not have - * their PELT windows aligned). + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial + * and simply copies the running/runnable sum over (but still wrong, because + * the group entity and group rq do not have their PELT windows aligned). * - * However, update_tg_cfs_runnable() is more complex. So we have: + * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * @@ -3313,9 +3466,35 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + + /* Set new sched_entity's runnable */ + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; + se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq runnable */ + add_positive(&cfs_rq->avg.runnable_avg, delta); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; +} + +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +{ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; - unsigned long runnable_load_avg, load_avg; - u64 runnable_load_sum, load_sum = 0; + unsigned long load_avg; + u64 load_sum = 0; s64 delta_sum; if (!runnable_sum) @@ -3363,20 +3542,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); - - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - - if (se->on_rq) { - delta_sum = runnable_load_sum - - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); - } - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3405,6 +3570,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + update_tg_cfs_load(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); trace_pelt_se_tp(se); @@ -3474,7 +3640,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3485,7 +3651,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); + swap(cfs_rq->removed.runnable_avg, removed_runnable); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3497,7 +3663,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sub_positive(&sa->runnable_sum, r * divider); + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); decayed = 1; } @@ -3542,17 +3717,19 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ se->avg.util_sum = se->avg.util_avg * divider; + se->avg.runnable_sum = se->avg.runnable_avg * divider; + se->avg.load_sum = divider; if (se_weight(se)) { se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); } - se->avg.runnable_load_sum = se->avg.load_sum; - enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); @@ -3574,6 +3751,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3680,13 +3859,13 @@ static void remove_entity_load_avg(struct sched_entity *se) ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; - cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->avg.runnable_load_avg; + return cfs_rq->avg.runnable_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3957,6 +4136,7 @@ static inline void check_schedstat_required(void) #endif } +static inline bool cfs_bandwidth_used(void); /* * MIGRATION @@ -4021,8 +4201,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + se_update_runnable(se); update_cfs_group(se); - enqueue_runnable_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -4035,10 +4215,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + /* + * When bandwidth control is enabled, cfs might have been removed + * because of a parent been throttled but cfs->nr_running > 1. Try to + * add it unconditionnally. + */ + if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); - } } static void __clear_buddies_last(struct sched_entity *se) @@ -4105,7 +4291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_runnable_load_avg(cfs_rq, se); + se_update_runnable(se); update_stats_dequeue(cfs_rq, se, flags); @@ -4541,8 +4727,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se->on_rq) break; - if (dequeue) + if (dequeue) { dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + } else { + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + } + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4610,8 +4801,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue = 0; cfs_rq = cfs_rq_of(se); - if (enqueue) + if (enqueue) { enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + } else { + update_load_avg(cfs_rq, se, 0); + se_update_runnable(se); + } + cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; @@ -4619,11 +4815,22 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) break; } - assert_list_leaf_cfs_rq(rq); - if (!se) add_nr_running(rq, task_delta); + /* + * The cfs_rq_throttled() breaks in the above iteration can result in + * incomplete leaf list maintenance, resulting in triggering the + * assertion below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + list_add_leaf_cfs_rq(cfs_rq); + } + + assert_list_leaf_cfs_rq(rq); + /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); @@ -5258,32 +5465,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; + flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); + cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; - - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); + goto enqueue_throttle; } +enqueue_throttle: if (!se) { add_nr_running(rq, 1); /* @@ -5344,17 +5551,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5372,16 +5575,21 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); + cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto dequeue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); } +dequeue_throttle: if (!se) sub_nr_running(rq, 1); @@ -5447,6 +5655,29 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) return load; } +static unsigned long cpu_runnable(struct rq *rq) +{ + return cfs_rq_runnable_avg(&rq->cfs); +} + +static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int runnable; + + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_runnable(rq); + + cfs_rq = &rq->cfs; + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + + /* Discount task's runnable from CPU's runnable */ + lsub_positive(&runnable, p->se.avg.runnable_avg); + + return runnable; +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -5786,10 +6017,12 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - __cpumask_clear_cpu(cpu, cpus); - if (!available_idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) { idle = false; + break; + } } + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; @@ -5894,6 +6127,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } /* + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which + * the task fits. If no CPU is big enough, but there are idle ones, try to + * maximize capacity. + */ +static int +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) +{ + unsigned long best_cap = 0; + int cpu, best_cpu = -1; + struct cpumask *cpus; + + sync_entity_load_avg(&p->se); + + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { + unsigned long cpu_cap = capacity_of(cpu); + + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + continue; + if (task_fits_capacity(p, cpu_cap)) + return cpu; + + if (cpu_cap > best_cap) { + best_cap = cpu_cap; + best_cpu = cpu; + } + } + + return best_cpu; +} + +/* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) @@ -5901,6 +6168,28 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (!sd) + goto symmetric; + + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + +symmetric: if (available_idle_cpu(target) || sched_idle_cpu(target)) return target; @@ -6101,33 +6390,6 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) } /* - * Disable WAKE_AFFINE in the case where task @p doesn't fit in the - * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. - * - * In that case WAKE_AFFINE doesn't make sense and we'll let - * BALANCE_WAKE sort things out. - */ -static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) -{ - long min_cap, max_cap; - - if (!static_branch_unlikely(&sched_asym_cpucapacity)) - return 0; - - min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; - - /* Minimum capacity is close to max, no need to abort wake_affine */ - if (max_cap - min_cap < max_cap >> 3) - return 0; - - /* Bring task utilization in sync with prev_cpu */ - sync_entity_load_avg(&p->se); - - return !task_fits_capacity(p, min_cap); -} - -/* * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) * to @dst_cpu. */ @@ -6391,8 +6653,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, p->cpus_ptr); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } rcu_read_lock(); @@ -7506,6 +7767,9 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; + if (thermal_load_avg(rq)) + return true; + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; @@ -7531,6 +7795,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); + unsigned long thermal_pressure; bool decayed; /* @@ -7539,8 +7804,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) @@ -7562,7 +7830,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->avg.runnable_load_sum) + if (cfs_rq->avg.runnable_sum) return false; return true; @@ -7700,7 +7968,8 @@ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; - unsigned long group_util; /* Total utilization of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; @@ -7763,8 +8032,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) if (unlikely(irq >= max)) return 1; + /* + * avg_rt.util_avg and avg_dl.util_avg track binary signals + * (running and not running) with weights 0 and 1024 respectively. + * avg_thermal.load_avg tracks thermal pressure and the weighted + * average uses the actual delta max capacity(load). + */ used = READ_ONCE(rq->avg_rt.util_avg); used += READ_ONCE(rq->avg_dl.util_avg); + used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -7921,6 +8197,10 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) if (sgs->sum_nr_running < sgs->group_weight) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return false; + if ((sgs->group_capacity * 100) > (sgs->group_util * imbalance_pct)) return true; @@ -7946,6 +8226,10 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) (sgs->group_util * imbalance_pct)) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return true; + return false; } @@ -8040,6 +8324,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); + sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; @@ -8315,6 +8600,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_load += cpu_load_without(rq, p); sgs->group_util += cpu_util_without(i, p); + sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; @@ -8337,13 +8623,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* * Computing avg_load makes sense only when group is fully busy or * overloaded */ - if (sgs->group_type < group_fully_busy) + if (sgs->group_type == group_fully_busy || + sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / sgs->group_capacity; } @@ -8626,6 +8915,21 @@ next_group: } } +static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +{ + unsigned int imbalance_min; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the source domain is almost idle. + */ + imbalance_min = 2; + if (src_nr_running <= imbalance_min) + return 0; + + return imbalance; +} + /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. @@ -8722,24 +9026,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) { - unsigned int imbalance_min; - - /* - * Compute an allowed imbalance based on a simple - * pair of communicating tasks that should remain - * local and ignore them. - * - * NOTE: Generally this would have been based on - * the domain size and this was evaluated. However, - * the benefit is similar across a range of workloads - * and machines but scaling by the domain size adds - * the risk that lower domains have to be rebalanced. - */ - imbalance_min = 2; - if (busiest->sum_nr_running <= imbalance_min) - env->imbalance = 0; - } + if (env->sd->flags & SD_NUMA) + env->imbalance = adjust_numa_imbalance(env->imbalance, + busiest->sum_nr_running); return; } @@ -9025,6 +9314,14 @@ static struct rq *find_busiest_queue(struct lb_env *env, case migrate_util: util = cpu_util(cpu_of(rq)); + /* + * Don't try to pull utilization from a CPU with one + * running task. Whatever its utilization, we will fail + * detach the task. + */ + if (nr_running <= 1) + continue; + if (busiest_util < util) { busiest_util = util; busiest = rq; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index bd006b79b360..b647d04d9c8b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -121,8 +121,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); + sa->runnable_sum = + decay_load(sa->runnable_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -149,7 +149,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, if (load) sa->load_sum += load * contrib; if (runnable) - sa->runnable_load_sum += runnable * contrib; + sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; @@ -238,7 +238,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +___update_load_avg(struct sched_avg *sa, unsigned long load) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; @@ -246,7 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * Step 2: update *_avg. */ sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + sa->runnable_avg = div_u64(sa->runnable_sum, divider); WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } @@ -254,33 +254,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * sched_entity: * * task: - * se_runnable() == se_weight() + * se_weight() = se->load.weight + * se_runnable() = !!on_rq * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * se_runnable() = grq->h_nr_running * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg + * runnable_sum = se_runnable() * runnable = grq->runnable_sum + * runnable_avg = runnable_sum * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum + * load_sum := runnable + * load_avg = se_weight(se) * load_sum * * cfq_rq: * + * runnable_sum = \Sum se->avg.runnable_sum + * runnable_avg = \Sum se->avg.runnable_avg + * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg */ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { if (___update_load_sum(now, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); trace_pelt_se_tp(se); return 1; } @@ -290,10 +289,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se), cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + ___update_load_avg(&se->avg, se_weight(se)); cfs_se_util_change(&se->avg); trace_pelt_se_tp(se); return 1; @@ -306,10 +305,10 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), + cfs_rq->h_nr_running, cfs_rq->curr != NULL)) { - ___update_load_avg(&cfs_rq->avg, 1, 1); + ___update_load_avg(&cfs_rq->avg, 1); trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -322,9 +321,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum * - * load_avg and runnable_load_avg are not supported and meaningless. + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -335,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) running, running)) { - ___update_load_avg(&rq->avg_rt, 1, 1); + ___update_load_avg(&rq->avg_rt, 1); trace_pelt_rt_tp(rq); return 1; } @@ -348,7 +347,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -359,7 +360,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) running, running)) { - ___update_load_avg(&rq->avg_dl, 1, 1); + ___update_load_avg(&rq->avg_dl, 1); trace_pelt_dl_tp(rq); return 1; } @@ -367,13 +368,46 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +/* + * thermal: + * + * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked + * + * util_avg and runnable_load_avg are not supported and meaningless. + * + * Unlike rt/dl utilization tracking that track time spent by a cpu + * running a rt/dl task through util_avg, the average thermal pressure is + * tracked through load_avg. This is because thermal pressure signal is + * time weighted "delta" capacity unlike util_avg which is binary. + * "delta capacity" = actual capacity - + * capped capacity a cpu due to a thermal event. + */ + +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + if (___update_load_sum(now, &rq->avg_thermal, + capacity, + capacity, + capacity)) { + ___update_load_avg(&rq->avg_thermal, 1); + trace_pelt_thermal_tp(rq); + return 1; + } + + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum - * runnable_load_sum = load_sum + * runnable_sum = util_sum + * + * load_avg and runnable_avg are not supported and meaningless. * */ @@ -410,7 +444,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) 1); if (ret) { - ___update_load_avg(&rq->avg_irq, 1, 1); + ___update_load_avg(&rq->avg_irq, 1); trace_pelt_irq_tp(rq); } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..eb034d9f024d 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return READ_ONCE(rq->avg_thermal.load_avg); +} +#else +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else @@ -159,6 +179,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} + +static inline int update_irq_load_avg(struct rq *rq, u64 running) { return 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 028520702717..8f45cdb6463b 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_FULL: return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; + return tasks[NR_RUNNING] > tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_NONIDLE] += delta; } -static u32 psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set, + bool wake_clock) { struct psi_group_cpu *groupc; + u32 state_mask = 0; unsigned int t, m; enum psi_states s; - u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -695,10 +696,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu, if (!(m & (1 << t))) continue; if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } groupc->tasks[t]--; @@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu, write_seqcount_end(&groupc->seq); - return state_mask; + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) return &psi_system; } -void psi_task_change(struct task_struct *task, int clear, int set) +static void psi_flags_change(struct task_struct *task, int clear, int set) { - int cpu = task_cpu(task); - struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; - - if (!task->pid) - return; - if (((task->psi_flags & set) || (task->psi_flags & clear) != clear) && !psi_bug) { printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, cpu, + task->pid, task->comm, task_cpu(task), task->psi_flags, clear, set); psi_bug = 1; } task->psi_flags &= ~clear; task->psi_flags |= set; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + bool wake_clock = true; + void *iter = NULL; + + if (!task->pid) + return; + + psi_flags_change(task, clear, set); /* * Periodic aggregation shuts off if there is a period of no @@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set) wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; - while ((group = iterate_groups(task, &iter))) { - u32 state_mask = psi_group_change(group, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set, wake_clock); +} + +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + struct psi_group *group, *common = NULL; + int cpu = task_cpu(prev); + void *iter; + + if (next->pid) { + psi_flags_change(next, 0, TSK_ONCPU); + /* + * When moving state between tasks, the group that + * contains them both does not change: we can stop + * updating the tree once we reach the first common + * ancestor. Iterate @next's ancestors until we + * encounter @prev's state. + */ + iter = NULL; + while ((group = iterate_groups(next, &iter))) { + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + common = group; + break; + } + + psi_group_change(group, cpu, 0, TSK_ONCPU, true); + } + } + + /* + * If this is a voluntary sleep, dequeue will have taken care + * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We + * only need to deal with it during preemption. + */ + if (sleep) + return; - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + if (prev->pid) { + psi_flags_change(prev, TSK_ONCPU, 0); - if (wake_clock && !delayed_work_pending(&group->avgs_work)) - schedule_delayed_work(&group->avgs_work, PSI_FREQ); + iter = NULL; + while ((group = iterate_groups(prev, &iter)) && group != common) + psi_group_change(group, cpu, TSK_ONCPU, 0, true); } } @@ -818,17 +865,17 @@ void psi_memstall_enter(unsigned long *flags) if (static_branch_likely(&psi_disabled)) return; - *flags = current->flags & PF_MEMSTALL; + *flags = current->in_memstall; if (*flags) return; /* - * PF_MEMSTALL setting & accounting needs to be atomic wrt + * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags |= PF_MEMSTALL; + current->in_memstall = 1; psi_task_change(current, 0, TSK_MEMSTALL); rq_unlock_irq(rq, &rf); @@ -851,13 +898,13 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; /* - * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags &= ~PF_MEMSTALL; + current->in_memstall = 0; psi_task_change(current, TSK_MEMSTALL, 0); rq_unlock_irq(rq, &rf); @@ -916,12 +963,14 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) + if (task_on_rq_queued(task)) { task_flags = TSK_RUNNING; - else if (task->in_iowait) + if (task_current(rq, task)) + task_flags |= TSK_ONCPU; + } else if (task->in_iowait) task_flags = TSK_IOWAIT; - if (task->flags & PF_MEMSTALL) + if (task->in_memstall) task_flags |= TSK_MEMSTALL; if (task_flags) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4043abe45459..df11d88c9895 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1475,6 +1475,13 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) int target = find_lowest_rq(p); /* + * Bail out if we were forcing a migration to find a better + * fitting CPU but our search failed. + */ + if (!test && target != -1 && !rt_task_fits_capacity(p, target)) + goto out_unlock; + + /* * Don't bother moving it if the destination CPU is * not running a lower priority task. */ @@ -1482,6 +1489,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } + +out_unlock: rcu_read_unlock(); out: @@ -1495,7 +1504,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; /* @@ -1503,7 +1512,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * see if it is pushed or pulled somewhere else. */ if (p->nr_cpus_allowed != 1 && - cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) + cpupri_find(&rq->rd->cpupri, p, NULL)) return; /* @@ -1647,8 +1656,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr) && - rt_task_fits_capacity(p, cpu)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; @@ -1682,6 +1690,7 @@ static int find_lowest_rq(struct task_struct *task) struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + int ret; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1690,8 +1699,22 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, - rt_task_fits_capacity)) + /* + * If we're on asym system ensure we consider the different capacities + * of the CPUs when searching for the lowest_mask. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + + ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, + task, lowest_mask, + rt_task_fits_capacity); + } else { + + ret = cpupri_find(&task_rq(task)->rd->cpupri, + task, lowest_mask); + } + + if (!ret) return -1; /* No targets found */ /* @@ -2202,7 +2225,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio); - if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) + if (need_to_push) push_rt_tasks(rq); } @@ -2274,10 +2297,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - bool need_to_push = rq->rt.overloaded || - !rt_task_fits_capacity(p, cpu_of(rq)); - - if (p->nr_cpus_allowed > 1 && need_to_push) + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) @@ -2449,10 +2469,11 @@ const struct sched_class rt_sched_class = { */ static DEFINE_MUTEX(rt_constraints_mutex); -/* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_struct *g, *p; + struct task_struct *task; + struct css_task_iter it; + int ret = 0; /* * Autogroups do not have RT tasks; see autogroup_create(). @@ -2460,12 +2481,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) if (task_group_is_autogroup(tg)) return 0; - for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } + css_task_iter_start(&tg->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret |= rt_task(task); + css_task_iter_end(&it); - return 0; + return ret; } struct rt_schedulable_data { @@ -2496,9 +2517,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) return -EINVAL; /* - * Ensure we don't starve existing RT tasks. + * Ensure we don't starve existing RT tasks if runtime turns zero. */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + if (rt_bandwidth_enabled() && !runtime && + tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; total = to_ratio(period, runtime); @@ -2564,7 +2586,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, return -EINVAL; mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) goto unlock; @@ -2582,7 +2603,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return err; @@ -2641,9 +2661,7 @@ static int sched_rt_global_constraints(void) int ret = 0; mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9ea647835fd6..0f616bf7bce3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -118,7 +118,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) @@ -305,7 +311,6 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -extern void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -489,7 +494,6 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long runnable_weight; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ @@ -528,7 +532,7 @@ struct cfs_rq { int nr; unsigned long load_avg; unsigned long util_avg; - unsigned long runnable_sum; + unsigned long runnable_avg; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -688,8 +692,30 @@ struct dl_rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) + +static inline void se_update_runnable(struct sched_entity *se) +{ + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_running; +} + +static inline long se_runnable(struct sched_entity *se) +{ + if (entity_is_task(se)) + return !!se->on_rq; + else + return se->runnable_weight; +} + #else #define entity_is_task(se) 1 + +static inline void se_update_runnable(struct sched_entity *se) {} + +static inline long se_runnable(struct sched_entity *se) +{ + return !!se->on_rq; +} #endif #ifdef CONFIG_SMP @@ -701,10 +727,6 @@ static inline long se_weight(struct sched_entity *se) return scale_load_down(se->load.weight); } -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} static inline bool sched_asym_prefer(int a, int b) { @@ -944,6 +966,9 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE + struct sched_avg avg_thermal; +#endif u64 idle_stamp; u64 avg_idle; @@ -967,7 +992,6 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP - int hrtick_csd_pending; call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; @@ -1107,6 +1131,24 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +/** + * By default the decay is the default pelt decay period. + * The decay shift can change the decay period in + * multiples of 32. + * Decay shift Decay period(ms) + * 0 32 + * 1 64 + * 2 128 + * 3 256 + * 4 512 + */ +extern int sched_thermal_decay_shift; + +static inline u64 rq_clock_thermal(struct rq *rq) +{ + return rq_clock_task(rq) >> sched_thermal_decay_shift; +} + static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); @@ -1337,8 +1379,6 @@ extern void sched_ttwu_pending(void); for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) -#define for_each_lower_domain(sd) for (; sd; sd = sd->child) - /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1869,7 +1909,6 @@ extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) @@ -1968,6 +2007,13 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifndef arch_scale_freq_tick +static __always_inline +void arch_scale_freq_tick(void) +{ +} +#endif + #ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(int cpu) @@ -2492,3 +2538,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } #endif + +void swake_up_all_locked(struct swait_queue_head *q); +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index ba683fe81a6e..33d0daf83842 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -70,7 +70,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) return; if (!wakeup || p->sched_psi_wake_requeue) { - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) set |= TSK_MEMSTALL; if (p->sched_psi_wake_requeue) p->sched_psi_wake_requeue = 0; @@ -90,9 +90,17 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) return; if (!sleep) { - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) clear |= TSK_MEMSTALL; } else { + /* + * When a task sleeps, schedule() dequeues it before + * switching to the next one. Merge the clearing of + * TSK_RUNNING and TSK_ONCPU to save an unnecessary + * psi_task_change() call in psi_sched_switch(). + */ + clear |= TSK_ONCPU; + if (p->in_iowait) set |= TSK_IOWAIT; } @@ -109,14 +117,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) * deregister its sleep-persistent psi states from the old * queue, and let psi_enqueue() know it has to requeue. */ - if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + if (unlikely(p->in_iowait || p->in_memstall)) { struct rq_flags rf; struct rq *rq; int clear = 0; if (p->in_iowait) clear |= TSK_IOWAIT; - if (p->flags & PF_MEMSTALL) + if (p->in_memstall) clear |= TSK_MEMSTALL; rq = __task_rq_lock(p, &rf); @@ -126,18 +134,31 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) } } +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) +{ + if (static_branch_likely(&psi_disabled)) + return; + + psi_task_switch(prev, next, sleep); +} + static inline void psi_task_tick(struct rq *rq) { if (static_branch_likely(&psi_disabled)) return; - if (unlikely(rq->curr->flags & PF_MEMSTALL)) + if (unlikely(rq->curr->in_memstall)) psi_memstall_tick(rq->curr, cpu_of(rq)); } #else /* CONFIG_PSI */ static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} static inline void psi_dequeue(struct task_struct *p, bool sleep) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} static inline void psi_task_tick(struct rq *rq) {} #endif /* CONFIG_PSI */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e83a3f8449f6..e1c655f928c7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,6 +32,19 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); +/* + * Wake up all waiters. This is an interface which is solely exposed for + * completions and not for general usage. + * + * It is intentionally different from swake_up_all() to allow usage from + * hard interrupt context and interrupt disabled regions. + */ +void swake_up_all_locked(struct swait_queue_head *q) +{ + while (!list_empty(&q->task_list)) + swake_up_locked(q); +} + void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -69,7 +82,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dfb64c08a407..8344757bba6e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -317,8 +317,9 @@ static void sched_energy_set(bool has_eas) * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. - * 3. the EM complexity is low enough to keep scheduling overheads low; - * 4. schedutil is driving the frequency of all CPUs of the rd; + * 3. no SMT is detected. + * 4. the EM complexity is low enough to keep scheduling overheads low; + * 5. schedutil is driving the frequency of all CPUs of the rd; * * The complexity of the Energy Model is defined as: * @@ -360,6 +361,13 @@ static bool build_perf_domains(const struct cpumask *cpu_map) goto free; } + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", + cpumask_pr_args(cpu_map)); + goto free; + } + for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) @@ -1374,18 +1382,9 @@ sd_init(struct sched_domain_topology_level *tl, * Convert topological properties into behaviour. */ - if (sd->flags & SD_ASYM_CPUCAPACITY) { - struct sched_domain *t = sd; - - /* - * Don't attempt to spread across CPUs of different capacities. - */ - if (sd->child) - sd->child->flags &= ~SD_PREFER_SIBLING; - - for_each_lower_domain(t) - t->flags |= SD_BALANCE_WAKE; - } + /* Don't attempt to spread across CPUs of different capacities. */ + if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b6ea3dcb57bf..ec5c606bc3a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -528,8 +528,12 @@ static long seccomp_attach_filter(unsigned int flags, int ret; ret = seccomp_can_sync_threads(); - if (ret) - return ret; + if (ret) { + if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) + return -ESRCH; + else + return ret; + } } /* Set log flag, if present. */ @@ -1221,6 +1225,7 @@ static const struct file_operations seccomp_notify_ops = { .poll = seccomp_notify_poll, .release = seccomp_notify_release, .unlocked_ioctl = seccomp_notify_ioctl, + .compat_ioctl = seccomp_notify_ioctl, }; static struct file *init_listener(struct seccomp_filter *filter) @@ -1288,10 +1293,12 @@ static long seccomp_set_mode_filter(unsigned int flags, * In the successful case, NEW_LISTENER returns the new listener fd. * But in the failure case, TSYNC returns the thread that died. If you * combine these two flags, there's no way to tell whether something - * succeeded or failed. So, let's disallow this combination. + * succeeded or failed. So, let's disallow this combination if the user + * has not explicitly requested no errors from TSYNC. */ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && - (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) && + ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; /* Prepare the new filter before holding any locks. */ diff --git a/kernel/smp.c b/kernel/smp.c index d0ada39eb4d4..786092aabdcd 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -329,6 +329,11 @@ EXPORT_SYMBOL(smp_call_function_single); * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * + * If the function is called with one csd which has not yet been + * processed by previous call to smp_call_function_single_async(), the + * function will return immediately with -EBUSY showing that the csd + * object is still in progress. + * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ @@ -338,14 +343,17 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) preempt_disable(); - /* We could deadlock if we have to wait here with interrupts disabled! */ - if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK)) - csd_lock_wait(csd); + if (csd->flags & CSD_FLAG_LOCK) { + err = -EBUSY; + goto out; + } csd->flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd, csd->func, csd->info); + +out: preempt_enable(); return err; @@ -589,20 +597,13 @@ void __init setup_nr_cpu_ids(void) void __init smp_init(void) { int num_nodes, num_cpus; - unsigned int cpu; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); - /* FIXME: This should be done in userspace --RR */ - for_each_present_cpu(cpu) { - if (num_online_cpus() >= setup_max_cpus) - break; - if (!cpu_online(cpu)) - cpu_up(cpu); - } + bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 0427a86743a4..a47c6dd57452 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,7 +126,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) * Were softirqs turned off above: */ if (softirq_count() == (cnt & SOFTIRQ_MASK)) - trace_softirqs_off(ip); + lockdep_softirqs_off(ip); raw_local_irq_restore(flags); if (preempt_count() == cnt) { @@ -147,7 +147,7 @@ static void __local_bh_enable(unsigned int cnt) trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); if (softirq_count() == (cnt & SOFTIRQ_MASK)) - trace_softirqs_on(_RET_IP_); + lockdep_softirqs_on(_RET_IP_); __preempt_count_sub(cnt); } @@ -174,7 +174,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) * Are softirqs going to be turned on now: */ if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) - trace_softirqs_on(ip); + lockdep_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: @@ -224,9 +224,9 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (trace_hardirq_context(current)) { + if (lockdep_hardirq_context(current)) { in_hardirq = true; - trace_hardirq_exit(); + lockdep_hardirq_exit(); } lockdep_softirq_enter(); @@ -239,7 +239,7 @@ static inline void lockdep_softirq_end(bool in_hardirq) lockdep_softirq_exit(); if (in_hardirq) - trace_hardirq_enter(); + lockdep_hardirq_enter(); } #else static inline bool lockdep_softirq_start(void) { return false; } @@ -414,7 +414,8 @@ void irq_exit(void) tick_irq_exit(); rcu_irq_exit(); - trace_hardirq_exit(); /* must be last! */ + /* must be last! */ + lockdep_hardirq_exit(); } /* diff --git a/kernel/sys.c b/kernel/sys.c index f9bc5c303e3f..d325f3ab624a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> @@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); + timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); diff --git a/kernel/task_work.c b/kernel/task_work.c index 0fef395662a6..825f28259a19 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -97,16 +97,26 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ - raw_spin_lock_irq(&task->pi_lock); do { + head = NULL; work = READ_ONCE(task->task_works); - head = !work && (task->flags & PF_EXITING) ? - &work_exited : NULL; + if (!work) { + if (task->flags & PF_EXITING) + head = &work_exited; + else + break; + } } while (cmpxchg(&task->task_works, work, head) != work); - raw_spin_unlock_irq(&task->pi_lock); if (!work) break; + /* + * Synchronize with task_work_cancel(). It can not remove + * the first entry == work, cmpxchg(task_works) must fail. + * But it can remove another entry from the ->next list. + */ + raw_spin_lock_irq(&task->pi_lock); + raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 428beb69426a..7cb09c4cf21c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -928,6 +928,15 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); +#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE + if (cs->vdso_clock_mode < 0 || + cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { + pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", + cs->name, cs->vdso_clock_mode); + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } +#endif + /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3a609e7344f3..d0a5ba37aff4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -311,7 +311,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) div >>= 1; } tmp >>= sft; - do_div(tmp, (unsigned long) div); + do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); @@ -1404,7 +1404,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; - timer->is_hard = !softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1514,7 +1514,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); + lockdep_hrtimer_enter(timer); + restart = fn(timer); + + lockdep_hrtimer_exit(timer); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index d23b434c2ca7..eddcf4970444 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = { .max_cycles = 10, }; -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); +__cacheline_aligned_in_smp seqcount_t jiffies_seq; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) @@ -67,9 +68,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); ret = jiffies_64; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12858507d75a..e6ba064ce773 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include <linux/user_namespace.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> +#include <linux/clocksource.h> #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> @@ -172,8 +173,8 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces - * the time namespace handling path. + * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. */ static void timens_setup_vdso_data(struct vdso_data *vdata, struct time_namespace *ns) @@ -183,7 +184,7 @@ static void timens_setup_vdso_data(struct vdso_data *vdata, struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vdata->seq = 1; - vdata->clock_mode = VCLOCK_TIMENS; + vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8ff6da77a01f..2fd3b3fa68bf 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -118,6 +118,16 @@ static inline int validate_clock_permissions(const clockid_t clock) return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; } +static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +{ + return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; +} + +static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) +{ + return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -336,9 +346,7 @@ static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) /* * Sample a process (thread group) clock for the given task clkid. If the * group's cputime accounting is already enabled, read the atomic - * store. Otherwise a full update is required. Task's sighand lock must be - * held to protect the task traversal on a full update. clkid is already - * validated. + * store. Otherwise a full update is required. clkid is already validated. */ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, bool start) @@ -393,7 +401,12 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.task = p; + new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); + /* + * get_task_for_clock() took a reference on @p. Drop it as the timer + * holds a reference on the pid of @p. + */ + put_task_struct(p); return 0; } @@ -406,13 +419,15 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) static int posix_cpu_timer_del(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; int ret = 0; - if (WARN_ON_ONCE(!p)) - return -EINVAL; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Protect against sighand release/switch in exit/exec and process/ @@ -434,8 +449,10 @@ static int posix_cpu_timer_del(struct k_itimer *timer) unlock_task_sighand(p, &flags); } +out: + rcu_read_unlock(); if (!ret) - put_task_struct(p); + put_pid(ctmr->pid); return ret; } @@ -484,12 +501,11 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the sighand lock held. */ -static void arm_timer(struct k_itimer *timer) +static void arm_timer(struct k_itimer *timer, struct task_struct *p) { int clkidx = CPUCLOCK_WHICH(timer->it_clock); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct task_struct *p = ctmr->task; struct posix_cputimer_base *base; if (CPUCLOCK_PERTHREAD(timer->it_clock)) @@ -564,13 +580,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); u64 old_expires, new_expires, old_incr, val; struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; int ret = 0; - if (WARN_ON_ONCE(!p)) - return -EINVAL; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) { + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + rcu_read_unlock(); + return -ESRCH; + } /* * Use the to_ktime conversion because that clamps the maximum @@ -587,8 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(sighand == NULL)) + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); return -ESRCH; + } /* * Disarm any old timer after extracting its expiry time. @@ -662,7 +688,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ cpu_timer_setexpires(ctmr, new_expires); if (new_expires != 0 && val < new_expires) { - arm_timer(timer); + arm_timer(timer, p); } unlock_task_sighand(p, &flags); @@ -693,6 +719,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = 0; out: + rcu_read_unlock(); if (old) old->it_interval = ns_to_timespec64(old_incr); @@ -704,10 +731,12 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); struct cpu_timer *ctmr = &timer->it.cpu; u64 now, expires = cpu_timer_getexpires(ctmr); - struct task_struct *p = ctmr->task; + struct task_struct *p; - if (WARN_ON_ONCE(!p)) - return; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Easy part: convert the reload time. @@ -715,36 +744,15 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp itp->it_interval = ktime_to_timespec64(timer->it_interval); if (!expires) - return; + goto out; /* * Sample the clock to take the difference with the expiry time. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + if (CPUCLOCK_PERTHREAD(timer->it_clock)) now = cpu_clock_sample(clkid, p); - } else { - struct sighand_struct *sighand; - unsigned long flags; - - /* - * Protect against sighand release/switch in exit/exec and - * also make timer sampling safe if it ends up calling - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - * Disarm the timer, nothing else to do. - */ - cpu_timer_setexpires(ctmr, 0); - return; - } else { - now = cpu_clock_sample_group(clkid, p, false); - unlock_task_sighand(p, &flags); - } - } + else + now = cpu_clock_sample_group(clkid, p, false); if (now < expires) { itp->it_value = ns_to_timespec64(expires - now); @@ -756,6 +764,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp itp->it_value.tv_nsec = 1; itp->it_value.tv_sec = 0; } +out: + rcu_read_unlock(); } #define MAX_COLLECTED 20 @@ -976,56 +986,38 @@ static void check_process_timers(struct task_struct *tsk, static void posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; + struct task_struct *p; struct sighand_struct *sighand; unsigned long flags; u64 now; - if (WARN_ON_ONCE(!p)) - return; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Fetch the current sample and update the timer's expiry time. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + if (CPUCLOCK_PERTHREAD(timer->it_clock)) now = cpu_clock_sample(clkid, p); - bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) - return; - - /* Protect timer list r/w in arm_timer() */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - return; - } else { - /* - * Protect arm_timer() and timer sampling in case of call to - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - */ - cpu_timer_setexpires(ctmr, 0); - return; - } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* If the process is dying, no need to rearm */ - goto unlock; - } + else now = cpu_clock_sample_group(clkid, p, true); - bump_cpu_timer(timer, now); - /* Leave the sighand locked for the call below. */ - } + + bump_cpu_timer(timer, now); + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) + goto out; /* * Now re-arm for the new expiry time. */ - arm_timer(timer); -unlock: + arm_timer(timer, p); unlock_task_sighand(p, &flags); +out: + rcu_read_unlock(); } /** @@ -1126,8 +1118,11 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - if (!lock_task_sighand(tsk, &flags)) + lockdep_posixtimer_enter(); + if (!lock_task_sighand(tsk, &flags)) { + lockdep_posixtimer_exit(); return; + } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1169,6 +1164,7 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + lockdep_posixtimer_exit(); } /* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ff0eb30de346..07709ac30439 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -121,7 +121,8 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, { struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash) { + hlist_for_each_entry_rcu(timer, head, t_hash, + lockdep_is_held(&hash_lock)) { if ((timer->it_signal == sig) && (timer->it_id == id)) return timer; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index e4332e3e2d56..fa3f800d7d76 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -208,7 +208,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (sched_clock_timer.function != NULL) { /* update timeout for clock wrap */ - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, + HRTIMER_MODE_REL_HARD); } r = rate; @@ -254,9 +255,9 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); sched_clock_timer.function = sched_clock_poll; - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } /* @@ -293,7 +294,7 @@ void sched_clock_resume(void) struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); rd->read_sched_clock = cd.actual_read_sched_clock; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e5d3524e924..6c9c342dd0e5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -84,13 +84,15 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -162,9 +164,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a792d21cac64..3e2dc9b8858c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevaluate with jiffies_lock held */ - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, last_jiffies_update); if (delta >= tick_period) { @@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } else { - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return; } - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return period; } @@ -240,6 +245,7 @@ static void nohz_full_kick_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_func, + .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), }; /* @@ -676,10 +682,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca69290bee2a..9ebaab13339d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1005,9 +1005,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; - rem *= mult; - do_div(rem, div); + rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } @@ -2397,8 +2396,10 @@ EXPORT_SYMBOL(hardpps); */ void xtime_update(unsigned long ticks) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); do_timer(ticks); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 141ab3ab0354..099737f6f10c 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { } extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -extern seqlock_t jiffies_lock; +extern raw_spinlock_t jiffies_lock; +extern seqcount_t jiffies_seq; #define CS_NAME_LEN 32 diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4820823515e9..a5221abb4594 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,6 +944,7 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 +#define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) @@ -960,7 +961,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ - if (timer_pending(timer)) { + if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new @@ -1133,7 +1134,7 @@ EXPORT_SYMBOL(timer_reduce); void add_timer(struct timer_list *timer) { BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); @@ -1828,21 +1829,23 @@ static void process_timeout(struct timer_list *t) * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): * - * You can set the task state as follows - + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process())". + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * - * The current task state is guaranteed to be TASK_RUNNING when this + * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule @@ -1850,7 +1853,7 @@ static void process_timeout(struct timer_list *t) * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed + * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) @@ -1891,7 +1894,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_singleshot_timer_sync(&timer.timer); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 9577c89179cd..54ce6eb2ca36 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -71,13 +71,15 @@ void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; + s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); - vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); - vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + vdata[CS_HRES_COARSE].clock_mode = clock_mode; + vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; @@ -103,10 +105,10 @@ void update_vsyscall(struct timekeeper *tk) WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* - * Architectures can opt out of updating the high resolution part - * of the VDSO. + * If the current clocksource is not VDSO capable, then spare the + * update of the high reolution parts. */ - if (__arch_update_vdso_data()) + if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); diff --git a/kernel/torture.c b/kernel/torture.c index 7c13f5558b71..a1a41484ff6d 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -42,6 +42,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); +static bool disable_onoff_at_boot; +module_param(disable_onoff_at_boot, bool, 0444); + static char *torture_type; static int verbose; @@ -84,6 +87,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -97,12 +101,18 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); starttime = jiffies; (*n_offl_attempts)++; - ret = cpu_down(cpu); + ret = remove_cpu(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_offl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: offline %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -137,6 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -148,12 +159,18 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, torture_type, cpu); starttime = jiffies; (*n_onl_attempts)++; - ret = cpu_up(cpu); + ret = add_cpu(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_onl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: online %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -192,17 +209,18 @@ torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); - if (!IS_MODULE(CONFIG_TORTURE_TEST)) + if (!IS_MODULE(CONFIG_TORTURE_TEST)) { for_each_possible_cpu(cpu) { if (cpu_online(cpu)) continue; - ret = cpu_up(cpu); + ret = add_cpu(cpu); if (ret && verbose) { pr_alert("%s" TORTURE_FLAG "%s: Initial online %d: errno %d\n", __func__, torture_type, cpu, ret); } } + } if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); @@ -215,6 +233,10 @@ torture_onoff(void *arg) VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } while (!torture_must_stop()) { + if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { + schedule_timeout_interruptible(HZ / 10); + continue; + } cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); if (!torture_offline(cpu, &n_offline_attempts, &n_offline_successes, diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4560878f0bac..ca39dc3230cb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1896,8 +1896,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (bt == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19e793aa441a..68250d433bd7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -732,7 +732,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f7ee102868a..fd81c7de77a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1547,6 +1547,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); + if (rec) + break; } return rec; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 301db4406bc3..4e01c448b4b4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1411,14 +1411,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; rcu_read_lock(); retry: - if (req_cpu == WORK_CPU_UNBOUND) - cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else + if (wq->flags & WQ_UNBOUND) { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + } else { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + } /* * If @work was previously on a different pool, it might still be |