summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_struct_ops.c14
-rw-r--r--kernel/bpf/btf.c5
-rw-r--r--kernel/bpf/cgroup.c7
-rw-r--r--kernel/bpf/syscall.c39
-rw-r--r--kernel/cgroup/cgroup-v1.c3
-rw-r--r--kernel/cgroup/cgroup.c43
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c147
-rw-r--r--kernel/events/core.c357
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c194
-rw-r--r--kernel/irq/Kconfig5
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/debugfs.c28
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irqdesc.c6
-rw-r--r--kernel/irq/irqdomain.c14
-rw-r--r--kernel/irq/manage.c11
-rw-r--r--kernel/irq/resend.c143
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/kthread.c17
-rw-r--r--kernel/locking/lockdep.c686
-rw-r--r--kernel/locking/lockdep_internals.h14
-rw-r--r--kernel/locking/lockdep_proc.c31
-rw-r--r--kernel/locking/locktorture.c15
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/percpu-rwsem.c194
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem.c9
-rw-r--r--kernel/locking/rwsem.h10
-rw-r--r--kernel/locking/spinlock_debug.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/power/qos.c559
-rw-r--r--kernel/power/user.c16
-rw-r--r--kernel/rcu/Makefile4
-rw-r--r--kernel/rcu/rcu.h23
-rw-r--r--kernel/rcu/rcu_segcblist.c4
-rw-r--r--kernel/rcu/rcuperf.c14
-rw-r--r--kernel/rcu/rcutorture.c67
-rw-r--r--kernel/rcu/srcutree.c18
-rw-r--r--kernel/rcu/tree.c453
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_exp.h13
-rw-r--r--kernel/rcu/tree_plugin.h25
-rw-r--r--kernel/rcu/tree_stall.h41
-rw-r--r--kernel/rcu/update.c52
-rw-r--r--kernel/sched/completion.c36
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/sched/cpupri.c158
-rw-r--r--kernel/sched/cpupri.h6
-rw-r--r--kernel/sched/cputime.c41
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/debug.c17
-rw-r--r--kernel/sched/fair.c793
-rw-r--r--kernel/sched/pelt.c90
-rw-r--r--kernel/sched/pelt.h31
-rw-r--r--kernel/sched/psi.c111
-rw-r--r--kernel/sched/rt.c66
-rw-r--r--kernel/sched/sched.h73
-rw-r--r--kernel/sched/stats.h31
-rw-r--r--kernel/sched/swait.c15
-rw-r--r--kernel/sched/topology.c27
-rw-r--r--kernel/seccomp.c15
-rw-r--r--kernel/smp.c23
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/task_work.c18
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/hrtimer.c8
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/namespace.c7
-rw-r--r--kernel/time/posix-cpu-timers.c154
-rw-r--r--kernel/time/posix-timers.c3
-rw-r--r--kernel/time/sched_clock.c9
-rw-r--r--kernel/time/tick-common.c10
-rw-r--r--kernel/time/tick-sched.c20
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/time/timekeeping.h3
-rw-r--r--kernel/time/timer.c23
-rw-r--r--kernel/time/vsyscall.c12
-rw-r--r--kernel/torture.c38
-rw-r--r--kernel/trace/blktrace.c5
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/workqueue.c14
88 files changed, 3447 insertions, 1850 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 042f95534f86..68a89a9f7ccd 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -482,13 +482,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
- if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
+ switch (prev_state) {
+ case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
if (refcount_dec_and_test(&st_map->kvalue.refcnt))
bpf_map_put(map);
+ return 0;
+ case BPF_STRUCT_OPS_STATE_TOBEFREE:
+ return -EINPROGRESS;
+ case BPF_STRUCT_OPS_STATE_INIT:
+ return -ENOENT;
+ default:
+ WARN_ON_ONCE(1);
+ /* Should never happen. Treat it as not found. */
+ return -ENOENT;
}
-
- return 0;
}
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 787140095e58..7787bdcb5d68 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
- if (struct_size - bytes_offset < sizeof(int)) {
+ if (struct_size - bytes_offset < member_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
@@ -4564,7 +4564,7 @@ int btf_get_info_by_fd(const struct btf *btf,
union bpf_attr __user *uattr)
{
struct bpf_btf_info __user *uinfo;
- struct bpf_btf_info info = {};
+ struct bpf_btf_info info;
u32 info_copy, btf_copy;
void __user *ubtf;
u32 uinfo_len;
@@ -4573,6 +4573,7 @@ int btf_get_info_by_fd(const struct btf *btf,
uinfo_len = attr->info.info_len;
info_copy = min_t(u32, uinfo_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9a500fadbef5..4f1472409ef8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -227,6 +227,9 @@ cleanup:
for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
percpu_ref_exit(&cgrp->bpf.refcnt);
return -ENOMEM;
@@ -302,8 +305,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
- struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
- *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_prog_list *pl, *replace_pl = NULL;
enum bpf_cgroup_storage_type stype;
int err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a91ad518c050..966b7b34cde0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -696,14 +696,15 @@ int bpf_get_file_flag(int flags)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
- * Return 0 on success and < 0 on error.
+/* dst and src must have at least "size" number of bytes.
+ * Return strlen on success and < 0 on error.
*/
-static int bpf_obj_name_cpy(char *dst, const char *src)
+int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
{
- const char *end = src + BPF_OBJ_NAME_LEN;
+ const char *end = src + size;
+ const char *orig_src = src;
- memset(dst, 0, BPF_OBJ_NAME_LEN);
+ memset(dst, 0, size);
/* Copy all isalnum(), '_' and '.' chars. */
while (src < end && *src) {
if (!isalnum(*src) &&
@@ -712,11 +713,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
*dst++ = *src++;
}
- /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+ /* No '\0' found in "size" number of bytes */
if (src == end)
return -EINVAL;
- return 0;
+ return src - orig_src;
}
int map_check_no_btf(const struct bpf_map *map,
@@ -810,8 +811,9 @@ static int map_create(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = bpf_obj_name_cpy(map->name, attr->map_name);
- if (err)
+ err = bpf_obj_name_cpy(map->name, attr->map_name,
+ sizeof(attr->map_name));
+ if (err < 0)
goto free_map;
atomic64_set(&map->refcnt, 1);
@@ -1510,6 +1512,11 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ fdput(f);
+ return -ENOTSUPP;
+ }
+
mutex_lock(&map->freeze_mutex);
if (map->writecnt) {
@@ -2093,8 +2100,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
goto free_prog;
prog->aux->load_time = ktime_get_boottime_ns();
- err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
- if (err)
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
+ sizeof(attr->prog_name));
+ if (err < 0)
goto free_prog;
/* run eBPF verifier */
@@ -2787,7 +2795,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
- struct bpf_prog_info info = {};
+ struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_stats stats;
char __user *uinsns;
@@ -2799,6 +2807,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return err;
info_len = min_t(u32, sizeof(info), info_len);
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
@@ -3062,7 +3071,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
union bpf_attr __user *uattr)
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
- struct bpf_map_info info = {};
+ struct bpf_map_info info;
u32 info_len = attr->info.info_len;
int err;
@@ -3071,6 +3080,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
return err;
info_len = min_t(u32, sizeof(info), info_len);
+ memset(&info, 0, sizeof(info));
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -3354,7 +3364,7 @@ err_put:
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
- union bpf_attr attr = {};
+ union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
@@ -3366,6 +3376,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ memset(&attr, 0, sizeof(attr));
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index be1a1c83cdd1..f2d7cea86ffe 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
*/
p++;
if (p >= end) {
+ (*pos)++;
return NULL;
} else {
*pos = *p;
@@ -782,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work)
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
+ if (!pathbuf || !agentbuf || !strlen(agentbuf))
goto out;
spin_lock_irq(&css_set_lock);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 75f687301bbf..3dead0416b91 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_CPU);
}
@@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
}
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
- if (!list_empty(&cset->tasks))
+ if (!list_empty(&cset->tasks)) {
it->task_pos = cset->tasks.next;
- else if (!list_empty(&cset->mg_tasks))
+ it->cur_tasks_head = &cset->tasks;
+ } else if (!list_empty(&cset->mg_tasks)) {
it->task_pos = cset->mg_tasks.next;
- else
+ it->cur_tasks_head = &cset->mg_tasks;
+ } else {
it->task_pos = cset->dying_tasks.next;
+ it->cur_tasks_head = &cset->dying_tasks;
+ }
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
@@ -4463,10 +4467,14 @@ repeat:
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head)
+ if (it->task_pos == it->tasks_head) {
it->task_pos = it->mg_tasks_head->next;
- if (it->task_pos == it->mg_tasks_head)
+ it->cur_tasks_head = it->mg_tasks_head;
+ }
+ if (it->task_pos == it->mg_tasks_head) {
it->task_pos = it->dying_tasks_head->next;
+ it->cur_tasks_head = it->dying_tasks_head;
+ }
if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
} else {
@@ -4485,11 +4493,12 @@ repeat:
goto repeat;
/* and dying leaders w/o live member threads */
- if (!atomic_read(&task->signal->live))
+ if (it->cur_tasks_head == it->dying_tasks_head &&
+ !atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (task->flags & PF_EXITING)
+ if (it->cur_tasks_head == it->dying_tasks_head)
goto repeat;
}
}
@@ -4595,6 +4604,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
+ if (pos)
+ (*pos)++;
+
return css_task_iter_next(it);
}
@@ -4610,7 +4622,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
* from position 0, so we can simply keep iterating on !0 *pos.
*/
if (!it) {
- if (WARN_ON_ONCE((*pos)++))
+ if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
it = kzalloc(sizeof(*it), GFP_KERNEL);
@@ -4618,10 +4630,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
return ERR_PTR(-ENOMEM);
of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
- } else if (!(*pos)++) {
+ } else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
- }
+ } else
+ return it->cur_task;
return cgroup_procs_next(s, NULL, NULL);
}
@@ -6258,6 +6271,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
return;
}
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt())
+ return;
+
rcu_read_lock();
while (true) {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 0296b4bda8f1..ce430885c26c 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -198,11 +198,13 @@ void __init context_tracking_cpu_set(int cpu)
if (initialized)
return;
+#ifdef CONFIG_HAVE_TIF_NOHZ
/*
* Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
* This assumes that init is the only task at this early boot stage.
*/
set_tsk_thread_flag(&init_task, TIF_NOHZ);
+#endif
WARN_ON_ONCE(!tasklist_empty());
initialized = true;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9c706af713fb..2371292f30b0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
static void lockdep_acquire_cpus_lock(void)
{
- rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+ rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}
static void lockdep_release_cpus_lock(void)
{
- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
+ rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
/*
@@ -1041,7 +1041,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
return _cpu_down(cpu, 0, target);
}
-static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
+static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
int err;
@@ -1051,11 +1051,72 @@ static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
return err;
}
-int cpu_down(unsigned int cpu)
+/**
+ * cpu_device_down - Bring down a cpu device
+ * @dev: Pointer to the cpu device to offline
+ *
+ * This function is meant to be used by device core cpu subsystem only.
+ *
+ * Other subsystems should use remove_cpu() instead.
+ */
+int cpu_device_down(struct device *dev)
{
- return do_cpu_down(cpu, CPUHP_OFFLINE);
+ return cpu_down(dev->id, CPUHP_OFFLINE);
+}
+
+int remove_cpu(unsigned int cpu)
+{
+ int ret;
+
+ lock_device_hotplug();
+ ret = device_offline(get_cpu_device(cpu));
+ unlock_device_hotplug();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(remove_cpu);
+
+void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
+{
+ unsigned int cpu;
+ int error;
+
+ cpu_maps_update_begin();
+
+ /*
+ * Make certain the cpu I'm about to reboot on is online.
+ *
+ * This is inline to what migrate_to_reboot_cpu() already do.
+ */
+ if (!cpu_online(primary_cpu))
+ primary_cpu = cpumask_first(cpu_online_mask);
+
+ for_each_online_cpu(cpu) {
+ if (cpu == primary_cpu)
+ continue;
+
+ error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (error) {
+ pr_err("Failed to offline CPU%d - error=%d",
+ cpu, error);
+ break;
+ }
+ }
+
+ /*
+ * Ensure all but the reboot CPU are offline.
+ */
+ BUG_ON(num_online_cpus() > 1);
+
+ /*
+ * Make sure the CPUs won't be enabled by someone else after this
+ * point. Kexec will reboot to a new kernel shortly resetting
+ * everything along the way.
+ */
+ cpu_hotplug_disabled++;
+
+ cpu_maps_update_done();
}
-EXPORT_SYMBOL(cpu_down);
#else
#define takedown_cpu NULL
@@ -1124,8 +1185,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
}
/*
- * The caller of do_cpu_up might have raced with another
- * caller. Ignore it for now.
+ * The caller of cpu_up() might have raced with another
+ * caller. Nothing to do.
*/
if (st->state >= target)
goto out;
@@ -1169,7 +1230,7 @@ out:
return ret;
}
-static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
+static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
int err = 0;
@@ -1203,16 +1264,70 @@ out:
return err;
}
-int cpu_up(unsigned int cpu)
+/**
+ * cpu_device_up - Bring up a cpu device
+ * @dev: Pointer to the cpu device to online
+ *
+ * This function is meant to be used by device core cpu subsystem only.
+ *
+ * Other subsystems should use add_cpu() instead.
+ */
+int cpu_device_up(struct device *dev)
+{
+ return cpu_up(dev->id, CPUHP_ONLINE);
+}
+
+int add_cpu(unsigned int cpu)
+{
+ int ret;
+
+ lock_device_hotplug();
+ ret = device_online(get_cpu_device(cpu));
+ unlock_device_hotplug();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_cpu);
+
+/**
+ * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
+ * @sleep_cpu: The cpu we hibernated on and should be brought up.
+ *
+ * On some architectures like arm64, we can hibernate on any CPU, but on
+ * wake up the CPU we hibernated on might be offline as a side effect of
+ * using maxcpus= for example.
+ */
+int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
- return do_cpu_up(cpu, CPUHP_ONLINE);
+ int ret;
+
+ if (!cpu_online(sleep_cpu)) {
+ pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
+ ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
+ if (ret) {
+ pr_err("Failed to bring hibernate-CPU up!\n");
+ return ret;
+ }
+ }
+ return 0;
+}
+
+void bringup_nonboot_cpus(unsigned int setup_max_cpus)
+{
+ unsigned int cpu;
+
+ for_each_present_cpu(cpu) {
+ if (num_online_cpus() >= setup_max_cpus)
+ break;
+ if (!cpu_online(cpu))
+ cpu_up(cpu, CPUHP_ONLINE);
+ }
}
-EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-int freeze_secondary_cpus(int primary)
+int __freeze_secondary_cpus(int primary, bool suspend)
{
int cpu, error = 0;
@@ -1237,7 +1352,7 @@ int freeze_secondary_cpus(int primary)
if (cpu == primary)
continue;
- if (pm_wakeup_pending()) {
+ if (suspend && pm_wakeup_pending()) {
pr_info("Wakeup pending. Abort CPU freeze\n");
error = -EBUSY;
break;
@@ -2028,9 +2143,9 @@ static ssize_t write_cpuhp_target(struct device *dev,
goto out;
if (st->state < target)
- ret = do_cpu_up(dev->id, target);
+ ret = cpu_up(dev->id, target);
else
- ret = do_cpu_down(dev->id, target);
+ ret = cpu_down(dev->id, target);
out:
unlock_device_hotplug();
return ret ? ret : count;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e453589da97c..d22e4ba59dfa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,6 +49,7 @@
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
+#include <linux/min_heap.h>
#include "internal.h"
@@ -891,6 +892,47 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
rcu_read_unlock();
}
+static int perf_cgroup_ensure_storage(struct perf_event *event,
+ struct cgroup_subsys_state *css)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event **storage;
+ int cpu, heap_size, ret = 0;
+
+ /*
+ * Allow storage to have sufficent space for an iterator for each
+ * possibly nested cgroup plus an iterator for events with no cgroup.
+ */
+ for (heap_size = 1; css; css = css->parent)
+ heap_size++;
+
+ for_each_possible_cpu(cpu) {
+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ if (heap_size <= cpuctx->heap_size)
+ continue;
+
+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!storage) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ raw_spin_lock_irq(&cpuctx->ctx.lock);
+ if (cpuctx->heap_size < heap_size) {
+ swap(cpuctx->heap, storage);
+ if (storage == cpuctx->heap_default)
+ storage = NULL;
+ cpuctx->heap_size = heap_size;
+ }
+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
+
+ kfree(storage);
+ }
+
+ return ret;
+}
+
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -910,6 +952,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
goto out;
}
+ ret = perf_cgroup_ensure_storage(event, css);
+ if (ret)
+ goto out;
+
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -1531,6 +1577,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
if (left->cpu > right->cpu)
return false;
+#ifdef CONFIG_CGROUP_PERF
+ if (left->cgrp != right->cgrp) {
+ if (!left->cgrp || !left->cgrp->css.cgroup) {
+ /*
+ * Left has no cgroup but right does, no cgroups come
+ * first.
+ */
+ return true;
+ }
+ if (!right->cgrp || !right->cgrp->css.cgroup) {
+ /*
+ * Right has no cgroup but left does, no cgroups come
+ * first.
+ */
+ return false;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
+ return true;
+
+ return false;
+ }
+#endif
+
if (left->group_index < right->group_index)
return true;
if (left->group_index > right->group_index)
@@ -1610,25 +1680,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
*/
static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+ struct cgroup *cgrp)
{
struct perf_event *node_event = NULL, *match = NULL;
struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+ u64 node_cgrp_id, cgrp_id = 0;
+
+ if (cgrp)
+ cgrp_id = cgrp->kn->id;
+#endif
while (node) {
node_event = container_of(node, struct perf_event, group_node);
if (cpu < node_event->cpu) {
node = node->rb_left;
- } else if (cpu > node_event->cpu) {
+ continue;
+ }
+ if (cpu > node_event->cpu) {
node = node->rb_right;
- } else {
- match = node_event;
+ continue;
+ }
+#ifdef CONFIG_CGROUP_PERF
+ node_cgrp_id = 0;
+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+
+ if (cgrp_id < node_cgrp_id) {
node = node->rb_left;
+ continue;
+ }
+ if (cgrp_id > node_cgrp_id) {
+ node = node->rb_right;
+ continue;
}
+#endif
+ match = node_event;
+ node = node->rb_left;
}
return match;
@@ -1641,12 +1734,26 @@ static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+ u64 curr_cgrp_id = 0;
+ u64 next_cgrp_id = 0;
+#endif
next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next && next->cpu == event->cpu)
- return next;
+ if (next == NULL || next->cpu != event->cpu)
+ return NULL;
- return NULL;
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp && event->cgrp->css.cgroup)
+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+
+ if (next->cgrp && next->cgrp->css.cgroup)
+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+
+ if (curr_cgrp_id != next_cgrp_id)
+ return NULL;
+#endif
+ return next;
}
/*
@@ -1986,6 +2093,12 @@ static int perf_get_aux_event(struct perf_event *event,
return 1;
}
+static inline struct list_head *get_event_list(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+}
+
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
@@ -2028,12 +2141,8 @@ static void perf_group_detach(struct perf_event *event)
if (!RB_EMPTY_NODE(&event->group_node)) {
add_event_to_groups(sibling, event->ctx);
- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
- struct list_head *list = sibling->attr.pinned ?
- &ctx->pinned_active : &ctx->flexible_active;
-
- list_add_tail(&sibling->active_list, list);
- }
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
+ list_add_tail(&sibling->active_list, get_event_list(sibling));
}
WARN_ON_ONCE(sibling->ctx != event->ctx);
@@ -2182,6 +2291,7 @@ __perf_remove_from_context(struct perf_event *event,
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
+ ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2350,6 +2460,8 @@ event_sched_in(struct perf_event *event,
{
int ret = 0;
+ WARN_ON_ONCE(event->ctx != ctx);
+
lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3077,12 +3189,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
- /*
- * If we had been multiplexing, no rotations are necessary, now no events
- * are active.
- */
- ctx->rotate_necessary = 0;
-
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3092,6 +3198,13 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
group_sched_out(event, cpuctx, ctx);
+
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ ctx->rotate_necessary = 0;
}
perf_pmu_enable(ctx->pmu);
}
@@ -3388,71 +3501,103 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
- int (*func)(struct perf_event *, void *), void *data)
+static bool perf_less_group_idx(const void *l, const void *r)
{
- struct perf_event **evt, *evt1, *evt2;
- int ret;
-
- evt1 = perf_event_groups_first(groups, -1);
- evt2 = perf_event_groups_first(groups, cpu);
-
- while (evt1 || evt2) {
- if (evt1 && evt2) {
- if (evt1->group_index < evt2->group_index)
- evt = &evt1;
- else
- evt = &evt2;
- } else if (evt1) {
- evt = &evt1;
- } else {
- evt = &evt2;
- }
+ const struct perf_event *le = l, *re = r;
- ret = func(*evt, data);
- if (ret)
- return ret;
+ return le->group_index < re->group_index;
+}
- *evt = perf_event_groups_next(*evt);
- }
+static void swap_ptr(void *l, void *r)
+{
+ void **lp = l, **rp = r;
- return 0;
+ swap(*lp, *rp);
}
-struct sched_in_data {
- struct perf_event_context *ctx;
- struct perf_cpu_context *cpuctx;
- int can_add_hw;
+static const struct min_heap_callbacks perf_min_heap = {
+ .elem_size = sizeof(struct perf_event *),
+ .less = perf_less_group_idx,
+ .swp = swap_ptr,
};
-static int pinned_sched_in(struct perf_event *event, void *data)
+static void __heap_add(struct min_heap *heap, struct perf_event *event)
{
- struct sched_in_data *sid = data;
+ struct perf_event **itrs = heap->data;
- if (event->state <= PERF_EVENT_STATE_OFF)
- return 0;
+ if (event) {
+ itrs[heap->nr] = event;
+ heap->nr++;
+ }
+}
- if (!event_filter_match(event))
- return 0;
+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+ struct perf_event_groups *groups, int cpu,
+ int (*func)(struct perf_event *, void *),
+ void *data)
+{
+#ifdef CONFIG_CGROUP_PERF
+ struct cgroup_subsys_state *css = NULL;
+#endif
+ /* Space for per CPU and/or any CPU event iterators. */
+ struct perf_event *itrs[2];
+ struct min_heap event_heap;
+ struct perf_event **evt;
+ int ret;
+
+ if (cpuctx) {
+ event_heap = (struct min_heap){
+ .data = cpuctx->heap,
+ .nr = 0,
+ .size = cpuctx->heap_size,
+ };
+
+ lockdep_assert_held(&cpuctx->ctx.lock);
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+#ifdef CONFIG_CGROUP_PERF
+ if (cpuctx->cgrp)
+ css = &cpuctx->cgrp->css;
+#endif
+ } else {
+ event_heap = (struct min_heap){
+ .data = itrs,
+ .nr = 0,
+ .size = ARRAY_SIZE(itrs),
+ };
+ /* Events not within a CPU context may be on any CPU. */
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
}
+ evt = event_heap.data;
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+
+#ifdef CONFIG_CGROUP_PERF
+ for (; css; css = css->parent)
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+#endif
+
+ min_heapify_all(&event_heap, &perf_min_heap);
+
+ while (event_heap.nr) {
+ ret = func(*evt, data);
+ if (ret)
+ return ret;
+
+ *evt = perf_event_groups_next(*evt);
+ if (*evt)
+ min_heapify(&event_heap, 0, &perf_min_heap);
+ else
+ min_heap_pop(&event_heap, &perf_min_heap);
+ }
return 0;
}
-static int flexible_sched_in(struct perf_event *event, void *data)
+static int merge_sched_in(struct perf_event *event, void *data)
{
- struct sched_in_data *sid = data;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -3460,14 +3605,17 @@ static int flexible_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
- if (ret) {
- sid->can_add_hw = 0;
- sid->ctx->rotate_necessary = 1;
- return 0;
- }
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+ if (!group_sched_in(event, cpuctx, ctx))
+ list_add_tail(&event->active_list, get_event_list(event));
+ }
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ if (event->attr.pinned)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ *can_add_hw = 0;
+ ctx->rotate_necessary = 1;
}
return 0;
@@ -3477,30 +3625,28 @@ static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct sched_in_data sid = {
- .ctx = ctx,
- .cpuctx = cpuctx,
- .can_add_hw = 1,
- };
+ int can_add_hw = 1;
- visit_groups_merge(&ctx->pinned_groups,
+ if (ctx != &cpuctx->ctx)
+ cpuctx = NULL;
+
+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
smp_processor_id(),
- pinned_sched_in, &sid);
+ merge_sched_in, &can_add_hw);
}
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct sched_in_data sid = {
- .ctx = ctx,
- .cpuctx = cpuctx,
- .can_add_hw = 1,
- };
+ int can_add_hw = 1;
- visit_groups_merge(&ctx->flexible_groups,
+ if (ctx != &cpuctx->ctx)
+ cpuctx = NULL;
+
+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
smp_processor_id(),
- flexible_sched_in, &sid);
+ merge_sched_in, &can_add_hw);
}
static void
@@ -3841,6 +3987,12 @@ ctx_event_to_rotate(struct perf_event_context *ctx)
typeof(*event), group_node);
}
+ /*
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
+ * finds there are unschedulable events, it will set it again.
+ */
+ ctx->rotate_necessary = 0;
+
return event;
}
@@ -6555,6 +6707,11 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -6643,6 +6800,8 @@ void perf_output_sample(struct perf_output_handle *handle,
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
+ if (perf_sample_save_hw_index(event))
+ perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
/*
@@ -6836,6 +6995,9 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
if (data->br_stack) {
+ if (perf_sample_save_hw_index(event))
+ size += sizeof(u64);
+
size += data->br_stack->nr
* sizeof(struct perf_branch_entry);
}
@@ -10349,6 +10511,9 @@ skip_type:
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
__perf_mux_hrtimer_init(cpuctx, cpu);
+
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
got_cpu_context:
@@ -10794,12 +10959,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
- if (cgroup_fd != -1) {
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
- if (err)
- goto err_ns;
- }
-
pmu = perf_init_event(event);
if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
@@ -10821,6 +10980,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_pmu;
}
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_pmu;
+ }
+
err = exclusive_event_init(event);
if (err)
goto err_pmu;
@@ -10881,12 +11046,12 @@ err_per_task:
exclusive_event_destroy(event);
err_pmu:
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
if (event->destroy)
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
if (event->hw.target)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..d70d47159640 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -103,17 +103,8 @@ static void __exit_signal(struct task_struct *tsk)
#ifdef CONFIG_POSIX_TIMERS
posix_cpu_timers_exit(tsk);
- if (group_dead) {
+ if (group_dead)
posix_cpu_timers_exit_group(tsk);
- } else {
- /*
- * This can only happen if the caller is de_thread().
- * FIXME: this is the temporary hack, we should teach
- * posix-cpu-timers to handle this case correctly.
- */
- if (unlikely(has_group_leader_pid(tsk)))
- posix_cpu_timers_exit_group(tsk);
- }
#endif
if (group_dead) {
@@ -258,6 +249,7 @@ void rcuwait_wake_up(struct rcuwait *w)
wake_up_process(task);
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
/*
* Determine if a process group is "orphaned", according to the POSIX
@@ -619,8 +611,8 @@ static void forget_original_parent(struct task_struct *father,
reaper = find_new_reaper(father, reaper);
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
- t->real_parent = reaper;
- BUG_ON((!t->ptrace) != (t->parent == father));
+ RCU_INIT_POINTER(t->real_parent, reaper);
+ BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
if (likely(!t->ptrace))
t->parent = t->real_parent;
if (t->pdeath_signal)
diff --git a/kernel/fork.c b/kernel/fork.c
index 60a1295f4384..d90af13431c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -397,8 +397,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
THREAD_SIZE / 1024 * account);
- mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
}
@@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
- rcu_assign_pointer(tsk->sighand, sig);
+ RCU_INIT_POINTER(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
diff --git a/kernel/futex.c b/kernel/futex.c
index 0cf84c8664f2..b59532862bc0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -135,8 +135,7 @@
*
* Where (A) orders the waiters increment and the futex value read through
* atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers for both
- * shared and private futexes in get_futex_key_refs().
+ * to futex and the waiters read (see hb_waiters_pending()).
*
* This yields the following case (where X:=waiters, Y:=futex):
*
@@ -331,17 +330,6 @@ static void compat_exit_robust_list(struct task_struct *curr);
static inline void compat_exit_robust_list(struct task_struct *curr) { }
#endif
-static inline void futex_get_mm(union futex_key *key)
-{
- mmgrab(key->private.mm);
- /*
- * Ensure futex_get_mm() implies a full barrier such that
- * get_futex_key() implies a full barrier. This is relied upon
- * as smp_mb(); (B), see the ordering comment above.
- */
- smp_mb__after_atomic();
-}
-
/*
* Reflects a new waiter being added to the waitqueue.
*/
@@ -370,6 +358,10 @@ static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
+ /*
+ * Full barrier (B), see the ordering comment above.
+ */
+ smp_mb();
return atomic_read(&hb->waiters);
#else
return 1;
@@ -385,9 +377,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
- u32 hash = jhash2((u32*)&key->both.word,
- (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
+
return &futex_queues[hash & (futex_hashsize - 1)];
}
@@ -407,70 +399,6 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
&& key1->both.offset == key2->both.offset);
}
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
- if (!key->both.ptr)
- return;
-
- /*
- * On MMU less systems futexes are always "private" as there is no per
- * process address space. We need the smp wmb nevertheless - yes,
- * arch/blackfin has MMU less SMP ...
- */
- if (!IS_ENABLED(CONFIG_MMU)) {
- smp_mb(); /* explicit smp_mb(); (B) */
- return;
- }
-
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies smp_mb(); (B) */
- break;
- case FUT_OFF_MMSHARED:
- futex_get_mm(key); /* implies smp_mb(); (B) */
- break;
- default:
- /*
- * Private futexes do not hold reference on an inode or
- * mm, therefore the only purpose of calling get_futex_key_refs
- * is because we need the barrier for the lockless waiter check.
- */
- smp_mb(); /* explicit smp_mb(); (B) */
- }
-}
-
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held. This is
- * a no-op for private futexes, see comment in the get
- * counterpart.
- */
-static void drop_futex_key_refs(union futex_key *key)
-{
- if (!key->both.ptr) {
- /* If we're here then we tried to put a key we failed to get */
- WARN_ON_ONCE(1);
- return;
- }
-
- if (!IS_ENABLED(CONFIG_MMU))
- return;
-
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- iput(key->shared.inode);
- break;
- case FUT_OFF_MMSHARED:
- mmdrop(key->private.mm);
- break;
- }
-}
-
enum futex_access {
FUTEX_READ,
FUTEX_WRITE
@@ -505,6 +433,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
return timeout;
}
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that match_futex() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
@@ -517,9 +485,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
*
* The key words are stored in @key on success.
*
- * For shared mappings, it's (page->index, file_inode(vma->vm_file),
- * offset_within_page). For private mappings, it's (uaddr, current->mm).
- * We can usually work out the index without swapping in the page.
+ * For shared mappings (when @fshared), the key is:
+ * ( inode->i_sequence, page->index, offset_within_page )
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
@@ -556,7 +530,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies smp_mb(); (B) */
return 0;
}
@@ -659,8 +632,6 @@ again:
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies smp_mb(); (B) */
-
} else {
struct inode *inode;
@@ -692,36 +663,8 @@ again:
goto again;
}
- /*
- * Take a reference unless it is about to be freed. Previously
- * this reference was taken by ihold under the page lock
- * pinning the inode in place so i_lock was unnecessary. The
- * only way for this check to fail is if the inode was
- * truncated in parallel which is almost certainly an
- * application bug. In such a case, just retry.
- *
- * We are not calling into get_futex_key_refs() in file-backed
- * cases, therefore a successful atomic_inc return below will
- * guarantee that get_futex_key() will still imply smp_mb(); (B).
- */
- if (!atomic_inc_not_zero(&inode->i_count)) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- /* Should be impossible but lets be paranoid for now */
- if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
- err = -EFAULT;
- rcu_read_unlock();
- iput(inode);
-
- goto out;
- }
-
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = inode;
+ key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = basepage_index(tail);
rcu_read_unlock();
}
@@ -733,7 +676,6 @@ out:
static inline void put_futex_key(union futex_key *key)
{
- drop_futex_key_refs(key);
}
/**
@@ -1723,10 +1665,9 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
oparg = 1 << oparg;
}
- if (!access_ok(uaddr, sizeof(u32)))
- return -EFAULT;
-
+ pagefault_disable();
ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ pagefault_enable();
if (ret)
return ret;
@@ -1868,7 +1809,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
}
- get_futex_key_refs(key2);
q->key = *key2;
}
@@ -1890,7 +1830,6 @@ static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
struct futex_hash_bucket *hb)
{
- get_futex_key_refs(key);
q->key = *key;
__unqueue_futex(q);
@@ -2001,7 +1940,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- int drop_count = 0, task_count = 0, ret;
+ int task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
@@ -2122,7 +2061,6 @@ retry_private:
*/
if (ret > 0) {
WARN_ON(pi_state);
- drop_count++;
task_count++;
/*
* If we acquired the lock, then the user space value
@@ -2242,7 +2180,6 @@ retry_private:
* doing so.
*/
requeue_pi_wake_futex(this, &key2, hb2);
- drop_count++;
continue;
} else if (ret) {
/*
@@ -2263,7 +2200,6 @@ retry_private:
}
}
requeue_futex(this, hb1, hb2, &key2);
- drop_count++;
}
/*
@@ -2278,15 +2214,6 @@ out_unlock:
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
- /*
- * drop_futex_key_refs() must be called outside the spinlocks. During
- * the requeue we moved futex_q's from the hash bucket at key1 to the
- * one at key2 and updated their key pointer. We no longer need to
- * hold the references to key1.
- */
- while (--drop_count >= 0)
- drop_futex_key_refs(&key1);
-
out_put_keys:
put_futex_key(&key2);
out_put_key1:
@@ -2416,7 +2343,6 @@ retry:
ret = 1;
}
- drop_futex_key_refs(&q->key);
return ret;
}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index f92d9a687372..20d501af4f2e 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -43,6 +43,10 @@ config GENERIC_IRQ_MIGRATION
config AUTO_IRQ_AFFINITY
bool
+# Interrupt injection mechanism
+config GENERIC_IRQ_INJECTION
+ bool
+
# Tasklet based software resend for pending interrupts on enable_irq()
config HARDIRQS_SW_RESEND
bool
@@ -127,6 +131,7 @@ config SPARSE_IRQ
config GENERIC_IRQ_DEBUGFS
bool "Expose irq internals in debugfs"
depends on DEBUG_FS
+ select GENERIC_IRQ_INJECTION
default n
---help---
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b3fa2d87d2f3..41e7e37a0928 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -278,7 +278,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
}
}
if (resend)
- check_irq_resend(desc);
+ check_irq_resend(desc, false);
return ret;
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index a949bd39e343..4f9f844074db 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -190,33 +190,7 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
return -EFAULT;
if (!strncmp(buf, "trigger", size)) {
- unsigned long flags;
- int err;
-
- /* Try the HW interface first */
- err = irq_set_irqchip_state(irq_desc_get_irq(desc),
- IRQCHIP_STATE_PENDING, true);
- if (!err)
- return count;
-
- /*
- * Otherwise, try to inject via the resend interface,
- * which may or may not succeed.
- */
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
-
- if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
- /* Can't do level nor NMIs, sorry */
- err = -EINVAL;
- } else {
- desc->istate |= IRQS_PENDING;
- check_irq_resend(desc);
- err = 0;
- }
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
+ int err = irq_inject_interrupt(irq_desc_get_irq(desc));
return err ? err : count;
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a4ace611f47f..a8e14c80b405 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -145,6 +145,13 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
for_each_action_of_desc(desc, action) {
irqreturn_t res;
+ /*
+ * If this IRQ would be threaded under force_irqthreads, mark it so.
+ */
+ if (irq_settings_can_thread(desc) &&
+ !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)))
+ lockdep_hardirq_threaded();
+
trace_irq_handler_entry(irq, action);
res = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, res);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9d8eb7f5c02..7db284b10ac9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -108,7 +108,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc);
+int check_irq_resend(struct irq_desc *desc, bool inject);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
@@ -425,6 +425,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
return desc->pending_mask;
}
+static inline bool handle_enforce_irqctx(struct irq_data *data)
+{
+ return irqd_is_handle_enforce_irqctx(data);
+}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
@@ -451,6 +455,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
+static inline bool handle_enforce_irqctx(struct irq_data *data)
+{
+ return false;
+}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 98a5f10d1900..1a7723604399 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -638,9 +638,15 @@ void irq_init_desc(unsigned int irq)
int generic_handle_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_data *data;
if (!desc)
return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ return -EPERM;
+
generic_handle_irq_desc(desc);
return 0;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7527e5ef6fe5..35b8d97c3a1d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -46,11 +46,11 @@ const struct fwnode_operations irqchip_fwnode_ops;
EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
/**
- * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
+ * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
* identifying an irq domain
* @type: Type of irqchip_fwnode. See linux/irqdomain.h
- * @name: Optional user provided domain name
* @id: Optional user provided id if name != NULL
+ * @name: Optional user provided domain name
* @pa: Optional user-provided physical address
*
* Allocate a struct irqchip_fwid, and return a poiner to the embedded
@@ -1310,6 +1310,11 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs, void *arg)
{
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
@@ -1347,11 +1352,6 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
return -EINVAL;
}
- if (!domain->ops->alloc) {
- pr_debug("domain->ops->alloc() is NULL\n");
- return -ENOSYS;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7eee98c38f25..fe40c658f86f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (desc->affinity_notify) {
kref_get(&desc->affinity_notify->kref);
- schedule_work(&desc->affinity_notify->work);
+ if (!schedule_work(&desc->affinity_notify->work)) {
+ /* Work was already scheduled, drop our extra ref */
+ kref_put(&desc->affinity_notify->kref,
+ desc->affinity_notify->release);
+ }
}
irqd_set(data, IRQD_AFFINITY_SET);
@@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (old_notify) {
- cancel_work_sync(&old_notify->work);
+ if (cancel_work_sync(&old_notify->work)) {
+ /* Pending work had a ref, put that one too */
+ kref_put(&old_notify->kref, old_notify->release);
+ }
kref_put(&old_notify->kref, old_notify->release);
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 98c04ca5fa43..27634f4022d0 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -47,6 +47,43 @@ static void resend_irqs(unsigned long arg)
/* Tasklet to handle resend: */
static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+static int irq_sw_resend(struct irq_desc *desc)
+{
+ unsigned int irq = irq_desc_get_irq(desc);
+
+ /*
+ * Validate whether this interrupt can be safely injected from
+ * non interrupt context
+ */
+ if (handle_enforce_irqctx(&desc->irq_data))
+ return -EINVAL;
+
+ /*
+ * If the interrupt is running in the thread context of the parent
+ * irq we need to be careful, because we cannot trigger it
+ * directly.
+ */
+ if (irq_settings_is_nested_thread(desc)) {
+ /*
+ * If the parent_irq is valid, we retrigger the parent,
+ * otherwise we do nothing.
+ */
+ if (!desc->parent_irq)
+ return -EINVAL;
+ irq = desc->parent_irq;
+ }
+
+ /* Set it pending and activate the softirq: */
+ set_bit(irq, irqs_resend);
+ tasklet_schedule(&resend_tasklet);
+ return 0;
+}
+
+#else
+static int irq_sw_resend(struct irq_desc *desc)
+{
+ return -EINVAL;
+}
#endif
/*
@@ -54,49 +91,83 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*
* Is called with interrupts disabled and desc->lock held.
*/
-void check_irq_resend(struct irq_desc *desc)
+int check_irq_resend(struct irq_desc *desc, bool inject)
{
+ int err = 0;
+
/*
- * We do not resend level type interrupts. Level type
- * interrupts are resent by hardware when they are still
- * active. Clear the pending bit so suspend/resume does not
- * get confused.
+ * We do not resend level type interrupts. Level type interrupts
+ * are resent by hardware when they are still active. Clear the
+ * pending bit so suspend/resume does not get confused.
*/
if (irq_settings_is_level(desc)) {
desc->istate &= ~IRQS_PENDING;
- return;
+ return -EINVAL;
}
+
if (desc->istate & IRQS_REPLAY)
- return;
- if (desc->istate & IRQS_PENDING) {
- desc->istate &= ~IRQS_PENDING;
+ return -EBUSY;
+
+ if (!(desc->istate & IRQS_PENDING) && !inject)
+ return 0;
+
+ desc->istate &= ~IRQS_PENDING;
+
+ if (!desc->irq_data.chip->irq_retrigger ||
+ !desc->irq_data.chip->irq_retrigger(&desc->irq_data))
+ err = irq_sw_resend(desc);
+
+ /* If the retrigger was successfull, mark it with the REPLAY bit */
+ if (!err)
desc->istate |= IRQS_REPLAY;
+ return err;
+}
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
-#ifdef CONFIG_HARDIRQS_SW_RESEND
- unsigned int irq = irq_desc_get_irq(desc);
-
- /*
- * If the interrupt is running in the thread
- * context of the parent irq we need to be
- * careful, because we cannot trigger it
- * directly.
- */
- if (irq_settings_is_nested_thread(desc)) {
- /*
- * If the parent_irq is valid, we
- * retrigger the parent, otherwise we
- * do nothing.
- */
- if (!desc->parent_irq)
- return;
- irq = desc->parent_irq;
- }
- /* Set it pending and activate the softirq: */
- set_bit(irq, irqs_resend);
- tasklet_schedule(&resend_tasklet);
-#endif
- }
- }
+#ifdef CONFIG_GENERIC_IRQ_INJECTION
+/**
+ * irq_inject_interrupt - Inject an interrupt for testing/error injection
+ * @irq: The interrupt number
+ *
+ * This function must only be used for debug and testing purposes!
+ *
+ * Especially on x86 this can cause a premature completion of an interrupt
+ * affinity change causing the interrupt line to become stale. Very
+ * unlikely, but possible.
+ *
+ * The injection can fail for various reasons:
+ * - Interrupt is not activated
+ * - Interrupt is NMI type or currently replaying
+ * - Interrupt is level type
+ * - Interrupt does not support hardware retrigger and software resend is
+ * either not enabled or not possible for the interrupt.
+ */
+int irq_inject_interrupt(unsigned int irq)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+ int err;
+
+ /* Try the state injection hardware interface first */
+ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true))
+ return 0;
+
+ /* That failed, try via the resend mechanism */
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return -EINVAL;
+
+ /*
+ * Only try to inject when the interrupt is:
+ * - not NMI type
+ * - activated
+ */
+ if ((desc->istate & IRQS_NMI) || !irqd_is_activated(&desc->irq_data))
+ err = -EINVAL;
+ else
+ err = check_irq_resend(desc, true);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
}
+EXPORT_SYMBOL_GPL(irq_inject_interrupt);
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 828cc30774bc..48b5d1b6af4d 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -153,7 +153,9 @@ static void irq_work_run_list(struct llist_head *list)
*/
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+ lockdep_irq_work_enter(work);
work->func(work);
+ lockdep_irq_work_exit(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b262f47046ca..bfbfa481be3a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -199,8 +199,15 @@ static void __kthread_parkme(struct kthread *self)
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the caller of kthread_park() may spend more time in
+ * wait_task_inactive().
+ */
+ preempt_disable();
complete(&self->parked);
- schedule();
+ schedule_preempt_disabled();
+ preempt_enable();
}
__set_current_state(TASK_RUNNING);
}
@@ -245,8 +252,14 @@ static int kthread(void *_create)
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the creator may spend more time in wait_task_inactive().
+ */
+ preempt_disable();
complete(done);
- schedule();
+ schedule_preempt_disabled();
+ preempt_enable();
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 32406ef0d6a2..1511690e4de7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -84,12 +84,39 @@ module_param(lock_stat, int, 0644);
* to use a raw spinlock - we really dont want the spinlock
* code to recurse back into the lockdep code...
*/
-static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static struct task_struct *__owner;
+
+static inline void lockdep_lock(void)
+{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ arch_spin_lock(&__lock);
+ __owner = current;
+ current->lockdep_recursion++;
+}
+
+static inline void lockdep_unlock(void)
+{
+ if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
+ return;
+
+ current->lockdep_recursion--;
+ __owner = NULL;
+ arch_spin_unlock(&__lock);
+}
+
+static inline bool lockdep_assert_locked(void)
+{
+ return DEBUG_LOCKS_WARN_ON(__owner != current);
+}
+
static struct task_struct *lockdep_selftest_task_struct;
+
static int graph_lock(void)
{
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
/*
* Make sure that if another CPU detected a bug while
* walking the graph we dont change it (while the other
@@ -97,27 +124,15 @@ static int graph_lock(void)
* dropped already)
*/
if (!debug_locks) {
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
return 0;
}
- /* prevent any recursions within lockdep from causing deadlocks */
- current->lockdep_recursion++;
return 1;
}
-static inline int graph_unlock(void)
+static inline void graph_unlock(void)
{
- if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
- /*
- * The lockdep graph lock isn't locked while we expect it to
- * be, we're confused now, bye!
- */
- return DEBUG_LOCKS_WARN_ON(1);
- }
-
- current->lockdep_recursion--;
- arch_spin_unlock(&lockdep_lock);
- return 0;
+ lockdep_unlock();
}
/*
@@ -128,7 +143,7 @@ static inline int debug_locks_off_graph_unlock(void)
{
int ret = debug_locks_off();
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
return ret;
}
@@ -147,6 +162,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
#define KEYHASH_SIZE (1UL << KEYHASH_BITS)
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
+unsigned long nr_zapped_classes;
#ifndef CONFIG_DEBUG_LOCKDEP
static
#endif
@@ -377,18 +393,31 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}
+/*
+ * Split the recrursion counter in two to readily detect 'off' vs recursion.
+ */
+#define LOCKDEP_RECURSION_BITS 16
+#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS)
+#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1)
+
void lockdep_off(void)
{
- current->lockdep_recursion++;
+ current->lockdep_recursion += LOCKDEP_OFF;
}
EXPORT_SYMBOL(lockdep_off);
void lockdep_on(void)
{
- current->lockdep_recursion--;
+ current->lockdep_recursion -= LOCKDEP_OFF;
}
EXPORT_SYMBOL(lockdep_on);
+static inline void lockdep_recursion_finish(void)
+{
+ if (WARN_ON_ONCE(--current->lockdep_recursion))
+ current->lockdep_recursion = 0;
+}
+
void lockdep_set_selftest_task(struct task_struct *task)
{
lockdep_selftest_task_struct = task;
@@ -575,6 +604,7 @@ static const char *usage_str[] =
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
+ [LOCK_USAGE_STATES] = "IN-NMI",
};
#endif
@@ -653,7 +683,9 @@ static void print_lock_name(struct lock_class *class)
printk(KERN_CONT " (");
__print_lock_name(class);
- printk(KERN_CONT "){%s}", usage);
+ printk(KERN_CONT "){%s}-{%hd:%hd}", usage,
+ class->wait_type_outer ?: class->wait_type_inner,
+ class->wait_type_inner);
}
static void print_lockdep_cache(struct lockdep_map *lock)
@@ -787,6 +819,7 @@ static int count_matching_names(struct lock_class *new_class)
return count + 1;
}
+/* used from NMI context -- must be lockless */
static inline struct lock_class *
look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
@@ -1070,13 +1103,15 @@ static inline void check_data_structures(void) { }
#endif /* CONFIG_DEBUG_LOCKDEP */
+static void init_chain_block_buckets(void);
+
/*
* Initialize the lock_classes[] array elements, the free_lock_classes list
* and also the delayed_free structure.
*/
static void init_data_structures_once(void)
{
- static bool ds_initialized, rcu_head_initialized;
+ static bool __read_mostly ds_initialized, rcu_head_initialized;
int i;
if (likely(rcu_head_initialized))
@@ -1100,6 +1135,7 @@ static void init_data_structures_once(void)
INIT_LIST_HEAD(&lock_classes[i].locks_after);
INIT_LIST_HEAD(&lock_classes[i].locks_before);
}
+ init_chain_block_buckets();
}
static inline struct hlist_head *keyhashentry(const struct lock_class_key *key)
@@ -1230,6 +1266,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
WARN_ON_ONCE(!list_empty(&class->locks_before));
WARN_ON_ONCE(!list_empty(&class->locks_after));
class->name_version = count_matching_names(class);
+ class->wait_type_inner = lock->wait_type_inner;
+ class->wait_type_outer = lock->wait_type_outer;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
@@ -1469,6 +1507,8 @@ static int __bfs(struct lock_list *source_entry,
struct circular_queue *cq = &lock_cq;
int ret = 1;
+ lockdep_assert_locked();
+
if (match(source_entry, data)) {
*target_entry = source_entry;
ret = 0;
@@ -1491,8 +1531,6 @@ static int __bfs(struct lock_list *source_entry,
head = get_dep_list(lock, offset);
- DEBUG_LOCKS_WARN_ON(!irqs_disabled());
-
list_for_each_entry_rcu(entry, head, entry) {
if (!lock_accessed(entry)) {
unsigned int cq_depth;
@@ -1719,9 +1757,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
ret = __lockdep_count_forward_deps(&this);
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
raw_local_irq_restore(flags);
return ret;
@@ -1746,9 +1784,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
this.class = class;
raw_local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
ret = __lockdep_count_backward_deps(&this);
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
raw_local_irq_restore(flags);
return ret;
@@ -2298,18 +2336,6 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
return 0;
}
-static void inc_chains(void)
-{
- if (current->hardirq_context)
- nr_hardirq_chains++;
- else {
- if (current->softirq_context)
- nr_softirq_chains++;
- else
- nr_process_chains++;
- }
-}
-
#else
static inline int check_irq_usage(struct task_struct *curr,
@@ -2317,13 +2343,27 @@ static inline int check_irq_usage(struct task_struct *curr,
{
return 1;
}
+#endif /* CONFIG_TRACE_IRQFLAGS */
-static inline void inc_chains(void)
+static void inc_chains(int irq_context)
{
- nr_process_chains++;
+ if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
+ nr_hardirq_chains++;
+ else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
}
-#endif /* CONFIG_TRACE_IRQFLAGS */
+static void dec_chains(int irq_context)
+{
+ if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
+ nr_hardirq_chains--;
+ else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT)
+ nr_softirq_chains--;
+ else
+ nr_process_chains--;
+}
static void
print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
@@ -2622,8 +2662,235 @@ out_bug:
struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);
-int nr_chain_hlocks;
static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+unsigned long nr_zapped_lock_chains;
+unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */
+unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */
+unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */
+
+/*
+ * The first 2 chain_hlocks entries in the chain block in the bucket
+ * list contains the following meta data:
+ *
+ * entry[0]:
+ * Bit 15 - always set to 1 (it is not a class index)
+ * Bits 0-14 - upper 15 bits of the next block index
+ * entry[1] - lower 16 bits of next block index
+ *
+ * A next block index of all 1 bits means it is the end of the list.
+ *
+ * On the unsized bucket (bucket-0), the 3rd and 4th entries contain
+ * the chain block size:
+ *
+ * entry[2] - upper 16 bits of the chain block size
+ * entry[3] - lower 16 bits of the chain block size
+ */
+#define MAX_CHAIN_BUCKETS 16
+#define CHAIN_BLK_FLAG (1U << 15)
+#define CHAIN_BLK_LIST_END 0xFFFFU
+
+static int chain_block_buckets[MAX_CHAIN_BUCKETS];
+
+static inline int size_to_bucket(int size)
+{
+ if (size > MAX_CHAIN_BUCKETS)
+ return 0;
+
+ return size - 1;
+}
+
+/*
+ * Iterate all the chain blocks in a bucket.
+ */
+#define for_each_chain_block(bucket, prev, curr) \
+ for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \
+ (curr) >= 0; \
+ (prev) = (curr), (curr) = chain_block_next(curr))
+
+/*
+ * next block or -1
+ */
+static inline int chain_block_next(int offset)
+{
+ int next = chain_hlocks[offset];
+
+ WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG));
+
+ if (next == CHAIN_BLK_LIST_END)
+ return -1;
+
+ next &= ~CHAIN_BLK_FLAG;
+ next <<= 16;
+ next |= chain_hlocks[offset + 1];
+
+ return next;
+}
+
+/*
+ * bucket-0 only
+ */
+static inline int chain_block_size(int offset)
+{
+ return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3];
+}
+
+static inline void init_chain_block(int offset, int next, int bucket, int size)
+{
+ chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG;
+ chain_hlocks[offset + 1] = (u16)next;
+
+ if (size && !bucket) {
+ chain_hlocks[offset + 2] = size >> 16;
+ chain_hlocks[offset + 3] = (u16)size;
+ }
+}
+
+static inline void add_chain_block(int offset, int size)
+{
+ int bucket = size_to_bucket(size);
+ int next = chain_block_buckets[bucket];
+ int prev, curr;
+
+ if (unlikely(size < 2)) {
+ /*
+ * We can't store single entries on the freelist. Leak them.
+ *
+ * One possible way out would be to uniquely mark them, other
+ * than with CHAIN_BLK_FLAG, such that we can recover them when
+ * the block before it is re-added.
+ */
+ if (size)
+ nr_lost_chain_hlocks++;
+ return;
+ }
+
+ nr_free_chain_hlocks += size;
+ if (!bucket) {
+ nr_large_chain_blocks++;
+
+ /*
+ * Variable sized, sort large to small.
+ */
+ for_each_chain_block(0, prev, curr) {
+ if (size >= chain_block_size(curr))
+ break;
+ }
+ init_chain_block(offset, curr, 0, size);
+ if (prev < 0)
+ chain_block_buckets[0] = offset;
+ else
+ init_chain_block(prev, offset, 0, 0);
+ return;
+ }
+ /*
+ * Fixed size, add to head.
+ */
+ init_chain_block(offset, next, bucket, size);
+ chain_block_buckets[bucket] = offset;
+}
+
+/*
+ * Only the first block in the list can be deleted.
+ *
+ * For the variable size bucket[0], the first block (the largest one) is
+ * returned, broken up and put back into the pool. So if a chain block of
+ * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be
+ * queued up after the primordial chain block and never be used until the
+ * hlock entries in the primordial chain block is almost used up. That
+ * causes fragmentation and reduce allocation efficiency. That can be
+ * monitored by looking at the "large chain blocks" number in lockdep_stats.
+ */
+static inline void del_chain_block(int bucket, int size, int next)
+{
+ nr_free_chain_hlocks -= size;
+ chain_block_buckets[bucket] = next;
+
+ if (!bucket)
+ nr_large_chain_blocks--;
+}
+
+static void init_chain_block_buckets(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_CHAIN_BUCKETS; i++)
+ chain_block_buckets[i] = -1;
+
+ add_chain_block(0, ARRAY_SIZE(chain_hlocks));
+}
+
+/*
+ * Return offset of a chain block of the right size or -1 if not found.
+ *
+ * Fairly simple worst-fit allocator with the addition of a number of size
+ * specific free lists.
+ */
+static int alloc_chain_hlocks(int req)
+{
+ int bucket, curr, size;
+
+ /*
+ * We rely on the MSB to act as an escape bit to denote freelist
+ * pointers. Make sure this bit isn't set in 'normal' class_idx usage.
+ */
+ BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG);
+
+ init_data_structures_once();
+
+ if (nr_free_chain_hlocks < req)
+ return -1;
+
+ /*
+ * We require a minimum of 2 (u16) entries to encode a freelist
+ * 'pointer'.
+ */
+ req = max(req, 2);
+ bucket = size_to_bucket(req);
+ curr = chain_block_buckets[bucket];
+
+ if (bucket) {
+ if (curr >= 0) {
+ del_chain_block(bucket, req, chain_block_next(curr));
+ return curr;
+ }
+ /* Try bucket 0 */
+ curr = chain_block_buckets[0];
+ }
+
+ /*
+ * The variable sized freelist is sorted by size; the first entry is
+ * the largest. Use it if it fits.
+ */
+ if (curr >= 0) {
+ size = chain_block_size(curr);
+ if (likely(size >= req)) {
+ del_chain_block(0, size, chain_block_next(curr));
+ add_chain_block(curr + req, size - req);
+ return curr;
+ }
+ }
+
+ /*
+ * Last resort, split a block in a larger sized bucket.
+ */
+ for (size = MAX_CHAIN_BUCKETS; size > req; size--) {
+ bucket = size_to_bucket(size);
+ curr = chain_block_buckets[bucket];
+ if (curr < 0)
+ continue;
+
+ del_chain_block(bucket, size, chain_block_next(curr));
+ add_chain_block(curr + req, size - req);
+ return curr;
+ }
+
+ return -1;
+}
+
+static inline void free_chain_hlocks(int base, int size)
+{
+ add_chain_block(base, max(size, 2));
+}
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
{
@@ -2803,7 +3070,7 @@ static inline int add_chain_cache(struct task_struct *curr,
* disabled to make this an IRQ-safe lock.. for recursion reasons
* lockdep won't complain about its own locking errors.
*/
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ if (lockdep_assert_locked())
return 0;
chain = alloc_lock_chain();
@@ -2824,15 +3091,8 @@ static inline int add_chain_cache(struct task_struct *curr,
BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
- if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
- chain->base = nr_chain_hlocks;
- for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx;
- chain_hlocks[chain->base + j] = lock_id;
- }
- chain_hlocks[chain->base + j] = class - lock_classes;
- nr_chain_hlocks += chain->depth;
- } else {
+ j = alloc_chain_hlocks(chain->depth);
+ if (j < 0) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2841,9 +3101,16 @@ static inline int add_chain_cache(struct task_struct *curr,
return 0;
}
+ chain->base = j;
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ int lock_id = curr->held_locks[i].class_idx;
+
+ chain_hlocks[chain->base + j] = lock_id;
+ }
+ chain_hlocks[chain->base + j] = class - lock_classes;
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
- inc_chains();
+ inc_chains(chain->irq_context);
return 1;
}
@@ -2987,6 +3254,8 @@ static inline int validate_chain(struct task_struct *curr,
{
return 1;
}
+
+static void init_chain_block_buckets(void) { }
#endif /* CONFIG_PROVE_LOCKING */
/*
@@ -3081,10 +3350,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
- trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
- trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
- trace_hardirqs_enabled(curr),
- trace_softirqs_enabled(curr));
+ lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ lockdep_hardirqs_enabled(curr),
+ lockdep_softirqs_enabled(curr));
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
@@ -3429,9 +3698,9 @@ void lockdep_hardirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
return;
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
__trace_hardirqs_on_caller(ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
}
NOKPROBE_SYMBOL(lockdep_hardirqs_on);
@@ -3468,7 +3737,7 @@ NOKPROBE_SYMBOL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
*/
-void trace_softirqs_on(unsigned long ip)
+void lockdep_softirqs_on(unsigned long ip)
{
struct task_struct *curr = current;
@@ -3487,7 +3756,7 @@ void trace_softirqs_on(unsigned long ip)
return;
}
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
/*
* We'll do an OFF -> ON transition:
*/
@@ -3502,13 +3771,13 @@ void trace_softirqs_on(unsigned long ip)
*/
if (curr->hardirqs_enabled)
mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
}
/*
* Softirqs were disabled:
*/
-void trace_softirqs_off(unsigned long ip)
+void lockdep_softirqs_off(unsigned long ip)
{
struct task_struct *curr = current;
@@ -3596,7 +3865,8 @@ lock_used:
static inline unsigned int task_irq_context(struct task_struct *task)
{
- return 2 * !!task->hardirq_context + !!task->softirq_context;
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context +
+ LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
}
static int separate_irq_context(struct task_struct *curr,
@@ -3682,6 +3952,113 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+static int
+print_lock_invalid_wait_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ pr_warn("\n");
+ pr_warn("=============================\n");
+ pr_warn("[ BUG: Invalid wait context ]\n");
+ print_kernel_ident();
+ pr_warn("-----------------------------\n");
+
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ pr_warn("other info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ pr_warn("stack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Verify the wait_type context.
+ *
+ * This check validates we takes locks in the right wait-type order; that is it
+ * ensures that we do not take mutexes inside spinlocks and do not attempt to
+ * acquire spinlocks inside raw_spinlocks and the sort.
+ *
+ * The entire thing is slightly more complex because of RCU, RCU is a lock that
+ * can be taken from (pretty much) any context but also has constraints.
+ * However when taken in a stricter environment the RCU lock does not loosen
+ * the constraints.
+ *
+ * Therefore we must look for the strictest environment in the lock stack and
+ * compare that to the lock we're trying to acquire.
+ */
+static int check_wait_context(struct task_struct *curr, struct held_lock *next)
+{
+ short next_inner = hlock_class(next)->wait_type_inner;
+ short next_outer = hlock_class(next)->wait_type_outer;
+ short curr_inner;
+ int depth;
+
+ if (!curr->lockdep_depth || !next_inner || next->trylock)
+ return 0;
+
+ if (!next_outer)
+ next_outer = next_inner;
+
+ /*
+ * Find start of current irq_context..
+ */
+ for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) {
+ struct held_lock *prev = curr->held_locks + depth;
+ if (prev->irq_context != next->irq_context)
+ break;
+ }
+ depth++;
+
+ /*
+ * Set appropriate wait type for the context; for IRQs we have to take
+ * into account force_irqthread as that is implied by PREEMPT_RT.
+ */
+ if (curr->hardirq_context) {
+ /*
+ * Check if force_irqthreads will run us threaded.
+ */
+ if (curr->hardirq_threaded || curr->irq_config)
+ curr_inner = LD_WAIT_CONFIG;
+ else
+ curr_inner = LD_WAIT_SPIN;
+ } else if (curr->softirq_context) {
+ /*
+ * Softirqs are always threaded.
+ */
+ curr_inner = LD_WAIT_CONFIG;
+ } else {
+ curr_inner = LD_WAIT_MAX;
+ }
+
+ for (; depth < curr->lockdep_depth; depth++) {
+ struct held_lock *prev = curr->held_locks + depth;
+ short prev_inner = hlock_class(prev)->wait_type_inner;
+
+ if (prev_inner) {
+ /*
+ * We can have a bigger inner than a previous one
+ * when outer is smaller than inner, as with RCU.
+ *
+ * Also due to trylocks.
+ */
+ curr_inner = min(curr_inner, prev_inner);
+ }
+ }
+
+ if (next_outer > curr_inner)
+ return print_lock_invalid_wait_context(curr, next);
+
+ return 0;
+}
+
#else /* CONFIG_PROVE_LOCKING */
static inline int
@@ -3701,13 +4078,20 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
+static inline int check_wait_context(struct task_struct *curr,
+ struct held_lock *next)
+{
+ return 0;
+}
+
#endif /* CONFIG_PROVE_LOCKING */
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
+void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass,
+ short inner, short outer)
{
int i;
@@ -3728,6 +4112,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
lock->name = name;
+ lock->wait_type_outer = outer;
+ lock->wait_type_inner = inner;
+
/*
* No key, no joy, we need to hash something.
*/
@@ -3755,13 +4142,13 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
register_lock_class(lock, subclass, 1);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map);
+EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3862,7 +4249,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) {
+ if (depth) { /* we're holding locks */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -3904,6 +4291,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
+ if (check_wait_context(curr, hlock))
+ return 0;
+
/* Initialize the lock usage bit */
if (!mark_usage(curr, hlock, check))
return 0;
@@ -4139,7 +4529,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map(lock, name, key, 0);
+ lockdep_init_map_waits(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
@@ -4437,11 +4829,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
check_flags(flags);
if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_set_class);
@@ -4454,15 +4846,45 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
check_flags(flags);
if (__lock_downgrade(lock, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_downgrade);
+/* NMI context !!! */
+static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class = look_up_lock_class(lock, subclass);
+
+ /* if it doesn't have a class (yet), it certainly hasn't been used yet */
+ if (!class)
+ return;
+
+ if (!(class->usage_mask & LOCK_USED))
+ return;
+
+ hlock->class_idx = class - lock_classes;
+
+ print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES);
+#endif
+}
+
+static bool lockdep_nmi(void)
+{
+ if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
+ return false;
+
+ if (!in_nmi())
+ return false;
+
+ return true;
+}
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -4473,17 +4895,34 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(current->lockdep_recursion)) {
+ /* XXX allow trylock from NMI ?!? */
+ if (lockdep_nmi() && !trylock) {
+ struct held_lock hlock;
+
+ hlock.acquire_ip = ip;
+ hlock.instance = lock;
+ hlock.nest_lock = nest_lock;
+ hlock.irq_context = 2; // XXX
+ hlock.trylock = trylock;
+ hlock.read = read;
+ hlock.check = check;
+ hlock.hardirqs_off = true;
+ hlock.references = 0;
+
+ verify_lock_unused(lock, &hlock, subclass);
+ }
return;
+ }
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
__lock_acquire(lock, subclass, trylock, read, check,
irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_acquire);
@@ -4497,11 +4936,11 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
trace_lock_release(lock, ip);
if (__lock_release(lock, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_release);
@@ -4517,9 +4956,9 @@ int lock_is_held_type(const struct lockdep_map *lock, int read)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
ret = __lock_is_held(lock, read);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
return ret;
@@ -4538,9 +4977,9 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
cookie = __lock_pin_lock(lock);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
return cookie;
@@ -4557,9 +4996,9 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
__lock_repin_lock(lock, cookie);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_repin_lock);
@@ -4574,9 +5013,9 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
__lock_unpin_lock(lock, cookie);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_unpin_lock);
@@ -4712,10 +5151,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
trace_lock_contended(lock, ip);
__lock_contended(lock, ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_contended);
@@ -4732,9 +5171,9 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ current->lockdep_recursion++;
__lock_acquired(lock, ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_acquired);
@@ -4768,57 +5207,33 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
struct lock_class *class)
{
#ifdef CONFIG_PROVE_LOCKING
- struct lock_chain *new_chain;
- u64 chain_key;
int i;
for (i = chain->base; i < chain->base + chain->depth; i++) {
if (chain_hlocks[i] != class - lock_classes)
continue;
- /* The code below leaks one chain_hlock[] entry. */
- if (--chain->depth > 0) {
- memmove(&chain_hlocks[i], &chain_hlocks[i + 1],
- (chain->base + chain->depth - i) *
- sizeof(chain_hlocks[0]));
- }
/*
* Each lock class occurs at most once in a lock chain so once
* we found a match we can break out of this loop.
*/
- goto recalc;
+ goto free_lock_chain;
}
/* Since the chain has not been modified, return. */
return;
-recalc:
- chain_key = INITIAL_CHAIN_KEY;
- for (i = chain->base; i < chain->base + chain->depth; i++)
- chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
- if (chain->depth && chain->chain_key == chain_key)
- return;
+free_lock_chain:
+ free_chain_hlocks(chain->base, chain->depth);
/* Overwrite the chain key for concurrent RCU readers. */
- WRITE_ONCE(chain->chain_key, chain_key);
+ WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY);
+ dec_chains(chain->irq_context);
+
/*
* Note: calling hlist_del_rcu() from inside a
* hlist_for_each_entry_rcu() loop is safe.
*/
hlist_del_rcu(&chain->entry);
__set_bit(chain - lock_chains, pf->lock_chains_being_freed);
- if (chain->depth == 0)
- return;
- /*
- * If the modified lock chain matches an existing lock chain, drop
- * the modified lock chain.
- */
- if (lookup_chain_cache(chain_key))
- return;
- new_chain = alloc_lock_chain();
- if (WARN_ON_ONCE(!new_chain)) {
- debug_locks_off();
- return;
- }
- *new_chain = *chain;
- hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key));
+ nr_zapped_lock_chains++;
#endif
}
@@ -4874,6 +5289,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
}
remove_class_from_lock_chains(pf, class);
+ nr_zapped_classes++;
}
static void reinit_class(struct lock_class *class)
@@ -4958,8 +5374,7 @@ static void free_zapped_rcu(struct rcu_head *ch)
return;
raw_local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
- current->lockdep_recursion = 1;
+ lockdep_lock();
/* closed head */
pf = delayed_free.pf + (delayed_free.index ^ 1);
@@ -4971,8 +5386,7 @@ static void free_zapped_rcu(struct rcu_head *ch)
*/
call_rcu_zapped(delayed_free.pf + delayed_free.index);
- current->lockdep_recursion = 0;
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
raw_local_irq_restore(flags);
}
@@ -5017,13 +5431,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
init_data_structures_once();
raw_local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
- current->lockdep_recursion = 1;
+ lockdep_lock();
pf = get_pending_free();
__lockdep_free_key_range(pf, start, size);
call_rcu_zapped(pf);
- current->lockdep_recursion = 0;
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
raw_local_irq_restore(flags);
/*
@@ -5045,10 +5457,10 @@ static void lockdep_free_key_range_imm(void *start, unsigned long size)
init_data_structures_once();
raw_local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
__lockdep_free_key_range(pf, start, size);
__free_zapped_classes(pf);
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
raw_local_irq_restore(flags);
}
@@ -5144,10 +5556,10 @@ static void lockdep_reset_lock_imm(struct lockdep_map *lock)
unsigned long flags;
raw_local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
__lockdep_reset_lock(pf, lock);
__free_zapped_classes(pf);
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
raw_local_irq_restore(flags);
}
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 18d85aebbb57..baca699b94e9 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -106,6 +106,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define STACK_TRACE_HASH_SIZE 16384
#endif
+/*
+ * Bit definitions for lock_chain.irq_context
+ */
+#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0)
+#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1)
+
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -124,17 +130,21 @@ extern const char *__get_key_name(const struct lockdep_subclass_key *key,
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
+extern unsigned long nr_zapped_classes;
+extern unsigned long nr_zapped_lock_chains;
extern unsigned long nr_list_entries;
long lockdep_next_lockchain(long i);
unsigned long lock_chain_count(void);
-extern int nr_chain_hlocks;
extern unsigned long nr_stack_trace_entries;
extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
-extern unsigned int max_lockdep_depth;
+extern unsigned int nr_free_chain_hlocks;
+extern unsigned int nr_lost_chain_hlocks;
+extern unsigned int nr_large_chain_blocks;
+extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
#ifdef CONFIG_PROVE_LOCKING
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 231684cfc5ae..5525cd3ba0c8 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -128,15 +128,22 @@ static int lc_show(struct seq_file *m, void *v)
struct lock_chain *chain = v;
struct lock_class *class;
int i;
+ static const char * const irq_strs[] = {
+ [0] = "0",
+ [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq",
+ [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq",
+ [LOCK_CHAIN_SOFTIRQ_CONTEXT|
+ LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq",
+ };
if (v == SEQ_START_TOKEN) {
- if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+ if (!nr_free_chain_hlocks)
seq_printf(m, "(buggered) ");
seq_printf(m, "all lock chains:\n");
return 0;
}
- seq_printf(m, "irq_context: %d\n", chain->irq_context);
+ seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]);
for (i = 0; i < chain->depth; i++) {
class = lock_chain_get_class(chain, i);
@@ -271,8 +278,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
lock_chain_count(), MAX_LOCKDEP_CHAINS);
- seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
- nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
+ seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n",
+ MAX_LOCKDEP_CHAIN_HLOCKS -
+ (nr_free_chain_hlocks + nr_lost_chain_hlocks),
+ MAX_LOCKDEP_CHAIN_HLOCKS);
+ seq_printf(m, " dependency chain hlocks lost: %11u\n",
+ nr_lost_chain_hlocks);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -336,6 +347,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
+ /*
+ * Zappped classes and lockdep data buffers reuse statistics.
+ */
+ seq_puts(m, "\n");
+ seq_printf(m, " zapped classes: %11lu\n",
+ nr_zapped_classes);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " zapped lock chains: %11lu\n",
+ nr_zapped_lock_chains);
+ seq_printf(m, " large chain blocks: %11u\n",
+ nr_large_chain_blocks);
+#endif
return 0;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 99475a66c94f..5efbfc68ce99 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -618,7 +618,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
static int lock_torture_writer(void *arg)
{
struct lock_stress_stats *lwsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
set_user_nice(current, MAX_NICE);
@@ -655,7 +655,7 @@ static int lock_torture_writer(void *arg)
static int lock_torture_reader(void *arg)
{
struct lock_stress_stats *lrsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_reader task started");
set_user_nice(current, MAX_NICE);
@@ -696,15 +696,16 @@ static void __torture_print_stats(char *page,
if (statp[i].n_lock_fail)
fail = true;
sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ if (max < statp[i].n_lock_acquired)
+ max = statp[i].n_lock_acquired;
+ if (min > statp[i].n_lock_acquired)
+ min = statp[i].n_lock_acquired;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
write ? "Writes" : "Reads ",
- sum, max, min, max / 2 > min ? "???" : "",
+ sum, max, min,
+ !onoff_interval && max / 2 > min ? "???" : "",
fail, fail ? "!!!" : "");
if (fail)
atomic_inc(&cxt.n_lock_torture_errors);
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 771d4ca96dda..a7276aaf2abc 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -85,7 +85,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
#endif
lock->magic = lock;
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 364d38a0c444..a008a1ba21a7 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,27 +1,29 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
+#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/errno.h>
-#include "rwsem.h"
-
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
- const char *name, struct lock_class_key *rwsem_key)
+ const char *name, struct lock_class_key *key)
{
sem->read_count = alloc_percpu(int);
if (unlikely(!sem->read_count))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
rcu_sync_init(&sem->rss);
- __init_rwsem(&sem->rw_sem, name, rwsem_key);
rcuwait_init(&sem->writer);
- sem->readers_block = 0;
+ init_waitqueue_head(&sem->waiters);
+ atomic_set(&sem->block, 0);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,73 +43,139 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
+ __this_cpu_inc(*sem->read_count);
+
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*
- * If the reader misses the writer's assignment of readers_block, then
- * the writer is guaranteed to see the reader's increment.
+ * If the reader misses the writer's assignment of sem->block, then the
+ * writer is guaranteed to see the reader's increment.
*
* Conversely, any readers that increment their sem->read_count after
- * the writer looks are guaranteed to see the readers_block value,
- * which in turn means that they are guaranteed to immediately
- * decrement their sem->read_count, so that it doesn't matter that the
- * writer missed them.
+ * the writer looks are guaranteed to see the sem->block value, which
+ * in turn means that they are guaranteed to immediately decrement
+ * their sem->read_count, so that it doesn't matter that the writer
+ * missed them.
*/
smp_mb(); /* A matches D */
/*
- * If !readers_block the critical section starts here, matched by the
+ * If !sem->block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(!smp_load_acquire(&sem->readers_block)))
+ if (likely(!atomic_read_acquire(&sem->block)))
+ return true;
+
+ __this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to re-evaluate readers_active_check() */
+ rcuwait_wake_up(&sem->writer);
+
+ return false;
+}
+
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+ if (atomic_read(&sem->block))
+ return false;
+
+ return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+ if (reader) {
+ bool ret;
+
+ preempt_disable();
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ return ret;
+ }
+ return __percpu_down_write_trylock(sem);
+}
+
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ * <0 - error, wakeup is terminated and the error is returned
+ * 0 - no wakeup, a next waiter is tried
+ * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ struct task_struct *p = get_task_struct(wq_entry->private);
+ bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+ struct percpu_rw_semaphore *sem = key;
+
+ /* concurrent against percpu_down_write(), can get stolen */
+ if (!__percpu_rwsem_trylock(sem, reader))
return 1;
- /*
- * Per the above comment; we still have preemption disabled and
- * will thus decrement on the same CPU as we incremented.
- */
- __percpu_up_read(sem);
+ list_del_init(&wq_entry->entry);
+ smp_store_release(&wq_entry->private, NULL);
- if (try)
- return 0;
+ wake_up_process(p);
+ put_task_struct(p);
- /*
- * We either call schedule() in the wait, or we'll fall through
- * and reschedule on the preempt_enable() in percpu_down_read().
- */
- preempt_enable_no_resched();
+ return !reader; /* wake (readers until) 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+{
+ DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+ bool wait;
+ spin_lock_irq(&sem->waiters.lock);
/*
- * Avoid lockdep for the down/up_read() we already have them.
+ * Serialize against the wakeup in percpu_up_write(), if we fail
+ * the trylock, the wakeup must see us on the list.
*/
- __down_read(&sem->rw_sem);
- this_cpu_inc(*sem->read_count);
- __up_read(&sem->rw_sem);
+ wait = !__percpu_rwsem_trylock(sem, reader);
+ if (wait) {
+ wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+ __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+ }
+ spin_unlock_irq(&sem->waiters.lock);
- preempt_disable();
- return 1;
+ while (wait) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!smp_load_acquire(&wq_entry.private))
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL_GPL(__percpu_down_read);
-void __percpu_up_read(struct percpu_rw_semaphore *sem)
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
- smp_mb(); /* B matches C */
- /*
- * In other words, if they see our decrement (presumably to aggregate
- * zero, as that is the only time it matters) they will also see our
- * critical section.
- */
- __this_cpu_dec(*sem->read_count);
+ if (__percpu_down_read_trylock(sem))
+ return true;
- /* Prod writer to recheck readers_active */
- rcuwait_wake_up(&sem->writer);
+ if (try)
+ return false;
+
+ preempt_enable();
+ percpu_rwsem_wait(sem, /* .reader = */ true);
+ preempt_disable();
+
+ return true;
}
-EXPORT_SYMBOL_GPL(__percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_down_read);
#define per_cpu_sum(var) \
({ \
@@ -124,6 +192,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
* zero. If this sum is zero, then it is stable due to the fact that if any
* newly arriving readers increment a given counter, they will immediately
* decrement that same counter.
+ *
+ * Assumes sem->block is set.
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
@@ -142,32 +212,36 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- down_write(&sem->rw_sem);
-
/*
- * Notify new readers to block; up until now, and thus throughout the
- * longish rcu_sync_enter() above, new readers could still come in.
+ * Try set sem->block; this provides writer-writer exclusion.
+ * Having sem->block set makes new readers block.
*/
- WRITE_ONCE(sem->readers_block, 1);
+ if (!__percpu_down_write_trylock(sem))
+ percpu_rwsem_wait(sem, /* .reader = */ false);
- smp_mb(); /* D matches A */
+ /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
/*
- * If they don't see our writer of readers_block, then we are
- * guaranteed to see their sem->read_count increment, and therefore
- * will wait for them.
+ * If they don't see our store of sem->block, then we are guaranteed to
+ * see their sem->read_count increment, and therefore will wait for
+ * them.
*/
- /* Wait for all now active readers to complete. */
- rcuwait_wait_event(&sem->writer, readers_active_check(sem));
+ /* Wait for all active readers to complete. */
+ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
void percpu_up_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, _RET_IP_);
+
/*
* Signal the writer is done, no fast path yet.
*
@@ -178,12 +252,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
* Therefore we force it through the slow path which guarantees an
* acquire and thereby guarantees the critical section's consistency.
*/
- smp_store_release(&sem->readers_block, 0);
+ atomic_set_release(&sem->block, 0);
/*
- * Release the write lock, this will allow readers back in the game.
+ * Prod any pending reader/writer to make progress.
*/
- up_write(&sem->rw_sem);
+ __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
/*
* Once this completes (at least one RCU-sched grace period hence) the
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 851bbb10819d..c9f090d64f00 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- lock->owner = (struct task_struct *)val;
+ WRITE_ONCE(lock->owner, (struct task_struct *)val);
}
static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 0d9b6be9ecc8..f11b9bd3431d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,7 +28,6 @@
#include <linux/rwsem.h>
#include <linux/atomic.h>
-#include "rwsem.h"
#include "lock_events.h"
/*
@@ -329,7 +328,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
* Make sure we are not reinitializing a held semaphore:
*/
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
#endif
#ifdef CONFIG_DEBUG_RWSEMS
sem->magic = sem;
@@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
unsigned long flags;
bool ret = true;
- BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
if (need_resched()) {
lockevent_inc(rwsem_opt_fail);
return false;
@@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
{
if (!rwsem_read_trylock(sem)) {
rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
@@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
/*
* unlock after reading
*/
-inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 2534ce49f648..e69de29bb2d1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __INTERNAL_RWSEM_H
-#define __INTERNAL_RWSEM_H
-#include <linux/rwsem.h>
-
-extern void __down_read(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-
-#endif /* __INTERNAL_RWSEM_H */
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 472dd462a40c..b9d93087ee66 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -14,14 +14,14 @@
#include <linux/export.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
- struct lock_class_key *key)
+ struct lock_class_key *key, short inner)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner);
#endif
lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
lock->magic = SPINLOCK_MAGIC;
@@ -39,7 +39,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG);
#endif
lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
lock->magic = RWLOCK_MAGIC;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 63d7501ac638..5989bbb93039 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_all();
+ vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/pid.c b/kernel/pid.c
index 0f4ecb57214c..647b4bb457b5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -247,6 +247,16 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
tmp = tmp->parent;
}
+ /*
+ * ENOMEM is not the most obvious choice especially for the case
+ * where the child subreaper has already exited and the pid
+ * namespace denies the creation of any new processes. But ENOMEM
+ * is what we have exposed to userspace for a long time and it is
+ * documented behavior for pid namespaces. So we can't easily
+ * change it even if there were an error code better suited.
+ */
+ retval = -ENOMEM;
+
if (unlikely(is_child_reaper(pid))) {
if (pid_ns_prepare_proc(ns))
goto out_free;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 83edf8698118..db0bed2cae26 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,31 +1,21 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * This module exposes the interface to kernel space for specifying
- * QoS dependencies. It provides infrastructure for registration of:
+ * Power Management Quality of Service (PM QoS) support base.
*
- * Dependents on a QoS value : register requests
- * Watchers of QoS value : get notified when target QoS value changes
+ * Copyright (C) 2020 Intel Corporation
*
- * This QoS design is best effort based. Dependents register their QoS needs.
- * Watchers register to keep track of the current QoS needs of the system.
+ * Authors:
+ * Mark Gross <mgross@linux.intel.com>
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
- * There are 3 basic classes of QoS parameter: latency, timeout, throughput
- * each have defined units:
- * latency: usec
- * timeout: usec <-- currently not used.
- * throughput: kbs (kilo byte / sec)
+ * Provided here is an interface for specifying PM QoS dependencies. It allows
+ * entities depending on QoS constraints to register their requests which are
+ * aggregated as appropriate to produce effective constraints (target values)
+ * that can be monitored by entities needing to respect them, either by polling
+ * or through a built-in notification mechanism.
*
- * There are lists of pm_qos_objects each one wrapping requests, notifiers
- *
- * User mode requests on a QOS parameter register themselves to the
- * subsystem by opening the device node /dev/... and writing there request to
- * the node. As long as the process holds a file handle open to the node the
- * client continues to be accounted for. Upon file release the usermode
- * request is removed and a new qos target is computed. This way when the
- * request that the application has is cleaned up when closes the file
- * pointer or exits the pm_qos_object will get an opportunity to clean up.
- *
- * Mark Gross <mgross@linux.intel.com>
+ * In addition to the basic functionality, more specific interfaces for managing
+ * global CPU latency QoS requests and frequency QoS requests are provided.
*/
/*#define DEBUG*/
@@ -54,56 +44,19 @@
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
-struct pm_qos_object {
- struct pm_qos_constraints *constraints;
- struct miscdevice pm_qos_power_miscdev;
- char *name;
-};
-
static DEFINE_SPINLOCK(pm_qos_lock);
-static struct pm_qos_object null_pm_qos;
-
-static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_constraints cpu_dma_constraints = {
- .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
- .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
- .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
- .type = PM_QOS_MIN,
- .notifiers = &cpu_dma_lat_notifier,
-};
-static struct pm_qos_object cpu_dma_pm_qos = {
- .constraints = &cpu_dma_constraints,
- .name = "cpu_dma_latency",
-};
-
-static struct pm_qos_object *pm_qos_array[] = {
- &null_pm_qos,
- &cpu_dma_pm_qos,
-};
-
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *f_pos);
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
- size_t count, loff_t *f_pos);
-static int pm_qos_power_open(struct inode *inode, struct file *filp);
-static int pm_qos_power_release(struct inode *inode, struct file *filp);
-
-static const struct file_operations pm_qos_power_fops = {
- .write = pm_qos_power_write,
- .read = pm_qos_power_read,
- .open = pm_qos_power_open,
- .release = pm_qos_power_release,
- .llseek = noop_llseek,
-};
-
-/* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_constraints *c)
+/**
+ * pm_qos_read_value - Return the current effective constraint value.
+ * @c: List of PM QoS constraint requests.
+ */
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
{
- struct plist_node *node;
- int total_value = 0;
+ return READ_ONCE(c->target_value);
+}
+static int pm_qos_get_value(struct pm_qos_constraints *c)
+{
if (plist_head_empty(&c->list))
return c->no_constraint_value;
@@ -114,111 +67,42 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
case PM_QOS_MAX:
return plist_last(&c->list)->prio;
- case PM_QOS_SUM:
- plist_for_each(node, &c->list)
- total_value += node->prio;
-
- return total_value;
-
default:
- /* runtime check for not using enum */
- BUG();
+ WARN(1, "Unknown PM QoS type in %s\n", __func__);
return PM_QOS_DEFAULT_VALUE;
}
}
-s32 pm_qos_read_value(struct pm_qos_constraints *c)
-{
- return c->target_value;
-}
-
-static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
+static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
{
- c->target_value = value;
+ WRITE_ONCE(c->target_value, value);
}
-static int pm_qos_debug_show(struct seq_file *s, void *unused)
-{
- struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
- struct pm_qos_constraints *c;
- struct pm_qos_request *req;
- char *type;
- unsigned long flags;
- int tot_reqs = 0;
- int active_reqs = 0;
-
- if (IS_ERR_OR_NULL(qos)) {
- pr_err("%s: bad qos param!\n", __func__);
- return -EINVAL;
- }
- c = qos->constraints;
- if (IS_ERR_OR_NULL(c)) {
- pr_err("%s: Bad constraints on qos?\n", __func__);
- return -EINVAL;
- }
-
- /* Lock to ensure we have a snapshot */
- spin_lock_irqsave(&pm_qos_lock, flags);
- if (plist_head_empty(&c->list)) {
- seq_puts(s, "Empty!\n");
- goto out;
- }
-
- switch (c->type) {
- case PM_QOS_MIN:
- type = "Minimum";
- break;
- case PM_QOS_MAX:
- type = "Maximum";
- break;
- case PM_QOS_SUM:
- type = "Sum";
- break;
- default:
- type = "Unknown";
- }
-
- plist_for_each_entry(req, &c->list, node) {
- char *state = "Default";
-
- if ((req->node).prio != c->default_value) {
- active_reqs++;
- state = "Active";
- }
- tot_reqs++;
- seq_printf(s, "%d: %d: %s\n", tot_reqs,
- (req->node).prio, state);
- }
-
- seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
- type, pm_qos_get_value(c), active_reqs, tot_reqs);
-
-out:
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(pm_qos_debug);
-
/**
- * pm_qos_update_target - manages the constraints list and calls the notifiers
- * if needed
- * @c: constraints data struct
- * @node: request to add to the list, to update or to remove
- * @action: action to take on the constraints list
- * @value: value of the request to add or update
+ * pm_qos_update_target - Update a list of PM QoS constraint requests.
+ * @c: List of PM QoS requests.
+ * @node: Target list entry.
+ * @action: Action to carry out (add, update or remove).
+ * @value: New request value for the target list entry.
*
- * This function returns 1 if the aggregated constraint value has changed, 0
- * otherwise.
+ * Update the given list of PM QoS constraint requests, @c, by carrying an
+ * @action involving the @node list entry and @value on it.
+ *
+ * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node
+ * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store
+ * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove
+ * @node from the list, ignore @value).
+ *
+ * Return: 1 if the aggregate constraint value has changed, 0 otherwise.
*/
int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
enum pm_qos_req_action action, int value)
{
- unsigned long flags;
int prev_value, curr_value, new_value;
- int ret;
+ unsigned long flags;
spin_lock_irqsave(&pm_qos_lock, flags);
+
prev_value = pm_qos_get_value(c);
if (value == PM_QOS_DEFAULT_VALUE)
new_value = c->default_value;
@@ -231,9 +115,8 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
break;
case PM_QOS_UPDATE_REQ:
/*
- * to change the list, we atomically remove, reinit
- * with new value and add, then see if the extremal
- * changed
+ * To change the list, atomically remove, reinit with new value
+ * and add, then see if the aggregate has changed.
*/
plist_del(node, &c->list);
/* fall through */
@@ -252,16 +135,14 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
spin_unlock_irqrestore(&pm_qos_lock, flags);
trace_pm_qos_update_target(action, prev_value, curr_value);
- if (prev_value != curr_value) {
- ret = 1;
- if (c->notifiers)
- blocking_notifier_call_chain(c->notifiers,
- (unsigned long)curr_value,
- NULL);
- } else {
- ret = 0;
- }
- return ret;
+
+ if (prev_value == curr_value)
+ return 0;
+
+ if (c->notifiers)
+ blocking_notifier_call_chain(c->notifiers, curr_value, NULL);
+
+ return 1;
}
/**
@@ -283,14 +164,12 @@ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
/**
* pm_qos_update_flags - Update a set of PM QoS flags.
- * @pqf: Set of flags to update.
+ * @pqf: Set of PM QoS flags to update.
* @req: Request to add to the set, to modify, or to remove from the set.
* @action: Action to take on the set.
* @val: Value of the request to add or modify.
*
- * Update the given set of PM QoS flags and call notifiers if the aggregate
- * value has changed. Returns 1 if the aggregate constraint value has changed,
- * 0 otherwise.
+ * Return: 1 if the aggregate constraint value has changed, 0 otherwise.
*/
bool pm_qos_update_flags(struct pm_qos_flags *pqf,
struct pm_qos_flags_request *req,
@@ -326,288 +205,180 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
spin_unlock_irqrestore(&pm_qos_lock, irqflags);
trace_pm_qos_update_flags(action, prev_value, curr_value);
- return prev_value != curr_value;
-}
-/**
- * pm_qos_request - returns current system wide qos expectation
- * @pm_qos_class: identification of which qos value is requested
- *
- * This function returns the current target value.
- */
-int pm_qos_request(int pm_qos_class)
-{
- return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
-}
-EXPORT_SYMBOL_GPL(pm_qos_request);
-
-int pm_qos_request_active(struct pm_qos_request *req)
-{
- return req->pm_qos_class != 0;
+ return prev_value != curr_value;
}
-EXPORT_SYMBOL_GPL(pm_qos_request_active);
-static void __pm_qos_update_request(struct pm_qos_request *req,
- s32 new_value)
-{
- trace_pm_qos_update_request(req->pm_qos_class, new_value);
+#ifdef CONFIG_CPU_IDLE
+/* Definitions related to the CPU latency QoS. */
- if (new_value != req->node.prio)
- pm_qos_update_target(
- pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
-}
+static struct pm_qos_constraints cpu_latency_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_latency_constraints.list),
+ .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+};
/**
- * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
- * @work: work struct for the delayed work (timeout)
- *
- * This cancels the timeout request by falling back to the default at timeout.
+ * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit.
*/
-static void pm_qos_work_fn(struct work_struct *work)
+s32 cpu_latency_qos_limit(void)
{
- struct pm_qos_request *req = container_of(to_delayed_work(work),
- struct pm_qos_request,
- work);
-
- __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+ return pm_qos_read_value(&cpu_latency_constraints);
}
/**
- * pm_qos_add_request - inserts new qos request into the list
- * @req: pointer to a preallocated handle
- * @pm_qos_class: identifies which list of qos request to use
- * @value: defines the qos request
+ * cpu_latency_qos_request_active - Check the given PM QoS request.
+ * @req: PM QoS request to check.
*
- * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance characteristics. It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request
- * handle. Caller needs to save this handle for later use in updates and
- * removal.
+ * Return: 'true' if @req has been added to the CPU latency QoS list, 'false'
+ * otherwise.
*/
-
-void pm_qos_add_request(struct pm_qos_request *req,
- int pm_qos_class, s32 value)
+bool cpu_latency_qos_request_active(struct pm_qos_request *req)
{
- if (!req) /*guard against callers passing in null */
- return;
+ return req->qos == &cpu_latency_constraints;
+}
+EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active);
- if (pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
- return;
- }
- req->pm_qos_class = pm_qos_class;
- INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
- trace_pm_qos_add_request(pm_qos_class, value);
- pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
- &req->node, PM_QOS_ADD_REQ, value);
+static void cpu_latency_qos_apply(struct pm_qos_request *req,
+ enum pm_qos_req_action action, s32 value)
+{
+ int ret = pm_qos_update_target(req->qos, &req->node, action, value);
+ if (ret > 0)
+ wake_up_all_idle_cpus();
}
-EXPORT_SYMBOL_GPL(pm_qos_add_request);
/**
- * pm_qos_update_request - modifies an existing qos request
- * @req : handle to list element holding a pm_qos request to use
- * @value: defines the qos request
+ * cpu_latency_qos_add_request - Add new CPU latency QoS request.
+ * @req: Pointer to a preallocated handle.
+ * @value: Requested constraint value.
*
- * Updates an existing qos request for the pm_qos_class of parameters along
- * with updating the target pm_qos_class value.
+ * Use @value to initialize the request handle pointed to by @req, insert it as
+ * a new entry to the CPU latency QoS list and recompute the effective QoS
+ * constraint for that list.
*
- * Attempts are made to make this code callable on hot code paths.
+ * Callers need to save the handle for later use in updates and removal of the
+ * QoS request represented by it.
*/
-void pm_qos_update_request(struct pm_qos_request *req,
- s32 new_value)
+void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value)
{
- if (!req) /*guard against callers passing in null */
+ if (!req)
return;
- if (!pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ if (cpu_latency_qos_request_active(req)) {
+ WARN(1, KERN_ERR "%s called for already added request\n", __func__);
return;
}
- cancel_delayed_work_sync(&req->work);
- __pm_qos_update_request(req, new_value);
+ trace_pm_qos_add_request(value);
+
+ req->qos = &cpu_latency_constraints;
+ cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value);
}
-EXPORT_SYMBOL_GPL(pm_qos_update_request);
+EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request);
/**
- * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
- * @req : handle to list element holding a pm_qos request to use
- * @new_value: defines the temporal qos request
- * @timeout_us: the effective duration of this qos request in usecs.
+ * cpu_latency_qos_update_request - Modify existing CPU latency QoS request.
+ * @req : QoS request to update.
+ * @new_value: New requested constraint value.
*
- * After timeout_us, this qos request is cancelled automatically.
+ * Use @new_value to update the QoS request represented by @req in the CPU
+ * latency QoS list along with updating the effective constraint value for that
+ * list.
*/
-void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
- unsigned long timeout_us)
+void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value)
{
if (!req)
return;
- if (WARN(!pm_qos_request_active(req),
- "%s called for unknown object.", __func__))
+
+ if (!cpu_latency_qos_request_active(req)) {
+ WARN(1, KERN_ERR "%s called for unknown object\n", __func__);
return;
+ }
- cancel_delayed_work_sync(&req->work);
+ trace_pm_qos_update_request(new_value);
- trace_pm_qos_update_request_timeout(req->pm_qos_class,
- new_value, timeout_us);
- if (new_value != req->node.prio)
- pm_qos_update_target(
- pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ if (new_value == req->node.prio)
+ return;
- schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
+ cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
}
+EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request);
/**
- * pm_qos_remove_request - modifies an existing qos request
- * @req: handle to request list element
+ * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request.
+ * @req: QoS request to remove.
*
- * Will remove pm qos request from the list of constraints and
- * recompute the current target value for the pm_qos_class. Call this
- * on slow code paths.
+ * Remove the CPU latency QoS request represented by @req from the CPU latency
+ * QoS list along with updating the effective constraint value for that list.
*/
-void pm_qos_remove_request(struct pm_qos_request *req)
+void cpu_latency_qos_remove_request(struct pm_qos_request *req)
{
- if (!req) /*guard against callers passing in null */
+ if (!req)
return;
- /* silent return to keep pcm code cleaner */
- if (!pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ if (!cpu_latency_qos_request_active(req)) {
+ WARN(1, KERN_ERR "%s called for unknown object\n", __func__);
return;
}
- cancel_delayed_work_sync(&req->work);
+ trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE);
- trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
- pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_REMOVE_REQ,
- PM_QOS_DEFAULT_VALUE);
+ cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
-EXPORT_SYMBOL_GPL(pm_qos_remove_request);
-
-/**
- * pm_qos_add_notifier - sets notification entry for changes to target value
- * @pm_qos_class: identifies which qos target changes should be notified.
- * @notifier: notifier block managed by caller.
- *
- * will register the notifier into a notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
-{
- int retval;
-
- retval = blocking_notifier_chain_register(
- pm_qos_array[pm_qos_class]->constraints->notifiers,
- notifier);
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
-
-/**
- * pm_qos_remove_notifier - deletes notification entry from chain.
- * @pm_qos_class: identifies which qos target changes are notified.
- * @notifier: notifier block to be removed.
- *
- * will remove the notifier from the notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
-{
- int retval;
+EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request);
- retval = blocking_notifier_chain_unregister(
- pm_qos_array[pm_qos_class]->constraints->notifiers,
- notifier);
+/* User space interface to the CPU latency QoS via misc device. */
- return retval;
-}
-EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-
-/* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
+static int cpu_latency_qos_open(struct inode *inode, struct file *filp)
{
- qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
- qos->pm_qos_power_miscdev.name = qos->name;
- qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-
- debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos,
- &pm_qos_debug_fops);
+ struct pm_qos_request *req;
- return misc_register(&qos->pm_qos_power_miscdev);
-}
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
-static int find_pm_qos_object_by_minor(int minor)
-{
- int pm_qos_class;
+ cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE);
+ filp->private_data = req;
- for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
- pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
- if (minor ==
- pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
- return pm_qos_class;
- }
- return -1;
+ return 0;
}
-static int pm_qos_power_open(struct inode *inode, struct file *filp)
+static int cpu_latency_qos_release(struct inode *inode, struct file *filp)
{
- long pm_qos_class;
-
- pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
- if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
- struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
- return -ENOMEM;
-
- pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
- filp->private_data = req;
-
- return 0;
- }
- return -EPERM;
-}
+ struct pm_qos_request *req = filp->private_data;
-static int pm_qos_power_release(struct inode *inode, struct file *filp)
-{
- struct pm_qos_request *req;
+ filp->private_data = NULL;
- req = filp->private_data;
- pm_qos_remove_request(req);
+ cpu_latency_qos_remove_request(req);
kfree(req);
return 0;
}
-
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
- size_t count, loff_t *f_pos)
+static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
{
- s32 value;
- unsigned long flags;
struct pm_qos_request *req = filp->private_data;
+ unsigned long flags;
+ s32 value;
- if (!req)
- return -EINVAL;
- if (!pm_qos_request_active(req))
+ if (!req || !cpu_latency_qos_request_active(req))
return -EINVAL;
spin_lock_irqsave(&pm_qos_lock, flags);
- value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
+ value = pm_qos_get_value(&cpu_latency_constraints);
spin_unlock_irqrestore(&pm_qos_lock, flags);
return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
}
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *f_pos)
+static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos)
{
s32 value;
- struct pm_qos_request *req;
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
@@ -620,36 +391,38 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
return ret;
}
- req = filp->private_data;
- pm_qos_update_request(req, value);
+ cpu_latency_qos_update_request(filp->private_data, value);
return count;
}
+static const struct file_operations cpu_latency_qos_fops = {
+ .write = cpu_latency_qos_write,
+ .read = cpu_latency_qos_read,
+ .open = cpu_latency_qos_open,
+ .release = cpu_latency_qos_release,
+ .llseek = noop_llseek,
+};
-static int __init pm_qos_power_init(void)
-{
- int ret = 0;
- int i;
- struct dentry *d;
-
- BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+static struct miscdevice cpu_latency_qos_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cpu_dma_latency",
+ .fops = &cpu_latency_qos_fops,
+};
- d = debugfs_create_dir("pm_qos", NULL);
+static int __init cpu_latency_qos_init(void)
+{
+ int ret;
- for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
- ret = register_pm_qos_misc(pm_qos_array[i], d);
- if (ret < 0) {
- pr_err("%s: %s setup failed\n",
- __func__, pm_qos_array[i]->name);
- return ret;
- }
- }
+ ret = misc_register(&cpu_latency_qos_miscdev);
+ if (ret < 0)
+ pr_err("%s: %s setup failed\n", __func__,
+ cpu_latency_qos_miscdev.name);
return ret;
}
-
-late_initcall(pm_qos_power_init);
+late_initcall(cpu_latency_qos_init);
+#endif /* CONFIG_CPU_IDLE */
/* Definitions related to the frequency QoS below. */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 77438954cc2b..58ed9478787f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -409,21 +409,7 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (cmd) {
case SNAPSHOT_GET_IMAGE_SIZE:
case SNAPSHOT_AVAIL_SWAP_SIZE:
- case SNAPSHOT_ALLOC_SWAP_PAGE: {
- compat_loff_t __user *uoffset = compat_ptr(arg);
- loff_t offset;
- mm_segment_t old_fs;
- int err;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
- set_fs(old_fs);
- if (!err && put_user(offset, uoffset))
- err = -EFAULT;
- return err;
- }
-
+ case SNAPSHOT_ALLOC_SWAP_PAGE:
case SNAPSHOT_CREATE_IMAGE:
return snapshot_ioctl(file, cmd,
(unsigned long) compat_ptr(arg));
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 82d5fba48b2f..f91f2c2cf138 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,6 +3,10 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
+ifeq ($(CONFIG_KCSAN),y)
+KBUILD_CFLAGS += -g -fno-omit-frame-pointer
+endif
+
obj-y += update.o sync.o
obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 05f936ed167a..00ddc92c5774 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+extern int rcu_cpu_stall_suppress_at_boot;
+
+static inline bool rcu_stall_is_suppressed_at_boot(void)
+{
+ return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
+}
+
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_ftrace_dump;
@@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
+static inline bool rcu_stall_is_suppressed(void)
+{
+ return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress;
+}
+
#define rcu_ftrace_dump_stall_suppress() \
do { \
if (!rcu_cpu_stall_suppress) \
@@ -218,6 +230,11 @@ do { \
} while (0)
#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+
+static inline bool rcu_stall_is_suppressed(void)
+{
+ return rcu_stall_is_suppressed_at_boot();
+}
#define rcu_ftrace_dump_stall_suppress()
#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
@@ -325,7 +342,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* Iterate over all possible CPUs in a leaf RCU node.
*/
#define for_each_leaf_node_possible_cpu(rnp, cpu) \
- for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+ for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \
+ (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
(cpu) <= rnp->grphi; \
(cpu) = cpumask_next((cpu), cpu_possible_mask))
@@ -335,7 +353,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
#define rcu_find_next_bit(rnp, cpu, mask) \
((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
- for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+ for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \
+ (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 5f4fd3b8777c..9a0f66133b4b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
{
return rcu_segcblist_is_enabled(rsclp) &&
- &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+ &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]);
}
/*
@@ -381,8 +381,6 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
return; /* Nothing to do. */
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
- rclp->head = NULL;
- rclp->tail = &rclp->head;
}
/*
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index da94b89cd531..a4a8d097d84d 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/err.h>
@@ -611,6 +612,7 @@ kfree_perf_thread(void *arg)
long me = (long)arg;
struct kfree_obj *alloc_ptr;
u64 start_time, end_time;
+ long long mem_begin, mem_during = 0;
VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
@@ -626,6 +628,12 @@ kfree_perf_thread(void *arg)
}
do {
+ if (!mem_during) {
+ mem_during = mem_begin = si_mem_available();
+ } else if (loop % (kfree_loops / 4) == 0) {
+ mem_during = (mem_during + si_mem_available()) / 2;
+ }
+
for (i = 0; i < kfree_alloc_num; i++) {
alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
if (!alloc_ptr)
@@ -645,9 +653,11 @@ kfree_perf_thread(void *arg)
else
b_rcu_gp_test_finished = cur_ops->get_gp_seq();
- pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
(unsigned long long)(end_time - start_time), kfree_loops,
- rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
+ rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
+ (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+
if (shutdown) {
smp_mb(); /* Assign before wake. */
wake_up(&shutdown_wq);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 1aeecc165b21..5453bd557f43 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!rcu_fwd_cb_nodelay &&
+ if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
- i = rp->rtort_pipe_count;
+ i = READ_ONCE(rp->rtort_pipe_count);
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ WRITE_ONCE(rp->rtort_pipe_count, i + 1);
+ if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
return true;
}
@@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg)
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
- old_rp->rtort_pipe_count++;
+ WRITE_ONCE(old_rp->rtort_pipe_count,
+ old_rp->rtort_pipe_count + 1);
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1067,7 +1069,8 @@ rcu_torture_writer(void *arg)
if (stutter_wait("rcu_torture_writer") &&
!READ_ONCE(rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
- !torture_must_stop())
+ !torture_must_stop() &&
+ rcu_inkernel_boot_has_ended())
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
@@ -1290,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
atomic_inc(&n_rcu_torture_mberror);
rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
preempt_disable();
- pipe_count = p->rtort_pipe_count;
+ pipe_count = READ_ONCE(p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
@@ -1404,14 +1407,15 @@ rcu_torture_stats_print(void)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
struct task_struct *wtp;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
- batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]);
+ batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
@@ -1420,9 +1424,10 @@ rcu_torture_stats_print(void)
}
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ rtcp = rcu_access_pointer(rcu_torture_current);
pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
- rcu_torture_current,
- rcu_torture_current ? "ver" : "VER",
+ rtcp,
+ rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER",
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc),
@@ -1478,7 +1483,8 @@ rcu_torture_stats_print(void)
if (cur_ops->stats)
cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
- rcu_torture_current != NULL) {
+ rcu_access_pointer(rcu_torture_current) &&
+ !rcu_stall_is_suppressed()) {
int __maybe_unused flags = 0;
unsigned long __maybe_unused gp_seq = 0;
@@ -1993,8 +1999,11 @@ static int rcu_torture_fwd_prog(void *args)
schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
WRITE_ONCE(rcu_fwd_emergency_stop, false);
register_oom_notifier(&rcutorture_oom_nb);
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- rcu_torture_fwd_prog_cr(rfp);
+ if (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ rcu_inkernel_boot_has_ended())
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
+ if (rcu_inkernel_boot_has_ended())
+ rcu_torture_fwd_prog_cr(rfp);
unregister_oom_notifier(&rcutorture_oom_nb);
/* Avoid slow periods, better to test when busy. */
@@ -2044,6 +2053,14 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
atomic_inc(&barrier_cbs_invoked);
}
+/* IPI handler to get callback posted on desired CPU, if online. */
+static void rcu_torture_barrier1cb(void *rcu_void)
+{
+ struct rcu_head *rhp = rcu_void;
+
+ cur_ops->call(rhp, rcu_torture_barrier_cbf);
+}
+
/* kthread function to register callbacks used to test RCU barriers. */
static int rcu_torture_barrier_cbs(void *arg)
{
@@ -2067,9 +2084,11 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the following ->call().
*/
- local_irq_disable(); /* Just to test no-irq call_rcu(). */
- cur_ops->call(&rcu, rcu_torture_barrier_cbf);
- local_irq_enable();
+ if (smp_call_function_single(myid, rcu_torture_barrier1cb,
+ &rcu, 1)) {
+ // IPI failed, so use direct call from current CPU.
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ }
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -2105,7 +2124,21 @@ static int rcu_torture_barrier(void *arg)
pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
atomic_read(&barrier_cbs_invoked),
n_barrier_cbs);
- WARN_ON_ONCE(1);
+ WARN_ON(1);
+ // Wait manually for the remaining callbacks
+ i = 0;
+ do {
+ if (WARN_ON(i++ > HZ))
+ i = INT_MIN;
+ schedule_timeout_interruptible(1);
+ cur_ops->cb_barrier();
+ } while (atomic_read(&barrier_cbs_invoked) !=
+ n_barrier_cbs &&
+ !torture_must_stop());
+ smp_mb(); // Can't trust ordering if broken.
+ if (!torture_must_stop())
+ pr_err("Recovered: barrier_cbs_invoked = %d\n",
+ atomic_read(&barrier_cbs_invoked));
} else {
n_barrier_successes++;
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 657e6a7d1c03..0c71505f0e19 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -5,7 +5,7 @@
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
*
- * Author: Paul McKenney <paulmck@linux.ibm.com>
+ * Authors: Paul McKenney <paulmck@linux.ibm.com>
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -450,7 +450,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ state = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- ssp->srcu_gp_seq_needed_exp = gpseq;
+ WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
spin_unlock_irq_rcu_node(ssp);
mutex_unlock(&ssp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
snp->srcu_have_cbs[idx] = gpseq;
rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- snp->srcu_gp_seq_needed_exp = gpseq;
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
spin_unlock_irq_rcu_node(snp);
@@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
}
spin_lock_irqsave_rcu_node(ssp, flags);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- ssp->srcu_gp_seq_needed_exp = s;
+ WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(ssp, flags);
}
@@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp == sdp->mynode)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- snp->srcu_gp_seq_needed_exp = s;
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
}
@@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
}
if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- ssp->srcu_gp_seq_needed_exp = s;
+ WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
/* If grace period not already done and none in progress, start it. */
if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
@@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
*/
unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return ssp->srcu_idx;
+ return READ_ONCE(ssp->srcu_idx);
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1130,7 +1130,9 @@ static void srcu_advance_state(struct srcu_struct *ssp)
return; /* readers present, retry later. */
}
srcu_flip(ssp);
+ spin_lock_irq_rcu_node(ssp);
rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ spin_unlock_irq_rcu_node(ssp);
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d91c9156fab2..06548e2ebb72 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1,12 +1,12 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Read-Copy Update mechanism for mutual exclusion
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
*
* Copyright IBM Corporation, 2008
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*
* Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
@@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
+static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
/* rcuc/rcub kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
@@ -342,14 +343,17 @@ bool rcu_eqs_special_set(int cpu)
{
int old;
int new;
+ int new_old;
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ new_old = atomic_read(&rdp->dynticks);
do {
- old = atomic_read(&rdp->dynticks);
+ old = new_old;
if (old & RCU_DYNTICK_CTRL_CTR)
return false;
new = old | RCU_DYNTICK_CTRL_MASK;
- } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old);
+ new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
+ } while (new_old != old);
return true;
}
@@ -410,10 +414,15 @@ static long blimit = DEFAULT_RCU_BLIMIT;
static long qhimark = DEFAULT_RCU_QHIMARK;
#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
static long qlowmark = DEFAULT_RCU_QLOMARK;
+#define DEFAULT_RCU_QOVLD_MULT 2
+#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
+static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
+static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
+module_param(qovld, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
@@ -818,11 +827,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
incby = 1;
} else if (tick_nohz_full_cpu(rdp->cpu) &&
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
- READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
+ READ_ONCE(rdp->rcu_urgent_qs) &&
+ !READ_ONCE(rdp->rcu_forced_tick)) {
raw_spin_lock_rcu_node(rdp->mynode);
// Recheck under lock.
if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
- rdp->rcu_forced_tick = true;
+ WRITE_ONCE(rdp->rcu_forced_tick, true);
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
raw_spin_unlock_rcu_node(rdp->mynode);
@@ -899,7 +909,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
- rdp->rcu_forced_tick = false;
+ WRITE_ONCE(rdp->rcu_forced_tick, false);
}
}
@@ -1072,7 +1082,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
- time_after(jiffies, rcu_state.jiffies_resched))) {
+ time_after(jiffies, rcu_state.jiffies_resched) ||
+ rcu_state.cbovld)) {
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
@@ -1089,8 +1100,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* So hit them over the head with the resched_cpu() hammer!
*/
if (tick_nohz_full_cpu(rdp->cpu) &&
- time_after(jiffies,
- READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
+ (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
+ rcu_state.cbovld)) {
WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
@@ -1113,6 +1124,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
(rnp->ffmask & rdp->grpmask)) {
init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+ atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
rdp->rcu_iw_pending = true;
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@@ -1126,8 +1138,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long gp_seq_req, const char *s)
{
- trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req,
- rnp->level, rnp->grplo, rnp->grphi, s);
+ trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ gp_seq_req, rnp->level,
+ rnp->grplo, rnp->grphi, s);
}
/*
@@ -1174,7 +1187,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
TPS("Prestarted"));
goto unlock_out;
}
- rnp->gp_seq_needed = gp_seq_req;
+ WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
/*
* We just marked the leaf or internal node, and a
@@ -1199,18 +1212,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
}
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
- rcu_state.gp_req_activity = jiffies;
- if (!rcu_state.gp_kthread) {
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ if (!READ_ONCE(rcu_state.gp_kthread)) {
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
goto unlock_out;
}
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
ret = true; /* Caller must wake GP kthread. */
unlock_out:
/* Push furthest requested GP to leaf node and rcu_data structure. */
if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
- rnp_start->gp_seq_needed = rnp->gp_seq_needed;
- rdp->gp_seq_needed = rnp->gp_seq_needed;
+ WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
+ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
}
if (rnp != rnp_start)
raw_spin_unlock_rcu_node(rnp);
@@ -1235,12 +1248,13 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
}
/*
- * Awaken the grace-period kthread. Don't do a self-awaken (unless in
- * an interrupt or softirq handler), and don't bother awakening when there
- * is nothing for the grace-period kthread to do (as in several CPUs raced
- * to awaken, and we lost), and finally don't try to awaken a kthread that
- * has not yet been created. If all those checks are passed, track some
- * debug information and awaken.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
+ * interrupt or softirq handler, in which case we just might immediately
+ * sleep upon return, resulting in a grace-period hang), and don't bother
+ * awakening when there is nothing for the grace-period kthread to do
+ * (as in several CPUs raced to awaken, we lost), and finally don't try
+ * to awaken a kthread that has not yet been created. If all those checks
+ * are passed, track some debug information and awaken.
*
* So why do the self-wakeup when in an interrupt or softirq handler
* in the grace-period kthread's context? Because the kthread might have
@@ -1250,10 +1264,10 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
*/
static void rcu_gp_kthread_wake(void)
{
- if ((current == rcu_state.gp_kthread &&
- !in_irq() && !in_serving_softirq()) ||
- !READ_ONCE(rcu_state.gp_flags) ||
- !rcu_state.gp_kthread)
+ struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
+
+ if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ !READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
@@ -1321,7 +1335,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
- if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
+ if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
(void)rcu_segcblist_accelerate(&rdp->cblist, c);
return;
@@ -1386,7 +1400,7 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
- bool need_gp;
+ bool need_qs;
const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
rcu_segcblist_is_offloaded(&rdp->cblist);
@@ -1400,10 +1414,13 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
unlikely(READ_ONCE(rdp->gpwrap))) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
+ rdp->core_needs_qs = false;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
if (!offloaded)
ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
+ if (rdp->core_needs_qs)
+ rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
}
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@ -1415,14 +1432,14 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
* go looking for one.
*/
trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
- need_gp = !!(rnp->qsmask & rdp->grpmask);
- rdp->cpu_no_qs.b.norm = need_gp;
- rdp->core_needs_qs = need_gp;
+ need_qs = !!(rnp->qsmask & rdp->grpmask);
+ rdp->cpu_no_qs.b.norm = need_qs;
+ rdp->core_needs_qs = need_qs;
zero_cpu_stall_ticks(rdp);
}
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
- rdp->gp_seq_needed = rnp->gp_seq_needed;
+ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
@@ -1651,8 +1668,7 @@ static void rcu_gp_fqs_loop(void)
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
rcu_state.gp_state = RCU_GP_WAIT_FQS;
ret = swait_event_idle_timeout_exclusive(
@@ -1666,13 +1682,11 @@ static void rcu_gp_fqs_loop(void)
/* If time for quiescent-state forcing, do it. */
if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
(gf & RCU_GP_FLAG_FQS)) {
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsstart"));
rcu_gp_fqs(first_gp_fqs);
first_gp_fqs = false;
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsend"));
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
@@ -1683,8 +1697,7 @@ static void rcu_gp_fqs_loop(void)
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswaitsig"));
ret = 1; /* Keep old FQS timing. */
j = jiffies;
@@ -1701,8 +1714,9 @@ static void rcu_gp_fqs_loop(void)
*/
static void rcu_gp_cleanup(void)
{
- unsigned long gp_duration;
+ int cpu;
bool needgp = false;
+ unsigned long gp_duration;
unsigned long new_gp_seq;
bool offloaded;
struct rcu_data *rdp;
@@ -1748,6 +1762,12 @@ static void rcu_gp_cleanup(void)
needgp = __note_gp_changes(rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
needgp = rcu_future_gp_cleanup(rnp) || needgp;
+ // Reset overload indication for CPUs no longer overloaded
+ if (rcu_is_leaf_node(rnp))
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ check_cb_ovld_locked(rdp, rnp);
+ }
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
@@ -1774,9 +1794,9 @@ static void rcu_gp_cleanup(void)
rcu_segcblist_is_offloaded(&rdp->cblist);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
- rcu_state.gp_req_activity = jiffies;
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ rcu_state.gp_seq,
TPS("newreq"));
} else {
WRITE_ONCE(rcu_state.gp_flags,
@@ -1795,8 +1815,7 @@ static int __noreturn rcu_gp_kthread(void *unused)
/* Handle grace-period start. */
for (;;) {
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
rcu_state.gp_state = RCU_GP_WAIT_GPS;
swait_event_idle_exclusive(rcu_state.gp_wq,
@@ -1809,8 +1828,7 @@ static int __noreturn rcu_gp_kthread(void *unused)
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwaitsig"));
}
@@ -1881,7 +1899,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
rcu_preempt_blocked_readers_cgp(rnp));
- rnp->qsmask &= ~mask;
+ WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
mask, rnp->qsmask, rnp->level,
rnp->grplo, rnp->grphi,
@@ -1904,7 +1922,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
rnp_c = rnp;
rnp = rnp->parent;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- oldmask = rnp_c->qsmask;
+ oldmask = READ_ONCE(rnp_c->qsmask);
}
/*
@@ -1987,6 +2005,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
+ if (rdp->cpu == smp_processor_id())
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
@@ -2052,7 +2072,7 @@ int rcutree_dying_cpu(unsigned int cpu)
return 0;
blkd = !!(rnp->qsmask & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
return 0;
}
@@ -2294,10 +2314,13 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
struct rcu_data *rdp;
struct rcu_node *rnp;
+ rcu_state.cbovld = rcu_state.cbovldnext;
+ rcu_state.cbovldnext = false;
rcu_for_each_leaf_node(rnp) {
cond_resched_tasks_rcu_qs();
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
@@ -2579,11 +2602,48 @@ static void rcu_leak_callback(struct rcu_head *rhp)
}
/*
- * Helper function for call_rcu() and friends. The cpu argument will
- * normally be -1, indicating "currently running CPU". It may specify
- * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier()
- * is expected to specify a CPU.
+ * Check and if necessary update the leaf rcu_node structure's
+ * ->cbovldmask bit corresponding to the current CPU based on that CPU's
+ * number of queued RCU callbacks. The caller must hold the leaf rcu_node
+ * structure's ->lock.
*/
+static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
+{
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (qovld_calc <= 0)
+ return; // Early boot and wildcard value set.
+ if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
+ WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
+ else
+ WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
+}
+
+/*
+ * Check and if necessary update the leaf rcu_node structure's
+ * ->cbovldmask bit corresponding to the current CPU based on that CPU's
+ * number of queued RCU callbacks. No locks need be held, but the
+ * caller must have disabled interrupts.
+ *
+ * Note that this function ignores the possibility that there are a lot
+ * of callbacks all of which have already seen the end of their respective
+ * grace periods. This omission is due to the need for no-CBs CPUs to
+ * be holding ->nocb_lock to do this check, which is too heavy for a
+ * common-case operation.
+ */
+static void check_cb_ovld(struct rcu_data *rdp)
+{
+ struct rcu_node *const rnp = rdp->mynode;
+
+ if (qovld_calc <= 0 ||
+ ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
+ !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
+ return; // Early boot wildcard value or already set correctly.
+ raw_spin_lock_rcu_node(rnp);
+ check_cb_ovld_locked(rdp, rnp);
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/* Helper function for call_rcu() and friends. */
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -2621,9 +2681,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
rcu_segcblist_init(&rdp->cblist);
}
+ check_cb_ovld(rdp);
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
return; // Enqueued onto ->nocb_bypass, so just leave.
- /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
+ // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
@@ -2689,22 +2750,47 @@ EXPORT_SYMBOL_GPL(call_rcu);
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
+
+/**
+ * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * @nr_records: Number of active pointers in the array
+ * @records: Array of the kfree_rcu() pointers
+ * @next: Next bulk object in the block chain
+ * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
+ */
+struct kfree_rcu_bulk_data {
+ unsigned long nr_records;
+ void *records[KFREE_BULK_MAX_ENTR];
+ struct kfree_rcu_bulk_data *next;
+ struct rcu_head *head_free_debug;
+};
+
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
+ struct kfree_rcu_bulk_data *bhead_free;
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
+ * @bcached: Keeps at most one object for later reuse when build chain blocks
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -2718,6 +2804,8 @@ struct kfree_rcu_cpu_work {
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
+ struct kfree_rcu_bulk_data *bhead;
+ struct kfree_rcu_bulk_data *bcached;
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
spinlock_t lock;
struct delayed_work monitor_work;
@@ -2727,14 +2815,24 @@ struct kfree_rcu_cpu {
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static __always_inline void
+debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ for (; head; head = head->next)
+ debug_rcu_head_unqueue(head);
+#endif
+}
+
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->head_free.
+ * It frees all the objects queued on ->bhead_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
struct rcu_head *head, *next;
+ struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
@@ -2744,22 +2842,44 @@ static void kfree_rcu_work(struct work_struct *work)
spin_lock_irqsave(&krcp->lock, flags);
head = krwp->head_free;
krwp->head_free = NULL;
+ bhead = krwp->bhead_free;
+ krwp->bhead_free = NULL;
spin_unlock_irqrestore(&krcp->lock, flags);
- // List "head" is now private, so traverse locklessly.
+ /* "bhead" is now private, so traverse locklessly. */
+ for (; bhead; bhead = bnext) {
+ bnext = bhead->next;
+
+ debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
+ bhead->nr_records, bhead->records);
+
+ kfree_bulk(bhead->nr_records, bhead->records);
+ rcu_lock_release(&rcu_callback_map);
+
+ if (cmpxchg(&krcp->bcached, NULL, bhead))
+ free_page((unsigned long) bhead);
+
+ cond_resched_tasks_rcu_qs();
+ }
+
+ /*
+ * Emergency case only. It can happen under low memory
+ * condition when an allocation gets failed, so the "bulk"
+ * path can not be temporary maintained.
+ */
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
next = head->next;
- // Potentially optimize with kfree_bulk in future.
debug_rcu_head_unqueue(head);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
- if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
- /* Could be optimized with kfree_bulk() in future. */
+ if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
kfree((void *)head - offset);
- }
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -2774,26 +2894,48 @@ static void kfree_rcu_work(struct work_struct *work)
*/
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
+ struct kfree_rcu_cpu_work *krwp;
+ bool queued = false;
int i;
- struct kfree_rcu_cpu_work *krwp = NULL;
lockdep_assert_held(&krcp->lock);
- for (i = 0; i < KFREE_N_BATCHES; i++)
- if (!krcp->krw_arr[i].head_free) {
- krwp = &(krcp->krw_arr[i]);
- break;
- }
- // If a previous RCU batch is in progress, we cannot immediately
- // queue another one, so return false to tell caller to retry.
- if (!krwp)
- return false;
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
- krwp->head_free = krcp->head;
- krcp->head = NULL;
- INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
- queue_rcu_work(system_wq, &krwp->rcu_work);
- return true;
+ /*
+ * Try to detach bhead or head and attach it over any
+ * available corresponding free channel. It can be that
+ * a previous RCU batch is in progress, it means that
+ * immediately to queue another one is not possible so
+ * return false to tell caller to retry.
+ */
+ if ((krcp->bhead && !krwp->bhead_free) ||
+ (krcp->head && !krwp->head_free)) {
+ /* Channel 1. */
+ if (!krwp->bhead_free) {
+ krwp->bhead_free = krcp->bhead;
+ krcp->bhead = NULL;
+ }
+
+ /* Channel 2. */
+ if (!krwp->head_free) {
+ krwp->head_free = krcp->head;
+ krcp->head = NULL;
+ }
+
+ /*
+ * One work is per one batch, so there are two "free channels",
+ * "bhead_free" and "head_free" the batch can handle. It can be
+ * that the work is in the pending state when two channels have
+ * been detached following each other, one by one.
+ */
+ queue_rcu_work(system_wq, &krwp->rcu_work);
+ queued = true;
+ }
+ }
+
+ return queued;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
@@ -2830,19 +2972,65 @@ static void kfree_rcu_monitor(struct work_struct *work)
spin_unlock_irqrestore(&krcp->lock, flags);
}
+static inline bool
+kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
+ struct rcu_head *head, rcu_callback_t func)
+{
+ struct kfree_rcu_bulk_data *bnode;
+
+ if (unlikely(!krcp->initialized))
+ return false;
+
+ lockdep_assert_held(&krcp->lock);
+
+ /* Check if a new block is required. */
+ if (!krcp->bhead ||
+ krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
+ bnode = xchg(&krcp->bcached, NULL);
+ if (!bnode) {
+ WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
+
+ bnode = (struct kfree_rcu_bulk_data *)
+ __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ }
+
+ /* Switch to emergency path. */
+ if (unlikely(!bnode))
+ return false;
+
+ /* Initialize the new block. */
+ bnode->nr_records = 0;
+ bnode->next = krcp->bhead;
+ bnode->head_free_debug = NULL;
+
+ /* Attach it to the head. */
+ krcp->bhead = bnode;
+ }
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ head->func = func;
+ head->next = krcp->bhead->head_free_debug;
+ krcp->bhead->head_free_debug = head;
+#endif
+
+ /* Finally insert. */
+ krcp->bhead->records[krcp->bhead->nr_records++] =
+ (void *) head - (unsigned long) func;
+
+ return true;
+}
+
/*
- * Queue a request for lazy invocation of kfree() after a grace period.
+ * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
+ * period. Please note there are two paths are maintained, one is the main one
+ * that uses kfree_bulk() interface and second one is emergency one, that is
+ * used only when the main path can not be maintained temporary, due to memory
+ * pressure.
*
* Each kfree_call_rcu() request is added to a batch. The batch will be drained
- * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
- * will be kfree'd in workqueue context. This allows us to:
- *
- * 1. Batch requests together to reduce the number of grace periods during
- * heavy kfree_rcu() load.
- *
- * 2. It makes it possible to use kfree_bulk() on a large number of
- * kfree_rcu() requests thus reducing cache misses and the per-object
- * overhead of kfree().
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
+ * be free'd in workqueue context. This allows us to: batch requests together to
+ * reduce the number of grace periods during heavy kfree_rcu() load.
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -2861,9 +3049,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
__func__, head);
goto unlock_return;
}
- head->func = func;
- head->next = krcp->head;
- krcp->head = head;
+
+ /*
+ * Under high memory pressure GFP_NOWAIT can fail,
+ * in that case the emergency path is maintained.
+ */
+ if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
+ head->func = func;
+ head->next = krcp->head;
+ krcp->head = head;
+ }
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
@@ -3075,24 +3270,32 @@ static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
/*
* RCU callback function for rcu_barrier(). If we are last, wake
* up the task executing rcu_barrier().
+ *
+ * Note that the value of rcu_state.barrier_sequence must be captured
+ * before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
+ * other CPUs might count the value down to zero before this CPU gets
+ * around to invoking rcu_barrier_trace(), which might result in bogus
+ * data from the next instance of rcu_barrier().
*/
static void rcu_barrier_callback(struct rcu_head *rhp)
{
+ unsigned long __maybe_unused s = rcu_state.barrier_sequence;
+
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
- rcu_barrier_trace(TPS("LastCB"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("LastCB"), -1, s);
complete(&rcu_state.barrier_completion);
} else {
- rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("CB"), -1, s);
}
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
-static void rcu_barrier_func(void *unused)
+static void rcu_barrier_func(void *cpu_in)
{
- struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
@@ -3119,7 +3322,7 @@ static void rcu_barrier_func(void *unused)
*/
void rcu_barrier(void)
{
- int cpu;
+ uintptr_t cpu;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
@@ -3142,13 +3345,14 @@ void rcu_barrier(void)
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
- * Initialize the count to one rather than to zero in order to
- * avoid a too-soon return to zero in case of a short grace period
- * (or preemption of this task). Exclude CPU-hotplug operations
- * to ensure that no offline CPU has callbacks queued.
+ * Initialize the count to two rather than to zero in order
+ * to avoid a too-soon return to zero in case of an immediate
+ * invocation of the just-enqueued callback (or preemption of
+ * this task). Exclude CPU-hotplug operations to ensure that no
+ * offline non-offloaded CPU has callbacks queued.
*/
init_completion(&rcu_state.barrier_completion);
- atomic_set(&rcu_state.barrier_cpu_count, 1);
+ atomic_set(&rcu_state.barrier_cpu_count, 2);
get_online_cpus();
/*
@@ -3158,13 +3362,23 @@ void rcu_barrier(void)
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (!cpu_online(cpu) &&
+ if (cpu_is_offline(cpu) &&
!rcu_segcblist_is_offloaded(&rdp->cblist))
continue;
- if (rcu_segcblist_n_cbs(&rdp->cblist)) {
+ if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
rcu_state.barrier_sequence);
- smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
+ smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
+ } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
+ cpu_is_offline(cpu)) {
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
+ rcu_state.barrier_sequence);
+ local_irq_disable();
+ rcu_barrier_func((void *)cpu);
+ local_irq_enable();
+ } else if (cpu_is_offline(cpu)) {
+ rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
+ rcu_state.barrier_sequence);
} else {
rcu_barrier_trace(TPS("OnlineNQ"), cpu,
rcu_state.barrier_sequence);
@@ -3176,7 +3390,7 @@ void rcu_barrier(void)
* Now that we have an rcu_barrier_callback() callback on each
* CPU, and thus each counted, remove the initial count.
*/
- if (atomic_dec_and_test(&rcu_state.barrier_cpu_count))
+ if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count))
complete(&rcu_state.barrier_completion);
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
@@ -3275,12 +3489,12 @@ int rcutree_prepare_cpu(unsigned int cpu)
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rdp->beenonline = true; /* We have now been online. */
- rdp->gp_seq = rnp->gp_seq;
- rdp->gp_seq_needed = rnp->gp_seq;
+ rdp->gp_seq = READ_ONCE(rnp->gp_seq);
+ rdp->gp_seq_needed = rdp->gp_seq;
rdp->cpu_no_qs.b.norm = true;
rdp->core_needs_qs = false;
rdp->rcu_iw_pending = false;
- rdp->rcu_iw_gp_seq = rnp->gp_seq - 1;
+ rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
@@ -3378,7 +3592,7 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rnp->qsmaskinitnext |= mask;
+ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
oldmask ^= rnp->expmaskinitnext;
@@ -3431,7 +3645,7 @@ void rcu_report_dead(unsigned int cpu)
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
- rnp->qsmaskinitnext &= ~mask;
+ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
raw_spin_unlock(&rcu_state.ofl_lock);
@@ -3545,7 +3759,10 @@ static int __init rcu_spawn_gp_kthread(void)
}
rnp = rcu_get_root();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rcu_state.gp_kthread = t;
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ // Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
+ smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
rcu_spawn_nocb_kthreads();
@@ -3769,8 +3986,11 @@ static void __init kfree_rcu_batch_init(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
spin_lock_init(&krcp->lock);
- for (i = 0; i < KFREE_N_BATCHES; i++)
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
+ }
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
@@ -3809,6 +4029,13 @@ void __init rcu_init(void)
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_par_gp_wq);
srcu_init();
+
+ /* Fill in default value for rcutree.qovld boot parameter. */
+ /* -After- the rcu_node ->lock fields are initialized! */
+ if (qovld < 0)
+ qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
+ else
+ qovld_calc = qovld;
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0c87e4c161c2..9dc2ec021da5 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -68,6 +68,8 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ unsigned long cbovldmask;
+ /* CPUs experiencing callback overload. */
unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
@@ -321,6 +323,8 @@ struct rcu_state {
atomic_t expedited_need_qs; /* # CPUs left to check in. */
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
+ u8 cbovld; /* Callback overload now? */
+ u8 cbovldnext; /* ^ ^ next time? */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dcbd75791f39..1a617b9dffb0 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -314,7 +314,7 @@ static bool exp_funnel_lock(unsigned long s)
sync_exp_work_done(s));
return true;
}
- rnp->exp_seq_rq = s; /* Followers can wait on us. */
+ WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */
spin_unlock(&rnp->exp_lock);
trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level,
rnp->grplo, rnp->grphi, TPS("nxtlvl"));
@@ -485,6 +485,7 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit)
static void synchronize_rcu_expedited_wait(void)
{
int cpu;
+ unsigned long j;
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
@@ -496,7 +497,7 @@ static void synchronize_rcu_expedited_wait(void)
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_jiffies_till_stall_check();
jiffies_start = jiffies;
- if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
@@ -508,12 +509,16 @@ static void synchronize_rcu_expedited_wait(void)
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
}
}
+ j = READ_ONCE(jiffies_till_first_fqs);
+ if (synchronize_rcu_expedited_wait_once(j + HZ))
+ return;
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT));
}
for (;;) {
if (synchronize_rcu_expedited_wait_once(jiffies_stall))
return;
- if (rcu_cpu_stall_suppress)
+ if (rcu_stall_is_suppressed())
continue;
panic_on_rcu_stall();
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
@@ -589,7 +594,7 @@ static void rcu_exp_wait_wake(unsigned long s)
spin_lock(&rnp->exp_lock);
/* Recheck, avoid hang in case someone just arrived. */
if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
- rnp->exp_seq_rq = s;
+ WRITE_ONCE(rnp->exp_seq_rq, s);
spin_unlock(&rnp->exp_lock);
}
smp_mb(); /* All above changes before wakeup. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c6ea81cd4189..097635c41135 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -56,6 +56,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
if (qlowmark != DEFAULT_RCU_QLOMARK)
pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
+ if (qovld != DEFAULT_RCU_QOVLD)
+ pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
if (jiffies_till_first_fqs != ULONG_MAX)
pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
if (jiffies_till_next_fqs != ULONG_MAX)
@@ -753,7 +755,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
raw_lockdep_assert_held_rcu_node(rnp);
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
__func__, rnp->grplo, rnp->grphi, rnp->level,
- (long)rnp->gp_seq, (long)rnp->completedqs);
+ (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
@@ -1032,18 +1034,18 @@ static int rcu_boost_kthread(void *arg)
trace_rcu_utilization(TPS("Start boost kthread@init"));
for (;;) {
- rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
- rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
more2boost = rcu_boost(rnp);
if (more2boost)
spincnt++;
else
spincnt = 0;
if (spincnt > 10) {
- rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
schedule_timeout_interruptible(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
@@ -1077,12 +1079,12 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+ (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) {
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_wake_cond(rnp->boost_kthread_task,
- rnp->boost_kthread_status);
+ READ_ONCE(rnp->boost_kthread_status));
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -1486,6 +1488,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0);
* flag the contention.
*/
static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
{
lockdep_assert_irqs_disabled();
if (raw_spin_trylock(&rdp->nocb_bypass_lock))
@@ -1529,6 +1532,7 @@ static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
* Release the specified rcu_data structure's ->nocb_bypass_lock.
*/
static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
{
lockdep_assert_irqs_disabled();
raw_spin_unlock(&rdp->nocb_bypass_lock);
@@ -1577,8 +1581,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
{
lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist) &&
- cpu_online(rdp->cpu))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
lockdep_assert_held(&rdp->nocb_lock);
}
@@ -1930,6 +1933,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
struct rcu_data *rdp;
struct rcu_node *rnp;
unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
/*
* Each pass through the following loop checks for CBs and for the
@@ -1969,10 +1973,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
}
// Need to wait on some grace period?
- WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist,
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
RCU_NEXT_READY_TAIL));
if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
if (!needwait_gp ||
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 55f9b84790d3..119ed6afd20f 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -102,7 +102,7 @@ static void record_gp_stall_check_time(void)
unsigned long j = jiffies;
unsigned long j1;
- rcu_state.gp_start = j;
+ WRITE_ONCE(rcu_state.gp_start, j);
j1 = rcu_jiffies_till_stall_check();
/* Record ->gp_start before ->jiffies_stall. */
smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
@@ -383,7 +383,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
+ if (rcu_stall_is_suppressed())
return;
/*
@@ -452,7 +452,7 @@ static void print_cpu_stall(void)
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
+ if (rcu_stall_is_suppressed())
return;
/*
@@ -504,7 +504,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
- if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+ if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) ||
!rcu_gp_in_progress())
return;
rcu_stall_kick_kthreads();
@@ -578,6 +578,7 @@ void show_rcu_gp_kthreads(void)
unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
ja = j - READ_ONCE(rcu_state.gp_activity);
@@ -585,28 +586,28 @@ void show_rcu_gp_kthreads(void)
jw = j - READ_ONCE(rcu_state.gp_wake_time);
pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state,
- rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+ rcu_state.gp_state, t ? t->state : 0x1ffffL,
ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
(long)READ_ONCE(rcu_state.gp_seq),
(long)READ_ONCE(rcu_get_root()->gp_seq_needed),
READ_ONCE(rcu_state.gp_flags));
rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
+ if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
+ READ_ONCE(rnp->gp_seq_needed)))
continue;
pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
- (long)rnp->gp_seq_needed);
+ rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq),
+ (long)READ_ONCE(rnp->gp_seq_needed));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->gpwrap ||
- ULONG_CMP_GE(rcu_state.gp_seq,
- rdp->gp_seq_needed))
+ if (READ_ONCE(rdp->gpwrap) ||
+ ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
+ READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)rdp->gp_seq_needed);
+ cpu, (long)READ_ONCE(rdp->gp_seq_needed));
}
}
for_each_possible_cpu(cpu) {
@@ -631,7 +632,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
static atomic_t warned = ATOMIC_INIT(0);
if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
+ ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
+ READ_ONCE(rnp_root->gp_seq_needed)) ||
+ !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread.
return;
j = jiffies; /* Expensive access, and in common case don't get here. */
if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
@@ -642,7 +645,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
raw_spin_lock_irqsave_rcu_node(rnp, flags);
j = jiffies;
if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
+ READ_ONCE(rnp_root->gp_seq_needed)) ||
time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
atomic_read(&warned)) {
@@ -655,9 +659,10 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
j = jiffies;
if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
- time_before(j, rcu_state.gp_activity + gpssdelay) ||
+ ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
+ READ_ONCE(rnp_root->gp_seq_needed)) ||
+ time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
atomic_xchg(&warned, 1)) {
if (rnp_root != rnp)
/* irqs remain disabled. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 6c4b862f57d6..28a8bdc5072f 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -183,6 +183,8 @@ void rcu_unexpedite_gp(void)
}
EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+static bool rcu_boot_ended __read_mostly;
+
/*
* Inform RCU of the end of the in-kernel boot sequence.
*/
@@ -191,7 +193,17 @@ void rcu_end_inkernel_boot(void)
rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
+ rcu_boot_ended = 1;
+}
+
+/*
+ * Let rcutorture know when it is OK to turn it up to eleven.
+ */
+bool rcu_inkernel_boot_has_ended(void)
+{
+ return rcu_boot_ended;
}
+EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended);
#endif /* #ifndef CONFIG_TINY_RCU */
@@ -227,18 +239,30 @@ core_initcall(rcu_set_runtime_mode);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+struct lockdep_map rcu_lock_map = {
+ .name = "rcu_read_lock",
+ .key = &rcu_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */
+};
EXPORT_SYMBOL_GPL(rcu_lock_map);
static struct lock_class_key rcu_bh_lock_key;
-struct lockdep_map rcu_bh_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+struct lockdep_map rcu_bh_lock_map = {
+ .name = "rcu_read_lock_bh",
+ .key = &rcu_bh_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */
+};
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
static struct lock_class_key rcu_sched_lock_key;
-struct lockdep_map rcu_sched_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
+struct lockdep_map rcu_sched_lock_map = {
+ .name = "rcu_read_lock_sched",
+ .key = &rcu_sched_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_SPIN,
+};
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
static struct lock_class_key rcu_callback_key;
@@ -464,13 +488,19 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
#ifdef CONFIG_RCU_STALL_COMMON
int rcu_cpu_stall_ftrace_dump __read_mostly;
module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
+// warnings. Also used by rcutorture even if stall warnings are excluded.
+int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
+module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
+
#ifdef CONFIG_TASKS_RCU
/*
@@ -528,7 +558,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
rhp->func = func;
raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
needwake = !rcu_tasks_cbs_head;
- *rcu_tasks_cbs_tail = rhp;
+ WRITE_ONCE(*rcu_tasks_cbs_tail, rhp);
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
/* We can't create the thread unless interrupts are enabled. */
@@ -658,7 +688,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
/* If there were none, wait a bit and start over. */
if (!list) {
wait_event_interruptible(rcu_tasks_cbs_wq,
- rcu_tasks_cbs_head);
+ READ_ONCE(rcu_tasks_cbs_head));
if (!rcu_tasks_cbs_head) {
WARN_ON(signal_pending(current));
schedule_timeout_interruptible(HZ/10);
@@ -801,7 +831,7 @@ static int __init rcu_spawn_tasks_kthread(void)
core_initcall(rcu_spawn_tasks_kthread);
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_start(void)
+void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
preempt_disable();
current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
@@ -809,7 +839,7 @@ void exit_tasks_rcu_start(void)
}
/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_finish(void)
+void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
{
preempt_disable();
__srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a1ad5b7d5521..a778554f9dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -29,12 +29,12 @@ void complete(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (x->done != UINT_MAX)
x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
@@ -58,10 +58,12 @@ void complete_all(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ lockdep_assert_RT_in_threaded_ctx();
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done = UINT_MAX;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_all_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
@@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+ DECLARE_SWAITQUEUE(wait);
- __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
+ __prepare_to_swait(&x->wait, &wait);
__set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
+ __finish_swait(&x->wait, &wait);
if (!x->done)
return timeout;
}
@@ -100,9 +102,9 @@ __wait_for_common(struct completion *x,
complete_acquire(x);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
complete_release(x);
@@ -291,12 +293,12 @@ bool try_wait_for_completion(struct completion *x)
if (!READ_ONCE(x->done))
return false;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = false;
else if (x->done != UINT_MAX)
x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
@@ -322,8 +324,8 @@ bool completion_done(struct completion *x)
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
*/
- spin_lock_irqsave(&x->wait.lock, flags);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a9983da4408..a2694ba82874 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -269,7 +269,6 @@ static void __hrtick_start(void *arg)
rq_lock(rq, &rf);
__hrtick_restart(rq);
- rq->hrtick_csd_pending = 0;
rq_unlock(rq, &rf);
}
@@ -293,12 +292,10 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_set_expires(timer, time);
- if (rq == this_rq()) {
+ if (rq == this_rq())
__hrtick_restart(rq);
- } else if (!rq->hrtick_csd_pending) {
+ else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
- rq->hrtick_csd_pending = 1;
- }
}
#else
@@ -322,8 +319,6 @@ void hrtick_start(struct rq *rq, u64 delay)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq->hrtick_csd_pending = 0;
-
rq->hrtick_csd.flags = 0;
rq->hrtick_csd.func = __hrtick_start;
rq->hrtick_csd.info = rq;
@@ -761,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
- p->se.runnable_weight = load->weight;
return;
}
@@ -774,7 +768,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
- p->se.runnable_weight = load->weight;
}
}
@@ -1652,7 +1645,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ /*
+ * Picking a ~random cpu helps in cases where we are changing affinity
+ * for groups of tasks (ie. cpuset), so that load balancing is not
+ * immediately required to distribute the tasks within their new mask.
+ */
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
@@ -3578,6 +3576,17 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+DEFINE_PER_CPU(unsigned long, thermal_pressure);
+
+void arch_set_thermal_pressure(struct cpumask *cpus,
+ unsigned long th_pressure)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpus)
+ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+}
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3588,12 +3597,16 @@ void scheduler_tick(void)
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
+ unsigned long thermal_pressure;
+ arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
psi_task_tick(rq);
@@ -3671,7 +3684,6 @@ static void sched_tick_remote(struct work_struct *work)
if (cpu_is_offline(cpu))
goto out_unlock;
- curr = rq->curr;
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -4074,6 +4086,8 @@ static void __sched notrace __schedule(bool preempt)
*/
++*switch_count;
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1a2719e1350a..0033731a0797 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -41,8 +41,67 @@ static int convert_prio(int prio)
return cpupri;
}
+static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask, int idx)
+{
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ int skip = 0;
+
+ if (!atomic_read(&(vec)->count))
+ skip = 1;
+ /*
+ * When looking at the vector, we need to read the counter,
+ * do a memory barrier, then read the mask.
+ *
+ * Note: This is still all racey, but we can deal with it.
+ * Ideally, we only want to look at masks that are set.
+ *
+ * If a mask is not set, then the only thing wrong is that we
+ * did a little more work than necessary.
+ *
+ * If we read a zero count but the mask is set, because of the
+ * memory barriers, that can only happen when the highest prio
+ * task for a run queue has left the run queue, in which case,
+ * it will be followed by a pull. If the task we are processing
+ * fails to find a proper place to go, that pull request will
+ * pull this task if the run queue is running at a lower
+ * priority.
+ */
+ smp_rmb();
+
+ /* Need to do the rmb for every iteration */
+ if (skip)
+ return 0;
+
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+ return 0;
+
+ if (lowest_mask) {
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_empty(lowest_mask))
+ return 0;
+ }
+
+ return 1;
+}
+
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask)
+{
+ return cpupri_find_fitness(cp, p, lowest_mask, NULL);
+}
+
/**
- * cpupri_find - find the best (lowest-pri) CPU in the system
+ * cpupri_find_fitness - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
@@ -58,84 +117,59 @@ static int convert_prio(int prio)
*
* Return: (int)bool - CPUs were found
*/
-int cpupri_find(struct cpupri *cp, struct task_struct *p,
+int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask,
bool (*fitness_fn)(struct task_struct *p, int cpu))
{
- int idx = 0;
int task_pri = convert_prio(p->prio);
+ int idx, cpu;
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
- int skip = 0;
-
- if (!atomic_read(&(vec)->count))
- skip = 1;
- /*
- * When looking at the vector, we need to read the counter,
- * do a memory barrier, then read the mask.
- *
- * Note: This is still all racey, but we can deal with it.
- * Ideally, we only want to look at masks that are set.
- *
- * If a mask is not set, then the only thing wrong is that we
- * did a little more work than necessary.
- *
- * If we read a zero count but the mask is set, because of the
- * memory barriers, that can only happen when the highest prio
- * task for a run queue has left the run queue, in which case,
- * it will be followed by a pull. If the task we are processing
- * fails to find a proper place to go, that pull request will
- * pull this task if the run queue is running at a lower
- * priority.
- */
- smp_rmb();
- /* Need to do the rmb for every iteration */
- if (skip)
+ if (!__cpupri_find(cp, p, lowest_mask, idx))
continue;
- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
- continue;
+ if (!lowest_mask || !fitness_fn)
+ return 1;
- if (lowest_mask) {
- int cpu;
-
- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
-
- /*
- * We have to ensure that we have at least one bit
- * still set in the array, since the map could have
- * been concurrently emptied between the first and
- * second reads of vec->mask. If we hit this
- * condition, simply act as though we never hit this
- * priority level and continue on.
- */
- if (cpumask_empty(lowest_mask))
- continue;
-
- if (!fitness_fn)
- return 1;
-
- /* Ensure the capacity of the CPUs fit the task */
- for_each_cpu(cpu, lowest_mask) {
- if (!fitness_fn(p, cpu))
- cpumask_clear_cpu(cpu, lowest_mask);
- }
-
- /*
- * If no CPU at the current priority can fit the task
- * continue looking
- */
- if (cpumask_empty(lowest_mask))
- continue;
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
}
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
+ continue;
+
return 1;
}
+ /*
+ * If we failed to find a fitting lowest_mask, kick off a new search
+ * but without taking into account any fitness criteria this time.
+ *
+ * This rule favours honouring priority over fitting the task in the
+ * correct CPU (Capacity Awareness being the only user now).
+ * The idea is that if a higher priority task can run, then it should
+ * run even if this ends up being on unfitting CPU.
+ *
+ * The cost of this trade-off is not entirely clear and will probably
+ * be good for some workloads and bad for others.
+ *
+ * The main idea here is that if some CPUs were overcommitted, we try
+ * to spread which is what the scheduler traditionally did. Sys admins
+ * must do proper RT planning to avoid overloading the system if they
+ * really care.
+ */
+ if (fitness_fn)
+ return cpupri_find(cp, p, lowest_mask);
+
return 0;
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 32dd520db11f..efbb492bb94c 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -19,8 +19,10 @@ struct cpupri {
#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask,
- bool (*fitness_fn)(struct task_struct *p, int cpu));
+ struct cpumask *lowest_mask);
+int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cff3e656566d..dac9104d126f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -909,8 +909,10 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
} while (read_seqcount_retry(&vtime->seqcount, seq));
}
-static int vtime_state_check(struct vtime *vtime, int cpu)
+static int vtime_state_fetch(struct vtime *vtime, int cpu)
{
+ int state = READ_ONCE(vtime->state);
+
/*
* We raced against a context switch, fetch the
* kcpustat task again.
@@ -927,10 +929,10 @@ static int vtime_state_check(struct vtime *vtime, int cpu)
*
* Case 1) is ok but 2) is not. So wait for a safe VTIME state.
*/
- if (vtime->state == VTIME_INACTIVE)
+ if (state == VTIME_INACTIVE)
return -EAGAIN;
- return 0;
+ return state;
}
static u64 kcpustat_user_vtime(struct vtime *vtime)
@@ -949,14 +951,15 @@ static int kcpustat_field_vtime(u64 *cpustat,
{
struct vtime *vtime = &tsk->vtime;
unsigned int seq;
- int err;
do {
+ int state;
+
seq = read_seqcount_begin(&vtime->seqcount);
- err = vtime_state_check(vtime, cpu);
- if (err < 0)
- return err;
+ state = vtime_state_fetch(vtime, cpu);
+ if (state < 0)
+ return state;
*val = cpustat[usage];
@@ -969,7 +972,7 @@ static int kcpustat_field_vtime(u64 *cpustat,
*/
switch (usage) {
case CPUTIME_SYSTEM:
- if (vtime->state == VTIME_SYS)
+ if (state == VTIME_SYS)
*val += vtime->stime + vtime_delta(vtime);
break;
case CPUTIME_USER:
@@ -981,11 +984,11 @@ static int kcpustat_field_vtime(u64 *cpustat,
*val += kcpustat_user_vtime(vtime);
break;
case CPUTIME_GUEST:
- if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
+ if (state == VTIME_GUEST && task_nice(tsk) <= 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
case CPUTIME_GUEST_NICE:
- if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
+ if (state == VTIME_GUEST && task_nice(tsk) > 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
default:
@@ -1036,23 +1039,23 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
{
struct vtime *vtime = &tsk->vtime;
unsigned int seq;
- int err;
do {
u64 *cpustat;
u64 delta;
+ int state;
seq = read_seqcount_begin(&vtime->seqcount);
- err = vtime_state_check(vtime, cpu);
- if (err < 0)
- return err;
+ state = vtime_state_fetch(vtime, cpu);
+ if (state < 0)
+ return state;
*dst = *src;
cpustat = dst->cpustat;
/* Task is sleeping, dead or idle, nothing to add */
- if (vtime->state < VTIME_SYS)
+ if (state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
@@ -1061,15 +1064,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
- if (vtime->state == VTIME_SYS) {
+ if (state == VTIME_SYS) {
cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
- } else if (vtime->state == VTIME_USER) {
+ } else if (state == VTIME_USER) {
if (task_nice(tsk) > 0)
cpustat[CPUTIME_NICE] += vtime->utime + delta;
else
cpustat[CPUTIME_USER] += vtime->utime + delta;
} else {
- WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+ WARN_ON_ONCE(state != VTIME_GUEST);
if (task_nice(tsk) > 0) {
cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
cpustat[CPUTIME_NICE] += vtime->gtime + delta;
@@ -1080,7 +1083,7 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
- return err;
+ return 0;
}
void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43323f875cb9..504d2f51b0d6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -153,7 +153,7 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
__sub_running_bw(dl_se->dl_bw, dl_rq);
}
-void dl_change_utilization(struct task_struct *p, u64 new_bw)
+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
@@ -334,6 +334,8 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
return dl_rq->root.rb_leftmost == &dl_se->rb_node;
}
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
+
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
{
raw_spin_lock_init(&dl_b->dl_runtime_lock);
@@ -2496,7 +2498,7 @@ int sched_dl_global_validate(void)
return ret;
}
-void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 879d3ccf3806..8331bc04aea2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -402,11 +402,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
}
P(se->load.weight);
- P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
- P(se->avg.runnable_load_avg);
+ P(se->avg.runnable_avg);
#endif
#undef PN_SCHEDSTAT
@@ -524,11 +523,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
- cfs_rq->avg.runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
+ cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
@@ -537,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
- cfs_rq->removed.runnable_sum);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
+ cfs_rq->removed.runnable_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@@ -947,13 +945,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
- P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
- P(se.avg.runnable_load_sum);
+ P(se.avg.runnable_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
- P(se.avg.runnable_load_avg);
+ P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c8a379c357e..d7fb20adabeb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -86,6 +86,19 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+int sched_thermal_decay_shift;
+static int __init setup_sched_thermal_decay_shift(char *str)
+{
+ int _shift = 0;
+
+ if (kstrtoint(str, 0, &_shift))
+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
+
+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ return 1;
+}
+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
+
#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
@@ -741,9 +754,7 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
-
- se->runnable_weight = se->load.weight;
+ sa->load_avg = scale_load_down(se->load.weight);
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -796,6 +807,8 @@ void post_init_entity_util_avg(struct task_struct *p)
}
}
+ sa->runnable_avg = cpu_scale;
+
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
@@ -1473,36 +1486,51 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
- return cfs_rq_runnable_load_avg(&rq->cfs);
-}
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+ /* The node has spare capacity that can be used to run more tasks. */
+ node_has_spare = 0,
+ /*
+ * The node is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ node_fully_busy,
+ /*
+ * The node is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ node_overloaded
+};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
-
+ unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ unsigned int nr_running;
+ unsigned int weight;
+ enum numa_type node_type;
+ int idle_cpu;
};
-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
+static inline bool is_core_idle(int cpu)
{
- int cpu;
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
- memset(ns, 0, sizeof(*ns));
- for_each_cpu(cpu, cpumask_of_node(nid)) {
- struct rq *rq = cpu_rq(cpu);
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
- ns->load += cpu_runnable_load(rq);
- ns->compute_capacity += capacity_of(cpu);
+ if (!idle_cpu(cpu))
+ return false;
}
+#endif
+ return true;
}
struct task_numa_env {
@@ -1521,20 +1549,128 @@ struct task_numa_env {
int best_cpu;
};
+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_util(int cpu);
+static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+ struct numa_stats *ns)
+{
+ if ((ns->nr_running > ns->weight) &&
+ ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ return node_overloaded;
+
+ if ((ns->nr_running < ns->weight) ||
+ ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ return node_has_spare;
+
+ return node_fully_busy;
+}
+
+#ifdef CONFIG_SCHED_SMT
+/* Forward declarations of select_idle_sibling helpers */
+static inline bool test_idle_cores(int cpu, bool def);
+static inline int numa_idle_core(int idle_core, int cpu)
+{
+ if (!static_branch_likely(&sched_smt_present) ||
+ idle_core >= 0 || !test_idle_cores(cpu, false))
+ return idle_core;
+
+ /*
+ * Prefer cores instead of packing HT siblings
+ * and triggering future load balancing.
+ */
+ if (is_core_idle(cpu))
+ idle_core = cpu;
+
+ return idle_core;
+}
+#else
+static inline int numa_idle_core(int idle_core, int cpu)
+{
+ return idle_core;
+}
+#endif
+
+/*
+ * Gather all necessary information to make NUMA balancing placement
+ * decisions that are compatible with standard load balancer. This
+ * borrows code and logic from update_sg_lb_stats but sharing a
+ * common implementation is impractical.
+ */
+static void update_numa_stats(struct task_numa_env *env,
+ struct numa_stats *ns, int nid,
+ bool find_idle)
+{
+ int cpu, idle_core = -1;
+
+ memset(ns, 0, sizeof(*ns));
+ ns->idle_cpu = -1;
+
+ rcu_read_lock();
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->load += cpu_load(rq);
+ ns->util += cpu_util(cpu);
+ ns->nr_running += rq->cfs.h_nr_running;
+ ns->compute_capacity += capacity_of(cpu);
+
+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (READ_ONCE(rq->numa_migrate_on) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
+ continue;
+
+ if (ns->idle_cpu == -1)
+ ns->idle_cpu = cpu;
+
+ idle_core = numa_idle_core(idle_core, cpu);
+ }
+ }
+ rcu_read_unlock();
+
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
+
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
+
+ if (idle_core >= 0)
+ ns->idle_cpu = idle_core;
+}
+
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
struct rq *rq = cpu_rq(env->dst_cpu);
- /* Bail out if run-queue part of active NUMA balance. */
- if (xchg(&rq->numa_migrate_on, 1))
+ /* Check if run-queue part of active NUMA balance. */
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
+ int cpu;
+ int start = env->dst_cpu;
+
+ /* Find alternative idle CPU. */
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+ continue;
+ }
+
+ env->dst_cpu = cpu;
+ rq = cpu_rq(env->dst_cpu);
+ if (!xchg(&rq->numa_migrate_on, 1))
+ goto assign;
+ }
+
+ /* Failed to find an alternative idle CPU */
return;
+ }
+assign:
/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
- if (env->best_cpu != -1) {
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
@@ -1590,7 +1726,7 @@ static bool load_too_imbalanced(long src_load, long dst_load,
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
-static void task_numa_compare(struct task_numa_env *env,
+static bool task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
@@ -1601,9 +1737,10 @@ static void task_numa_compare(struct task_numa_env *env,
int dist = env->dist;
long moveimp = imp;
long load;
+ bool stopsearch = false;
if (READ_ONCE(dst_rq->numa_migrate_on))
- return;
+ return false;
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
@@ -1614,8 +1751,10 @@ static void task_numa_compare(struct task_numa_env *env,
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.
*/
- if (cur == env->p)
+ if (cur == env->p) {
+ stopsearch = true;
goto unlock;
+ }
if (!cur) {
if (maymove && moveimp >= env->best_imp)
@@ -1624,18 +1763,27 @@ static void task_numa_compare(struct task_numa_env *env,
goto unlock;
}
+ /* Skip this swap candidate if cannot move to the source cpu. */
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
+ goto unlock;
+
+ /*
+ * Skip this swap candidate if it is not moving to its preferred
+ * node and the best task is.
+ */
+ if (env->best_task &&
+ env->best_task->numa_preferred_nid == env->src_nid &&
+ cur->numa_preferred_nid != env->src_nid) {
+ goto unlock;
+ }
+
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
* the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
- */
- /* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
- goto unlock;
-
- /*
+ *
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
@@ -1662,6 +1810,19 @@ static void task_numa_compare(struct task_numa_env *env,
task_weight(cur, env->dst_nid, dist);
}
+ /* Discourage picking a task already on its preferred node */
+ if (cur->numa_preferred_nid == env->dst_nid)
+ imp -= imp / 16;
+
+ /*
+ * Encourage picking a task that moves to its preferred node.
+ * This potentially makes imp larger than it's maximum of
+ * 1998 (see SMALLIMP and task_weight for why) but in this
+ * case, it does not matter.
+ */
+ if (cur->numa_preferred_nid == env->src_nid)
+ imp += imp / 8;
+
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp;
cur = NULL;
@@ -1669,6 +1830,15 @@ static void task_numa_compare(struct task_numa_env *env,
}
/*
+ * Prefer swapping with a task moving to its preferred node over a
+ * task that is not.
+ */
+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
+ env->best_task->numa_preferred_nid != env->src_nid) {
+ goto assign;
+ }
+
+ /*
* If the NUMA importance is less than SMALLIMP,
* task migration might only result in ping pong
* of tasks and also hurt performance due to cache
@@ -1691,42 +1861,95 @@ static void task_numa_compare(struct task_numa_env *env,
goto unlock;
assign:
- /*
- * One idle CPU per node is evaluated for a task numa move.
- * Call select_idle_sibling to maybe find a better one.
- */
+ /* Evaluate an idle CPU for a task numa move. */
if (!cur) {
+ int cpu = env->dst_stats.idle_cpu;
+
+ /* Nothing cached so current CPU went idle since the search. */
+ if (cpu < 0)
+ cpu = env->dst_cpu;
+
/*
- * select_idle_siblings() uses an per-CPU cpumask that
- * can be used from IRQ context.
+ * If the CPU is no longer truly idle and the previous best CPU
+ * is, keep using it.
*/
- local_irq_disable();
- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
- env->dst_cpu);
- local_irq_enable();
+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
+ idle_cpu(env->best_cpu)) {
+ cpu = env->best_cpu;
+ }
+
+ env->dst_cpu = cpu;
}
task_numa_assign(env, cur, imp);
+
+ /*
+ * If a move to idle is allowed because there is capacity or load
+ * balance improves then stop the search. While a better swap
+ * candidate may exist, a search is not free.
+ */
+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
+ stopsearch = true;
+
+ /*
+ * If a swap candidate must be identified and the current best task
+ * moves its preferred node then stop the search.
+ */
+ if (!maymove && env->best_task &&
+ env->best_task->numa_preferred_nid == env->src_nid) {
+ stopsearch = true;
+ }
unlock:
rcu_read_unlock();
+
+ return stopsearch;
}
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
- long src_load, dst_load, load;
bool maymove = false;
int cpu;
- load = task_h_load(env->p);
- dst_load = env->dst_stats.load + load;
- src_load = env->src_stats.load - load;
-
/*
- * If the improvement from just moving env->p direction is better
- * than swapping tasks around, check if a move is possible.
+ * If dst node has spare capacity, then check if there is an
+ * imbalance that would be overruled by the load balancer.
*/
- maymove = !load_too_imbalanced(src_load, dst_load, env);
+ if (env->dst_stats.node_type == node_has_spare) {
+ unsigned int imbalance;
+ int src_running, dst_running;
+
+ /*
+ * Would movement cause an imbalance? Note that if src has
+ * more running tasks that the imbalance is ignored as the
+ * move improves the imbalance from the perspective of the
+ * CPU load balancer.
+ * */
+ src_running = env->src_stats.nr_running - 1;
+ dst_running = env->dst_stats.nr_running + 1;
+ imbalance = max(0, dst_running - src_running);
+ imbalance = adjust_numa_imbalance(imbalance, src_running);
+
+ /* Use idle CPU if there is no imbalance */
+ if (!imbalance) {
+ maymove = true;
+ if (env->dst_stats.idle_cpu >= 0) {
+ env->dst_cpu = env->dst_stats.idle_cpu;
+ task_numa_assign(env, NULL, 0);
+ return;
+ }
+ }
+ } else {
+ long src_load, dst_load, load;
+ /*
+ * If the improvement from just moving env->p direction is better
+ * than swapping tasks around, check if a move is possible.
+ */
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+ }
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
@@ -1734,7 +1957,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
continue;
env->dst_cpu = cpu;
- task_numa_compare(env, taskimp, groupimp, maymove);
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
+ break;
}
}
@@ -1788,10 +2012,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1824,7 +2048,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1848,15 +2072,17 @@ static int task_numa_migrate(struct task_struct *p)
}
/* No better CPU than the current one was found. */
- if (env.best_cpu == -1)
+ if (env.best_cpu == -1) {
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
return -EAGAIN;
+ }
best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
return ret;
}
@@ -1864,7 +2090,7 @@ static int task_numa_migrate(struct task_struct *p)
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
put_task_struct(env.best_task);
return ret;
}
@@ -2835,25 +3061,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_SMP
static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_weight += se->runnable_weight;
-
- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
-}
-
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_weight -= se->runnable_weight;
-
- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
- sub_positive(&cfs_rq->avg.runnable_load_sum,
- se_runnable(se) * se->avg.runnable_load_sum);
-}
-
-static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->avg.load_avg += se->avg.load_avg;
@@ -2868,28 +3075,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#else
static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight, unsigned long runnable)
+ unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
- dequeue_runnable_load_avg(cfs_rq, se);
}
dequeue_load_avg(cfs_rq, se);
- se->runnable_weight = runnable;
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
@@ -2897,16 +3098,13 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
- se->avg.runnable_load_avg =
- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq) {
+ if (se->on_rq)
account_entity_enqueue(cfs_rq, se);
- enqueue_runnable_load_avg(cfs_rq, se);
- }
+
}
void reweight_task(struct task_struct *p, int prio)
@@ -2916,7 +3114,7 @@ void reweight_task(struct task_struct *p, int prio)
struct load_weight *load = &se->load;
unsigned long weight = scale_load(sched_prio_to_weight[prio]);
- reweight_entity(cfs_rq, se, weight, weight);
+ reweight_entity(cfs_rq, se, weight);
load->inv_weight = sched_prio_to_wmult[prio];
}
@@ -3028,50 +3226,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-
-/*
- * This calculates the effective runnable weight for a group entity based on
- * the group entity weight calculated above.
- *
- * Because of the above approximation (2), our group entity weight is
- * an load_avg based ratio (3). This means that it includes blocked load and
- * does not represent the runnable weight.
- *
- * Approximate the group entity's runnable weight per ratio from the group
- * runqueue:
- *
- * grq->avg.runnable_load_avg
- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
- * grq->avg.load_avg
- *
- * However, analogous to above, since the avg numbers are slow, this leads to
- * transients in the from-idle case. Instead we use:
- *
- * ge->runnable_weight = ge->load.weight *
- *
- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
- * ----------------------------------------------------- (8)
- * max(grq->avg.load_avg, grq->load.weight)
- *
- * Where these max() serve both to use the 'instant' values to fix the slow
- * from-idle and avoid the /0 on to-idle, similar to (6).
- */
-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
-{
- long runnable, load_avg;
-
- load_avg = max(cfs_rq->avg.load_avg,
- scale_load_down(cfs_rq->load.weight));
-
- runnable = max(cfs_rq->avg.runnable_load_avg,
- scale_load_down(cfs_rq->runnable_weight));
-
- runnable *= shares;
- if (load_avg)
- runnable /= load_avg;
-
- return clamp_t(long, runnable, MIN_SHARES, shares);
-}
#endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -3083,7 +3237,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
static void update_cfs_group(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long shares, runnable;
+ long shares;
if (!gcfs_rq)
return;
@@ -3092,16 +3246,15 @@ static void update_cfs_group(struct sched_entity *se)
return;
#ifndef CONFIG_SMP
- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+ shares = READ_ONCE(gcfs_rq->tg->shares);
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
- runnable = calc_group_runnable(gcfs_rq, shares);
#endif
- reweight_entity(cfs_rq_of(se), se, shares, runnable);
+ reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -3226,11 +3379,11 @@ void set_task_rq_fair(struct sched_entity *se,
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
- * Per the above update_tg_cfs_util() is trivial and simply copies the running
- * sum over (but still wrong, because the group entity and group rq do not have
- * their PELT windows aligned).
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
+ * and simply copies the running/runnable sum over (but still wrong, because
+ * the group entity and group rq do not have their PELT windows aligned).
*
- * However, update_tg_cfs_runnable() is more complex. So we have:
+ * However, update_tg_cfs_load() is more complex. So we have:
*
* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
*
@@ -3313,9 +3466,35 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /*
+ * The relation between sum and avg is:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * however, the PELT windows are not aligned between grq and gse.
+ */
+
+ /* Set new sched_entity's runnable */
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
+ se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq runnable */
+ add_positive(&cfs_rq->avg.runnable_avg, delta);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+}
+
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
- unsigned long runnable_load_avg, load_avg;
- u64 runnable_load_sum, load_sum = 0;
+ unsigned long load_avg;
+ u64 load_sum = 0;
s64 delta_sum;
if (!runnable_sum)
@@ -3363,20 +3542,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta_avg);
add_positive(&cfs_rq->avg.load_sum, delta_sum);
-
- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
-
- if (se->on_rq) {
- delta_sum = runnable_load_sum -
- se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
- }
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3405,6 +3570,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
trace_pelt_cfs_tp(cfs_rq);
trace_pelt_se_tp(se);
@@ -3474,7 +3640,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;
@@ -3485,7 +3651,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);
@@ -3497,7 +3663,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);
- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+ r = removed_runnable;
+ sub_positive(&sa->runnable_avg, r);
+ sub_positive(&sa->runnable_sum, r * divider);
+
+ /*
+ * removed_runnable is the unweighted version of removed_load so we
+ * can use it to estimate removed_load_sum.
+ */
+ add_tg_cfs_propagate(cfs_rq,
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
decayed = 1;
}
@@ -3542,17 +3717,19 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
se->avg.util_sum = se->avg.util_avg * divider;
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
+
se->avg.load_sum = divider;
if (se_weight(se)) {
se->avg.load_sum =
div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
}
- se->avg.runnable_load_sum = se->avg.load_sum;
-
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
@@ -3574,6 +3751,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3680,13 +3859,13 @@ static void remove_entity_load_avg(struct sched_entity *se)
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
{
- return cfs_rq->avg.runnable_load_avg;
+ return cfs_rq->avg.runnable_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3957,6 +4136,7 @@ static inline void check_schedstat_required(void)
#endif
}
+static inline bool cfs_bandwidth_used(void);
/*
* MIGRATION
@@ -4021,8 +4201,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ se_update_runnable(se);
update_cfs_group(se);
- enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
@@ -4035,10 +4215,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ /*
+ * When bandwidth control is enabled, cfs might have been removed
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
+ * add it unconditionnally.
+ */
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
- }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4105,7 +4291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, UPDATE_TG);
- dequeue_runnable_load_avg(cfs_rq, se);
+ se_update_runnable(se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -4541,8 +4727,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se->on_rq)
break;
- if (dequeue)
+ if (dequeue) {
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ } else {
+ update_load_avg(qcfs_rq, se, 0);
+ se_update_runnable(se);
+ }
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
@@ -4610,8 +4801,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue = 0;
cfs_rq = cfs_rq_of(se);
- if (enqueue)
+ if (enqueue) {
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ } else {
+ update_load_avg(cfs_rq, se, 0);
+ se_update_runnable(se);
+ }
+
cfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta;
@@ -4619,11 +4815,22 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
- assert_list_leaf_cfs_rq(rq);
-
if (!se)
add_nr_running(rq, task_delta);
+ /*
+ * The cfs_rq_throttled() breaks in the above iteration can result in
+ * incomplete leaf list maintenance, resulting in triggering the
+ * assertion below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
+
+ assert_list_leaf_cfs_rq(rq);
+
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
@@ -5258,32 +5465,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto enqueue_throttle;
+
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
+ update_cfs_group(se);
+
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- break;
-
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
+ goto enqueue_throttle;
}
+enqueue_throttle:
if (!se) {
add_nr_running(rq, 1);
/*
@@ -5344,17 +5551,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running decrement below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto dequeue_throttle;
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -5372,16 +5575,21 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
+ update_cfs_group(se);
+
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- break;
+ goto dequeue_throttle;
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
}
+dequeue_throttle:
if (!se)
sub_nr_running(rq, 1);
@@ -5447,6 +5655,29 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
return load;
}
+static unsigned long cpu_runnable(struct rq *rq)
+{
+ return cfs_rq_runnable_avg(&rq->cfs);
+}
+
+static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned int runnable;
+
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_runnable(rq);
+
+ cfs_rq = &rq->cfs;
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+
+ /* Discount task's runnable from CPU's runnable */
+ lsub_positive(&runnable, p->se.avg.runnable_avg);
+
+ return runnable;
+}
+
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
@@ -5786,10 +6017,12 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- __cpumask_clear_cpu(cpu, cpus);
- if (!available_idle_cpu(cpu))
+ if (!available_idle_cpu(cpu)) {
idle = false;
+ break;
+ }
}
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
if (idle)
return core;
@@ -5894,6 +6127,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
/*
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
+ * maximize capacity.
+ */
+static int
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ unsigned long best_cap = 0;
+ int cpu, best_cpu = -1;
+ struct cpumask *cpus;
+
+ sync_entity_load_avg(&p->se);
+
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
+ unsigned long cpu_cap = capacity_of(cpu);
+
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ continue;
+ if (task_fits_capacity(p, cpu_cap))
+ return cpu;
+
+ if (cpu_cap > best_cap) {
+ best_cap = cpu_cap;
+ best_cpu = cpu;
+ }
+ }
+
+ return best_cpu;
+}
+
+/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
@@ -5901,6 +6168,28 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (!sd)
+ goto symmetric;
+
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+
+symmetric:
if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
@@ -6101,33 +6390,6 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
/*
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- *
- * In that case WAKE_AFFINE doesn't make sense and we'll let
- * BALANCE_WAKE sort things out.
- */
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
-{
- long min_cap, max_cap;
-
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
- return 0;
-
- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
-
- /* Minimum capacity is close to max, no need to abort wake_affine */
- if (max_cap - min_cap < max_cap >> 3)
- return 0;
-
- /* Bring task utilization in sync with prev_cpu */
- sync_entity_load_avg(&p->se);
-
- return !task_fits_capacity(p, min_cap);
-}
-
-/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
@@ -6391,8 +6653,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = prev_cpu;
}
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, p->cpus_ptr);
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
@@ -7506,6 +7767,9 @@ static inline bool others_have_blocked(struct rq *rq)
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
+ if (thermal_load_avg(rq))
+ return true;
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
@@ -7531,6 +7795,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
+ unsigned long thermal_pressure;
bool decayed;
/*
@@ -7539,8 +7804,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
*/
curr_class = rq->curr->sched_class;
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
@@ -7562,7 +7830,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;
- if (cfs_rq->avg.runnable_load_sum)
+ if (cfs_rq->avg.runnable_sum)
return false;
return true;
@@ -7700,7 +7968,8 @@ struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long group_capacity;
- unsigned long group_util; /* Total utilization of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
unsigned int sum_nr_running; /* Nr of tasks running in the group */
unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
@@ -7763,8 +8032,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
if (unlikely(irq >= max))
return 1;
+ /*
+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
+ * (running and not running) with weights 0 and 1024 respectively.
+ * avg_thermal.load_avg tracks thermal pressure and the weighted
+ * average uses the actual delta max capacity(load).
+ */
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
+ used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
@@ -7921,6 +8197,10 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
if (sgs->sum_nr_running < sgs->group_weight)
return true;
+ if ((sgs->group_capacity * imbalance_pct) <
+ (sgs->group_runnable * 100))
+ return false;
+
if ((sgs->group_capacity * 100) >
(sgs->group_util * imbalance_pct))
return true;
@@ -7946,6 +8226,10 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
(sgs->group_util * imbalance_pct))
return true;
+ if ((sgs->group_capacity * imbalance_pct) <
+ (sgs->group_runnable * 100))
+ return true;
+
return false;
}
@@ -8040,6 +8324,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
+ sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
@@ -8315,6 +8600,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_load += cpu_load_without(rq, p);
sgs->group_util += cpu_util_without(i, p);
+ sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
@@ -8337,13 +8623,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_capacity = group->sgc->capacity;
+ sgs->group_weight = group->group_weight;
+
sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
/*
* Computing avg_load makes sense only when group is fully busy or
* overloaded
*/
- if (sgs->group_type < group_fully_busy)
+ if (sgs->group_type == group_fully_busy ||
+ sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
@@ -8626,6 +8915,21 @@ next_group:
}
}
+static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
+{
+ unsigned int imbalance_min;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the source domain is almost idle.
+ */
+ imbalance_min = 2;
+ if (src_nr_running <= imbalance_min)
+ return 0;
+
+ return imbalance;
+}
+
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
@@ -8722,24 +9026,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/* Consider allowing a small imbalance between NUMA groups */
- if (env->sd->flags & SD_NUMA) {
- unsigned int imbalance_min;
-
- /*
- * Compute an allowed imbalance based on a simple
- * pair of communicating tasks that should remain
- * local and ignore them.
- *
- * NOTE: Generally this would have been based on
- * the domain size and this was evaluated. However,
- * the benefit is similar across a range of workloads
- * and machines but scaling by the domain size adds
- * the risk that lower domains have to be rebalanced.
- */
- imbalance_min = 2;
- if (busiest->sum_nr_running <= imbalance_min)
- env->imbalance = 0;
- }
+ if (env->sd->flags & SD_NUMA)
+ env->imbalance = adjust_numa_imbalance(env->imbalance,
+ busiest->sum_nr_running);
return;
}
@@ -9025,6 +9314,14 @@ static struct rq *find_busiest_queue(struct lb_env *env,
case migrate_util:
util = cpu_util(cpu_of(rq));
+ /*
+ * Don't try to pull utilization from a CPU with one
+ * running task. Whatever its utilization, we will fail
+ * detach the task.
+ */
+ if (nr_running <= 1)
+ continue;
+
if (busiest_util < util) {
busiest_util = util;
busiest = rq;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index bd006b79b360..b647d04d9c8b 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -121,8 +121,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- sa->runnable_load_sum =
- decay_load(sa->runnable_load_sum, periods);
+ sa->runnable_sum =
+ decay_load(sa->runnable_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
@@ -149,7 +149,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
if (load)
sa->load_sum += load * contrib;
if (runnable)
- sa->runnable_load_sum += runnable * contrib;
+ sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
if (running)
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
@@ -238,7 +238,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
}
static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+___update_load_avg(struct sched_avg *sa, unsigned long load)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
@@ -246,7 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
- sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ sa->runnable_avg = div_u64(sa->runnable_sum, divider);
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
@@ -254,33 +254,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* sched_entity:
*
* task:
- * se_runnable() == se_weight()
+ * se_weight() = se->load.weight
+ * se_runnable() = !!on_rq
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ * se_runnable() = grq->h_nr_running
*
- * load_sum := runnable_sum
- * load_avg = se_weight(se) * runnable_avg
+ * runnable_sum = se_runnable() * runnable = grq->runnable_sum
+ * runnable_avg = runnable_sum
*
- * runnable_load_sum := runnable_sum
- * runnable_load_avg = se_runnable(se) * runnable_avg
- *
- * XXX collapse load_sum and runnable_load_sum
+ * load_sum := runnable
+ * load_avg = se_weight(se) * load_sum
*
* cfq_rq:
*
+ * runnable_sum = \Sum se->avg.runnable_sum
+ * runnable_avg = \Sum se->avg.runnable_avg
+ *
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
- *
- * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
- * runnable_load_avg = \Sum se->avg.runable_load_avg
*/
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ ___update_load_avg(&se->avg, se_weight(se));
trace_pelt_se_tp(se);
return 1;
}
@@ -290,10 +289,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
cfs_rq->curr == se)) {
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ ___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
trace_pelt_se_tp(se);
return 1;
@@ -306,10 +305,10 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->h_nr_running,
cfs_rq->curr != NULL)) {
- ___update_load_avg(&cfs_rq->avg, 1, 1);
+ ___update_load_avg(&cfs_rq->avg, 1);
trace_pelt_cfs_tp(cfs_rq);
return 1;
}
@@ -322,9 +321,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
+ * runnable_sum = util_sum
*
- * load_avg and runnable_load_avg are not supported and meaningless.
+ * load_avg and runnable_avg are not supported and meaningless.
*
*/
@@ -335,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
running,
running)) {
- ___update_load_avg(&rq->avg_rt, 1, 1);
+ ___update_load_avg(&rq->avg_rt, 1);
trace_pelt_rt_tp(rq);
return 1;
}
@@ -348,7 +347,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
*
*/
@@ -359,7 +360,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
running,
running)) {
- ___update_load_avg(&rq->avg_dl, 1, 1);
+ ___update_load_avg(&rq->avg_dl, 1);
trace_pelt_dl_tp(rq);
return 1;
}
@@ -367,13 +368,46 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+/*
+ * thermal:
+ *
+ * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
+ *
+ * util_avg and runnable_load_avg are not supported and meaningless.
+ *
+ * Unlike rt/dl utilization tracking that track time spent by a cpu
+ * running a rt/dl task through util_avg, the average thermal pressure is
+ * tracked through load_avg. This is because thermal pressure signal is
+ * time weighted "delta" capacity unlike util_avg which is binary.
+ * "delta capacity" = actual capacity -
+ * capped capacity a cpu due to a thermal event.
+ */
+
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ if (___update_load_sum(now, &rq->avg_thermal,
+ capacity,
+ capacity,
+ capacity)) {
+ ___update_load_avg(&rq->avg_thermal, 1);
+ trace_pelt_thermal_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
* irq:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
*
*/
@@ -410,7 +444,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
1);
if (ret) {
- ___update_load_avg(&rq->avg_irq, 1, 1);
+ ___update_load_avg(&rq->avg_irq, 1);
trace_pelt_irq_tp(rq);
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index afff644da065..eb034d9f024d 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_thermal.load_avg);
+}
+#else
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
@@ -159,6 +179,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 028520702717..8f45cdb6463b 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_FULL:
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > 1;
+ return tasks[NR_RUNNING] > tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -669,13 +669,14 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
groupc->times[PSI_NONIDLE] += delta;
}
-static u32 psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set)
+static void psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set,
+ bool wake_clock)
{
struct psi_group_cpu *groupc;
+ u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
- u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -695,10 +696,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t] == 0 && !psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
groupc->tasks[t]--;
@@ -717,7 +718,11 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
write_seqcount_end(&groupc->seq);
- return state_mask;
+ if (state_mask & group->poll_states)
+ psi_schedule_poll_work(group, 1);
+
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -744,27 +749,32 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
return &psi_system;
}
-void psi_task_change(struct task_struct *task, int clear, int set)
+static void psi_flags_change(struct task_struct *task, int clear, int set)
{
- int cpu = task_cpu(task);
- struct psi_group *group;
- bool wake_clock = true;
- void *iter = NULL;
-
- if (!task->pid)
- return;
-
if (((task->psi_flags & set) ||
(task->psi_flags & clear) != clear) &&
!psi_bug) {
printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
- task->pid, task->comm, cpu,
+ task->pid, task->comm, task_cpu(task),
task->psi_flags, clear, set);
psi_bug = 1;
}
task->psi_flags &= ~clear;
task->psi_flags |= set;
+}
+
+void psi_task_change(struct task_struct *task, int clear, int set)
+{
+ int cpu = task_cpu(task);
+ struct psi_group *group;
+ bool wake_clock = true;
+ void *iter = NULL;
+
+ if (!task->pid)
+ return;
+
+ psi_flags_change(task, clear, set);
/*
* Periodic aggregation shuts off if there is a period of no
@@ -777,14 +787,51 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
- while ((group = iterate_groups(task, &iter))) {
- u32 state_mask = psi_group_change(group, cpu, clear, set);
+ while ((group = iterate_groups(task, &iter)))
+ psi_group_change(group, cpu, clear, set, wake_clock);
+}
+
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep)
+{
+ struct psi_group *group, *common = NULL;
+ int cpu = task_cpu(prev);
+ void *iter;
+
+ if (next->pid) {
+ psi_flags_change(next, 0, TSK_ONCPU);
+ /*
+ * When moving state between tasks, the group that
+ * contains them both does not change: we can stop
+ * updating the tree once we reach the first common
+ * ancestor. Iterate @next's ancestors until we
+ * encounter @prev's state.
+ */
+ iter = NULL;
+ while ((group = iterate_groups(next, &iter))) {
+ if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ common = group;
+ break;
+ }
+
+ psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+ }
+ }
+
+ /*
+ * If this is a voluntary sleep, dequeue will have taken care
+ * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
+ * only need to deal with it during preemption.
+ */
+ if (sleep)
+ return;
- if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+ if (prev->pid) {
+ psi_flags_change(prev, TSK_ONCPU, 0);
- if (wake_clock && !delayed_work_pending(&group->avgs_work))
- schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+ iter = NULL;
+ while ((group = iterate_groups(prev, &iter)) && group != common)
+ psi_group_change(group, cpu, TSK_ONCPU, 0, true);
}
}
@@ -818,17 +865,17 @@ void psi_memstall_enter(unsigned long *flags)
if (static_branch_likely(&psi_disabled))
return;
- *flags = current->flags & PF_MEMSTALL;
+ *flags = current->in_memstall;
if (*flags)
return;
/*
- * PF_MEMSTALL setting & accounting needs to be atomic wrt
+ * in_memstall setting & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we can
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);
- current->flags |= PF_MEMSTALL;
+ current->in_memstall = 1;
psi_task_change(current, 0, TSK_MEMSTALL);
rq_unlock_irq(rq, &rf);
@@ -851,13 +898,13 @@ void psi_memstall_leave(unsigned long *flags)
if (*flags)
return;
/*
- * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+ * in_memstall clearing & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we could
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);
- current->flags &= ~PF_MEMSTALL;
+ current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0);
rq_unlock_irq(rq, &rf);
@@ -916,12 +963,14 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task))
+ if (task_on_rq_queued(task)) {
task_flags = TSK_RUNNING;
- else if (task->in_iowait)
+ if (task_current(rq, task))
+ task_flags |= TSK_ONCPU;
+ } else if (task->in_iowait)
task_flags = TSK_IOWAIT;
- if (task->flags & PF_MEMSTALL)
+ if (task->in_memstall)
task_flags |= TSK_MEMSTALL;
if (task_flags)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4043abe45459..df11d88c9895 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1475,6 +1475,13 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
int target = find_lowest_rq(p);
/*
+ * Bail out if we were forcing a migration to find a better
+ * fitting CPU but our search failed.
+ */
+ if (!test && target != -1 && !rt_task_fits_capacity(p, target))
+ goto out_unlock;
+
+ /*
* Don't bother moving it if the destination CPU is
* not running a lower priority task.
*/
@@ -1482,6 +1489,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
+
+out_unlock:
rcu_read_unlock();
out:
@@ -1495,7 +1504,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
/*
@@ -1503,7 +1512,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpupri_find(&rq->rd->cpupri, p, NULL, NULL))
+ cpupri_find(&rq->rd->cpupri, p, NULL))
return;
/*
@@ -1647,8 +1656,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr) &&
- rt_task_fits_capacity(p, cpu))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1682,6 +1690,7 @@ static int find_lowest_rq(struct task_struct *task)
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ int ret;
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
@@ -1690,8 +1699,22 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask,
- rt_task_fits_capacity))
+ /*
+ * If we're on asym system ensure we consider the different capacities
+ * of the CPUs when searching for the lowest_mask.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+
+ ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
+ task, lowest_mask,
+ rt_task_fits_capacity);
+ } else {
+
+ ret = cpupri_find(&task_rq(task)->rd->cpupri,
+ task, lowest_mask);
+ }
+
+ if (!ret)
return -1; /* No targets found */
/*
@@ -2202,7 +2225,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
(rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio);
- if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq)))
+ if (need_to_push)
push_rt_tasks(rq);
}
@@ -2274,10 +2297,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- bool need_to_push = rq->rt.overloaded ||
- !rt_task_fits_capacity(p, cpu_of(rq));
-
- if (p->nr_cpus_allowed > 1 && need_to_push)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
@@ -2449,10 +2469,11 @@ const struct sched_class rt_sched_class = {
*/
static DEFINE_MUTEX(rt_constraints_mutex);
-/* Must be called with tasklist_lock held */
static inline int tg_has_rt_tasks(struct task_group *tg)
{
- struct task_struct *g, *p;
+ struct task_struct *task;
+ struct css_task_iter it;
+ int ret = 0;
/*
* Autogroups do not have RT tasks; see autogroup_create().
@@ -2460,12 +2481,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
if (task_group_is_autogroup(tg))
return 0;
- for_each_process_thread(g, p) {
- if (rt_task(p) && task_group(p) == tg)
- return 1;
- }
+ css_task_iter_start(&tg->css, 0, &it);
+ while (!ret && (task = css_task_iter_next(&it)))
+ ret |= rt_task(task);
+ css_task_iter_end(&it);
- return 0;
+ return ret;
}
struct rt_schedulable_data {
@@ -2496,9 +2517,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
return -EINVAL;
/*
- * Ensure we don't starve existing RT tasks.
+ * Ensure we don't starve existing RT tasks if runtime turns zero.
*/
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ if (rt_bandwidth_enabled() && !runtime &&
+ tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
return -EBUSY;
total = to_ratio(period, runtime);
@@ -2564,7 +2586,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
return -EINVAL;
mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
goto unlock;
@@ -2582,7 +2603,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
}
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
unlock:
- read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return err;
@@ -2641,9 +2661,7 @@ static int sched_rt_global_constraints(void)
int ret = 0;
mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
ret = __rt_schedulable(NULL, 0, 0);
- read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9ea647835fd6..0f616bf7bce3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -118,7 +118,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)
@@ -305,7 +311,6 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
-extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
extern void init_dl_bw(struct dl_bw *dl_b);
extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
@@ -489,7 +494,6 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long runnable_weight;
unsigned int nr_running;
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
@@ -528,7 +532,7 @@ struct cfs_rq {
int nr;
unsigned long load_avg;
unsigned long util_avg;
- unsigned long runnable_sum;
+ unsigned long runnable_avg;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -688,8 +692,30 @@ struct dl_rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
+
+static inline void se_update_runnable(struct sched_entity *se)
+{
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_running;
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ return !!se->on_rq;
+ else
+ return se->runnable_weight;
+}
+
#else
#define entity_is_task(se) 1
+
+static inline void se_update_runnable(struct sched_entity *se) {}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return !!se->on_rq;
+}
#endif
#ifdef CONFIG_SMP
@@ -701,10 +727,6 @@ static inline long se_weight(struct sched_entity *se)
return scale_load_down(se->load.weight);
}
-static inline long se_runnable(struct sched_entity *se)
-{
- return scale_load_down(se->runnable_weight);
-}
static inline bool sched_asym_prefer(int a, int b)
{
@@ -944,6 +966,9 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+ struct sched_avg avg_thermal;
+#endif
u64 idle_stamp;
u64 avg_idle;
@@ -967,7 +992,6 @@ struct rq {
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
- int hrtick_csd_pending;
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
@@ -1107,6 +1131,24 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
+/**
+ * By default the decay is the default pelt decay period.
+ * The decay shift can change the decay period in
+ * multiples of 32.
+ * Decay shift Decay period(ms)
+ * 0 32
+ * 1 64
+ * 2 128
+ * 3 256
+ * 4 512
+ */
+extern int sched_thermal_decay_shift;
+
+static inline u64 rq_clock_thermal(struct rq *rq)
+{
+ return rq_clock_task(rq) >> sched_thermal_decay_shift;
+}
+
static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
@@ -1337,8 +1379,6 @@ extern void sched_ttwu_pending(void);
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
-#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
-
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1869,7 +1909,6 @@ extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
@@ -1968,6 +2007,13 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
+#ifndef arch_scale_freq_tick
+static __always_inline
+void arch_scale_freq_tick(void)
+{
+}
+#endif
+
#ifndef arch_scale_freq_capacity
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
@@ -2492,3 +2538,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
return true;
}
#endif
+
+void swake_up_all_locked(struct swait_queue_head *q);
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index ba683fe81a6e..33d0daf83842 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -70,7 +70,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
return;
if (!wakeup || p->sched_psi_wake_requeue) {
- if (p->flags & PF_MEMSTALL)
+ if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->sched_psi_wake_requeue)
p->sched_psi_wake_requeue = 0;
@@ -90,9 +90,17 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return;
if (!sleep) {
- if (p->flags & PF_MEMSTALL)
+ if (p->in_memstall)
clear |= TSK_MEMSTALL;
} else {
+ /*
+ * When a task sleeps, schedule() dequeues it before
+ * switching to the next one. Merge the clearing of
+ * TSK_RUNNING and TSK_ONCPU to save an unnecessary
+ * psi_task_change() call in psi_sched_switch().
+ */
+ clear |= TSK_ONCPU;
+
if (p->in_iowait)
set |= TSK_IOWAIT;
}
@@ -109,14 +117,14 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+ if (unlikely(p->in_iowait || p->in_memstall)) {
struct rq_flags rf;
struct rq *rq;
int clear = 0;
if (p->in_iowait)
clear |= TSK_IOWAIT;
- if (p->flags & PF_MEMSTALL)
+ if (p->in_memstall)
clear |= TSK_MEMSTALL;
rq = __task_rq_lock(p, &rf);
@@ -126,18 +134,31 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
}
}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ psi_task_switch(prev, next, sleep);
+}
+
static inline void psi_task_tick(struct rq *rq)
{
if (static_branch_likely(&psi_disabled))
return;
- if (unlikely(rq->curr->flags & PF_MEMSTALL))
+ if (unlikely(rq->curr->in_memstall))
psi_memstall_tick(rq->curr, cpu_of(rq));
}
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep) {}
static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e83a3f8449f6..e1c655f928c7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,6 +32,19 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
+/*
+ * Wake up all waiters. This is an interface which is solely exposed for
+ * completions and not for general usage.
+ *
+ * It is intentionally different from swake_up_all() to allow usage from
+ * hard interrupt context and interrupt disabled regions.
+ */
+void swake_up_all_locked(struct swait_queue_head *q)
+{
+ while (!list_empty(&q->task_list))
+ swake_up_locked(q);
+}
+
void swake_up_one(struct swait_queue_head *q)
{
unsigned long flags;
@@ -69,7 +82,7 @@ void swake_up_all(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_all);
-static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dfb64c08a407..8344757bba6e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -317,8 +317,9 @@ static void sched_energy_set(bool has_eas)
* EAS can be used on a root domain if it meets all the following conditions:
* 1. an Energy Model (EM) is available;
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
- * 3. the EM complexity is low enough to keep scheduling overheads low;
- * 4. schedutil is driving the frequency of all CPUs of the rd;
+ * 3. no SMT is detected.
+ * 4. the EM complexity is low enough to keep scheduling overheads low;
+ * 5. schedutil is driving the frequency of all CPUs of the rd;
*
* The complexity of the Energy Model is defined as:
*
@@ -360,6 +361,13 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
goto free;
}
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_map));
+ goto free;
+ }
+
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
@@ -1374,18 +1382,9 @@ sd_init(struct sched_domain_topology_level *tl,
* Convert topological properties into behaviour.
*/
- if (sd->flags & SD_ASYM_CPUCAPACITY) {
- struct sched_domain *t = sd;
-
- /*
- * Don't attempt to spread across CPUs of different capacities.
- */
- if (sd->child)
- sd->child->flags &= ~SD_PREFER_SIBLING;
-
- for_each_lower_domain(t)
- t->flags |= SD_BALANCE_WAKE;
- }
+ /* Don't attempt to spread across CPUs of different capacities. */
+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
+ sd->child->flags &= ~SD_PREFER_SIBLING;
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b6ea3dcb57bf..ec5c606bc3a1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -528,8 +528,12 @@ static long seccomp_attach_filter(unsigned int flags,
int ret;
ret = seccomp_can_sync_threads();
- if (ret)
- return ret;
+ if (ret) {
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
+ return -ESRCH;
+ else
+ return ret;
+ }
}
/* Set log flag, if present. */
@@ -1221,6 +1225,7 @@ static const struct file_operations seccomp_notify_ops = {
.poll = seccomp_notify_poll,
.release = seccomp_notify_release,
.unlocked_ioctl = seccomp_notify_ioctl,
+ .compat_ioctl = seccomp_notify_ioctl,
};
static struct file *init_listener(struct seccomp_filter *filter)
@@ -1288,10 +1293,12 @@ static long seccomp_set_mode_filter(unsigned int flags,
* In the successful case, NEW_LISTENER returns the new listener fd.
* But in the failure case, TSYNC returns the thread that died. If you
* combine these two flags, there's no way to tell whether something
- * succeeded or failed. So, let's disallow this combination.
+ * succeeded or failed. So, let's disallow this combination if the user
+ * has not explicitly requested no errors from TSYNC.
*/
if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
- (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
+ (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
+ ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
/* Prepare the new filter before holding any locks. */
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ada39eb4d4..786092aabdcd 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -329,6 +329,11 @@ EXPORT_SYMBOL(smp_call_function_single);
* (ie: embedded in an object) and is responsible for synchronizing it
* such that the IPIs performed on the @csd are strictly serialized.
*
+ * If the function is called with one csd which has not yet been
+ * processed by previous call to smp_call_function_single_async(), the
+ * function will return immediately with -EBUSY showing that the csd
+ * object is still in progress.
+ *
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
@@ -338,14 +343,17 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
preempt_disable();
- /* We could deadlock if we have to wait here with interrupts disabled! */
- if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
- csd_lock_wait(csd);
+ if (csd->flags & CSD_FLAG_LOCK) {
+ err = -EBUSY;
+ goto out;
+ }
csd->flags = CSD_FLAG_LOCK;
smp_wmb();
err = generic_exec_single(cpu, csd, csd->func, csd->info);
+
+out:
preempt_enable();
return err;
@@ -589,20 +597,13 @@ void __init setup_nr_cpu_ids(void)
void __init smp_init(void)
{
int num_nodes, num_cpus;
- unsigned int cpu;
idle_threads_init();
cpuhp_threads_init();
pr_info("Bringing up secondary CPUs ...\n");
- /* FIXME: This should be done in userspace --RR */
- for_each_present_cpu(cpu) {
- if (num_online_cpus() >= setup_max_cpus)
- break;
- if (!cpu_online(cpu))
- cpu_up(cpu);
- }
+ bringup_nonboot_cpus(setup_max_cpus);
num_nodes = num_online_nodes();
num_cpus = num_online_cpus();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0427a86743a4..a47c6dd57452 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -126,7 +126,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
* Were softirqs turned off above:
*/
if (softirq_count() == (cnt & SOFTIRQ_MASK))
- trace_softirqs_off(ip);
+ lockdep_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == cnt) {
@@ -147,7 +147,7 @@ static void __local_bh_enable(unsigned int cnt)
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
if (softirq_count() == (cnt & SOFTIRQ_MASK))
- trace_softirqs_on(_RET_IP_);
+ lockdep_softirqs_on(_RET_IP_);
__preempt_count_sub(cnt);
}
@@ -174,7 +174,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
* Are softirqs going to be turned on now:
*/
if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
- trace_softirqs_on(ip);
+ lockdep_softirqs_on(ip);
/*
* Keep preemption disabled until we are done with
* softirq processing:
@@ -224,9 +224,9 @@ static inline bool lockdep_softirq_start(void)
{
bool in_hardirq = false;
- if (trace_hardirq_context(current)) {
+ if (lockdep_hardirq_context(current)) {
in_hardirq = true;
- trace_hardirq_exit();
+ lockdep_hardirq_exit();
}
lockdep_softirq_enter();
@@ -239,7 +239,7 @@ static inline void lockdep_softirq_end(bool in_hardirq)
lockdep_softirq_exit();
if (in_hardirq)
- trace_hardirq_enter();
+ lockdep_hardirq_enter();
}
#else
static inline bool lockdep_softirq_start(void) { return false; }
@@ -414,7 +414,8 @@ void irq_exit(void)
tick_irq_exit();
rcu_irq_exit();
- trace_hardirq_exit(); /* must be last! */
+ /* must be last! */
+ lockdep_hardirq_exit();
}
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index f9bc5c303e3f..d325f3ab624a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/time_namespace.h>
#include <linux/binfmts.h>
#include <linux/sched.h>
@@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info)
memset(info, 0, sizeof(struct sysinfo));
ktime_get_boottime_ts64(&tp);
+ timens_add_boottime(&tp);
info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 0fef395662a6..825f28259a19 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -97,16 +97,26 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
- raw_spin_lock_irq(&task->pi_lock);
do {
+ head = NULL;
work = READ_ONCE(task->task_works);
- head = !work && (task->flags & PF_EXITING) ?
- &work_exited : NULL;
+ if (!work) {
+ if (task->flags & PF_EXITING)
+ head = &work_exited;
+ else
+ break;
+ }
} while (cmpxchg(&task->task_works, work, head) != work);
- raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
+ /*
+ * Synchronize with task_work_cancel(). It can not remove
+ * the first entry == work, cmpxchg(task_works) must fail.
+ * But it can remove another entry from the ->next list.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ raw_spin_unlock_irq(&task->pi_lock);
do {
next = work->next;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 428beb69426a..7cb09c4cf21c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -928,6 +928,15 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_arch_init(cs);
+#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE
+ if (cs->vdso_clock_mode < 0 ||
+ cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
+ pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
+ cs->name, cs->vdso_clock_mode);
+ cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
+ }
+#endif
+
/* Initialize mult/shift and max_idle_ns */
__clocksource_update_freq_scale(cs, scale, freq);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3a609e7344f3..d0a5ba37aff4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -311,7 +311,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div)
div >>= 1;
}
tmp >>= sft;
- do_div(tmp, (unsigned long) div);
+ do_div(tmp, (u32) div);
return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
@@ -1404,7 +1404,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
- timer->is_hard = !softtimer;
+ timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1514,7 +1514,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
*/
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
+ lockdep_hrtimer_enter(timer);
+
restart = fn(timer);
+
+ lockdep_hrtimer_exit(timer);
trace_hrtimer_expire_exit(timer);
raw_spin_lock_irq(&cpu_base->lock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index d23b434c2ca7..eddcf4970444 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = {
.max_cycles = 10,
};
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
@@ -67,9 +68,9 @@ u64 get_jiffies_64(void)
u64 ret;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
ret = jiffies_64;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 12858507d75a..e6ba064ce773 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -8,6 +8,7 @@
#include <linux/user_namespace.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
+#include <linux/clocksource.h>
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
@@ -172,8 +173,8 @@ static struct timens_offset offset_from_ts(struct timespec64 off)
* for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
* update to finish and for 'seq' to become even anyway.
*
- * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces
- * the time namespace handling path.
+ * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * enforces the time namespace handling path.
*/
static void timens_setup_vdso_data(struct vdso_data *vdata,
struct time_namespace *ns)
@@ -183,7 +184,7 @@ static void timens_setup_vdso_data(struct vdso_data *vdata,
struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
vdata->seq = 1;
- vdata->clock_mode = VCLOCK_TIMENS;
+ vdata->clock_mode = VDSO_CLOCKMODE_TIMENS;
offset[CLOCK_MONOTONIC] = monotonic;
offset[CLOCK_MONOTONIC_RAW] = monotonic;
offset[CLOCK_MONOTONIC_COARSE] = monotonic;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8ff6da77a01f..2fd3b3fa68bf 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -118,6 +118,16 @@ static inline int validate_clock_permissions(const clockid_t clock)
return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL;
}
+static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer)
+{
+ return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID;
+}
+
+static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer)
+{
+ return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer));
+}
+
/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
@@ -336,9 +346,7 @@ static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
/*
* Sample a process (thread group) clock for the given task clkid. If the
* group's cputime accounting is already enabled, read the atomic
- * store. Otherwise a full update is required. Task's sighand lock must be
- * held to protect the task traversal on a full update. clkid is already
- * validated.
+ * store. Otherwise a full update is required. clkid is already validated.
*/
static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
bool start)
@@ -393,7 +401,12 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
new_timer->kclock = &clock_posix_cpu;
timerqueue_init(&new_timer->it.cpu.node);
- new_timer->it.cpu.task = p;
+ new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer));
+ /*
+ * get_task_for_clock() took a reference on @p. Drop it as the timer
+ * holds a reference on the pid of @p.
+ */
+ put_task_struct(p);
return 0;
}
@@ -406,13 +419,15 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
static int posix_cpu_timer_del(struct k_itimer *timer)
{
struct cpu_timer *ctmr = &timer->it.cpu;
- struct task_struct *p = ctmr->task;
struct sighand_struct *sighand;
+ struct task_struct *p;
unsigned long flags;
int ret = 0;
- if (WARN_ON_ONCE(!p))
- return -EINVAL;
+ rcu_read_lock();
+ p = cpu_timer_task_rcu(timer);
+ if (!p)
+ goto out;
/*
* Protect against sighand release/switch in exit/exec and process/
@@ -434,8 +449,10 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
unlock_task_sighand(p, &flags);
}
+out:
+ rcu_read_unlock();
if (!ret)
- put_task_struct(p);
+ put_pid(ctmr->pid);
return ret;
}
@@ -484,12 +501,11 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
* Insert the timer on the appropriate list before any timers that
* expire later. This must be called with the sighand lock held.
*/
-static void arm_timer(struct k_itimer *timer)
+static void arm_timer(struct k_itimer *timer, struct task_struct *p)
{
int clkidx = CPUCLOCK_WHICH(timer->it_clock);
struct cpu_timer *ctmr = &timer->it.cpu;
u64 newexp = cpu_timer_getexpires(ctmr);
- struct task_struct *p = ctmr->task;
struct posix_cputimer_base *base;
if (CPUCLOCK_PERTHREAD(timer->it_clock))
@@ -564,13 +580,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
u64 old_expires, new_expires, old_incr, val;
struct cpu_timer *ctmr = &timer->it.cpu;
- struct task_struct *p = ctmr->task;
struct sighand_struct *sighand;
+ struct task_struct *p;
unsigned long flags;
int ret = 0;
- if (WARN_ON_ONCE(!p))
- return -EINVAL;
+ rcu_read_lock();
+ p = cpu_timer_task_rcu(timer);
+ if (!p) {
+ /*
+ * If p has just been reaped, we can no
+ * longer get any information about it at all.
+ */
+ rcu_read_unlock();
+ return -ESRCH;
+ }
/*
* Use the to_ktime conversion because that clamps the maximum
@@ -587,8 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* If p has just been reaped, we can no
* longer get any information about it at all.
*/
- if (unlikely(sighand == NULL))
+ if (unlikely(sighand == NULL)) {
+ rcu_read_unlock();
return -ESRCH;
+ }
/*
* Disarm any old timer after extracting its expiry time.
@@ -662,7 +688,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
*/
cpu_timer_setexpires(ctmr, new_expires);
if (new_expires != 0 && val < new_expires) {
- arm_timer(timer);
+ arm_timer(timer, p);
}
unlock_task_sighand(p, &flags);
@@ -693,6 +719,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
ret = 0;
out:
+ rcu_read_unlock();
if (old)
old->it_interval = ns_to_timespec64(old_incr);
@@ -704,10 +731,12 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
struct cpu_timer *ctmr = &timer->it.cpu;
u64 now, expires = cpu_timer_getexpires(ctmr);
- struct task_struct *p = ctmr->task;
+ struct task_struct *p;
- if (WARN_ON_ONCE(!p))
- return;
+ rcu_read_lock();
+ p = cpu_timer_task_rcu(timer);
+ if (!p)
+ goto out;
/*
* Easy part: convert the reload time.
@@ -715,36 +744,15 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
itp->it_interval = ktime_to_timespec64(timer->it_interval);
if (!expires)
- return;
+ goto out;
/*
* Sample the clock to take the difference with the expiry time.
*/
- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
now = cpu_clock_sample(clkid, p);
- } else {
- struct sighand_struct *sighand;
- unsigned long flags;
-
- /*
- * Protect against sighand release/switch in exit/exec and
- * also make timer sampling safe if it ends up calling
- * thread_group_cputime().
- */
- sighand = lock_task_sighand(p, &flags);
- if (unlikely(sighand == NULL)) {
- /*
- * The process has been reaped.
- * We can't even collect a sample any more.
- * Disarm the timer, nothing else to do.
- */
- cpu_timer_setexpires(ctmr, 0);
- return;
- } else {
- now = cpu_clock_sample_group(clkid, p, false);
- unlock_task_sighand(p, &flags);
- }
- }
+ else
+ now = cpu_clock_sample_group(clkid, p, false);
if (now < expires) {
itp->it_value = ns_to_timespec64(expires - now);
@@ -756,6 +764,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
itp->it_value.tv_nsec = 1;
itp->it_value.tv_sec = 0;
}
+out:
+ rcu_read_unlock();
}
#define MAX_COLLECTED 20
@@ -976,56 +986,38 @@ static void check_process_timers(struct task_struct *tsk,
static void posix_cpu_timer_rearm(struct k_itimer *timer)
{
clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
- struct cpu_timer *ctmr = &timer->it.cpu;
- struct task_struct *p = ctmr->task;
+ struct task_struct *p;
struct sighand_struct *sighand;
unsigned long flags;
u64 now;
- if (WARN_ON_ONCE(!p))
- return;
+ rcu_read_lock();
+ p = cpu_timer_task_rcu(timer);
+ if (!p)
+ goto out;
/*
* Fetch the current sample and update the timer's expiry time.
*/
- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
now = cpu_clock_sample(clkid, p);
- bump_cpu_timer(timer, now);
- if (unlikely(p->exit_state))
- return;
-
- /* Protect timer list r/w in arm_timer() */
- sighand = lock_task_sighand(p, &flags);
- if (!sighand)
- return;
- } else {
- /*
- * Protect arm_timer() and timer sampling in case of call to
- * thread_group_cputime().
- */
- sighand = lock_task_sighand(p, &flags);
- if (unlikely(sighand == NULL)) {
- /*
- * The process has been reaped.
- * We can't even collect a sample any more.
- */
- cpu_timer_setexpires(ctmr, 0);
- return;
- } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
- /* If the process is dying, no need to rearm */
- goto unlock;
- }
+ else
now = cpu_clock_sample_group(clkid, p, true);
- bump_cpu_timer(timer, now);
- /* Leave the sighand locked for the call below. */
- }
+
+ bump_cpu_timer(timer, now);
+
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL))
+ goto out;
/*
* Now re-arm for the new expiry time.
*/
- arm_timer(timer);
-unlock:
+ arm_timer(timer, p);
unlock_task_sighand(p, &flags);
+out:
+ rcu_read_unlock();
}
/**
@@ -1126,8 +1118,11 @@ void run_posix_cpu_timers(void)
if (!fastpath_timer_check(tsk))
return;
- if (!lock_task_sighand(tsk, &flags))
+ lockdep_posixtimer_enter();
+ if (!lock_task_sighand(tsk, &flags)) {
+ lockdep_posixtimer_exit();
return;
+ }
/*
* Here we take off tsk->signal->cpu_timers[N] and
* tsk->cpu_timers[N] all the timers that are firing, and
@@ -1169,6 +1164,7 @@ void run_posix_cpu_timers(void)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
+ lockdep_posixtimer_exit();
}
/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ff0eb30de346..07709ac30439 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -121,7 +121,8 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
{
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash) {
+ hlist_for_each_entry_rcu(timer, head, t_hash,
+ lockdep_is_held(&hash_lock)) {
if ((timer->it_signal == sig) && (timer->it_id == id))
return timer;
}
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index e4332e3e2d56..fa3f800d7d76 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -208,7 +208,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (sched_clock_timer.function != NULL) {
/* update timeout for clock wrap */
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt,
+ HRTIMER_MODE_REL_HARD);
}
r = rate;
@@ -254,9 +255,9 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
sched_clock_timer.function = sched_clock_poll;
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
/*
@@ -293,7 +294,7 @@ void sched_clock_resume(void)
struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock();
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
rd->read_sched_clock = cd.actual_read_sched_clock;
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7e5d3524e924..6c9c342dd0e5 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -84,13 +84,15 @@ int tick_is_oneshot_available(void)
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -162,9 +164,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
ktime_t next;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
next = tick_next_period;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a792d21cac64..3e2dc9b8858c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(ktime_t now)
return;
/* Reevaluate with jiffies_lock held */
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
delta = ktime_sub(now, last_jiffies_update);
if (delta >= tick_period) {
@@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
} else {
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return;
}
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void)
{
ktime_t period;
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return period;
}
@@ -240,6 +245,7 @@ static void nohz_full_kick_func(struct irq_work *work)
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
.func = nohz_full_kick_func,
+ .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
};
/*
@@ -676,10 +682,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/* Read jiffies and the time when jiffies were updated last */
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
basemono = last_jiffies_update;
basejiff = jiffies;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ca69290bee2a..9ebaab13339d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1005,9 +1005,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
return -EOVERFLOW;
tmp *= mult;
- rem *= mult;
- do_div(rem, div);
+ rem = div64_u64(rem * mult, div);
*base = tmp + rem;
return 0;
}
@@ -2397,8 +2396,10 @@ EXPORT_SYMBOL(hardpps);
*/
void xtime_update(unsigned long ticks)
{
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
do_timer(ticks);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 141ab3ab0354..099737f6f10c 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { }
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
#define CS_NAME_LEN 32
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4820823515e9..a5221abb4594 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -944,6 +944,7 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
#define MOD_TIMER_PENDING_ONLY 0x01
#define MOD_TIMER_REDUCE 0x02
+#define MOD_TIMER_NOTPENDING 0x04
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
@@ -960,7 +961,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
* the timer is re-modified to have the same timeout or ends up in the
* same array bucket then just return:
*/
- if (timer_pending(timer)) {
+ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
/*
* The downside of this optimization is that it can result in
* larger granularity than you would get from adding a new
@@ -1133,7 +1134,7 @@ EXPORT_SYMBOL(timer_reduce);
void add_timer(struct timer_list *timer)
{
BUG_ON(timer_pending(timer));
- mod_timer(timer, timer->expires);
+ __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);
@@ -1828,21 +1829,23 @@ static void process_timeout(struct timer_list *t)
* schedule_timeout - sleep until timeout
* @timeout: timeout value in jiffies
*
- * Make the current task sleep until @timeout jiffies have
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
+ * Make the current task sleep until @timeout jiffies have elapsed.
+ * The function behavior depends on the current task state
+ * (see also set_current_state() description):
*
- * You can set the task state as follows -
+ * %TASK_RUNNING - the scheduler is called, but the task does not sleep
+ * at all. That happens because sched_submit_work() does nothing for
+ * tasks in %TASK_RUNNING state.
*
* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
* pass before the routine returns unless the current task is explicitly
- * woken up, (e.g. by wake_up_process())".
+ * woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task or the current task is explicitly woken
* up.
*
- * The current task state is guaranteed to be TASK_RUNNING when this
+ * The current task state is guaranteed to be %TASK_RUNNING when this
* routine returns.
*
* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
@@ -1850,7 +1853,7 @@ static void process_timeout(struct timer_list *t)
* value will be %MAX_SCHEDULE_TIMEOUT.
*
* Returns 0 when the timer has expired otherwise the remaining time in
- * jiffies will be returned. In all cases the return value is guaranteed
+ * jiffies will be returned. In all cases the return value is guaranteed
* to be non-negative.
*/
signed long __sched schedule_timeout(signed long timeout)
@@ -1891,7 +1894,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer.task = current;
timer_setup_on_stack(&timer.timer, process_timeout, 0);
- __mod_timer(&timer.timer, expire, 0);
+ __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
schedule();
del_singleshot_timer_sync(&timer.timer);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 9577c89179cd..54ce6eb2ca36 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -71,13 +71,15 @@ void update_vsyscall(struct timekeeper *tk)
{
struct vdso_data *vdata = __arch_get_k_vdso_data();
struct vdso_timestamp *vdso_ts;
+ s32 clock_mode;
u64 nsec;
/* copy vsyscall data */
vdso_write_begin(vdata);
- vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
- vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
+ clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ vdata[CS_HRES_COARSE].clock_mode = clock_mode;
+ vdata[CS_RAW].clock_mode = clock_mode;
/* CLOCK_REALTIME also required for time() */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
@@ -103,10 +105,10 @@ void update_vsyscall(struct timekeeper *tk)
WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
/*
- * Architectures can opt out of updating the high resolution part
- * of the VDSO.
+ * If the current clocksource is not VDSO capable, then spare the
+ * update of the high reolution parts.
*/
- if (__arch_update_vdso_data())
+ if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_data(vdata, tk);
__arch_update_vsyscall(vdata, tk);
diff --git a/kernel/torture.c b/kernel/torture.c
index 7c13f5558b71..a1a41484ff6d 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -42,6 +42,9 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
+static bool disable_onoff_at_boot;
+module_param(disable_onoff_at_boot, bool, 0444);
+
static char *torture_type;
static int verbose;
@@ -84,6 +87,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
{
unsigned long delta;
int ret;
+ char *s;
unsigned long starttime;
if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
@@ -97,12 +101,18 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
torture_type, cpu);
starttime = jiffies;
(*n_offl_attempts)++;
- ret = cpu_down(cpu);
+ ret = remove_cpu(cpu);
if (ret) {
+ s = "";
+ if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) {
+ // PCI probe frequently disables hotplug during boot.
+ (*n_offl_attempts)--;
+ s = " (-EBUSY forgiven during boot)";
+ }
if (verbose)
pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: offline %d failed: errno %d\n",
- torture_type, cpu, ret);
+ "torture_onoff task: offline %d failed%s: errno %d\n",
+ torture_type, cpu, s, ret);
} else {
if (verbose > 1)
pr_alert("%s" TORTURE_FLAG
@@ -137,6 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
{
unsigned long delta;
int ret;
+ char *s;
unsigned long starttime;
if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
@@ -148,12 +159,18 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
torture_type, cpu);
starttime = jiffies;
(*n_onl_attempts)++;
- ret = cpu_up(cpu);
+ ret = add_cpu(cpu);
if (ret) {
+ s = "";
+ if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) {
+ // PCI probe frequently disables hotplug during boot.
+ (*n_onl_attempts)--;
+ s = " (-EBUSY forgiven during boot)";
+ }
if (verbose)
pr_alert("%s" TORTURE_FLAG
- "torture_onoff task: online %d failed: errno %d\n",
- torture_type, cpu, ret);
+ "torture_onoff task: online %d failed%s: errno %d\n",
+ torture_type, cpu, s, ret);
} else {
if (verbose > 1)
pr_alert("%s" TORTURE_FLAG
@@ -192,17 +209,18 @@ torture_onoff(void *arg)
for_each_online_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
- if (!IS_MODULE(CONFIG_TORTURE_TEST))
+ if (!IS_MODULE(CONFIG_TORTURE_TEST)) {
for_each_possible_cpu(cpu) {
if (cpu_online(cpu))
continue;
- ret = cpu_up(cpu);
+ ret = add_cpu(cpu);
if (ret && verbose) {
pr_alert("%s" TORTURE_FLAG
"%s: Initial online %d: errno %d\n",
__func__, torture_type, cpu, ret);
}
}
+ }
if (maxcpu == 0) {
VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
@@ -215,6 +233,10 @@ torture_onoff(void *arg)
VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
}
while (!torture_must_stop()) {
+ if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) {
+ schedule_timeout_interruptible(HZ / 10);
+ continue;
+ }
cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
if (!torture_offline(cpu,
&n_offline_attempts, &n_offline_successes,
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4560878f0bac..ca39dc3230cb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1896,8 +1896,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
ret = 0;
- if (bt == NULL)
+ if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
+ }
if (ret == 0) {
if (attr == &dev_attr_act_mask)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 19e793aa441a..68250d433bd7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -732,7 +732,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- if (in_nmi()) {
+ if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f7ee102868a..fd81c7de77a7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1547,6 +1547,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
rec = bsearch(&key, pg->records, pg->index,
sizeof(struct dyn_ftrace),
ftrace_cmp_recs);
+ if (rec)
+ break;
}
return rec;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 301db4406bc3..4e01c448b4b4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1411,14 +1411,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
return;
rcu_read_lock();
retry:
- if (req_cpu == WORK_CPU_UNBOUND)
- cpu = wq_select_unbound_cpu(raw_smp_processor_id());
-
/* pwq which will be used unless @work is executing elsewhere */
- if (!(wq->flags & WQ_UNBOUND))
- pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
- else
+ if (wq->flags & WQ_UNBOUND) {
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+ } else {
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ }
/*
* If @work was previously on a different pool, it might still be