summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks13
-rw-r--r--kernel/compat.c6
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/events/core.c56
-rw-r--r--kernel/events/internal.h9
-rw-r--r--kernel/events/ring_buffer.c16
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c35
-rw-r--r--kernel/irq/chip.c14
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c31
-rw-r--r--kernel/irq/migration.c9
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lglock.c22
-rw-r--r--kernel/locking/lockdep.c6
-rw-r--r--kernel/locking/lockdep_proc.c22
-rw-r--r--kernel/locking/locktorture.c14
-rw-r--r--kernel/locking/mcs_spinlock.h1
-rw-r--r--kernel/locking/qrwlock.c30
-rw-r--r--kernel/locking/qspinlock.c473
-rw-r--r--kernel/locking/qspinlock_paravirt.h325
-rw-r--r--kernel/locking/rtmutex.c13
-rw-r--r--kernel/locking/rwsem-xadd.c44
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/rcu/rcutorture.c103
-rw-r--r--kernel/rcu/srcu.c10
-rw-r--r--kernel/rcu/tiny.c38
-rw-r--r--kernel/rcu/tiny_plugin.h12
-rw-r--r--kernel/rcu/tree.c365
-rw-r--r--kernel/rcu/tree.h35
-rw-r--r--kernel/rcu/tree_plugin.h216
-rw-r--r--kernel/rcu/tree_trace.c6
-rw-r--r--kernel/rcu/update.c30
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c143
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c374
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/sched/stats.h15
-rw-r--r--kernel/sched/wait.c8
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/stop_machine.c42
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/time/hrtimer.c14
-rw-r--r--kernel/time/posix-cpu-timers.c87
-rw-r--r--kernel/torture.c26
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace_events_filter.c11
-rw-r--r--kernel/watchdog.c2
55 files changed, 2131 insertions, 904 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1acd13..ebdb0043203a 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
def_bool y
depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
bool
-config QUEUE_RWLOCK
- def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+ def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+ depends on SMP
+
+config ARCH_USE_QUEUED_RWLOCKS
+ bool
+
+config QUEUED_RWLOCKS
+ def_bool y if ARCH_USE_QUEUED_RWLOCKS
depends on SMP
diff --git a/kernel/compat.c b/kernel/compat.c
index 24f00610c575..333d364be29d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -912,7 +912,8 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
* bitmap. We must however ensure the end of the
* kernel bitmap is zeroed.
*/
- if (nr_compat_longs-- > 0) {
+ if (nr_compat_longs) {
+ nr_compat_longs--;
if (__get_user(um, umask))
return -EFAULT;
} else {
@@ -954,7 +955,8 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
* We dont want to write past the end of the userspace
* bitmap.
*/
- if (nr_compat_longs-- > 0) {
+ if (nr_compat_longs) {
+ nr_compat_longs--;
if (__put_user(um, umask))
return -EFAULT;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 94bbe4695232..9c9c9fab16cc 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -398,7 +398,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
- smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
goto out_release;
}
@@ -463,6 +462,7 @@ static int smpboot_thread_call(struct notifier_block *nfb,
switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
case CPU_ONLINE:
smpboot_unpark_threads(cpu);
break;
@@ -479,7 +479,7 @@ static struct notifier_block smpboot_thread_notifier = {
.priority = CPU_PRI_SMPBOOT,
};
-void __cpuinit smpboot_thread_init(void)
+void smpboot_thread_init(void)
{
register_cpu_notifier(&smpboot_thread_notifier);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1c6c2826af1e..8e13f3e54ec3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3418,7 +3418,6 @@ static void free_event_rcu(struct rcu_head *head)
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
- perf_event_free_bpf_prog(event);
kfree(event);
}
@@ -3549,6 +3548,8 @@ static void __free_event(struct perf_event *event)
put_callchain_buffers();
}
+ perf_event_free_bpf_prog(event);
+
if (event->destroy)
event->destroy(event);
@@ -4306,20 +4307,20 @@ static void ring_buffer_attach(struct perf_event *event,
WARN_ON_ONCE(event->rcu_pending);
old_rb = event->rb;
- event->rcu_batches = get_state_synchronize_rcu();
- event->rcu_pending = 1;
-
spin_lock_irqsave(&old_rb->event_lock, flags);
list_del_rcu(&event->rb_entry);
spin_unlock_irqrestore(&old_rb->event_lock, flags);
- }
- if (event->rcu_pending && rb) {
- cond_synchronize_rcu(event->rcu_batches);
- event->rcu_pending = 0;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
}
if (rb) {
+ if (event->rcu_pending) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
spin_lock_irqsave(&rb->event_lock, flags);
list_add_rcu(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
@@ -5356,9 +5357,9 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-static void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5950,6 +5951,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
}
/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 lost;
+ } lost_samples_event = {
+ .header = {
+ .type = PERF_RECORD_LOST_SAMPLES,
+ .misc = 0,
+ .size = sizeof(lost_samples_event),
+ },
+ .lost = lost,
+ };
+
+ perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ lost_samples_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, lost_samples_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
+/*
* IRQ throttle logging
*/
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9ba4a04..2deb24c7a40d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -72,15 +72,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags);
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
- struct perf_output_handle *handle,
- struct perf_sample_data *sample);
-
extern struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 232f00f273cb..96472824a752 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
do {
- tail = ACCESS_ONCE(rb->user_page->data_tail);
+ tail = READ_ONCE_CTRL(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -493,6 +493,20 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
}
+ /*
+ * In overwrite mode, PMUs that don't support SG may not handle more
+ * than one contiguous allocation, since they rely on PMI to do double
+ * buffering. In this case, the entire buffer has to be one contiguous
+ * chunk.
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
+ overwrite) {
+ struct page *page = virt_to_page(rb->aux_pages[0]);
+
+ if (page_private(page) != max_order)
+ goto out;
+ }
+
rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
overwrite);
if (!rb->aux_priv)
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;
- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
+
+ p->pagefault_disabled = 0;
+
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 720eacff6b58..ea6ca0bca525 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
/*
* The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
*/
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
{
struct task_struct *p = q->task;
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
return;
/*
- * We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non-futex wake up happens on another CPU then the task
- * might exit and p would dereference a non-existing task
- * struct. Prevent this by holding a reference on p across the
- * wake up.
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
*/
- get_task_struct(p);
-
+ wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
*/
smp_wmb();
q->lock_ptr = NULL;
-
- wake_up_state(p, TASK_NORMAL);
- put_task_struct(p);
}
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
+ WAKE_Q(wake_q);
if (!bitset)
return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
}
spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
out_put_key:
put_futex_key(&key);
out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
+ WAKE_Q(wake_q);
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -1334,7 +1332,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@@ -1344,6 +1342,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
out_put_keys:
put_futex_key(&key2);
out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
+ WAKE_Q(wake_q);
if (requeue_pi) {
/*
@@ -1679,7 +1679,7 @@ retry_private:
* woken by futex_unlock_pi().
*/
if (++task_count <= nr_wake && !requeue_pi) {
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
continue;
}
@@ -1719,6 +1719,7 @@ retry_private:
out_unlock:
free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
hb_waiters_dec(hb2);
/*
@@ -2055,7 +2056,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
{
/*
* The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using set_mb() and
+ * wake it. set_current_state() is implemented using smp_store_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea394ab..55016b2151f3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -950,6 +950,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
}
/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_vcpu_affinity)
+ return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+ return -ENOSYS;
+}
+
+/**
* irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
* @data: Pointer to interrupt specific data
* @on: Whether to set or reset the wake-up capability of this irq
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af936..b93d434e70bd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,8 +59,6 @@ enum {
#include "debug.h"
#include "settings.h"
-#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
-
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932bb308e..b1c7e8f46bfb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+/**
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data
+ *
+ * This function uses the vCPU specific data to set the vCPU
+ * affinity for an irq. The vCPU specific data is passed from
+ * outside, such as KVM. One example code path is as below:
+ * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ struct irq_data *data;
+ struct irq_chip *chip;
+ int ret = -ENOSYS;
+
+ if (!desc)
+ return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+ irq_put_desc_unlock(desc, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
static void irq_affinity_notify(struct work_struct *work)
{
struct irq_affinity_notify *notify =
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff707..dd203e276b07 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,7 +7,7 @@
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = idata->chip;
+ struct irq_chip *chip = desc->irq_data.chip;
if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
return;
@@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata)
{
bool masked;
+ /*
+ * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled,
+ * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is
+ * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here.
+ */
+ idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
+
if (likely(!irqd_is_setaffinity_pending(idata)))
return;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cca2a..7dd5c9918e4c 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a0831e1b99f4..456614136f1a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3900,7 +3900,8 @@ static void zap_class(struct lock_class *class)
list_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
- class->key = NULL;
+ RCU_INIT_POINTER(class->key, NULL);
+ RCU_INIT_POINTER(class->name, NULL);
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -4066,8 +4067,7 @@ void __init lockdep_info(void)
#ifdef CONFIG_DEBUG_LOCKDEP
if (lockdep_init_error) {
- printk("WARNING: lockdep init error! lock-%s was acquired"
- "before lockdep_init\n", lock_init_error);
+ printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
printk("Call stack leading to lockdep invocation was:\n");
print_stack_trace(&lockdep_init_trace, 0);
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index ef43ac4bafb5..d83d798bef95 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -426,10 +426,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- char name[39];
- struct lock_class *class;
+ struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
+ struct lock_class *class;
+ const char *cname;
int i, namelen;
+ char name[39];
class = data->class;
stats = &data->stats;
@@ -440,15 +442,25 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
if (class->subclass)
namelen -= 2;
- if (!class->name) {
+ rcu_read_lock_sched();
+ cname = rcu_dereference_sched(class->name);
+ ckey = rcu_dereference_sched(class->key);
+
+ if (!cname && !ckey) {
+ rcu_read_unlock_sched();
+ return;
+
+ } else if (!cname) {
char str[KSYM_NAME_LEN];
const char *key_name;
- key_name = __get_key_name(class->key, str);
+ key_name = __get_key_name(ckey, str);
snprintf(name, namelen, "%s", key_name);
} else {
- snprintf(name, namelen, "%s", class->name);
+ snprintf(name, namelen, "%s", cname);
}
+ rcu_read_unlock_sched();
+
namelen = strlen(name);
if (class->name_version > 1) {
snprintf(name+namelen, 3, "#%d", class->name_version);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce259779..32244186f1f2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_us = 100;
+ const unsigned long longdelay_ms = 100;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
@@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_us = 100;
+ const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
@@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
static void torture_rwlock_read_unlock_irq(void)
__releases(torture_rwlock)
{
- write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+ read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
static struct lock_torture_ops rw_lock_irq_ops = {
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf3f2..fd91aaa4554c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
+ int count; /* nesting count, see qspinlock.c */
};
#ifndef arch_mcs_spin_lock_contended
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f90d..6c5da483966b 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
/*
- * Queue read/write lock
+ * Queued read/write locks
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,6 +22,26 @@
#include <linux/hardirq.h>
#include <asm/qrwlock.h>
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+ union {
+ atomic_t cnts;
+ struct {
+#ifdef __LITTLE_ENDIAN
+ u8 wmode; /* Writer mode */
+ u8 rcnts[3]; /* Reader counts */
+#else
+ u8 rcnts[3]; /* Reader counts */
+ u8 wmode; /* Writer mode */
+#endif
+ };
+ };
+ arch_spinlock_t lock;
+};
+
/**
* rspin_until_writer_unlock - inc reader count & spin until writer is gone
* @lock : Pointer to queue rwlock structure
@@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
* or wait for a previous writer to go away.
*/
for (;;) {
- cnts = atomic_read(&lock->cnts);
- if (!(cnts & _QW_WMASK) &&
- (atomic_cmpxchg(&lock->cnts, cnts,
- cnts | _QW_WAITING) == cnts))
+ struct __qrwlock *l = (struct __qrwlock *)lock;
+
+ if (!READ_ONCE(l->wmode) &&
+ (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
break;
cpu_relax_lowlatency();
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000000..38c49202d532
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES 8
+#else
+#define MAX_NODES 4
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(idx > 3);
+#endif
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+ union {
+ atomic_t val;
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u8 locked;
+ u8 pending;
+ };
+ struct {
+ u16 locked_pending;
+ u16 tail;
+ };
+#else
+ struct {
+ u16 tail;
+ u16 locked_pending;
+ };
+ struct {
+ u8 reserved[2];
+ u8 pending;
+ u8 locked;
+ };
+#endif
+ };
+};
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new, val = atomic_read(&lock->val);
+
+ for (;;) {
+ new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+
+#define pv_enabled() false
+
+#define pv_init_node __pv_init_node
+#define pv_wait_node __pv_wait_node
+#define pv_kick_node __pv_kick_node
+#define pv_wait_head __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 new, old, tail;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (pv_enabled())
+ goto queue;
+
+ if (virt_queued_spin_lock(lock))
+ return;
+
+ /*
+ * wait for in-progress pending->locked hand-overs
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+ cpu_relax();
+ }
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,0 -> 0,0,1 ; trylock
+ * 0,0,1 -> 0,1,1 ; pending
+ */
+ for (;;) {
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ new = _Q_LOCKED_VAL;
+ if (val == new)
+ new |= _Q_PENDING_VAL;
+
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ /*
+ * we won the trylock
+ */
+ if (new == _Q_LOCKED_VAL)
+ return;
+
+ /*
+ * we're pending, wait for the owner to go away.
+ *
+ * *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all clear_pending_set_locked()
+ * implementations imply full barriers.
+ */
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * *,1,0 -> *,0,1
+ */
+ clear_pending_set_locked(lock);
+ return;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ node = this_cpu_ptr(&mcs_nodes[0]);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ node += idx;
+ node->locked = 0;
+ node->next = NULL;
+ pv_init_node(node);
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ prev = decode_tail(old);
+ WRITE_ONCE(prev->next, node);
+
+ pv_wait_node(node);
+ arch_mcs_spin_lock_contended(&node->locked);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ */
+ pv_wait_head(lock, node);
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+ cpu_relax();
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,0,0 -> *,0,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail),
+ * clear the tail code and grab the lock. Otherwise, we only need
+ * to grab the lock.
+ */
+ for (;;) {
+ if (val != tail) {
+ set_locked(lock);
+ break;
+ }
+ old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ if (old == val)
+ goto release; /* No contention */
+
+ val = old;
+ }
+
+ /*
+ * contended path; wait for next, release.
+ */
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ pv_kick_node(next);
+
+release:
+ /*
+ * release the node
+ */
+ this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled() true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+
+#undef queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000000..04ab18151cc8
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,325 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ * pv_kick(cpu) -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+
+enum vcpu_state {
+ vcpu_running = 0,
+ vcpu_halted,
+};
+
+struct pv_node {
+ struct mcs_spinlock mcs;
+ struct mcs_spinlock __res[3];
+
+ int cpu;
+ u8 state;
+};
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+ struct qspinlock *lock;
+ struct pv_node *node;
+};
+
+#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+ int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+ if (pv_hash_size < PV_HE_MIN)
+ pv_hash_size = PV_HE_MIN;
+
+ /*
+ * Allocate space from bootmem which should be page-size aligned
+ * and hence cacheline aligned.
+ */
+ pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+ sizeof(struct pv_hash_entry),
+ pv_hash_size, 0, HASH_EARLY,
+ &pv_lock_hash_bits, NULL,
+ pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash) \
+ for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
+ offset < (1 << pv_lock_hash_bits); \
+ offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (!cmpxchg(&he->lock, NULL, lock)) {
+ WRITE_ONCE(he->node, node);
+ return &he->lock;
+ }
+ }
+ /*
+ * Hard assume there is a free entry for us.
+ *
+ * This is guaranteed by ensuring every blocked lock only ever consumes
+ * a single entry, and since we only have 4 nesting levels per CPU
+ * and allocated 4*nr_possible_cpus(), this must be so.
+ *
+ * The single entry is guaranteed by having the lock owner unhash
+ * before it releases.
+ */
+ BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ struct pv_node *node;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (READ_ONCE(he->lock) == lock) {
+ node = READ_ONCE(he->node);
+ WRITE_ONCE(he->lock, NULL);
+ return node;
+ }
+ }
+ /*
+ * Hard assume we'll find an entry.
+ *
+ * This guarantees a limited lookup time and is itself guaranteed by
+ * having the lock owner do the unhash -- IFF the unlock sees the
+ * SLOW flag, there MUST be a hash entry.
+ */
+ BUG();
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+ pn->cpu = smp_processor_id();
+ pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ int loop;
+
+ for (;;) {
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (READ_ONCE(node->locked))
+ return;
+ cpu_relax();
+ }
+
+ /*
+ * Order pn->state vs pn->locked thusly:
+ *
+ * [S] pn->state = vcpu_halted [S] next->locked = 1
+ * MB MB
+ * [L] pn->locked [RmW] pn->state = vcpu_running
+ *
+ * Matches the xchg() from pv_kick_node().
+ */
+ smp_store_mb(pn->state, vcpu_halted);
+
+ if (!READ_ONCE(node->locked))
+ pv_wait(&pn->state, vcpu_halted);
+
+ /*
+ * Reset the vCPU state to avoid unncessary CPU kicking
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * If the locked flag is still not set after wakeup, it is a
+ * spurious wakeup and the vCPU should wait again. However,
+ * there is a pretty high overhead for CPU halting and kicking.
+ * So it is better to spin for a while in the hope that the
+ * MCS lock will be released soon.
+ */
+ }
+ /*
+ * By now our node->locked should be 1 and our caller will not actually
+ * spin-wait for it. We do however rely on our caller to do a
+ * load-acquire for us.
+ */
+}
+
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ /*
+ * Note that because node->locked is already set, this actual
+ * mcs_spinlock entry could be re-used already.
+ *
+ * This should be fine however, kicking people for no reason is
+ * harmless.
+ *
+ * See the comment in pv_wait_node().
+ */
+ if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+ pv_kick(pn->cpu);
+}
+
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
+ struct qspinlock **lp = NULL;
+ int loop;
+
+ for (;;) {
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (!READ_ONCE(l->locked))
+ return;
+ cpu_relax();
+ }
+
+ WRITE_ONCE(pn->state, vcpu_halted);
+ if (!lp) { /* ONCE */
+ lp = pv_hash(lock, pn);
+ /*
+ * lp must be set before setting _Q_SLOW_VAL
+ *
+ * [S] lp = lock [RmW] l = l->locked = 0
+ * MB MB
+ * [S] l->locked = _Q_SLOW_VAL [L] lp
+ *
+ * Matches the cmpxchg() in __pv_queued_spin_unlock().
+ */
+ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ /*
+ * The lock is free and _Q_SLOW_VAL has never
+ * been set. Therefore we need to unhash before
+ * getting the lock.
+ */
+ WRITE_ONCE(*lp, NULL);
+ return;
+ }
+ }
+ pv_wait(&l->locked, _Q_SLOW_VAL);
+
+ /*
+ * The unlocker should have freed the lock before kicking the
+ * CPU. So if the lock is still not free, it is a spurious
+ * wakeup and so the vCPU should wait again after spinning for
+ * a while.
+ */
+ }
+
+ /*
+ * Lock is unlocked now; the caller will acquire it without waiting.
+ * As with pv_wait_node() we rely on the caller to do a load-acquire
+ * for us.
+ */
+}
+
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ struct pv_node *node;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+ return;
+
+ /*
+ * Since the above failed to release, this must be the SLOW path.
+ * Therefore start by looking up the blocked node and unhashing it.
+ */
+ node = pv_unhash(lock);
+
+ /*
+ * Now that we have a reference to the (likely) blocked pv_node,
+ * release the lock.
+ */
+ smp_store_release(&l->locked, 0);
+
+ /*
+ * At this point the memory pointed at by lock can be freed/reused,
+ * however we can still use the pv_node to kick the CPU.
+ */
+ if (READ_ONCE(node->state) == vcpu_halted)
+ pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
+
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8b678cac7fbe..36573e96a477 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
*/
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
{
@@ -1440,10 +1440,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*
* @lock: the rt_mutex to be locked
*
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
* Returns 1 on success and 0 on contention
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
+ if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ return 0;
+
return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
}
EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172a5d..0f189714e457 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
return taken;
}
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return osq_is_locked(&sem->osq);
+}
+
#else
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
return false;
}
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return false;
+}
#endif
/*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ /*
+ * If a spinner is present, it is not necessary to do the wakeup.
+ * Try to do wakeup only if the trylock succeeds to minimize
+ * spinlock contention which may introduce too much delay in the
+ * unlock operation.
+ *
+ * spinning writer up_write/up_read caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * MB RMB
+ * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+ *
+ * Here, it is important to make sure that there won't be a missed
+ * wakeup while the rwsem is free and the only spinning writer goes
+ * to sleep without taking the rwsem. Even when the spinning writer
+ * is just going to break out of the waiting loop, it will still do
+ * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+ * rwsem_has_spinner() is true, it will guarantee at least one
+ * trylock attempt on the rwsem later on.
+ */
+ if (rwsem_has_spinner(sem)) {
+ /*
+ * The smp_rmb() here is to make sure that the spinner
+ * state is consulted before reading the wait_lock.
+ */
+ smp_rmb();
+ if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+ return sem;
+ goto locked;
+ }
raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
diff --git a/kernel/module.c b/kernel/module.c
index 42a1d2afb217..cfc9e843a924 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3370,6 +3370,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+
/* we can't deallocate the module until we clear memory protection */
unset_module_init_ro_nx(mod);
unset_module_core_ro_nx(mod);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8dbe27611ec3..59e32684c23b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p)
struct rcu_torture_ops {
int ttype;
void (*init)(void);
+ void (*cleanup)(void);
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
@@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = {
*/
DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
{
- return srcu_read_lock(&srcu_ctl);
+ return srcu_read_lock(srcu_ctlp);
}
static void srcu_read_delay(struct torture_random_state *rrsp)
@@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
rcu_read_delay(rrsp);
}
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
{
- srcu_read_unlock(&srcu_ctl, idx);
+ srcu_read_unlock(srcu_ctlp, idx);
}
static unsigned long srcu_torture_completed(void)
{
- return srcu_batches_completed(&srcu_ctl);
+ return srcu_batches_completed(srcu_ctlp);
}
static void srcu_torture_deferred_free(struct rcu_torture *rp)
{
- call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+ call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
}
static void srcu_torture_synchronize(void)
{
- synchronize_srcu(&srcu_ctl);
+ synchronize_srcu(srcu_ctlp);
}
static void srcu_torture_call(struct rcu_head *head,
void (*func)(struct rcu_head *head))
{
- call_srcu(&srcu_ctl, head, func);
+ call_srcu(srcu_ctlp, head, func);
}
static void srcu_torture_barrier(void)
{
- srcu_barrier(&srcu_ctl);
+ srcu_barrier(srcu_ctlp);
}
static void srcu_torture_stats(void)
{
int cpu;
- int idx = srcu_ctl.completed & 0x1;
+ int idx = srcu_ctlp->completed & 0x1;
pr_alert("%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
long c0, c1;
- c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
@@ -549,7 +552,7 @@ static void srcu_torture_stats(void)
static void srcu_torture_synchronize_expedited(void)
{
- synchronize_srcu_expedited(&srcu_ctl);
+ synchronize_srcu_expedited(srcu_ctlp);
}
static struct rcu_torture_ops srcu_ops = {
@@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = {
.name = "srcu"
};
+static void srcu_torture_init(void)
+{
+ rcu_sync_torture_init();
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+ cleanup_srcu_struct(&srcu_ctld);
+ srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .started = NULL,
+ .completed = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .name = "srcud"
+};
+
/*
* Definitions for sched torture testing.
*/
@@ -672,8 +707,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head)
struct rcu_boost_inflight *rbip =
container_of(head, struct rcu_boost_inflight, rcu);
- smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
- rbip->inflight = 0;
+ /* Ensure RCU-core accesses precede clearing ->inflight */
+ smp_store_release(&rbip->inflight, 0);
}
static int rcu_torture_boost(void *arg)
@@ -710,9 +745,9 @@ static int rcu_torture_boost(void *arg)
call_rcu_time = jiffies;
while (ULONG_CMP_LT(jiffies, endtime)) {
/* If we don't have a callback in flight, post one. */
- if (!rbi.inflight) {
- smp_mb(); /* RCU core before ->inflight = 1. */
- rbi.inflight = 1;
+ if (!smp_load_acquire(&rbi.inflight)) {
+ /* RCU core before ->inflight = 1. */
+ smp_store_release(&rbi.inflight, 1);
call_rcu(&rbi.rcu, rcu_torture_boost_cb);
if (jiffies - call_rcu_time >
test_boost_duration * HZ - HZ / 2) {
@@ -751,11 +786,10 @@ checkwait: stutter_wait("rcu_torture_boost");
} while (!torture_must_stop());
/* Clean up and exit. */
- while (!kthread_should_stop() || rbi.inflight) {
+ while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
}
- smp_mb(); /* order accesses to ->inflight before stack-frame death. */
destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
@@ -1054,7 +1088,7 @@ static void rcu_torture_timer(unsigned long unused)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
+ srcu_read_lock_held(srcu_ctlp));
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -1128,7 +1162,7 @@ rcu_torture_reader(void *arg)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
+ srcu_read_lock_held(srcu_ctlp));
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1413,12 +1447,15 @@ static int rcu_torture_barrier_cbs(void *arg)
do {
wait_event(barrier_cbs_wq[myid],
(newphase =
- ACCESS_ONCE(barrier_phase)) != lastphase ||
+ smp_load_acquire(&barrier_phase)) != lastphase ||
torture_must_stop());
lastphase = newphase;
- smp_mb(); /* ensure barrier_phase load before ->call(). */
if (torture_must_stop())
break;
+ /*
+ * The above smp_load_acquire() ensures barrier_phase load
+ * is ordered before the folloiwng ->call().
+ */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
@@ -1439,8 +1476,8 @@ static int rcu_torture_barrier(void *arg)
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
- smp_mb(); /* Ensure barrier_phase after prior assignments. */
- barrier_phase = !barrier_phase;
+ /* Ensure barrier_phase ordered after prior assignments. */
+ smp_store_release(&barrier_phase, !barrier_phase);
for (i = 0; i < n_barrier_cbs; i++)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
@@ -1588,10 +1625,14 @@ rcu_torture_cleanup(void)
rcutorture_booster_cleanup(i);
}
- /* Wait for all RCU callbacks to fire. */
-
+ /*
+ * Wait for all RCU callbacks to fire, then do flavor-specific
+ * cleanup operations.
+ */
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
@@ -1668,8 +1709,8 @@ rcu_torture_init(void)
int cpu;
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
- RCUTORTURE_TASKS_OPS
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
+ &sched_ops, RCUTORTURE_TASKS_OPS
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -1701,7 +1742,7 @@ rcu_torture_init(void)
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
- nrealreaders = num_online_cpus() - 1;
+ nrealreaders = num_online_cpus() - 2 - nreaders;
if (nrealreaders <= 0)
nrealreaders = 1;
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index cad76e76b4e7..fb33d35ee0b7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
unsigned long t;
for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
sum += t;
}
return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
unsigned long t;
for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
sum += t;
}
return sum;
@@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
}
return sum;
}
@@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
{
int idx;
- idx = ACCESS_ONCE(sp->completed) & 0x1;
+ idx = READ_ONCE(sp->completed) & 0x1;
preempt_disable();
__this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 069742d61c68..591af0cb7b9f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
#include "tiny_plugin.h"
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
/*
@@ -170,6 +137,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
+ if (rcp->donetail == &rcp->rcucblist) {
+ /* No callbacks ready, so just leave. */
+ local_irq_restore(flags);
+ return;
+ }
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index f94e209a10d6..e492a5253e0f 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
return;
rcp->ticks_this_gp++;
j = jiffies;
- js = ACCESS_ONCE(rcp->jiffies_stall);
+ js = READ_ONCE(rcp->jiffies_stall);
if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
jiffies - rcp->gp_start, rcp->qlen);
dump_stack();
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
} else if (ULONG_CMP_GE(j, js)) {
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
}
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
{
rcp->ticks_this_gp = 0;
rcp->gp_start = jiffies;
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2867..add042926a66 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,7 +91,7 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
DEFINE_RCU_TPS(sname) \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
.rda = &sname##_data, \
@@ -110,11 +110,18 @@ struct rcu_state sname##_state = { \
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-static struct rcu_state *rcu_state_p;
+static struct rcu_state *const rcu_state_p;
+static struct rcu_data __percpu *const rcu_data_p;
LIST_HEAD(rcu_struct_flavors);
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
@@ -159,17 +166,46 @@ static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
/* rcuc/rcub kthread realtime priority */
+#ifdef CONFIG_RCU_KTHREAD_PRIO
static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
module_param(kthread_prio, int, 0644);
/* Delay in jiffies for grace-period initialization delays, debug only. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
+static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
+module_param(gp_preinit_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+static const int gp_preinit_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+
#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
module_param(gp_init_delay, int, 0644);
#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
static const int gp_init_delay;
#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
+static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
+module_param(gp_cleanup_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static const int gp_cleanup_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+
+/*
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay. The longer the the delay, the more the grace periods between
+ * each delay. The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay. This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
+ */
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
/*
* Track the rcutorture test sequence number and the update version
@@ -191,17 +227,17 @@ unsigned long rcutorture_vernum;
*/
unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
{
- return ACCESS_ONCE(rnp->qsmaskinitnext);
+ return READ_ONCE(rnp->qsmaskinitnext);
}
/*
- * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
+ * Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
*/
static int rcu_gp_in_progress(struct rcu_state *rsp)
{
- return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+ return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
}
/*
@@ -278,8 +314,8 @@ static void rcu_momentary_dyntick_idle(void)
if (!(resched_mask & rsp->flavor_mask))
continue;
smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (ACCESS_ONCE(rdp->mynode->completed) !=
- ACCESS_ONCE(rdp->cond_resched_completed))
+ if (READ_ONCE(rdp->mynode->completed) !=
+ READ_ONCE(rdp->cond_resched_completed))
continue;
/*
@@ -491,9 +527,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
break;
}
if (rsp != NULL) {
- *flags = ACCESS_ONCE(rsp->gp_flags);
- *gpnum = ACCESS_ONCE(rsp->gpnum);
- *completed = ACCESS_ONCE(rsp->completed);
+ *flags = READ_ONCE(rsp->gp_flags);
+ *gpnum = READ_ONCE(rsp->gpnum);
+ *completed = READ_ONCE(rsp->completed);
return;
}
*flags = 0;
@@ -539,10 +575,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static int rcu_future_needs_gp(struct rcu_state *rsp)
{
struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- return ACCESS_ONCE(*fp);
+ return READ_ONCE(*fp);
}
/*
@@ -565,7 +601,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
return 1; /* Yes, this CPU has newly registered callbacks. */
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ ULONG_CMP_LT(READ_ONCE(rsp->completed),
rdp->nxtcompleted[i]))
return 1; /* Yes, CBs for future grace period. */
return 0; /* No grace period needed. */
@@ -585,7 +621,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
@@ -604,7 +641,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ atomic_read(&rdtp->dynticks) & 0x1);
rcu_dynticks_task_enter();
/*
@@ -630,7 +668,8 @@ static void rcu_eqs_enter(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
rcu_eqs_enter_common(oldval, user);
@@ -703,7 +742,8 @@ void rcu_irq_exit(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting < 0);
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
@@ -728,10 +768,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
@@ -755,7 +797,7 @@ static void rcu_eqs_exit(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
@@ -828,7 +870,8 @@ void rcu_irq_enter(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting == 0);
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
@@ -1011,9 +1054,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
- if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
- ACCESS_ONCE(rdp->gpwrap) = true;
+ WRITE_ONCE(rdp->gpwrap, true);
return 0;
}
}
@@ -1093,12 +1136,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
if (ULONG_CMP_GE(jiffies,
rdp->rsp->gp_start + jiffies_till_sched_qs) ||
ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- ACCESS_ONCE(rdp->cond_resched_completed) =
- ACCESS_ONCE(rdp->mynode->completed);
+ if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ WRITE_ONCE(rdp->cond_resched_completed,
+ READ_ONCE(rdp->mynode->completed));
smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- ACCESS_ONCE(*rcrmp) =
- ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ WRITE_ONCE(*rcrmp,
+ READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
rdp->rsp->jiffies_resched += 5; /* Enable beating. */
} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1119,9 +1162,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
j1 = rcu_jiffies_till_stall_check();
- ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+ WRITE_ONCE(rsp->jiffies_stall, j + j1);
rsp->jiffies_resched = j + j1 / 2;
- rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+ rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
}
/*
@@ -1133,10 +1176,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
unsigned long j;
j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
+ gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies!\n",
- rsp->name, j - gpa);
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+ rsp->name, j - gpa,
+ rsp->gpnum, rsp->completed, rsp->gp_flags);
}
/*
@@ -1173,12 +1217,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -1212,12 +1257,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
} else {
- if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
- ACCESS_ONCE(rsp->completed) == gpnum) {
+ if (READ_ONCE(rsp->gpnum) != gpnum ||
+ READ_ONCE(rsp->completed) == gpnum) {
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
+ gpa = READ_ONCE(rsp->gp_activity);
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rsp->name, j - gpa, j, gpa,
jiffies_till_next_fqs,
@@ -1262,9 +1307,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -1307,20 +1352,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
* Given this check, comparisons of jiffies, rsp->jiffies_stall,
* and rsp->gp_start suffice to forestall false positives.
*/
- gpnum = ACCESS_ONCE(rsp->gpnum);
+ gpnum = READ_ONCE(rsp->gpnum);
smp_rmb(); /* Pick up ->gpnum first... */
- js = ACCESS_ONCE(rsp->jiffies_stall);
+ js = READ_ONCE(rsp->jiffies_stall);
smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = ACCESS_ONCE(rsp->gp_start);
+ gps = READ_ONCE(rsp->gp_start);
smp_rmb(); /* ...and finally ->gp_start before ->completed. */
- completed = ACCESS_ONCE(rsp->completed);
+ completed = READ_ONCE(rsp->completed);
if (ULONG_CMP_GE(completed, gpnum) ||
ULONG_CMP_LT(j, js) ||
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
if (rcu_gp_in_progress(rsp) &&
- (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -1347,7 +1392,7 @@ void rcu_cpu_stall_reset(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+ WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
}
/*
@@ -1457,7 +1502,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+ READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1542,7 +1587,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
static void rcu_gp_kthread_wake(struct rcu_state *rsp)
{
if (current == rsp->gp_kthread ||
- !ACCESS_ONCE(rsp->gp_flags) ||
+ !READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
wake_up(&rsp->gp_wq);
@@ -1677,7 +1722,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ !unlikely(READ_ONCE(rdp->gpwrap))) {
/* No grace period end, so just accelerate recent callbacks. */
ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1692,7 +1737,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
- if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1704,7 +1749,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
- ACCESS_ONCE(rdp->gpwrap) = false;
+ WRITE_ONCE(rdp->gpwrap, false);
}
return ret;
}
@@ -1717,9 +1762,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_save(flags);
rnp = rdp->mynode;
- if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed) &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+ if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+ rdp->completed == READ_ONCE(rnp->completed) &&
+ !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
!raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
@@ -1731,6 +1776,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_gp_kthread_wake(rsp);
}
+static void rcu_gp_slow(struct rcu_state *rsp, int delay)
+{
+ if (delay > 0 &&
+ !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ schedule_timeout_uninterruptible(delay);
+}
+
/*
* Initialize a new grace period. Return 0 if no grace period required.
*/
@@ -1740,15 +1792,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- if (!ACCESS_ONCE(rsp->gp_flags)) {
+ if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
- ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+ WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
/*
@@ -1773,6 +1825,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
* will handle subsequent offline CPUs.
*/
rcu_for_each_leaf_node(rsp, rnp) {
+ rcu_gp_slow(rsp, gp_preinit_delay);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1829,14 +1882,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
* process finishes, because this kthread handles both.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ rcu_gp_slow(rsp, gp_init_delay);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
- ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+ WRITE_ONCE(rnp->gpnum, rsp->gpnum);
if (WARN_ON_ONCE(rnp->completed != rsp->completed))
- ACCESS_ONCE(rnp->completed) = rsp->completed;
+ WRITE_ONCE(rnp->completed, rsp->completed);
if (rnp == rdp->mynode)
(void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1845,10 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- if (gp_init_delay > 0 &&
- !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
- schedule_timeout_uninterruptible(gp_init_delay);
+ WRITE_ONCE(rsp->gp_activity, jiffies);
}
return 1;
@@ -1864,7 +1915,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
@@ -1882,11 +1933,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+ WRITE_ONCE(rsp->gp_flags,
+ READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1903,7 +1954,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
@@ -1934,7 +1985,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
WARN_ON_ONCE(rnp->qsmask);
- ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ WRITE_ONCE(rnp->completed, rsp->gpnum);
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1942,7 +1993,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
+ rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1950,16 +2002,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
- ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+ WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
/* Advance CBs to reduce false positives below. */
needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
if (needgp || cpu_needs_another_gp(rsp, rdp)) {
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
raw_spin_unlock_irq(&rnp->lock);
@@ -1983,20 +2035,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Handle grace-period start. */
for (;;) {
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq,
- ACCESS_ONCE(rsp->gp_flags) &
+ READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("reqwaitsig"));
}
@@ -2012,39 +2064,39 @@ static int __noreturn rcu_gp_kthread(void *arg)
if (!ret)
rsp->jiffies_force_qs = jiffies + j;
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+ ((gf = READ_ONCE(rsp->gp_flags)) &
RCU_GP_FLAG_FQS) ||
- (!ACCESS_ONCE(rnp->qsmask) &&
+ (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp)),
j);
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
- if (!ACCESS_ONCE(rnp->qsmask) &&
+ if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
/* If time for quiescent-state forcing, do it. */
if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
(gf & RCU_GP_FLAG_FQS)) {
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqsstart"));
fqs_state = rcu_gp_fqs(rsp, fqs_state);
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqsend"));
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
}
j = jiffies_till_next_fqs;
@@ -2086,8 +2138,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
*/
return false;
}
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+ WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
TPS("newreq"));
/*
@@ -2137,6 +2189,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
rcu_gp_kthread_wake(rsp);
}
@@ -2334,8 +2387,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Send the specified CPU's RCU callbacks to the orphanage. The
* specified CPU must be offline, and the caller must hold the
@@ -2346,7 +2397,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
/* No-CBs CPUs do not have orphanable callbacks. */
- if (rcu_is_nocb_cpu(rdp->cpu))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
return;
/*
@@ -2359,7 +2410,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->qlen += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
+ WRITE_ONCE(rdp->qlen, 0);
}
/*
@@ -2405,7 +2456,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
- if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
return;
/* Do the accounting first. */
@@ -2452,6 +2504,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
@@ -2480,7 +2535,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
for (;;) {
mask = rnp->grpmask;
@@ -2511,6 +2567,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2532,6 +2591,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
@@ -2546,26 +2608,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
cpu, rdp->qlen, rdp->nxtlist);
}
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
@@ -2580,7 +2622,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* If no callbacks are ready, just return. */
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
- trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
return;
@@ -2636,7 +2678,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+ WRITE_ONCE(rdp->qlen, rdp->qlen - count);
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2730,10 +2772,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (!rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
if (rnp->qsmask == 0) {
if (rcu_state_p == &rcu_sched_state ||
rsp != rcu_state_p ||
@@ -2763,8 +2801,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
if ((rnp->qsmask & bit) != 0) {
- if ((rnp->qsmaskinit & bit) == 0)
- *isidle = false; /* Pending hotplug. */
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
}
@@ -2793,7 +2829,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
/* Funnel through hierarchy to reduce memory contention. */
rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
- ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+ ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
@@ -2809,13 +2845,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_lock_irqsave(&rnp_old->lock, flags);
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+ WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
rcu_gp_kthread_wake(rsp);
}
@@ -2881,7 +2916,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
- if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+ if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
return;
if (likely(!rsp->boost)) {
rcu_do_batch(rsp, rdp);
@@ -2972,7 +3007,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
- ACCESS_ONCE(head->func) = rcu_leak_callback;
+ WRITE_ONCE(head->func, rcu_leak_callback);
WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
return;
}
@@ -3011,7 +3046,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
if (!likely(rdp->nxtlist))
init_default_callback_list(rdp);
}
- ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+ WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
if (lazy)
rdp->qlen_lazy++;
else
@@ -3287,7 +3322,7 @@ void synchronize_sched_expedited(void)
if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
(ulong)atomic_long_read(&rsp->expedited_done) +
ULONG_MAX / 8)) {
- synchronize_sched();
+ wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_wrap);
return;
}
@@ -3450,14 +3485,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+ if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
rdp->n_rp_gp_completed++;
return 1;
}
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+ if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
rdp->n_rp_gp_started++;
return 1;
}
@@ -3493,7 +3528,7 @@ static int rcu_pending(void)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3564,7 +3599,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+ unsigned long snap = READ_ONCE(rsp->n_barrier_done);
unsigned long snap_done;
_rcu_barrier_trace(rsp, "Begin", -1, snap);
@@ -3606,10 +3641,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
/*
* Increment ->n_barrier_done to avoid duplicate work. Use
- * ACCESS_ONCE() to prevent the compiler from speculating
+ * WRITE_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+ WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3645,7 +3680,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
}
- } else if (ACCESS_ONCE(rdp->qlen)) {
+ } else if (READ_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->n_barrier_done);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3665,7 +3700,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+ WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3780,7 +3815,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
rdp->passed_quiesce = false;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
rdp->qs_pending = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3924,16 +3959,16 @@ void rcu_scheduler_starting(void)
/*
* Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
*/
static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+ if (rcu_fanout_exact) {
rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ rsp->levelspread[i] = RCU_FANOUT;
} else {
int ccur;
int cprv;
@@ -3971,9 +4006,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
- /* Silence gcc 4.8 warning about array index out of range. */
- if (rcu_num_lvls > RCU_NUM_LVLS)
- panic("rcu_init_one: rcu_num_lvls overflow");
+ /* Silence gcc 4.8 false positive about array index out of range. */
+ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls out of range");
/* Initialize the level-tracking arrays. */
@@ -4059,7 +4094,7 @@ static void __init rcu_init_geometry(void)
jiffies_till_next_fqs = d;
/* If the compile-time values are accurate, just leave. */
- if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
@@ -4073,7 +4108,7 @@ static void __init rcu_init_geometry(void)
rcu_capacity[0] = 1;
rcu_capacity[1] = rcu_fanout_leaf;
for (i = 2; i <= MAX_RCU_LVLS; i++)
- rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+ rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
/*
* The boot-time rcu_fanout_leaf parameter is only permitted
@@ -4083,7 +4118,7 @@ static void __init rcu_init_geometry(void)
* the configured number of CPUs. Complain and fall back to the
* compile-time values if these limits are exceeded.
*/
- if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
n > rcu_capacity[MAX_RCU_LVLS]) {
WARN_ON(1);
@@ -4109,6 +4144,28 @@ static void __init rcu_init_geometry(void)
rcu_num_nodes -= n;
}
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure referenced by rsp.
+ */
+static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
+{
+ int level = 0;
+ struct rcu_node *rnp;
+
+ pr_info("rcu_node tree layout dump\n");
+ pr_info(" ");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp->level != level) {
+ pr_cont("\n");
+ pr_info(" ");
+ level = rnp->level;
+ }
+ pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
+ }
+ pr_cont("\n");
+}
+
void __init rcu_init(void)
{
int cpu;
@@ -4119,6 +4176,8 @@ void __init rcu_init(void)
rcu_init_geometry();
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ if (dump_tree)
+ rcu_dump_rcu_node_tree(&rcu_sched_state);
__rcu_init_preempt();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a69d3dab2ec4..4adb7ca0bf47 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -35,11 +35,33 @@
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
+
#define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT_LEAF 64
+# else
+# define RCU_FANOUT_LEAF 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
# define RCU_NUM_LVLS 1
@@ -170,7 +192,6 @@ struct rcu_node {
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
@@ -208,7 +229,6 @@ struct rcu_node {
unsigned long n_balk_nos;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -519,14 +539,11 @@ extern struct list_head rcu_struct_flavors;
* RCU implementation internal declarations:
*/
extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d72fa24f2312..013485fb2b06 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ * all uses are in dead code. Provide a definition to keep the compiler
+ * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ * This probably needs to be excluded from -rt builds.
+ */
+#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -60,11 +70,11 @@ static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
pr_info("\tRCU debugfs-based tracing is enabled.\n");
- if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
- (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+ if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
- if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ RCU_FANOUT);
+ if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
@@ -76,10 +86,10 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tAdditional per-CPU info printed with stalls.\n");
if (NUM_RCU_LVL_4 != 0)
pr_info("\tFour-level hierarchy is enabled.\n");
- if (CONFIG_RCU_FANOUT_LEAF != 16)
+ if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
- CONFIG_RCU_FANOUT_LEAF);
- if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ RCU_FANOUT_LEAF);
+ if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
@@ -90,7 +100,8 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -116,11 +127,11 @@ static void __init rcu_bootup_announce(void)
*/
static void rcu_preempt_qs(void)
{
- if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
- __this_cpu_read(rcu_preempt_data.gpnum),
+ __this_cpu_read(rcu_data_p->gpnum),
TPS("cpuqs"));
- __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ __this_cpu_write(rcu_data_p->passed_quiesce, 1);
barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
@@ -150,7 +161,7 @@ static void rcu_preempt_note_context_switch(void)
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = this_cpu_ptr(rcu_preempt_state.rda);
+ rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -180,10 +191,9 @@ static void rcu_preempt_note_context_switch(void)
if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
- if (rnp->boost_tasks != NULL)
+ if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+ rnp->boost_tasks != NULL)
rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
} else {
list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
if (rnp->qsmask & rdp->grpmask)
@@ -263,9 +273,7 @@ void rcu_read_unlock_special(struct task_struct *t)
bool empty_exp_now;
unsigned long flags;
struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
union rcu_special special;
@@ -307,9 +315,11 @@ void rcu_read_unlock_special(struct task_struct *t)
t->rcu_read_unlock_special.b.blocked = false;
/*
- * Remove this task from the list it blocked on. The
- * task can migrate while we acquire the lock, but at
- * most one time. So at most two passes through loop.
+ * Remove this task from the list it blocked on. The task
+ * now remains queued on the rcu_node corresponding to
+ * the CPU it first blocked on, so the first attempt to
+ * acquire the task's rcu_node's ->lock will succeed.
+ * Keep the loop and add a WARN_ON() out of sheer paranoia.
*/
for (;;) {
rnp = t->rcu_blocked_node;
@@ -317,6 +327,7 @@ void rcu_read_unlock_special(struct task_struct *t)
smp_mb__after_unlock_lock();
if (rnp == t->rcu_blocked_node)
break;
+ WARN_ON_ONCE(1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
@@ -331,12 +342,12 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->gp_tasks = np;
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
- /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+ if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+ /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ }
/*
* If this was the last task on the current list, and if
@@ -353,24 +364,21 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(&rcu_preempt_state,
- rnp, flags);
+ rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
} else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
-#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (drop_boost_mutex)
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
- rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ rcu_report_exp_rnp(rcu_state_p, rnp, true);
} else {
local_irq_restore(flags);
}
@@ -390,7 +398,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- t = list_entry(rnp->gp_tasks,
+ t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
sched_show_task(t);
@@ -447,7 +455,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks,
+ t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
pr_cont(" P%d", t->pid);
@@ -491,8 +499,8 @@ static void rcu_preempt_check_callbacks(void)
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- __this_cpu_read(rcu_preempt_data.qs_pending) &&
- !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ __this_cpu_read(rcu_data_p->qs_pending) &&
+ !__this_cpu_read(rcu_data_p->passed_quiesce))
t->rcu_read_unlock_special.b.need_qs = true;
}
@@ -500,7 +508,7 @@ static void rcu_preempt_check_callbacks(void)
static void rcu_preempt_do_callbacks(void)
{
- rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+ rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
}
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -510,7 +518,7 @@ static void rcu_preempt_do_callbacks(void)
*/
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
- __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+ __call_rcu(head, func, rcu_state_p, -1, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -570,7 +578,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return !rcu_preempted_readers_exp(rnp) &&
- ACCESS_ONCE(rnp->expmask) == 0;
+ READ_ONCE(rnp->expmask) == 0;
}
/*
@@ -711,12 +719,12 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_preempt_state;
+ struct rcu_state *rsp = rcu_state_p;
unsigned long snap;
int trycount = 0;
smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+ snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
smp_mb(); /* Above access cannot bleed into critical section. */
/*
@@ -740,7 +748,7 @@ void synchronize_rcu_expedited(void)
*/
while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
if (ULONG_CMP_LT(snap,
- ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ READ_ONCE(sync_rcu_preempt_exp_count))) {
put_online_cpus();
goto mb_ret; /* Others did our work for us. */
}
@@ -752,7 +760,7 @@ void synchronize_rcu_expedited(void)
return;
}
}
- if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
put_online_cpus();
goto unlock_mb_ret; /* Others did our work for us. */
}
@@ -780,8 +788,7 @@ void synchronize_rcu_expedited(void)
/* Clean up and exit. */
smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count) =
- sync_rcu_preempt_exp_count + 1;
+ WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
unlock_mb_ret:
mutex_unlock(&sync_rcu_preempt_exp_mutex);
mb_ret:
@@ -799,7 +806,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
*/
void rcu_barrier(void)
{
- _rcu_barrier(&rcu_preempt_state);
+ _rcu_barrier(rcu_state_p);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -808,7 +815,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
*/
static void __init __rcu_init_preempt(void)
{
- rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+ rcu_init_one(rcu_state_p, rcu_data_p);
}
/*
@@ -831,7 +838,8 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
+static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
/*
* Tell them what RCU they are running.
@@ -994,8 +1002,8 @@ static int rcu_boost(struct rcu_node *rnp)
struct task_struct *t;
struct list_head *tb;
- if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
- ACCESS_ONCE(rnp->boost_tasks) == NULL)
+ if (READ_ONCE(rnp->exp_tasks) == NULL &&
+ READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1048,8 +1056,8 @@ static int rcu_boost(struct rcu_node *rnp)
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
- return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
- ACCESS_ONCE(rnp->boost_tasks) != NULL;
+ return READ_ONCE(rnp->exp_tasks) != NULL ||
+ READ_ONCE(rnp->boost_tasks) != NULL;
}
/*
@@ -1173,7 +1181,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct sched_param sp;
struct task_struct *t;
- if (&rcu_preempt_state != rsp)
+ if (rcu_state_p != rsp)
return 0;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@ -1367,13 +1375,12 @@ static void rcu_prepare_kthreads(int cpu)
* Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
* any flavor of RCU.
*/
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return rcu_cpu_has_callbacks(NULL);
+ return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+ ? 0 : rcu_cpu_has_callbacks(NULL);
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1460,7 +1467,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* callbacks not yet ready to invoke.
*/
if ((rdp->completed != rnp->completed ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+ unlikely(READ_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
@@ -1478,12 +1485,16 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
*
* The caller must have disabled interrupts.
*/
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+ *nextevt = KTIME_MAX;
+ return 0;
+ }
+
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1511,7 +1522,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
*nextevt = basemono + dj * TICK_NSEC;
return 0;
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1525,7 +1535,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
*/
static void rcu_prepare_for_idle(void)
{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1533,8 +1542,11 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ return;
+
/* Handle nohz enablement switches conservatively. */
- tne = ACCESS_ONCE(tick_nohz_active);
+ tne = READ_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
@@ -1580,7 +1592,6 @@ static void rcu_prepare_for_idle(void)
if (needwake)
rcu_gp_kthread_wake(rsp);
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1590,12 +1601,11 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1760,7 +1770,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+ READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
@@ -1898,11 +1908,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ if (!READ_ONCE(rdp_leader->nocb_kthread))
return;
- if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
- ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
wake_up(&rdp_leader->nocb_wq);
}
}
@@ -1934,14 +1944,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
ret = atomic_long_read(&rdp->nocb_q_count);
#ifdef CONFIG_PROVE_RCU
- rhp = ACCESS_ONCE(rdp->nocb_head);
+ rhp = READ_ONCE(rdp->nocb_head);
if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ rhp = READ_ONCE(rdp->nocb_gp_head);
if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+ rhp = READ_ONCE(rdp->nocb_follower_head);
/* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+ if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
rcu_scheduler_fully_active) {
/* RCU callback enqueued before CPU first came online??? */
pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@ -1975,12 +1985,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
atomic_long_add(rhcount, &rdp->nocb_q_count);
/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- ACCESS_ONCE(*old_rhpp) = rhp;
+ WRITE_ONCE(*old_rhpp, rhp);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
/* If we are not being polled and there is a kthread, awaken it ... */
- t = ACCESS_ONCE(rdp->nocb_kthread);
+ t = READ_ONCE(rdp->nocb_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeNotPoll"));
@@ -2118,7 +2128,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
for (;;) {
wait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
- (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
break;
WARN_ON(signal_pending(current));
@@ -2145,7 +2155,7 @@ wait_again:
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
- !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ !READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2159,12 +2169,12 @@ wait_again:
*/
gotcbs = false;
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
continue; /* No CBs here, try next follower. */
/* Move callbacks to wait-for-GP list, which is empty. */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
+ WRITE_ONCE(rdp->nocb_head, NULL);
rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
gotcbs = true;
}
@@ -2184,7 +2194,7 @@ wait_again:
my_rdp->nocb_leader_sleep = true;
smp_mb(); /* Ensure _sleep true before scan. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (ACCESS_ONCE(rdp->nocb_head)) {
+ if (READ_ONCE(rdp->nocb_head)) {
/* Found CB, so short-circuit next wait. */
my_rdp->nocb_leader_sleep = false;
break;
@@ -2205,7 +2215,7 @@ wait_again:
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (ACCESS_ONCE(rdp->nocb_head))
+ if (READ_ONCE(rdp->nocb_head))
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
@@ -2241,7 +2251,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
wait_event_interruptible(rdp->nocb_wq,
- ACCESS_ONCE(rdp->nocb_follower_head));
+ READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
firsttime = false;
@@ -2282,10 +2292,10 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = ACCESS_ONCE(rdp->nocb_follower_head);
+ list = READ_ONCE(rdp->nocb_follower_head);
BUG_ON(!list);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ WRITE_ONCE(rdp->nocb_follower_head, NULL);
tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
/* Each pass through the following loop invokes a callback. */
@@ -2324,7 +2334,7 @@ static int rcu_nocb_kthread(void *arg)
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
- return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ return READ_ONCE(rdp->nocb_defer_wakeup);
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2334,8 +2344,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
- ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
- ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2448,7 +2458,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
t = kthread_run(rcu_nocb_kthread, rdp_spawn,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
- ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+ WRITE_ONCE(rdp_spawn->nocb_kthread, t);
}
/*
@@ -2663,7 +2673,7 @@ static void rcu_sysidle_enter(int irq)
/* Record start of fully idle period. */
j = jiffies;
- ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
@@ -2681,7 +2691,7 @@ static void rcu_sysidle_enter(int irq)
*/
void rcu_sysidle_force_exit(void)
{
- int oldstate = ACCESS_ONCE(full_sysidle_state);
+ int oldstate = READ_ONCE(full_sysidle_state);
int newoldstate;
/*
@@ -2794,7 +2804,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
smp_mb(); /* Read counters before timestamps. */
/* Pick up timestamps. */
- j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ j = READ_ONCE(rdtp->dynticks_idle_jiffies);
/* If this CPU entered idle more recently, update maxj timestamp. */
if (ULONG_CMP_LT(*maxj, j))
*maxj = j;
@@ -2831,11 +2841,11 @@ static unsigned long rcu_sysidle_delay(void)
static void rcu_sysidle(unsigned long j)
{
/* Check the current state. */
- switch (ACCESS_ONCE(full_sysidle_state)) {
+ switch (READ_ONCE(full_sysidle_state)) {
case RCU_SYSIDLE_NOT:
/* First time all are idle, so note a short idle period. */
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
break;
case RCU_SYSIDLE_SHORT:
@@ -2873,7 +2883,7 @@ static void rcu_sysidle_cancel(void)
{
smp_mb();
if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
}
/*
@@ -2925,7 +2935,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
smp_mb(); /* grace period precedes setting inuse. */
rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- ACCESS_ONCE(rshp->inuse) = 0;
+ WRITE_ONCE(rshp->inuse, 0);
}
/*
@@ -2936,7 +2946,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
bool rcu_sys_is_idle(void)
{
static struct rcu_sysidle_head rsh;
- int rss = ACCESS_ONCE(full_sysidle_state);
+ int rss = READ_ONCE(full_sysidle_state);
if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
return false;
@@ -2964,7 +2974,7 @@ bool rcu_sys_is_idle(void)
}
rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
- rss = ACCESS_ONCE(full_sysidle_state);
+ rss = READ_ONCE(full_sysidle_state);
}
}
@@ -3048,10 +3058,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(smp_processor_id()) &&
(!rcu_gp_in_progress(rsp) ||
- ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
- return 1;
+ ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+ return true;
#endif /* #ifdef CONFIG_NO_HZ_FULL */
- return 0;
+ return false;
}
/*
@@ -3077,7 +3087,7 @@ static void rcu_bind_gp_kthread(void)
static void rcu_dynticks_task_enter(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
@@ -3085,6 +3095,6 @@ static void rcu_dynticks_task_enter(void)
static void rcu_dynticks_task_exit(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index f92361efd0f5..3ea7ffc7d5c4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+ READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
@@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
struct rcu_node *rnp = &rsp->node[0];
raw_spin_lock_irqsave(&rnp->lock, flags);
- completed = ACCESS_ONCE(rsp->completed);
- gpnum = ACCESS_ONCE(rsp->gpnum);
+ completed = READ_ONCE(rsp->completed);
+ gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
gpage = 0;
else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1f133350da01..afaecb7a799a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -150,14 +150,14 @@ void __rcu_read_unlock(void)
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
}
#ifdef CONFIG_PROVE_LOCKING
{
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+ int rrln = READ_ONCE(t->rcu_read_lock_nesting);
WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
}
@@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
int rcu_jiffies_till_stall_check(void)
{
- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
/*
* Limit check must be consistent with the Kconfig limits
* for CONFIG_RCU_CPU_STALL_TIMEOUT.
*/
if (till_stall_check < 3) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
till_stall_check = 3;
} else if (till_stall_check > 300) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
till_stall_check = 300;
}
return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t,
{
int cpu;
- if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
- t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
- !ACCESS_ONCE(t->on_rq) ||
+ if (!READ_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+ !READ_ONCE(t->on_rq) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
!is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
- ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
return;
@@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
*/
rcu_read_lock();
for_each_process_thread(g, t) {
- if (t != current && ACCESS_ONCE(t->on_rq) &&
+ if (t != current && READ_ONCE(t->on_rq) &&
!is_idle_task(t)) {
get_task_struct(t);
- t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
- ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+ WRITE_ONCE(t->rcu_tasks_holdout, true);
list_add(&t->rcu_tasks_holdout_list,
&rcu_tasks_holdouts);
}
@@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct task_struct *t1;
schedule_timeout_interruptible(HZ);
- rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+ rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 &&
time_after(jiffies, lastreport + rtst);
if (needreport)
@@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void)
static struct task_struct *rcu_tasks_kthread_ptr;
struct task_struct *t;
- if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+ if (READ_ONCE(rcu_tasks_kthread_ptr)) {
smp_mb(); /* Ensure caller sees full kthread. */
return;
}
@@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void)
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
- ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+ WRITE_ONCE(rcu_tasks_kthread_ptr, t);
mutex_unlock(&rcu_tasks_kthread_mutex);
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
#include "sched.h"
#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;
for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9f25ce70c77..c9a707b59331 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -490,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -520,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_list().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
@@ -1027,7 +1073,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
@@ -2083,12 +2129,15 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
/**
* preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
+ static_key_slow_inc(&preempt_notifier_key);
hlist_add_head(&notifier->link, &current->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2097,15 +2146,16 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
* preempt_notifier_unregister - no longer interested in preemption notifications
* @notifier: notifier struct to unregister
*
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
*/
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
hlist_del(&notifier->link);
+ static_key_slow_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
@@ -2113,9 +2163,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
{
struct preempt_notifier *notifier;
@@ -2123,13 +2179,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
#else /* !CONFIG_PREEMPT_NOTIFIERS */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
-static void
+static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
@@ -2375,9 +2439,9 @@ unsigned long nr_iowait_cpu(int cpu)
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2475,6 +2539,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2503,7 +2568,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2704,9 +2769,7 @@ again:
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
*/
static void __sched __schedule(void)
{
@@ -2715,7 +2778,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch();
@@ -2779,8 +2841,6 @@ static void __sched __schedule(void)
raw_spin_unlock_irq(&rq->lock);
post_schedule(rq);
-
- sched_preempt_enable_no_resched();
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2801,7 +2861,9 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
+ preempt_disable();
__schedule();
+ sched_preempt_enable_no_resched();
} while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2840,15 +2902,14 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ preempt_active_exit();
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
- barrier();
} while (need_resched());
}
@@ -2872,9 +2933,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
/**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
@@ -2887,7 +2947,7 @@ EXPORT_SYMBOL(preempt_schedule);
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
@@ -2895,7 +2955,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Use raw __prempt_count() ops that don't call function.
+ * We can't call functions before disabling preemption which
+ * disarm preemption tracing recursions.
+ */
+ __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ barrier();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -2905,12 +2971,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
__schedule();
exception_exit(prev_ctx);
- __preempt_count_sub(PREEMPT_ACTIVE);
barrier();
+ __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
} while (need_resched());
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPT */
@@ -2930,17 +2995,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
+ preempt_active_exit();
} while (need_resched());
exception_exit(prev_state);
@@ -3018,7 +3077,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
@@ -4367,10 +4425,7 @@ long __sched io_schedule_timeout(long timeout)
long ret;
current->in_iowait = 1;
- if (old_iowait)
- blk_schedule_flush_plug(current);
- else
- blk_flush_plug(current);
+ blk_schedule_flush_plug(current);
delayacct_blkio_start();
rq = raw_rq();
@@ -5295,7 +5350,7 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
@@ -7713,11 +7768,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
- rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
{
cputime_t old;
- while (new > (old = ACCESS_ONCE(*counter)))
+ while (new > (old = READ_ONCE(*counter)))
cmpxchg_cputime(counter, old, new);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 21d6907d2b9f..eac20c557a55 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -632,7 +632,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
}
static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
return (dl_se->runtime <= 0);
}
@@ -676,7 +676,7 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(rq, dl_se)) {
+ if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -987,7 +987,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If we are dealing with a -deadline task, we must
@@ -1004,7 +1004,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
(p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
- if (target != -1)
+ if (target != -1 &&
+ dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr))
cpu = target;
}
rcu_read_unlock();
@@ -1222,6 +1224,32 @@ next_node:
return NULL;
}
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct task_struct *p = NULL;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+next_node:
+ if (next_node) {
+ p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ next_node = rb_next(next_node);
+ goto next_node;
+ }
+
+ return NULL;
+}
+
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -1325,6 +1353,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
+ if (!dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr)) {
+ /*
+ * Target rq has tasks of equal or earlier deadline,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ later_rq = NULL;
+ break;
+ }
+
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
@@ -1506,7 +1545,7 @@ static int pull_dl_task(struct rq *this_rq)
if (src_rq->dl.dl_nr_running <= 1)
goto skip;
- p = pick_next_earliest_dl_task(src_rq, this_cpu);
+ p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
/*
* We found a task to be pulled if:
@@ -1651,7 +1690,7 @@ static void rq_offline_dl(struct rq *rq)
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
{
unsigned int i;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f94724eda407..315c68e015d9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,12 +132,14 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
#else
- SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ 0LL, 0L,
+ SPLIT_NS(p->se.sum_exec_runtime),
+ 0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d", task_node(p));
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
" task PID tree-key switches prio"
- " exec-runtime sum-exec sum-sleep\n"
+ " wait-time sum-exec sum-sleep\n"
"------------------------------------------------------"
"----------------------------------------------------\n");
@@ -580,6 +582,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
+ PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69be2825262d..40a7fcbf491e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int factor = get_update_sysctl_factor();
+ unsigned int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
@@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env,
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
+ long imb, old_imb;
+ long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
- long orig_src_load;
- long load_a, load_b;
- long moved_load;
- long imb;
/*
* The load is corrected for the CPU capacity available on each node.
@@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
- load_a = dst_load;
- load_b = src_load;
- if (load_a < load_b)
- swap(load_a, load_b);
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
/* Is the difference below the threshold? */
- imb = load_a * src_capacity * 100 -
- load_b * dst_capacity * env->imbalance_pct;
+ imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
- * Allow a move that brings us closer to a balanced situation,
- * without moving things past the point of balance.
+ * Compare it with the old imbalance.
*/
orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
- /*
- * In a task swap, there will be one load moving from src to dst,
- * and another moving back. This is the net sum of both moves.
- * A simple task move will always have a positive value.
- * Allow the move if it brings the system closer to a balanced
- * situation, without crossing over the balance point.
- */
- moved_load = orig_src_load - src_load;
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
- if (moved_load > 0)
- /* Moving src -> dst. Did we overshoot balance? */
- return src_load * dst_capacity < dst_load * src_capacity;
- else
- /* Moving dst -> src. Did we overshoot balance? */
- return dst_load * src_capacity < src_load * dst_capacity;
+ old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
}
/*
@@ -1409,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
}
}
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+ struct numa_stats *src = &env->src_stats;
+ struct numa_stats *dst = &env->dst_stats;
+
+ if (src->has_free_capacity && !dst->has_free_capacity)
+ return false;
+
+ /*
+ * Only consider a task move if the source has a higher load
+ * than the destination, corrected for CPU capacity on each node.
+ *
+ * src->load dst->load
+ * --------------------- vs ---------------------
+ * src->compute_capacity dst->compute_capacity
+ */
+ if (src->load * dst->compute_capacity >
+ dst->load * src->compute_capacity)
+ return true;
+
+ return false;
+}
+
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
@@ -1463,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
- task_numa_find_cpu(&env, taskimp, groupimp);
+ if (numa_has_capacity(&env))
+ task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
@@ -1494,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
- task_numa_find_cpu(&env, taskimp, groupimp);
+ if (numa_has_capacity(&env))
+ task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1794,7 +1809,12 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;
- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1938,7 +1958,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}
rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -2107,7 +2127,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations:
+ */
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
@@ -2181,7 +2209,7 @@ void task_numa_work(struct callback_head *work)
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
- is_vm_hugetlb_page(vma)) {
+ is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
}
@@ -4303,6 +4331,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long load = this_rq->cfs.runnable_load_avg;
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = this_rq->cfs.runnable_load_avg;
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
+}
+
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
@@ -4355,7 +4566,7 @@ static unsigned long capacity_orig_of(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
@@ -5106,18 +5317,21 @@ again:
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
- if (curr && curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
- /*
- * This call to check_cfs_rq_runtime() will do the throttle and
- * dequeue its entity in the parent(s). Therefore the 'simple'
- * nr_running test will indeed be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto simple;
+ /*
+ * This call to check_cfs_rq_runtime() will do the
+ * throttle and dequeue its entity in the parent(s).
+ * Therefore the 'simple' nr_running test will indeed
+ * be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+ }
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
@@ -5447,10 +5661,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
}
#ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5464,29 +5683,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- if (numa_group) {
- /* Task is already in the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return false;
-
- /* Task is moving into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return true;
-
- return group_faults(p, dst_nid) > group_faults(p, src_nid);
- }
-
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
return true;
- return task_faults(p, dst_nid) > task_faults(p, src_nid);
+ /* Migrating away from the preferred node is bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return false;
+
+ if (numa_group) {
+ src_faults = group_faults(p, src_nid);
+ dst_faults = group_faults(p, dst_nid);
+ } else {
+ src_faults = task_faults(p, src_nid);
+ dst_faults = task_faults(p, dst_nid);
+ }
+
+ return dst_faults > src_faults;
}
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5501,23 +5721,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- if (numa_group) {
- /* Task is moving within/into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return false;
+ /* Migrating away from the preferred node is bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
- /* Task is moving out of the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return true;
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return false;
- return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ if (numa_group) {
+ src_faults = group_faults(p, src_nid);
+ dst_faults = group_faults(p, dst_nid);
+ } else {
+ src_faults = task_faults(p, src_nid);
+ dst_faults = task_faults(p, dst_nid);
}
- /* Migrating away from the preferred node is always bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
-
- return task_faults(p, dst_nid) < task_faults(p, src_nid);
+ return dst_faults < src_faults;
}
#else
@@ -6017,8 +6237,8 @@ static unsigned long scale_rt_capacity(int cpu)
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
delta = __rq_clock_broken(rq) - age_stamp;
if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f2..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
/*
- * kernel/sched/proc.c
+ * kernel/sched/loadavg.c
*
- * Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
*/
#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
long nr_active, delta = 0;
nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
delta = calc_load_fold_active(this_rq);
if (delta) {
int idx = calc_load_write_idx();
+
atomic_long_add(delta, &calc_load_idle[idx]);
}
}
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
unsigned long result = 1UL << frac_bits;
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
}
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
}
return result;
@@ -285,7 +290,6 @@ static unsigned long
calc_load_n(unsigned long load, unsigned long exp,
unsigned long active, unsigned int n)
{
-
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
*/
void calc_global_load(unsigned long ticks)
{
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
}
/*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
* active count.
*/
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
{
long delta;
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
this_rq->calc_load_update += LOAD_FREQ;
}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = get_rq_runnable_load(this_rq);
- unsigned long pending_updates;
-
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (load || curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
- struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long pending_updates;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
- raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
- unsigned long load = get_rq_runnable_load(this_rq);
- /*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
-
- calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e43da5391dcd..7d7093c51f8d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1327,7 +1327,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f9a58ef373b4..aea7c1f393cb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
+extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
/*
* Helpers for converting nanosecond timing to jiffy resolution
@@ -708,7 +714,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static inline u64 __rq_clock_broken(struct rq *rq)
{
- return ACCESS_ONCE(rq->clock);
+ return READ_ONCE(rq->clock);
}
static inline u64 rq_clock(struct rq *rq)
@@ -1285,7 +1291,6 @@ extern void update_max_interval(void);
extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -1299,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq);
-
extern void init_task_runnable_average(struct task_struct *p);
static inline void add_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..077ebbd5e10f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running))
return false;
/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->cputime_atomic.utime);
}
/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->cputime_atomic.stime);
}
/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
* condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
* an event.
*/
- set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+ smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
return timeout;
}
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
* doesn't imply write barrier and the users expects write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
- * and is paired with set_mb() in wait_woken().
+ * and is paired with smp_store_mb() in wait_woken().
*/
smp_wmb(); /* C */
wait->flags |= WQ_FLAG_WOKEN;
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..f19833b5db3c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
* RETURNS:
* %true if @mask is set, %false if made noop because @task was dying.
*/
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~JOBCTL_PENDING_MASK);
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
struct signal_struct *sig = current->signal;
if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
- unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+ unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
/* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..fd643d8c4b42 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
return err;
}
-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
- struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;
preempt_disable();
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
.done = &done
};
- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}
- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 1);
- lg_local_unlock(&stop_cpus_lock);
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ cpu_stop_queue_work(cpu1, &work1);
+ cpu_stop_queue_work(cpu2, &work2);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
preempt_enable();
wait_for_completion(&done.completion);
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b798a5..8571296b7ddb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,10 +92,10 @@
# define SET_TSC_CTL(a) (-EINVAL)
#endif
#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
#endif
#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a) (-EINVAL)
@@ -2230,12 +2230,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_MPX_ENABLE_MANAGEMENT:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT(me);
+ error = MPX_ENABLE_MANAGEMENT();
break;
case PR_MPX_DISABLE_MANAGEMENT:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT(me);
+ error = MPX_DISABLE_MANAGEMENT();
break;
case PR_SET_FP_MODE:
error = SET_FP_MODE(me, arg2);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index db5c9508ed95..5c7ae4b641c4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -272,21 +272,23 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
/*
* Divide a ktime value by a nanosecond value
*/
-u64 __ktime_divns(const ktime_t kt, s64 div)
+s64 __ktime_divns(const ktime_t kt, s64 div)
{
- u64 dclc;
int sft = 0;
+ s64 dclc;
+ u64 tmp;
dclc = ktime_to_ns(kt);
+ tmp = dclc < 0 ? -dclc : dclc;
+
/* Make sure the divisor is less than 2^32: */
while (div >> 32) {
sft++;
div >>= 1;
}
- dclc >>= sft;
- do_div(dclc, (unsigned long) div);
-
- return dclc;
+ tmp >>= sft;
+ do_div(tmp, (unsigned long) div);
+ return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
+ u64 curr_cputime;
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}
- if (b->stime > a->stime)
- a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+ __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+ __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+ struct task_cputime_atomic *atomic_times)
+{
+ times->utime = atomic64_read(&atomic_times->utime);
+ times->stime = atomic64_read(&atomic_times->stime);
+ times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
}
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;
- if (!cputimer->running) {
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running)) {
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
- * to synchronize the timer to the clock every time we start
- * it.
+ * to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+ /*
+ * We're setting cputimer->running without a lock. Ensure
+ * this only gets written to in one operation. We set
+ * running after update_gt_cputime() as a small optimization,
+ * but barriers are not required because update_gt_cputime()
+ * can handle concurrent updates.
+ */
+ WRITE_ONCE(cputimer->running, 1);
+ }
+ sample_cputime_atomic(times, &cputimer->cputime_atomic);
}
/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires))
return false;
- if (tsk->signal->cputimer.running)
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(tsk->signal->cputimer.running))
return false;
return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
}
}
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ /* Turn off cputimer->running. This is done without locking. */
+ WRITE_ONCE(cputimer->running, 0);
}
static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
}
sig = tsk->signal;
- if (sig->cputimer.running) {
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;
- raw_spin_lock(&sig->cputimer.lock);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running.
*/
- if (tsk->signal->cputimer.running)
+ if (READ_ONCE(tsk->signal->cputimer.running))
check_process_timers(tsk, &firing);
/*
diff --git a/kernel/torture.c b/kernel/torture.c
index dd70993c266c..3e4840633d3e 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
*/
void torture_shutdown_absorb(const char *title)
{
- while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_notice("torture thread %s parking due to system shutdown\n",
title);
schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
unsigned long unused2, void *unused3)
{
mutex_lock(&fullstop_mutex);
- if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
- ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
} else {
pr_warn("Concurrent rmmod and shutdown illegal!\n");
}
@@ -523,13 +523,13 @@ static int stutter;
*/
void stutter_wait(const char *title)
{
- while (ACCESS_ONCE(stutter_pause_test) ||
- (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ while (READ_ONCE(stutter_pause_test) ||
+ (torture_runnable && !READ_ONCE(*torture_runnable))) {
if (stutter_pause_test)
- if (ACCESS_ONCE(stutter_pause_test) == 1)
+ if (READ_ONCE(stutter_pause_test) == 1)
schedule_timeout_interruptible(1);
else
- while (ACCESS_ONCE(stutter_pause_test))
+ while (READ_ONCE(stutter_pause_test))
cond_resched();
else
schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
if (!torture_must_stop()) {
if (stutter > 1) {
schedule_timeout_interruptible(stutter - 1);
- ACCESS_ONCE(stutter_pause_test) = 2;
+ WRITE_ONCE(stutter_pause_test, 2);
}
schedule_timeout_interruptible(1);
- ACCESS_ONCE(stutter_pause_test) = 1;
+ WRITE_ONCE(stutter_pause_test, 1);
}
if (!torture_must_stop())
schedule_timeout_interruptible(stutter);
- ACCESS_ONCE(stutter_pause_test) = 0;
+ WRITE_ONCE(stutter_pause_test, 0);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
bool torture_cleanup_begin(void)
{
mutex_lock(&fullstop_mutex);
- if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_warn("Concurrent rmmod and shutdown illegal!\n");
mutex_unlock(&fullstop_mutex);
schedule_timeout_uninterruptible(10);
return true;
}
- ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
mutex_unlock(&fullstop_mutex);
torture_shutdown_cleanup();
torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
*/
bool torture_must_stop_irq(void)
{
- return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+ return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
}
EXPORT_SYMBOL_GPL(torture_must_stop_irq);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 13d945c0d03f..1b28df2d9104 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -450,7 +450,7 @@ static int __init ring_buffer_benchmark_init(void)
if (producer_fifo >= 0) {
struct sched_param param = {
- .sched_priority = consumer_fifo
+ .sched_priority = producer_fifo
};
sched_setscheduler(producer, SCHED_FIFO, &param);
} else
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ced69da0ff55..7f2e97ce71a7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1369,19 +1369,26 @@ static int check_preds(struct filter_parse_state *ps)
{
int n_normal_preds = 0, n_logical_preds = 0;
struct postfix_elt *elt;
+ int cnt = 0;
list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE)
+ if (elt->op == OP_NONE) {
+ cnt++;
continue;
+ }
if (elt->op == OP_AND || elt->op == OP_OR) {
n_logical_preds++;
+ cnt--;
continue;
}
+ if (elt->op != OP_NOT)
+ cnt--;
n_normal_preds++;
+ WARN_ON_ONCE(cnt < 0);
}
- if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+ if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
return -EINVAL;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 506edcc500c4..581a68a04c64 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -621,7 +621,7 @@ void watchdog_nmi_enable_all(void)
put_online_cpus();
unlock:
- mutex_lock(&watchdog_proc_mutex);
+ mutex_unlock(&watchdog_proc_mutex);
}
void watchdog_nmi_disable_all(void)