summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/locking/lockdep.c176
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/rcu/Kconfig242
-rw-r--r--kernel/rcu/Kconfig.debug82
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h277
-rw-r--r--kernel/rcu/rcuperf.c129
-rw-r--r--kernel/rcu/rcutorture.c21
-rw-r--r--kernel/rcu/srcu.c661
-rw-r--r--kernel/rcu/srcutiny.c86
-rw-r--r--kernel/rcu/srcutree.c187
-rw-r--r--kernel/rcu/tiny.c54
-rw-r--r--kernel/rcu/tiny_plugin.h123
-rw-r--r--kernel/rcu/tree.c195
-rw-r--r--kernel/rcu/tree.h109
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h573
-rw-r--r--kernel/rcu/tree_trace.c494
-rw-r--r--kernel/rcu/update.c77
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/signal.c20
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/time/Kconfig50
-rw-r--r--kernel/time/timekeeping.c71
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/ftrace.c3
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/trace/trace_functions.c12
-rw-r--r--kernel/trace/trace_kprobe.c14
-rw-r--r--kernel/trace/trace_stack.c6
33 files changed, 1352 insertions, 2358 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 339c8a1371de..a8a725697bed 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -989,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 2831480c63a2..ee97196bb151 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
int ret = -ENOMEM, max_order = 0;
if (!has_aux(event))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
/*
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ae1a3ba24df5..154ffb489b93 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -38,6 +38,7 @@
#include <linux/syscore_ops.h>
#include <linux/compiler.h>
#include <linux/hugetlb.h>
+#include <linux/frame.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -874,7 +875,7 @@ int kexec_load_disabled;
* only when panic_cpu holds the current CPU number; this is the only CPU
* which processes crash_kexec routines.
*/
-void __crash_kexec(struct pt_regs *regs)
+void __noclone __crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
@@ -896,6 +897,7 @@ void __crash_kexec(struct pt_regs *regs)
mutex_unlock(&kexec_mutex);
}
}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
void crash_kexec(struct pt_regs *regs)
{
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c0e31bfee25c..7d2499bec5fe 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("======================================================\n");
pr_warn("WARNING: possible circular locking dependency detected\n");
print_kernel_ident();
pr_warn("------------------------------------------------------\n");
- printk("%s/%d is trying to acquire lock:\n",
+ pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- printk("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(check_tgt);
- printk("\nwhich lock already depends on the new lock.\n\n");
- printk("\nthe existing dependency chain (in reverse order) is:\n");
+ pr_warn("\nwhich lock already depends on the new lock.\n\n");
+ pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
@@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("=====================================================\n");
pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
irqclass, irqclass);
print_kernel_ident();
pr_warn("-----------------------------------------------------\n");
- printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
@@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr,
curr->softirqs_enabled);
print_lock(next);
- printk("\nand this task is already holding:\n");
+ pr_warn("\nand this task is already holding:\n");
print_lock(prev);
- printk("which would create a new lock dependency:\n");
+ pr_warn("which would create a new lock dependency:\n");
print_lock_name(hlock_class(prev));
- printk(KERN_CONT " ->");
+ pr_cont(" ->");
print_lock_name(hlock_class(next));
- printk(KERN_CONT "\n");
+ pr_cont("\n");
- printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
print_lock_name(backwards_entry->class);
- printk("\n... which became %s-irq-safe at:\n", irqclass);
+ pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
- printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+ pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
- printk("\n... which became %s-irq-unsafe at:\n", irqclass);
- printk("...");
+ pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
+ pr_warn("...");
print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
- printk("\nother info that might help us debug this:\n\n");
+ pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
hlock_class(prev), hlock_class(next));
lockdep_print_held_locks(curr);
- printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
+ pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
return 0;
print_shortest_lock_dependencies(backwards_entry, prev_root);
- printk("\nthe dependencies between the lock to be acquired");
- printk(" and %s-irq-unsafe lock:\n", irqclass);
+ pr_warn("\nthe dependencies between the lock to be acquired");
+ pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
if (!save_trace(&next_root->trace))
return 0;
print_shortest_lock_dependencies(forwards_entry, next_root);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("============================================\n");
pr_warn("WARNING: possible recursive locking detected\n");
print_kernel_ident();
pr_warn("--------------------------------------------\n");
- printk("%s/%d is trying to acquire lock:\n",
+ pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(next);
- printk("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr,
struct held_lock *hlock_next,
struct lock_chain *chain)
{
- printk("\n");
+ pr_warn("\n");
pr_warn("============================\n");
pr_warn("WARNING: chain_key collision\n");
print_kernel_ident();
pr_warn("----------------------------\n");
- printk("%s/%d: ", current->comm, task_pid_nr(current));
- printk("Hash chain already cached but the contents don't match!\n");
+ pr_warn("%s/%d: ", current->comm, task_pid_nr(current));
+ pr_warn("Hash chain already cached but the contents don't match!\n");
- printk("Held locks:");
+ pr_warn("Held locks:");
print_chain_keys_held_locks(curr, hlock_next);
- printk("Locks in cached chain:");
+ pr_warn("Locks in cached chain:");
print_chain_keys_chain(chain);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
#endif
@@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("================================\n");
pr_warn("WARNING: inconsistent lock state\n");
print_kernel_ident();
pr_warn("--------------------------------\n");
- printk("inconsistent {%s} -> {%s} usage.\n",
+ pr_warn("inconsistent {%s} -> {%s} usage.\n",
usage_str[prev_bit], usage_str[new_bit]);
- printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
@@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
trace_softirqs_enabled(curr));
print_lock(this);
- printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+ pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
print_irqtrace_events(curr);
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
print_usage_bug_scenario(this);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("========================================================\n");
pr_warn("WARNING: possible irq lock inversion dependency detected\n");
print_kernel_ident();
pr_warn("--------------------------------------------------------\n");
- printk("%s/%d just changed the state of lock:\n",
+ pr_warn("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
- printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
+ pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
- printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+ pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
print_lock_name(other->class);
- printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+ pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
/* Find a middle lock (if one exists) */
depth = get_lock_depth(other);
do {
if (depth == 0 && (entry != root)) {
- printk("lockdep:%s bad path found in chain graph\n", __func__);
+ pr_warn("lockdep:%s bad path found in chain graph\n", __func__);
break;
}
middle = entry;
@@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
- printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
if (!save_trace(&root->trace))
return 0;
print_shortest_lock_dependencies(other, root);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("==================================\n");
pr_warn("WARNING: Nested lock was not taken\n");
print_kernel_ident();
pr_warn("----------------------------------\n");
- printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
print_lock(hlock);
- printk("\nbut this task is not holding:\n");
- printk("%s\n", hlock->nest_lock->name);
+ pr_warn("\nbut this task is not holding:\n");
+ pr_warn("%s\n", hlock->nest_lock->name);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("=====================================\n");
pr_warn("WARNING: bad unlock balance detected!\n");
print_kernel_ident();
pr_warn("-------------------------------------\n");
- printk("%s/%d is trying to release lock (",
+ pr_warn("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(KERN_CONT ") at:\n");
+ pr_cont(") at:\n");
print_ip_sym(ip);
- printk("but there are no more locks to release!\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("but there are no more locks to release!\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("=================================\n");
pr_warn("WARNING: bad contention detected!\n");
print_kernel_ident();
pr_warn("---------------------------------\n");
- printk("%s/%d is trying to contend lock (",
+ pr_warn("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(KERN_CONT ") at:\n");
+ pr_cont(") at:\n");
print_ip_sym(ip);
- printk("but there are no locks held!\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("but there are no locks held!\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
- printk("\n");
+ pr_warn("\n");
pr_warn("=========================\n");
pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
pr_warn("-------------------------\n");
- printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
@@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void)
if (debug_locks_silent)
return;
- printk("\n");
+ pr_warn("\n");
pr_warn("====================================\n");
pr_warn("WARNING: %s/%d still has locks held!\n",
current->comm, task_pid_nr(current));
print_kernel_ident();
pr_warn("------------------------------------\n");
lockdep_print_held_locks(current);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
@@ -4402,10 +4402,10 @@ void debug_show_all_locks(void)
int unlock = 1;
if (unlikely(!debug_locks)) {
- printk("INFO: lockdep is turned off.\n");
+ pr_warn("INFO: lockdep is turned off.\n");
return;
}
- printk("\nShowing all locks held in the system:\n");
+ pr_warn("\nShowing all locks held in the system:\n");
/*
* Here we try to get the tasklist_lock as hard as possible,
@@ -4416,18 +4416,18 @@ void debug_show_all_locks(void)
retry:
if (!read_trylock(&tasklist_lock)) {
if (count == 10)
- printk("hm, tasklist_lock locked, retrying... ");
+ pr_warn("hm, tasklist_lock locked, retrying... ");
if (count) {
count--;
- printk(" #%d", 10-count);
+ pr_cont(" #%d", 10-count);
mdelay(200);
goto retry;
}
- printk(" ignoring it.\n");
+ pr_cont(" ignoring it.\n");
unlock = 0;
} else {
if (count != 10)
- printk(KERN_CONT " locked it.\n");
+ pr_cont(" locked it.\n");
}
do_each_thread(g, p) {
@@ -4445,7 +4445,7 @@ retry:
unlock = 1;
} while_each_thread(g, p);
- printk("\n");
+ pr_warn("\n");
pr_warn("=============================================\n\n");
if (unlock)
@@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
- printk("\n");
+ pr_warn("\n");
pr_warn("================================================\n");
pr_warn("WARNING: lock held when returning to user space!\n");
print_kernel_ident();
pr_warn("------------------------------------------------\n");
- printk("%s/%d is leaving the kernel with locks still held!\n",
+ pr_warn("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
}
@@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
-#ifndef CONFIG_PROVE_RCU_REPEATEDLY
- if (!debug_locks_off())
- return;
-#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
- printk("\n");
+ pr_warn("\n");
pr_warn("=============================\n");
pr_warn("WARNING: suspicious RCU usage\n");
print_kernel_ident();
pr_warn("-----------------------------\n");
- printk("%s:%d %s!\n", file, line, s);
- printk("\nother info that might help us debug this:\n\n");
- printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("%s:%d %s!\n", file, line, s);
+ pr_warn("\nother info that might help us debug this:\n\n");
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: !rcu_is_watching()
@@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
* rcu_read_lock_bh() and so on from extended quiescent states.
*/
if (!rcu_is_watching())
- printk("RCU used illegally from extended quiescent state!\n");
+ pr_warn("RCU used illegally from extended quiescent state!\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f80fd33639e0..57d22571f306 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev;
struct hib_bio_batch {
atomic_t count;
wait_queue_head_t wait;
- int error;
+ blk_status_t error;
};
static void hib_init_batch(struct hib_bio_batch *hb)
{
atomic_set(&hb->count, 0);
init_waitqueue_head(&hb->wait);
- hb->error = 0;
+ hb->error = BLK_STS_OK;
}
static void hib_end_io(struct bio *bio)
@@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio)
struct hib_bio_batch *hb = bio->bi_private;
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_error) {
+ if (bio->bi_status) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
@@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio)
flush_icache_range((unsigned long)page_address(page),
(unsigned long)page_address(page) + PAGE_SIZE);
- if (bio->bi_error && !hb->error)
- hb->error = bio->bi_error;
+ if (bio->bi_status && !hb->error)
+ hb->error = bio->bi_status;
if (atomic_dec_and_test(&hb->count))
wake_up(&hb->wait);
@@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
return error;
}
-static int hib_wait_io(struct hib_bio_batch *hb)
+static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
{
wait_event(hb->wait, atomic_read(&hb->count) == 0);
- return hb->error;
+ return blk_status_to_errno(hb->error);
}
/*
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
new file mode 100644
index 000000000000..be90c945063f
--- /dev/null
+++ b/kernel/rcu/Kconfig
@@ -0,0 +1,242 @@
+#
+# RCU-related configuration options
+#
+
+menu "RCU Subsystem"
+
+config TREE_RCU
+ bool
+ default y if !PREEMPT && SMP
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP system with hundreds or
+ thousands of CPUs. It also scales down nicely to
+ smaller systems.
+
+config PREEMPT_RCU
+ bool
+ default y if PREEMPT
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP systems with hundreds or
+ thousands of CPUs, but for which real-time response
+ is also required. It also scales down nicely to
+ smaller systems.
+
+ Select this option if you are unsure.
+
+config TINY_RCU
+ bool
+ default y if !PREEMPT && !SMP
+ help
+ This option selects the RCU implementation that is
+ designed for UP systems from which real-time response
+ is not required. This option greatly reduces the
+ memory footprint of RCU.
+
+config RCU_EXPERT
+ bool "Make expert-level adjustments to RCU configuration"
+ default n
+ help
+ This option needs to be enabled if you wish to make
+ expert-level adjustments to RCU configuration. By default,
+ no such adjustments can be made, which has the often-beneficial
+ side-effect of preventing "make oldconfig" from asking you all
+ sorts of detailed questions about how you would like numerous
+ obscure RCU options to be set up.
+
+ Say Y if you need to make expert-level adjustments to RCU.
+
+ Say N if you are unsure.
+
+config SRCU
+ bool
+ help
+ This option selects the sleepable version of RCU. This version
+ permits arbitrary sleeping or blocking within RCU read-side critical
+ sections.
+
+config TINY_SRCU
+ bool
+ default y if SRCU && TINY_RCU
+ help
+ This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+ bool
+ default y if SRCU && !TINY_RCU
+ help
+ This option selects the full-fledged version of SRCU.
+
+config TASKS_RCU
+ bool
+ default n
+ select SRCU
+ help
+ This option enables a task-based RCU implementation that uses
+ only voluntary context switch (not preemption!), idle, and
+ user-mode execution as quiescent states.
+
+config RCU_STALL_COMMON
+ def_bool ( TREE_RCU || PREEMPT_RCU )
+ help
+ This option enables RCU CPU stall code that is common between
+ the TINY and TREE variants of RCU. The purpose is to allow
+ the tiny variants to disable RCU CPU stall warnings, while
+ making these warnings mandatory for the tree variants.
+
+config RCU_NEED_SEGCBLIST
+ def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
+
+config RCU_FANOUT
+ int "Tree-based hierarchical RCU fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ default 64 if 64BIT
+ default 32 if !64BIT
+ help
+ This option controls the fanout of hierarchical implementations
+ of RCU, allowing RCU to work efficiently on machines with
+ large numbers of CPUs. This value must be at least the fourth
+ root of NR_CPUS, which allows NR_CPUS to be insanely large.
+ The default value of RCU_FANOUT should be used for production
+ systems, but if you are stress-testing the RCU implementation
+ itself, small RCU_FANOUT values allow you to test large-system
+ code paths on small(er) systems.
+
+ Select a specific number if testing RCU itself.
+ Take the default if unsure.
+
+config RCU_FANOUT_LEAF
+ int "Tree-based hierarchical RCU leaf-level fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ default 16
+ help
+ This option controls the leaf-level fanout of hierarchical
+ implementations of RCU, and allows trading off cache misses
+ against lock contention. Systems that synchronize their
+ scheduling-clock interrupts for energy-efficiency reasons will
+ want the default because the smaller leaf-level fanout keeps
+ lock contention levels acceptably low. Very large systems
+ (hundreds or thousands of CPUs) will instead want to set this
+ value to the maximum value possible in order to reduce the
+ number of cache misses incurred during RCU's grace-period
+ initialization. These systems tend to run CPU-bound, and thus
+ are not helped by synchronized interrupts, and thus tend to
+ skew them, which reduces lock contention enough that large
+ leaf-level fanouts work well. That said, setting leaf-level
+ fanout to a large number will likely cause problematic
+ lock contention on the leaf-level rcu_node structures unless
+ you boot with the skew_tick kernel parameter.
+
+ Select a specific number if testing RCU itself.
+
+ Select the maximum permissible value for large systems, but
+ please understand that you may also need to set the skew_tick
+ kernel boot parameter to avoid contention on the rcu_node
+ structure's locks.
+
+ Take the default if unsure.
+
+config RCU_FAST_NO_HZ
+ bool "Accelerate last non-dyntick-idle CPU's grace periods"
+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT
+ default n
+ help
+ This option permits CPUs to enter dynticks-idle state even if
+ they have RCU callbacks queued, and prevents RCU from waking
+ these CPUs up more than roughly once every four jiffies (by
+ default, you can adjust this using the rcutree.rcu_idle_gp_delay
+ parameter), thus improving energy efficiency. On the other
+ hand, this option increases the duration of RCU grace periods,
+ for example, slowing down synchronize_rcu().
+
+ Say Y if energy efficiency is critically important, and you
+ don't care about increased grace-period durations.
+
+ Say N if you are unsure.
+
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config RCU_NOCB_CPU
+ bool "Offload RCU callback processing from boot-selected CPUs"
+ depends on TREE_RCU || PREEMPT_RCU
+ depends on RCU_EXPERT || NO_HZ_FULL
+ default n
+ help
+ Use this option to reduce OS jitter for aggressive HPC or
+ real-time workloads. It can also be used to offload RCU
+ callback invocation to energy-efficient CPUs in battery-powered
+ asymmetric multiprocessors.
+
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+ "s" for RCU-sched. Nothing prevents this kthread from running
+ on the specified CPUs, but (1) the kthreads may be preempted
+ between each callback, and (2) affinity or cgroups can be used
+ to force the kthreads to run on whatever set of CPUs is desired.
+
+ Say Y here if you want to help to debug reduced OS jitter.
+ Say N here if you are unsure.
+
+endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
new file mode 100644
index 000000000000..0ec7d1d33a14
--- /dev/null
+++ b/kernel/rcu/Kconfig.debug
@@ -0,0 +1,82 @@
+#
+# RCU-related debugging configuration options
+#
+
+menu "RCU Debugging"
+
+config PROVE_RCU
+ def_bool PROVE_LOCKING
+
+config TORTURE_TEST
+ tristate
+ default n
+
+config RCU_PERF_TEST
+ tristate "performance tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance
+ tests on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU performance tests to be built into
+ the kernel.
+ Say M if you want the RCU performance tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST
+ tristate "torture tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs torture tests
+ on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU torture tests to be built into
+ the kernel.
+ Say M if you want the RCU torture tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_CPU_STALL_TIMEOUT
+ int "RCU CPU stall timeout in seconds"
+ depends on RCU_STALL_COMMON
+ range 3 300
+ default 21
+ help
+ If a given RCU grace period extends more than the specified
+ number of seconds, a CPU stall warning is printed. If the
+ RCU grace period persists, additional CPU stall warnings are
+ printed at more widely spaced intervals.
+
+config RCU_TRACE
+ bool "Enable tracing for RCU"
+ depends on DEBUG_KERNEL
+ default y if TREE_RCU
+ select TRACE_CLOCK
+ help
+ This option enables additional tracepoints for ftrace-style
+ event tracing.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
+
+config RCU_EQS_DEBUG
+ bool "Provide debugging asserts for adding NO_HZ support to an arch"
+ depends on DEBUG_KERNEL
+ help
+ This option provides consistency checks in RCU's handling of
+ NO_HZ. These checks have proven quite helpful in detecting
+ bugs in arch-specific NO_HZ code.
+
+ Say N here if you need ultimate kernel/user switch latencies
+ Say Y if you are unsure
+
+endmenu # "RCU Debugging"
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 23803c7d5180..13c0fc852767 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,13 +3,11 @@
KCOV_INSTRUMENT := n
obj-y += update.o sync.o
-obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 73e16ec4054b..808b8c85f626 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+ \
+ if (!atomic_read(&___rfd_beenhere) && \
+ !atomic_xchg(&___rfd_beenhere, 1)) \
+ ftrace_dump(oops_dump_mode); \
+} while (0)
+
void rcu_early_boot_tests(void);
void rcu_test_sync_prims(void);
@@ -291,6 +303,271 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
cpu <= rnp->grphi; \
cpu = cpumask_next((cpu), cpu_possible_mask))
+/*
+ * Wrappers for the rcu_node::lock acquire and release.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
+ */
+#define raw_spin_lock_rcu_node(p) \
+do { \
+ raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irq_rcu_node(p) \
+do { \
+ raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irq_rcu_node(p) \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+
+#define raw_spin_trylock_rcu_node(p) \
+({ \
+ bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
+{
+ return true;
+}
+static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
+{
+ return false;
+}
+
+static inline void rcu_expedite_gp(void)
+{
+}
+
+static inline void rcu_unexpedite_gp(void)
+{
+}
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void); /* Internal RCU use. */
+bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+void rcupdate_announce_bootup_oddness(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#define RCU_SCHEDULER_INACTIVE 0
+#define RCU_SCHEDULER_INIT 1
+#define RCU_SCHEDULER_RUNNING 2
+
+#ifdef CONFIG_TINY_RCU
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+#else /* #ifdef CONFIG_TINY_RCU */
+void rcu_request_urgent_qs_task(struct task_struct *t);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+enum rcutorture_type {
+ RCU_FLAVOR,
+ RCU_BH_FLAVOR,
+ RCU_SCHED_FLAVOR,
+ RCU_TASKS_FLAVOR,
+ SRCU_FLAVOR,
+ INVALID_RCU_FLAVOR
+};
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed);
+void rcutorture_record_test_transition(void);
+void rcutorture_record_progress(unsigned long vernum);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
+ int *flags,
+ unsigned long *gpnum,
+ unsigned long *completed)
+{
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+static inline void rcutorture_record_test_transition(void)
+{
+}
+static inline void rcutorture_record_progress(unsigned long vernum)
+{
+}
+#ifdef CONFIG_RCU_TRACE
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+#endif
+#endif
+
+#ifdef CONFIG_TINY_SRCU
+
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum,
+ unsigned long *completed)
+{
+ if (test_type != SRCU_FLAVOR)
+ return;
+ *flags = 0;
+ *completed = sp->srcu_idx;
+ *gpnum = *completed;
+}
+
+#elif defined(CONFIG_TREE_SRCU)
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum, unsigned long *completed);
+
+#endif
+
+#ifdef CONFIG_TINY_RCU
+
+/*
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_bh(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of sched grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+ return 0;
+}
+
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return 0;
+}
+
+static inline void rcu_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_bh_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_sched_force_quiescent_state(void)
+{
+}
+
+static inline void show_rcu_gp_kthreads(void)
+{
+}
+
+#else /* #ifdef CONFIG_TINY_RCU */
+extern unsigned long rcutorture_testseq;
+extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+void show_rcu_gp_kthreads(void);
+void rcu_force_quiescent_state(void);
+void rcu_bh_force_quiescent_state(void);
+void rcu_sched_force_quiescent_state(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+bool rcu_is_nocb_cpu(int cpu);
+#else
+static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+#endif
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index a4a86fb47e4a..3cc18110b612 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -48,6 +48,8 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include "rcu.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
@@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nreaders, 0, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+ "Shutdown at end of performance tests.");
torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
@@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started;
static u64 t_rcu_perf_writer_finished;
static unsigned long b_rcu_perf_writer_started;
static unsigned long b_rcu_perf_writer_finished;
+static DEFINE_PER_CPU(atomic_t, n_async_inflight);
static int rcu_perf_writer_state;
#define RTWS_INIT 0
-#define RTWS_EXP_SYNC 1
-#define RTWS_SYNC 2
-#define RTWS_IDLE 2
-#define RTWS_STOPPING 3
+#define RTWS_ASYNC 1
+#define RTWS_BARRIER 2
+#define RTWS_EXP_SYNC 3
+#define RTWS_SYNC 4
+#define RTWS_IDLE 5
+#define RTWS_STOPPING 6
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -114,6 +123,8 @@ struct rcu_perf_ops {
unsigned long (*started)(void);
unsigned long (*completed)(void);
unsigned long (*exp_completed)(void);
+ void (*async)(struct rcu_head *head, rcu_callback_t func);
+ void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
const char *name;
@@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = {
.started = rcu_batches_started,
.completed = rcu_batches_completed,
.exp_completed = rcu_exp_batches_completed,
+ .async = call_rcu,
+ .gp_barrier = rcu_barrier,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.name = "rcu"
@@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = {
.started = rcu_batches_started_bh,
.completed = rcu_batches_completed_bh,
.exp_completed = rcu_exp_batches_completed_sched,
+ .async = call_rcu_bh,
+ .gp_barrier = rcu_barrier_bh,
.sync = synchronize_rcu_bh,
.exp_sync = synchronize_rcu_bh_expedited,
.name = "rcu_bh"
@@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void)
return srcu_batches_completed(srcu_ctlp);
}
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+ srcu_barrier(srcu_ctlp);
+}
+
static void srcu_perf_synchronize(void)
{
synchronize_srcu(srcu_ctlp);
@@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = {
.started = NULL,
.completed = srcu_perf_completed,
.exp_completed = srcu_perf_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
.sync = srcu_perf_synchronize,
.exp_sync = srcu_perf_synchronize_expedited,
.name = "srcu"
};
+static struct srcu_struct srcud;
+
+static void srcu_sync_perf_init(void)
+{
+ srcu_ctlp = &srcud;
+ init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_perf_cleanup(void)
+{
+ cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcud_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = srcu_sync_perf_init,
+ .cleanup = srcu_sync_perf_cleanup,
+ .readlock = srcu_perf_read_lock,
+ .readunlock = srcu_perf_read_unlock,
+ .started = NULL,
+ .completed = srcu_perf_completed,
+ .exp_completed = srcu_perf_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_perf_synchronize,
+ .exp_sync = srcu_perf_synchronize_expedited,
+ .name = "srcud"
+};
+
/*
* Definitions for sched perf testing.
*/
@@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = {
.started = rcu_batches_started_sched,
.completed = rcu_batches_completed_sched,
.exp_completed = rcu_exp_batches_completed_sched,
+ .async = call_rcu_sched,
+ .gp_barrier = rcu_barrier_sched,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
.name = "sched"
@@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = {
.readunlock = tasks_perf_read_unlock,
.started = rcu_no_completed,
.completed = rcu_no_completed,
+ .async = call_rcu_tasks,
+ .gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
.name = "tasks"
@@ -344,6 +404,15 @@ rcu_perf_reader(void *arg)
}
/*
+ * Callback function for asynchronous grace periods from rcu_perf_writer().
+ */
+static void rcu_perf_async_cb(struct rcu_head *rhp)
+{
+ atomic_dec(this_cpu_ptr(&n_async_inflight));
+ kfree(rhp);
+}
+
+/*
* RCU perf writer kthread. Repeatedly does a grace period.
*/
static int
@@ -352,6 +421,7 @@ rcu_perf_writer(void *arg)
int i = 0;
int i_max;
long me = (long)arg;
+ struct rcu_head *rhp = NULL;
struct sched_param sp;
bool started = false, done = false, alldone = false;
u64 t;
@@ -380,9 +450,27 @@ rcu_perf_writer(void *arg)
}
do {
+ if (writer_holdoff)
+ udelay(writer_holdoff);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
- if (gp_exp) {
+ if (gp_async) {
+retry:
+ if (!rhp)
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+ rcu_perf_writer_state = RTWS_ASYNC;
+ atomic_inc(this_cpu_ptr(&n_async_inflight));
+ cur_ops->async(rhp, rcu_perf_async_cb);
+ rhp = NULL;
+ } else if (!kthread_should_stop()) {
+ rcu_perf_writer_state = RTWS_BARRIER;
+ cur_ops->gp_barrier();
+ goto retry;
+ } else {
+ kfree(rhp); /* Because we are stopping. */
+ }
+ } else if (gp_exp) {
rcu_perf_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
} else {
@@ -429,6 +517,10 @@ rcu_perf_writer(void *arg)
i++;
rcu_perf_wait_shutdown();
} while (!torture_must_stop());
+ if (gp_async) {
+ rcu_perf_writer_state = RTWS_BARRIER;
+ cur_ops->gp_barrier();
+ }
rcu_perf_writer_state = RTWS_STOPPING;
writer_n_durations[me] = i_max;
torture_kthread_stopping("rcu_perf_writer");
@@ -452,6 +544,17 @@ rcu_perf_cleanup(void)
u64 *wdp;
u64 *wdpp;
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
if (torture_cleanup_begin())
return;
@@ -554,7 +657,7 @@ rcu_perf_init(void)
long i;
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
- &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
RCUPERF_TASKS_OPS
};
@@ -624,16 +727,6 @@ rcu_perf_init(void)
firsterr = -ENOMEM;
goto unwind;
}
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
- VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- firsterr = -EINVAL;
- goto unwind;
- }
- if (rcu_gp_is_normal() && gp_exp) {
- VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- firsterr = -EINVAL;
- goto unwind;
- }
for (i = 0; i < nrealwriters; i++) {
writer_durations[i] =
kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ae6e574d4cf5..b8f7f8ce8575 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -52,6 +52,8 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include "rcu.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
@@ -562,31 +564,19 @@ static void srcu_torture_stats(void)
int __maybe_unused cpu;
int idx;
-#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
#ifdef CONFIG_TREE_SRCU
idx = srcu_ctlp->srcu_idx & 0x1;
-#else /* #ifdef CONFIG_TREE_SRCU */
- idx = srcu_ctlp->completed & 0x1;
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
unsigned long l0, l1;
unsigned long u0, u1;
long c0, c1;
-#ifdef CONFIG_TREE_SRCU
struct srcu_data *counts;
counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
u0 = counts->srcu_unlock_count[!idx];
u1 = counts->srcu_unlock_count[idx];
-#else /* #ifdef CONFIG_TREE_SRCU */
- struct srcu_array *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
- u0 = counts->unlock_count[!idx];
- u1 = counts->unlock_count[idx];
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
/*
* Make sure that a lock is always counted if the corresponding
@@ -594,13 +584,8 @@ static void srcu_torture_stats(void)
*/
smp_rmb();
-#ifdef CONFIG_TREE_SRCU
l0 = counts->srcu_lock_count[!idx];
l1 = counts->srcu_lock_count[idx];
-#else /* #ifdef CONFIG_TREE_SRCU */
- l0 = counts->lock_count[!idx];
- l1 = counts->lock_count[idx];
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
c0 = l0 - u0;
c1 = l1 - u1;
@@ -609,7 +594,7 @@ static void srcu_torture_stats(void)
pr_cont("\n");
#elif defined(CONFIG_TINY_SRCU)
idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
+ pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
torture_type, TORTURE_FLAG, idx,
READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
deleted file mode 100644
index dea03614263f..000000000000
--- a/kernel/rcu/srcu.c
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2006
- * Copyright (C) Fujitsu, 2012
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- * Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-
-#include <linux/export.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/srcu.h>
-
-#include "rcu.h"
-
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
- b->head = NULL;
- b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
- *b->tail = head;
- b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
- return b->tail == &b->head;
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
- struct rcu_head *head;
-
- if (rcu_batch_empty(b))
- return NULL;
-
- head = b->head;
- b->head = head->next;
- if (b->tail == &head->next)
- rcu_batch_init(b);
-
- return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
- if (!rcu_batch_empty(from)) {
- *to->tail = from->head;
- to->tail = from->tail;
- rcu_batch_init(from);
- }
-}
-
-static int init_srcu_struct_fields(struct srcu_struct *sp)
-{
- sp->completed = 0;
- spin_lock_init(&sp->queue_lock);
- sp->running = false;
- rcu_batch_init(&sp->batch_queue);
- rcu_batch_init(&sp->batch_check0);
- rcu_batch_init(&sp->batch_check1);
- rcu_batch_init(&sp->batch_done);
- INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_array);
- return sp->per_cpu_ref ? 0 : -ENOMEM;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
- struct lock_class_key *key)
-{
- /* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(__init_srcu_struct);
-
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
- *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function. Each srcu_struct represents a separate domain
- * of SRCU protection.
- */
-int init_srcu_struct(struct srcu_struct *sp)
-{
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/*
- * Returns approximate total of the readers' ->lock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->lock_count[idx]);
- }
- return sum;
-}
-
-/*
- * Returns approximate total of the readers' ->unlock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->unlock_count[idx]);
- }
- return sum;
-}
-
-/*
- * Return true if the number of pre-existing readers is determined to
- * be zero.
- */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
-{
- unsigned long unlocks;
-
- unlocks = srcu_readers_unlock_idx(sp, idx);
-
- /*
- * Make sure that a lock is always counted if the corresponding unlock
- * is counted. Needs to be a smp_mb() as the read side may contain a
- * read from a variable that is written to before the synchronize_srcu()
- * in the write side. In this case smp_mb()s A and B act like the store
- * buffering pattern.
- *
- * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
- * synchronize_srcu() from being executed before the grace period ends.
- */
- smp_mb(); /* A */
-
- /*
- * If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does not
- * mean that there are no more readers, as one could have read the
- * current index but not have incremented the lock counter yet.
- *
- * Possible bug: There is no guarantee that there haven't been ULONG_MAX
- * increments of ->lock_count[] since the unlocks were counted, meaning
- * that this could return true even if there are still active readers.
- * Since there are no memory barriers around srcu_flip(), the CPU is not
- * required to increment ->completed before running
- * srcu_readers_unlock_idx(), which means that there could be an
- * arbitrarily large number of critical sections that execute after
- * srcu_readers_unlock_idx() but use the old value of ->completed.
- */
- return srcu_readers_lock_idx(sp, idx) == unlocks;
-}
-
-/**
- * srcu_readers_active - returns true if there are readers. and false
- * otherwise
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
- *
- * Note that this is not an atomic primitive, and can therefore suffer
- * severe errors when invoked on an active srcu_struct. That said, it
- * can be useful as an error check at cleanup time.
- */
-static bool srcu_readers_active(struct srcu_struct *sp)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->lock_count[0]);
- sum += READ_ONCE(cpuc->lock_count[1]);
- sum -= READ_ONCE(cpuc->unlock_count[0]);
- sum -= READ_ONCE(cpuc->unlock_count[1]);
- }
- return sum;
-}
-
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this only after you are finished using a given srcu_struct
- * that was initialized via init_srcu_struct(). This code does some
- * probabalistic checking, spotting late uses of srcu_read_lock(),
- * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
- * If any such late uses are detected, the per-CPU memory associated with
- * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
- * caller frees the srcu_struct itself, a use-after-free crash will likely
- * ensue, but at least there will be a warning printed.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
-{
- if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- free_percpu(sp->per_cpu_ref);
- sp->per_cpu_ref = NULL;
-}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct.
- * Returns an index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- idx = READ_ONCE(sp->completed) & 0x1;
- this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
- smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
- * Removes the count for the old reader from the appropriate per-CPU
- * element of the srcu_struct. Note that this may well be a different
- * CPU than that which was incremented by the corresponding srcu_read_lock().
- */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
-{
- smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods. This approach
- * has done well in testing, so there is no need for a config parameter.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-#define SYNCHRONIZE_SRCU_TRYCOUNT 2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
-
-/*
- * @@@ Wait until all pre-existing readers complete. Such readers
- * will have used the index specified by "idx".
- * the caller should ensures the ->completed is not changed while checking
- * and idx = (->completed & 1) ^ 1
- */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
-{
- for (;;) {
- if (srcu_readers_active_idx_check(sp, idx))
- return true;
- if (--trycount <= 0)
- return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
- }
-}
-
-/*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->(un)lock_count[] arrays. This allows
- * us to wait for pre-existing readers in a starvation-free manner.
- */
-static void srcu_flip(struct srcu_struct *sp)
-{
- WRITE_ONCE(sp->completed, sp->completed + 1);
-
- /*
- * Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
- */
- smp_mb(); /* D */ /* Pairs with C. */
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing SRCU read-side critical section. On systems with
- * more than one CPU, this means that when "func()" is invoked, each CPU
- * is guaranteed to have executed a full memory barrier since the end of
- * its last corresponding SRCU read-side critical section whose beginning
- * preceded the call to call_rcu(). It also means that each CPU executing
- * an SRCU read-side critical section that continues beyond the start of
- * "func()" must have executed a memory barrier after the call_rcu()
- * but before the beginning of that SRCU read-side critical section.
- * Note that these guarantees include CPUs that are offline, idle, or
- * executing in user mode, as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting SRCU callback function "func()", then both CPU A and CPU
- * B are guaranteed to execute a full memory barrier during the time
- * interval between the call to call_rcu() and the invocation of "func()".
- * This guarantee applies even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- *
- * Of course, these guarantees apply only for invocations of call_srcu(),
- * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
- * srcu_struct structure.
- */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
- rcu_callback_t func)
-{
- unsigned long flags;
-
- head->next = NULL;
- head->func = func;
- spin_lock_irqsave(&sp->queue_lock, flags);
- smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
- rcu_batch_queue(&sp->batch_queue, head);
- if (!sp->running) {
- sp->running = true;
- queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
- }
- spin_unlock_irqrestore(&sp->queue_lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_srcu);
-
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
-static void srcu_reschedule(struct srcu_struct *sp);
-
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
-{
- struct rcu_synchronize rcu;
- struct rcu_head *head = &rcu.head;
- bool done = false;
-
- RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
- lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
-
- might_sleep();
- init_completion(&rcu.completion);
-
- head->next = NULL;
- head->func = wakeme_after_rcu;
- spin_lock_irq(&sp->queue_lock);
- smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
- if (!sp->running) {
- /* steal the processing owner */
- sp->running = true;
- rcu_batch_queue(&sp->batch_check0, head);
- spin_unlock_irq(&sp->queue_lock);
-
- srcu_advance_batches(sp, trycount);
- if (!rcu_batch_empty(&sp->batch_done)) {
- BUG_ON(sp->batch_done.head != head);
- rcu_batch_dequeue(&sp->batch_done);
- done = true;
- }
- /* give the processing owner to work_struct */
- srcu_reschedule(sp);
- } else {
- rcu_batch_queue(&sp->batch_queue, head);
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (!done) {
- wait_for_completion(&rcu.completion);
- smp_mb(); /* Caller's later accesses after GP. */
- }
-
-}
-
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for the count to drain to zero of both indexes. To avoid the
- * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
- *
- * Can block; must be called from process context.
- *
- * Note that it is illegal to call synchronize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section,
- * as long as the resulting graph of srcu_structs is acyclic.
- *
- * There are memory-ordering constraints implied by synchronize_srcu().
- * On systems with more than one CPU, when synchronize_srcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last corresponding SRCU-sched read-side critical section
- * whose beginning preceded the call to synchronize_srcu(). In addition,
- * each CPU having an SRCU read-side critical section that extends beyond
- * the return from synchronize_srcu() is guaranteed to have executed a
- * full memory barrier after the beginning of synchronize_srcu() and before
- * the beginning of that SRCU read-side critical section. Note that these
- * guarantees include CPUs that are offline, idle, or executing in user mode,
- * as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_srcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
- * are the same CPU, but again only if the system has more than one CPU.
- *
- * Of course, these memory-ordering guarantees apply only when
- * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
- * passed the same srcu_struct structure.
- */
-void synchronize_srcu(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
- ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
- : SYNCHRONIZE_SRCU_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
- */
-void srcu_barrier(struct srcu_struct *sp)
-{
- synchronize_srcu(sp);
-}
-EXPORT_SYMBOL_GPL(srcu_barrier);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return sp->completed;
-}
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
-
-#define SRCU_CALLBACK_BATCH 10
-#define SRCU_INTERVAL 1
-
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
- if (!rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
- spin_unlock_irq(&sp->queue_lock);
- }
-}
-
-/*
- * Core SRCU state machine. Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
- */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
-{
- int idx = 1 ^ (sp->completed & 1);
-
- /*
- * Because readers might be delayed for an extended period after
- * fetching ->completed for their index, at any point in time there
- * might well be readers using both idx=0 and idx=1. We therefore
- * need to wait for readers to clear from both index values before
- * invoking a callback.
- */
-
- if (rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_check1))
- return; /* no callbacks need to be advanced */
-
- if (!try_check_zero(sp, idx, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have already done with their
- * first zero check and flip back when they were enqueued on
- * ->batch_check0 in a previous invocation of srcu_advance_batches().
- * (Presumably try_check_zero() returned false during that
- * invocation, leaving the callbacks stranded on ->batch_check1.)
- * They are therefore ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
- if (rcu_batch_empty(&sp->batch_check0))
- return; /* no callbacks need to be advanced */
- srcu_flip(sp);
-
- /*
- * The callbacks in ->batch_check0 just finished their
- * first check zero and flip, so move them to ->batch_check1
- * for future checking on the other idx.
- */
- rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
- /*
- * SRCU read-side critical sections are normally short, so check
- * at least twice in quick succession after a flip.
- */
- trycount = trycount < 2 ? 2 : trycount;
- if (!try_check_zero(sp, idx^1, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have now waited for all
- * pre-existing readers using both idx values. They are therefore
- * ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-}
-
-/*
- * Invoke a limited number of SRCU callbacks that have passed through
- * their grace period. If there are more to do, SRCU will reschedule
- * the workqueue. Note that needed memory barriers have been executed
- * in this task's context by srcu_readers_active_idx_check().
- */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
-{
- int i;
- struct rcu_head *head;
-
- for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
- head = rcu_batch_dequeue(&sp->batch_done);
- if (!head)
- break;
- local_bh_disable();
- head->func(head);
- local_bh_enable();
- }
-}
-
-/*
- * Finished one round of SRCU grace period. Start another if there are
- * more SRCU callbacks queued, otherwise put SRCU into not-running state.
- */
-static void srcu_reschedule(struct srcu_struct *sp)
-{
- bool pending = true;
-
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- sp->running = false;
- pending = false;
- }
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (pending)
- queue_delayed_work(system_power_efficient_wq,
- &sp->work, SRCU_INTERVAL);
-}
-
-/*
- * This is the work-queue function that handles SRCU grace periods.
- */
-void process_srcu(struct work_struct *work)
-{
- struct srcu_struct *sp;
-
- sp = container_of(work, struct srcu_struct, work.work);
-
- srcu_collect_new(sp);
- srcu_advance_batches(sp, 1);
- srcu_invoke_callbacks(sp);
- srcu_reschedule(sp);
-}
-EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 32798eb14853..1a1c1047d2ed 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
sp->srcu_lock_nesting[0] = 0;
sp->srcu_lock_nesting[1] = 0;
init_swait_queue_head(&sp->srcu_wq);
- sp->srcu_gp_seq = 0;
- rcu_segcblist_init(&sp->srcu_cblist);
+ sp->srcu_cb_head = NULL;
+ sp->srcu_cb_tail = &sp->srcu_cb_head;
sp->srcu_gp_running = false;
sp->srcu_gp_waiting = false;
sp->srcu_idx = 0;
@@ -88,30 +88,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
{
WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
flush_work(&sp->srcu_work);
- WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
WARN_ON(sp->srcu_gp_running);
WARN_ON(sp->srcu_gp_waiting);
- WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+ WARN_ON(sp->srcu_cb_head);
+ WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct. Can be invoked from irq/bh handlers, but the matching
- * __srcu_read_unlock() must be in the same handler instance. Returns an
- * index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- idx = READ_ONCE(sp->srcu_idx);
- WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
* Removes the count for the old reader from the appropriate element of
* the srcu_struct.
*/
@@ -133,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
void srcu_drive_gp(struct work_struct *wp)
{
int idx;
- struct rcu_cblist ready_cbs;
- struct srcu_struct *sp;
+ struct rcu_head *lh;
struct rcu_head *rhp;
+ struct srcu_struct *sp;
sp = container_of(wp, struct srcu_struct, srcu_work);
- if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+ if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
return; /* Already running or nothing to do. */
- /* Tag recently arrived callbacks and wait for readers. */
+ /* Remove recently arrived callbacks and wait for readers. */
WRITE_ONCE(sp->srcu_gp_running, true);
- rcu_segcblist_accelerate(&sp->srcu_cblist,
- rcu_seq_snap(&sp->srcu_gp_seq));
- rcu_seq_start(&sp->srcu_gp_seq);
+ local_irq_disable();
+ lh = sp->srcu_cb_head;
+ sp->srcu_cb_head = NULL;
+ sp->srcu_cb_tail = &sp->srcu_cb_head;
+ local_irq_enable();
idx = sp->srcu_idx;
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
- rcu_seq_end(&sp->srcu_gp_seq);
-
- /* Update callback list based on GP, and invoke ready callbacks. */
- rcu_segcblist_advance(&sp->srcu_cblist,
- rcu_seq_current(&sp->srcu_gp_seq));
- if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
- rcu_cblist_init(&ready_cbs);
- local_irq_disable();
- rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
- local_irq_enable();
- rhp = rcu_cblist_dequeue(&ready_cbs);
- for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
- local_bh_disable();
- rhp->func(rhp);
- local_bh_enable();
- }
- local_irq_disable();
- rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
- local_irq_enable();
+
+ /* Invoke the callbacks we removed above. */
+ while (lh) {
+ rhp = lh;
+ lh = lh->next;
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
}
- WRITE_ONCE(sp->srcu_gp_running, false);
/*
- * If more callbacks, reschedule ourselves. This can race with
- * a call_srcu() at interrupt level, but the ->srcu_gp_running
- * checks will straighten that out.
+ * Enable rescheduling, and if there are more callbacks,
+ * reschedule ourselves. This can race with a call_srcu()
+ * at interrupt level, but the ->srcu_gp_running checks will
+ * straighten that out.
*/
- if (!rcu_segcblist_empty(&sp->srcu_cblist))
+ WRITE_ONCE(sp->srcu_gp_running, false);
+ if (READ_ONCE(sp->srcu_cb_head))
schedule_work(&sp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -187,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp);
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
*/
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rcu_callback_t func)
{
unsigned long flags;
- head->func = func;
+ rhp->func = func;
+ rhp->next = NULL;
local_irq_save(flags);
- rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+ *sp->srcu_cb_tail = rhp;
+ sp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
if (!READ_ONCE(sp->srcu_gp_running))
schedule_work(&sp->srcu_work);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 157654fa436a..d0ca524bf042 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -40,9 +40,15 @@
#include "rcu.h"
#include "rcu_segcblist.h"
-ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
+/* Holdoff in nanoseconds for auto-expediting. */
+#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
+static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
module_param(exp_holdoff, ulong, 0444);
+/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */
+static ulong counter_wrap_check = (ULONG_MAX >> 2);
+module_param(counter_wrap_check, ulong, 0444);
+
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
@@ -70,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
/* Each pass through this loop initializes one srcu_node structure. */
rcu_for_each_node_breadth_first(sp, snp) {
- spin_lock_init(&snp->lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -104,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
snp_first = sp->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- spin_lock_init(&sdp->lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
@@ -163,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
lockdep_init_map(&sp->dep_map, name, key, 0);
- spin_lock_init(&sp->gp_lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -180,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *sp)
{
- spin_lock_init(&sp->gp_lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -191,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
- * to each update-side SRCU primitive. Use ->gp_lock, which -is-
+ * to each update-side SRCU primitive. Use sp->lock, which -is-
* compile-time initialized, to resolve races involving multiple
* CPUs trying to garner first-use privileges.
*/
@@ -203,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
/* The smp_load_acquire() pairs with the smp_store_release(). */
if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave(&sp->gp_lock, flags);
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
return;
}
init_srcu_struct_fields(sp, true);
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -275,15 +281,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
* not mean that there are no more readers, as one could have read
* the current index but not have incremented the lock counter yet.
*
- * Possible bug: There is no guarantee that there haven't been
- * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
- * counted, meaning that this could return true even if there are
- * still active readers. Since there are no memory barriers around
- * srcu_flip(), the CPU is not required to increment ->srcu_idx
- * before running srcu_readers_unlock_idx(), which means that there
- * could be an arbitrarily large number of critical sections that
- * execute after srcu_readers_unlock_idx() but use the old value
- * of ->srcu_idx.
+ * So suppose that the updater is preempted here for so long
+ * that more than ULONG_MAX non-nested readers come and go in
+ * the meantime. It turns out that this cannot result in overflow
+ * because if a reader modifies its unlock count after we read it
+ * above, then that reader's next load of ->srcu_idx is guaranteed
+ * to get the new value, which will cause it to operate on the
+ * other bank of counters, where it cannot contribute to the
+ * overflow of these counters. This means that there is a maximum
+ * of 2*NR_CPUS increments, which cannot overflow given current
+ * systems, especially not on 64-bit systems.
+ *
+ * OK, how about nesting? This does impose a limit on nesting
+ * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
+ * especially on 64-bit systems.
*/
return srcu_readers_lock_idx(sp, idx) == unlocks;
}
@@ -400,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
struct srcu_data *sdp = this_cpu_ptr(sp->sda);
int state;
- RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
- "Invoked srcu_gp_start() without ->gp_lock!");
+ lockdep_assert_held(&sp->lock);
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -489,17 +499,20 @@ static void srcu_gp_end(struct srcu_struct *sp)
{
unsigned long cbdelay;
bool cbs;
+ int cpu;
+ unsigned long flags;
unsigned long gpseq;
int idx;
int idxnext;
unsigned long mask;
+ struct srcu_data *sdp;
struct srcu_node *snp;
/* Prevent more than one additional grace period. */
mutex_lock(&sp->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
idx = rcu_seq_state(sp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(sp);
@@ -508,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
sp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -516,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
rcu_for_each_node_breadth_first(sp, snp) {
- spin_lock_irq(&snp->lock);
+ raw_spin_lock_irq_rcu_node(snp);
cbs = false;
if (snp >= sp->level[rcu_num_lvls - 1])
cbs = snp->srcu_have_cbs[idx] == gpseq;
@@ -526,28 +539,37 @@ static void srcu_gp_end(struct srcu_struct *sp)
snp->srcu_gp_seq_needed_exp = gpseq;
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq(&snp->lock);
- if (cbs) {
- smp_mb(); /* GP end before CB invocation. */
+ raw_spin_unlock_irq_rcu_node(snp);
+ if (cbs)
srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
- }
+
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ raw_spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq,
+ sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
}
/* Callback initiation done, allow grace periods after next. */
mutex_unlock(&sp->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
srcu_gp_start(sp);
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
/* Throttle expedited grace periods: Should be rare! */
srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
? 0 : SRCU_INTERVAL);
} else {
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
}
}
@@ -567,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
return;
- spin_lock_irqsave(&snp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
return;
}
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
- spin_lock_irqsave(&sp->gp_lock, flags);
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
sp->srcu_gp_seq_needed_exp = s;
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -600,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
for (; snp != NULL; snp = snp->srcu_parent) {
if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
return; /* GP already done and CBs recorded. */
- spin_lock_irqsave(&snp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
snp_seq = snp->srcu_have_cbs[idx];
if (snp == sdp->mynode && snp_seq == s)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
if (snp == sdp->mynode && snp_seq != s) {
- smp_mb(); /* CBs after GP! */
srcu_schedule_cbs_sdp(sdp, do_norm
? SRCU_INTERVAL
: 0);
@@ -622,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
snp->srcu_gp_seq_needed_exp = s;
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave(&sp->gp_lock, flags);
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -645,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
queue_delayed_work(system_power_efficient_wq, &sp->work,
srcu_get_delay(sp));
}
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -671,6 +692,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
*/
static void srcu_flip(struct srcu_struct *sp)
{
+ /*
+ * Ensure that if this updater saw a given reader's increment
+ * from __srcu_read_lock(), that reader was using an old value
+ * of ->srcu_idx. Also ensure that if a given reader sees the
+ * new value of ->srcu_idx, this updater's earlier scans cannot
+ * have seen that reader's increments (which is OK, because this
+ * grace period need not wait on that reader).
+ */
+ smp_mb(); /* E */ /* Pairs with B and C. */
+
WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
/*
@@ -745,6 +776,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
}
/*
+ * SRCU callback function to leak a callback.
+ */
+static void srcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -782,10 +820,16 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
struct srcu_data *sdp;
check_init_srcu_struct(sp);
+ if (debug_rcu_head_queue(rhp)) {
+ /* Probable double call_srcu(), so leak the callback. */
+ WRITE_ONCE(rhp->func, srcu_leak_callback);
+ WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n");
+ return;
+ }
rhp->func = func;
local_irq_save(flags);
sdp = this_cpu_ptr(sp->sda);
- spin_lock(&sdp->lock);
+ raw_spin_lock_rcu_node(sdp);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -799,13 +843,30 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- spin_unlock_irqrestore(&sdp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
if (needgp)
srcu_funnel_gp_start(sp, sdp, s, do_norm);
else if (needexp)
srcu_funnel_exp_start(sp, sdp->mynode, s);
}
+/**
+ * call_srcu() - Queue a callback for invocation after an SRCU grace period
+ * @sp: srcu_struct in queue the callback
+ * @head: structure to be used for queueing the SRCU callback.
+ * @func: function to be invoked after the SRCU grace period
+ *
+ * The callback function will be invoked some time after a full SRCU
+ * grace period elapses, in other words after all pre-existing SRCU
+ * read-side critical sections have completed. However, the callback
+ * function might well execute concurrently with other SRCU read-side
+ * critical sections that started after call_srcu() was invoked. SRCU
+ * read-side critical sections are delimited by srcu_read_lock() and
+ * srcu_read_unlock(), and may be nested.
+ *
+ * The callback will be invoked from process context, but must nevertheless
+ * be fast and must not block.
+ */
void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rcu_callback_t func)
{
@@ -953,13 +1014,16 @@ void srcu_barrier(struct srcu_struct *sp)
*/
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- spin_lock_irq(&sdp->lock);
+ raw_spin_lock_irq_rcu_node(sdp);
atomic_inc(&sp->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head, 0))
+ &sdp->srcu_barrier_head, 0)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&sp->srcu_barrier_cpu_cnt);
- spin_unlock_irq(&sdp->lock);
+ }
+ raw_spin_unlock_irq_rcu_node(sdp);
}
/* Remove the initial count, at which point reaching zero can happen. */
@@ -1008,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
*/
idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
return;
}
idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(sp);
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
if (idx != SRCU_STATE_IDLE) {
mutex_unlock(&sp->srcu_gp_mutex);
return; /* Someone else started the grace period. */
@@ -1067,22 +1131,22 @@ static void srcu_invoke_callbacks(struct work_struct *work)
sdp = container_of(work, struct srcu_data, work.work);
sp = sdp->sp;
rcu_cblist_init(&ready_cbs);
- spin_lock_irq(&sdp->lock);
- smp_mb(); /* Old grace periods before callback invocation! */
+ raw_spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
- spin_unlock_irq(&sdp->lock);
+ raw_spin_unlock_irq_rcu_node(sdp);
return; /* Someone else on the job or nothing to do. */
}
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
- spin_unlock_irq(&sdp->lock);
+ raw_spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+ debug_rcu_head_unqueue(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
@@ -1092,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
- spin_lock_irq(&sdp->lock);
+ raw_spin_lock_irq_rcu_node(sdp);
rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&sp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
- spin_unlock_irq(&sdp->lock);
+ raw_spin_unlock_irq_rcu_node(sdp);
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
@@ -1111,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
@@ -1121,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/* Outstanding request and no GP. Start one. */
srcu_gp_start(sp);
}
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
if (pushgp)
queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
@@ -1152,3 +1216,12 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+
+static int __init srcu_bootup_announce(void)
+{
+ pr_info("Hierarchical SRCU implementation.\n");
+ if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
+ pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ return 0;
+}
+early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index e5385731e391..f8488965250f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,15 +35,26 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/trace_events.h>
#include "rcu.h"
-/* Forward declarations for tiny_plugin.h. */
-struct rcu_ctrlblk;
-static void __call_rcu(struct rcu_head *head,
- rcu_callback_t func,
- struct rcu_ctrlblk *rcp);
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
#include "tiny_plugin.h"
@@ -59,19 +70,6 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL(rcu_barrier_sched);
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-/*
- * Test whether RCU thinks that the current CPU is idle.
- */
-bool notrace __rcu_is_watching(void)
-{
- return true;
-}
-EXPORT_SYMBOL(__rcu_is_watching);
-
-#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
* Also irqs are disabled to avoid confusion due to interrupt handlers
@@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching);
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- RCU_TRACE(reset_cpu_stall_ticks(rcp);)
if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
@@ -125,7 +122,6 @@ void rcu_bh_qs(void)
*/
void rcu_check_callbacks(int user)
{
- RCU_TRACE(check_cpu_stalls();)
if (user)
rcu_sched_qs();
else if (!in_softirq())
@@ -140,10 +136,8 @@ void rcu_check_callbacks(int user)
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
- const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
- RCU_TRACE(int cb_count = 0;)
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
@@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
return;
}
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
@@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
- RCU_TRACE(rn = rcp->name;)
while (list) {
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- __rcu_reclaim(rn, list);
+ __rcu_reclaim("", list);
local_bh_enable();
list = next;
- RCU_TRACE(cb_count++;)
}
- RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
- RCU_TRACE(trace_rcu_batch_end(rcp->name,
- cb_count, 0, need_resched(),
- is_idle_task(current),
- false));
}
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
@@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
- RCU_TRACE(rcp->qlen++;)
local_irq_restore(flags);
if (unlikely(is_idle_task(current))) {
@@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
-
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 371034e77f87..f0a01b2a3062 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -22,36 +22,6 @@
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-#include <linux/kthread.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
- RCU_TRACE(long qlen); /* Number of pending CBs. */
- RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
- RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
- RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
- RCU_TRACE(const char *name); /* Name of RCU type. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_sched")
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_bh")
-};
-
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
#include <linux/kernel_stat.h>
@@ -75,96 +45,3 @@ void __init rcu_scheduler_starting(void)
}
#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
-
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcp->qlen -= n;
- local_irq_restore(flags);
-}
-
-/*
- * Dump statistics for TINY_RCU, such as they are.
- */
-static int show_tiny_stats(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
- seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
- return 0;
-}
-
-static int show_tiny_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_tiny_stats, NULL);
-}
-
-static const struct file_operations show_tiny_stats_fops = {
- .owner = THIS_MODULE,
- .open = show_tiny_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutiny_trace_init(void)
-{
- struct dentry *retval;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
- retval = debugfs_create_file("rcudata", 0444, rcudir,
- NULL, &show_tiny_stats_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-device_initcall(rcutiny_trace_init);
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long j;
- unsigned long js;
-
- if (rcu_cpu_stall_suppress)
- return;
- rcp->ticks_this_gp++;
- j = jiffies;
- js = READ_ONCE(rcp->jiffies_stall);
- if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
- pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
- jiffies - rcp->gp_start, rcp->qlen);
- dump_stack();
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- } else if (ULONG_CMP_GE(j, js)) {
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
- }
-}
-
-static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-{
- rcp->ticks_this_gp = 0;
- rcp->gp_start = jiffies;
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
-}
-
-static void check_cpu_stalls(void)
-{
- RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
- RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e354e475e645..51d4c3acf32d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -168,35 +168,17 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp,
static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
-#ifdef CONFIG_RCU_KTHREAD_PRIO
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
module_param(kthread_prio, int, 0644);
/* Delay in jiffies for grace-period initialization delays, debug only. */
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
-static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
-module_param(gp_preinit_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-static const int gp_preinit_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
-module_param(gp_init_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-static const int gp_init_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
-static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
-module_param(gp_cleanup_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
-static const int gp_cleanup_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static int gp_preinit_delay;
+module_param(gp_preinit_delay, int, 0444);
+static int gp_init_delay;
+module_param(gp_init_delay, int, 0444);
+static int gp_cleanup_delay;
+module_param(gp_cleanup_delay, int, 0444);
/*
* Number of grace periods between delays, normalized by the duration of
@@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!");
if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
return;
trace_rcu_grace_period(TPS("rcu_sched"),
@@ -265,6 +248,7 @@ void rcu_sched_qs(void)
void rcu_bh_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_bh"),
__this_cpu_read(rcu_bh_data.gpnum),
@@ -286,10 +270,6 @@ void rcu_bh_qs(void)
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
/*
@@ -478,7 +458,7 @@ void rcu_note_context_switch(bool preempt)
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
- rcu_preempt_note_context_switch();
+ rcu_preempt_note_context_switch(preempt);
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
goto out;
@@ -534,9 +514,12 @@ void rcu_all_qs(void)
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
-static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
-static long qhimark = 10000; /* If this many pending, ignore blimit. */
-static long qlowmark = 100; /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
+static long blimit = DEFAULT_RCU_BLIMIT;
+#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+static long qhimark = DEFAULT_RCU_QHIMARK;
+#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
+static long qlowmark = DEFAULT_RCU_QLOMARK;
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
@@ -559,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj);
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
@@ -757,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
return READ_ONCE(*fp);
}
@@ -768,6 +749,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
@@ -794,6 +776,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -864,7 +847,6 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -914,7 +896,6 @@ void rcu_irq_exit(void)
trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
rdtp->dynticks_nesting--;
}
- rcu_sysidle_enter(1);
}
/*
@@ -967,6 +948,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -995,7 +977,6 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -1047,7 +1028,6 @@ void rcu_irq_enter(void)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
rcu_eqs_exit_common(oldval, true);
- rcu_sysidle_exit(1);
}
/*
@@ -1130,22 +1110,11 @@ void rcu_nmi_exit(void)
}
/**
- * __rcu_is_watching - are RCU read-side critical sections safe?
- *
- * Return true if RCU is watching the running CPU, which means that
- * this CPU can safely enter RCU read-side critical sections. Unlike
- * rcu_is_watching(), the caller of __rcu_is_watching() must have at
- * least disabled preemption.
- */
-bool notrace __rcu_is_watching(void)
-{
- return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
-/**
* rcu_is_watching - see if RCU thinks that the current CPU is idle
*
- * If the current CPU is in its idle loop and is neither in an interrupt
+ * Return true if RCU is watching the running CPU, which means that this
+ * CPU can safely enter RCU read-side critical sections. In other words,
+ * if the current CPU is in its idle loop and is neither in an interrupt
* or NMI handler, return true.
*/
bool notrace rcu_is_watching(void)
@@ -1153,7 +1122,7 @@ bool notrace rcu_is_watching(void)
bool ret;
preempt_disable_notrace();
- ret = __rcu_is_watching();
+ ret = !rcu_dynticks_curr_cpu_in_eqs();
preempt_enable_notrace();
return ret;
}
@@ -1237,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
- rcu_sysidle_check_cpu(rdp, isidle, maxj);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
@@ -1258,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
* idle state since the last call to dyntick_save_progress_counter()
* for this same CPU, or by virtue of having been offline.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
bool *rnhqp;
@@ -1674,6 +1640,8 @@ void rcu_cpu_stall_reset(void)
static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
struct rcu_node *rnp)
{
+ lockdep_assert_held(&rnp->lock);
+
/*
* If RCU is idle, we just wait for the next grace period.
* But we can only be sure that RCU is idle if we are looking
@@ -1719,6 +1687,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ lockdep_assert_held(&rnp->lock);
+
/*
* Pick up grace-period number for new callbacks. If this
* grace period is already marked as needed, return to the caller.
@@ -1845,6 +1815,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
{
bool ret = false;
+ lockdep_assert_held(&rnp->lock);
+
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
@@ -1883,6 +1855,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ lockdep_assert_held(&rnp->lock);
+
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
@@ -1909,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
bool need_gp;
+ lockdep_assert_held(&rnp->lock);
+
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
!unlikely(READ_ONCE(rdp->gpwrap))) {
@@ -2115,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
*/
static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
{
- bool isidle = false;
- unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
if (first_time) {
/* Collect dyntick-idle snapshots. */
- if (is_sysidle_rcu_state(rsp)) {
- isidle = true;
- maxj = jiffies - ULONG_MAX / 4;
- }
- force_qs_rnp(rsp, dyntick_save_progress_counter,
- &isidle, &maxj);
- rcu_sysidle_report_gp(rsp, isidle, maxj);
+ force_qs_rnp(rsp, dyntick_save_progress_counter);
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = true;
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -2341,6 +2308,7 @@ static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ lockdep_assert_held(&rnp->lock);
if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
@@ -2402,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
+ lockdep_assert_held(&rcu_get_root(rsp)->lock);
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2426,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
unsigned long oldmask = 0;
struct rcu_node *rnp_c;
+ lockdep_assert_held(&rnp->lock);
+
/* Walk up the rcu_node hierarchy. */
for (;;) {
if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
@@ -2486,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
unsigned long mask;
struct rcu_node *rnp_p;
+ lockdep_assert_held(&rnp->lock);
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2599,6 +2571,8 @@ static void
rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
+ lockdep_assert_held(&rsp->orphan_lock);
+
/* No-CBs CPUs do not have orphanable callbacks. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
return;
@@ -2639,6 +2613,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ lockdep_assert_held(&rsp->orphan_lock);
+
/* No-CBs CPUs are handled specially. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2705,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
+ lockdep_assert_held(&rnp->lock);
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
@@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user)
*
* The caller must have suppressed start of new grace periods.
*/
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj)
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
{
int cpu;
unsigned long flags;
@@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
- if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+ if (f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit;
}
}
@@ -3143,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
if (debug_rcu_head_queue(head)) {
- /* Probable double call_rcu(), so leak the callback. */
+ /*
+ * Probable double call_rcu(), so leak the callback.
+ * Use rcu:rcu_callback trace event to find the previous
+ * time callback was passed to __call_rcu().
+ */
+ WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+ head, head->func);
WRITE_ONCE(head->func, rcu_leak_callback);
- WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
return;
}
head->func = func;
@@ -3194,8 +3173,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
local_irq_restore(flags);
}
-/*
- * Queue an RCU-sched callback for invocation after a grace period.
+/**
+ * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_sched() assumes
+ * that the read-side critical sections end on enabling of preemption
+ * or on voluntary preemption.
+ * RCU read-side critical sections are delimited by :
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
+ *
+ * These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{
@@ -3203,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu_sched);
-/*
- * Queue an RCU callback for invocation after a quicker grace period.
+/**
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
+ * OR
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ * These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{
@@ -3280,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API. In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
*/
void synchronize_sched(void)
{
@@ -3578,8 +3585,14 @@ static void rcu_barrier_func(void *type)
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
- atomic_inc(&rsp->barrier_cpu_count);
- rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+ rdp->barrier_head.func = rcu_barrier_callback;
+ debug_rcu_head_queue(&rdp->barrier_head);
+ if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+ atomic_inc(&rsp->barrier_cpu_count);
+ } else {
+ debug_rcu_head_unqueue(&rdp->barrier_head);
+ _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ }
}
/*
@@ -3698,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
+ lockdep_assert_held(&rnp->lock);
for (;;) {
mask = rnp->grpmask;
rnp = rnp->parent;
@@ -3753,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
!init_nocb_callback_list(rdp))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_sysidle_init_percpu_data(rdp->dynticks);
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ba38262c3554..9af0f31d6847 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -45,14 +45,6 @@ struct rcu_dynticks {
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- long long dynticks_idle_nesting;
- /* irq/process nesting level from idle. */
- atomic_t dynticks_idle; /* Even value for idle, else odd. */
- /* "Idle" excludes userspace execution. */
- unsigned long dynticks_idle_jiffies;
- /* End of last non-NMI non-idle period. */
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
@@ -160,19 +152,6 @@ struct rcu_node {
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
/* Number of tasks boosted for normal GP. */
- unsigned long n_balk_blkd_tasks;
- /* Refused to boost: no blocked tasks. */
- unsigned long n_balk_exp_gp_tasks;
- /* Refused to boost: nothing blocking GP. */
- unsigned long n_balk_boost_tasks;
- /* Refused to boost: already boosting. */
- unsigned long n_balk_notblocked;
- /* Refused to boost: RCU RS CS still running. */
- unsigned long n_balk_notyet;
- /* Refused to boost: not yet time. */
- unsigned long n_balk_nos;
- /* Refused to boost: not sure why, though. */
- /* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -312,9 +291,9 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOGP_WAKE_NOT 0
-#define RCU_NOGP_WAKE 1
-#define RCU_NOGP_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_NOT 0
+#define RCU_NOCB_WAKE 1
+#define RCU_NOCB_WAKE_FORCE 2
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -477,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
-static void rcu_preempt_note_context_switch(void);
+static void rcu_preempt_note_context_switch(bool preempt);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -529,15 +508,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(int irq);
-static void rcu_sysidle_exit(int irq);
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj);
-static bool is_sysidle_rcu_state(struct rcu_state *rsp);
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj);
static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
@@ -551,75 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { }
#endif /* #else #ifdef CONFIG_SRCU */
#endif /* #ifndef RCU_TREE_NONCORE */
-
-#ifdef CONFIG_RCU_TRACE
-/* Read out queue lengths for tracing. */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- *ql = atomic_long_read(&rdp->nocb_q_count);
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
- *ql = 0;
- *qll = 0;
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Wrappers for the rcu_node::lock acquire and release.
- *
- * Because the rcu_nodes form a tree, the tree traversal locking will observe
- * different lock values, this in turn means that an UNLOCK of one level
- * followed by a LOCK of another level does not imply a full memory barrier;
- * and most importantly transitivity is lost.
- *
- * In order to restore full ordering between tree levels, augment the regular
- * lock acquire functions with smp_mb__after_unlock_lock().
- *
- * As ->lock of struct rcu_node is a __private field, therefore one should use
- * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
- */
-static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
- smp_mb__after_unlock_lock();
-}
-
-static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
-}
-
-static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
- smp_mb__after_unlock_lock();
-}
-
-static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
-}
-
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
-do { \
- typecheck(unsigned long, flags); \
- raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
-do { \
- typecheck(unsigned long, flags); \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
-} while (0)
-
-static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
-{
- bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
-
- if (locked)
- smp_mb__after_unlock_lock();
- return locked;
-}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e513b4ab1197..dd21ca47e4b4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
*
* Caller must hold the rcu_state's exp_mutex.
*/
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c9a48657512a..908b309d60d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ pr_info("\tRCU event tracing is enabled.\n");
if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
(!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
@@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
- if (IS_ENABLED(CONFIG_RCU_BOOST))
- pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
+#endif
+ if (blimit != DEFAULT_RCU_BLIMIT)
+ pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
+ if (qhimark != DEFAULT_RCU_QHIMARK)
+ pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
+ if (qlowmark != DEFAULT_RCU_QLOMARK)
+ pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
+ if (jiffies_till_first_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
+ if (jiffies_till_next_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
+ if (rcu_kick_kthreads)
+ pr_info("\tKick kthreads if too-long grace period.\n");
+ if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
+ pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ if (gp_preinit_delay)
+ pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
+ if (gp_init_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
+ if (gp_cleanup_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
+ pr_info("\tRCU debug extended QS entry/exit.\n");
+ rcupdate_announce_bootup_oddness();
}
#ifdef CONFIG_PREEMPT_RCU
@@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
+ lockdep_assert_held(&rnp->lock);
+
/*
* Decide where to queue the newly blocked task. In theory,
* this could be an if-statement. In practice, when I tried
@@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static void rcu_preempt_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data_p->gpnum),
@@ -286,12 +313,14 @@ static void rcu_preempt_qs(void)
*
* Caller must disable interrupts.
*/
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
{
struct task_struct *t = current;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
if (rcu_preempt_has_tasks(rnp))
rnp->gp_tasks = rnp->blkd_tasks.next;
@@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void)
#endif /* #ifdef CONFIG_RCU_BOOST */
-/*
- * Queue a preemptible-RCU callback for invocation after a grace period.
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
* synchronize_rcu() was waiting. RCU read-side critical sections are
* delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
*
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
+ * See the description of synchronize_sched() for more detailed
+ * information on memory-ordering guarantees. However, please note
+ * that -only- the memory-ordering guarantees apply. For example,
+ * synchronize_rcu() is -not- guaranteed to wait on things like code
+ * protected by preempt_disable(), instead, synchronize_rcu() is -only-
+ * guaranteed to wait on RCU read-side critical sections, that is, sections
+ * of code protected by rcu_read_lock().
*/
void synchronize_rcu(void)
{
@@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void)
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
{
}
@@ -835,33 +899,6 @@ void exit_rcu(void)
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
- if (!rcu_preempt_has_tasks(rnp))
- rnp->n_balk_blkd_tasks++;
- else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
- rnp->n_balk_exp_gp_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
- rnp->n_balk_boost_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
- rnp->n_balk_notblocked++;
- else if (rnp->gp_tasks != NULL &&
- ULONG_CMP_LT(jiffies, rnp->boost_time))
- rnp->n_balk_notyet++;
- else
- rnp->n_balk_nos++;
-}
-
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
struct task_struct *t;
+ lockdep_assert_held(&rnp->lock);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
- rnp->n_balk_exp_gp_tasks++;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (t)
rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
- rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
- ? 0 : rcu_cpu_has_callbacks(NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
/*
@@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- rcu_is_nocb_cpu(smp_processor_id()))
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- rcu_is_nocb_cpu(smp_processor_id()))
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
@@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Kick the leader kthread for this NOCB group.
@@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
}
}
@@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2023,6 +2054,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
+ smp_mb(); /* wakeup before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void)
bool need_rcu_nocb_mask = true;
struct rcu_state *rsp;
-#ifdef CONFIG_RCU_NOCB_CPU_NONE
- need_rcu_nocb_mask = false;
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
-
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
need_rcu_nocb_mask = true;
@@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void)
if (!have_rcu_nocb_mask)
return;
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
- pr_info("\tOffload RCU callbacks from CPU 0\n");
- cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
- pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
@@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
-
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-
-static int full_sysidle_state; /* Current system-idle state. */
-#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
-#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
-#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
-#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
-#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
-
-/*
- * Invoked to note exit from irq or task transition to idle. Note that
- * usermode execution does -not- count as idle here! After all, we want
- * to detect full-system idle states, not RCU quiescent states and grace
- * periods. The caller must have disabled interrupts.
- */
-static void rcu_sysidle_enter(int irq)
-{
- unsigned long j;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for fully idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- if (rdtp->dynticks_idle_nesting != 0)
- return; /* Still not fully idle. */
- } else {
- if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE) {
- rdtp->dynticks_idle_nesting = 0;
- } else {
- rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- return; /* Still not fully idle. */
- }
- }
-
- /* Record start of fully idle period. */
- j = jiffies;
- WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
-}
-
-/*
- * Unconditionally force exit from full system-idle state. This is
- * invoked when a normal CPU exits idle, but must be called separately
- * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
- * is that the timekeeping CPU is permitted to take scheduling-clock
- * interrupts while the system is in system-idle state, and of course
- * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
- * interrupt from any other type of interrupt.
- */
-void rcu_sysidle_force_exit(void)
-{
- int oldstate = READ_ONCE(full_sysidle_state);
- int newoldstate;
-
- /*
- * Each pass through the following loop attempts to exit full
- * system-idle state. If contention proves to be a problem,
- * a trylock-based contention tree could be used here.
- */
- while (oldstate > RCU_SYSIDLE_SHORT) {
- newoldstate = cmpxchg(&full_sysidle_state,
- oldstate, RCU_SYSIDLE_NOT);
- if (oldstate == newoldstate &&
- oldstate == RCU_SYSIDLE_FULL_NOTED) {
- rcu_kick_nohz_cpu(tick_do_timer_cpu);
- return; /* We cleared it, done! */
- }
- oldstate = newoldstate;
- }
- smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
-}
-
-/*
- * Invoked to note entry to irq or task transition from idle. Note that
- * usermode execution does -not- count as idle here! The caller must
- * have disabled interrupts.
- */
-static void rcu_sysidle_exit(int irq)
-{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for already non-idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- if (rdtp->dynticks_idle_nesting != 1)
- return; /* Already non-idle. */
- } else {
- /*
- * Allow for irq misnesting. Yes, it really is possible
- * to enter an irq handler then never leave it, and maybe
- * also vice versa. Handle both possibilities.
- */
- if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- return; /* Already non-idle. */
- } else {
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
- }
- }
-
- /* Record end of idle period. */
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
-
- /*
- * If we are the timekeeping CPU, we are permitted to be non-idle
- * during a system-idle state. This must be the case, because
- * the timekeeping CPU has to take scheduling-clock interrupts
- * during the time that the system is transitioning to full
- * system-idle state. This means that the timekeeping CPU must
- * invoke rcu_sysidle_force_exit() directly if it does anything
- * more than take a scheduling-clock interrupt.
- */
- if (smp_processor_id() == tick_do_timer_cpu)
- return;
-
- /* Update system-idle state: We are clearly no longer fully idle! */
- rcu_sysidle_force_exit();
-}
-
-/*
- * Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts,
- * and must be running on tick_do_timer_cpu.
- */
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
- int cur;
- unsigned long j;
- struct rcu_dynticks *rdtp = rdp->dynticks;
-
- /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
- if (!tick_nohz_full_enabled())
- return;
-
- /*
- * If some other CPU has already reported non-idle, if this is
- * not the flavor of RCU that tracks sysidle state, or if this
- * is an offline or the timekeeping CPU, nothing to do.
- */
- if (!*isidle || rdp->rsp != rcu_state_p ||
- cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
- return;
- /* Verify affinity of current kthread. */
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
-
- /* Pick up current idle and NMI-nesting counter and check. */
- cur = atomic_read(&rdtp->dynticks_idle);
- if (cur & 0x1) {
- *isidle = false; /* We are not idle! */
- return;
- }
- smp_mb(); /* Read counters before timestamps. */
-
- /* Pick up timestamps. */
- j = READ_ONCE(rdtp->dynticks_idle_jiffies);
- /* If this CPU entered idle more recently, update maxj timestamp. */
- if (ULONG_CMP_LT(*maxj, j))
- *maxj = j;
-}
-
-/*
- * Is this the flavor of RCU that is handling full-system idle?
- */
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return rsp == rcu_state_p;
-}
-
-/*
- * Return a delay in jiffies based on the number of CPUs, rcu_node
- * leaf fanout, and jiffies tick rate. The idea is to allow larger
- * systems more time to transition to full-idle state in order to
- * avoid the cache thrashing that otherwise occur on the state variable.
- * Really small systems (less than a couple of tens of CPUs) should
- * instead use a single global atomically incremented counter, and later
- * versions of this will automatically reconfigure themselves accordingly.
- */
-static unsigned long rcu_sysidle_delay(void)
-{
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return 0;
- return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
-}
-
-/*
- * Advance the full-system-idle state. This is invoked when all of
- * the non-timekeeping CPUs are idle.
- */
-static void rcu_sysidle(unsigned long j)
-{
- /* Check the current state. */
- switch (READ_ONCE(full_sysidle_state)) {
- case RCU_SYSIDLE_NOT:
-
- /* First time all are idle, so note a short idle period. */
- WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
- break;
-
- case RCU_SYSIDLE_SHORT:
-
- /*
- * Idle for a bit, time to advance to next state?
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
- break;
-
- case RCU_SYSIDLE_LONG:
-
- /*
- * Do an additional check pass before advancing to full.
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
- break;
-
- default:
- break;
- }
-}
-
-/*
- * Found a non-idle non-timekeeping CPU, so kick the system-idle state
- * back to the beginning.
- */
-static void rcu_sysidle_cancel(void)
-{
- smp_mb();
- if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
-}
-
-/*
- * Update the sysidle state based on the results of a force-quiescent-state
- * scan of the CPUs' dyntick-idle state.
- */
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
- unsigned long maxj, bool gpkt)
-{
- if (rsp != rcu_state_p)
- return; /* Wrong flavor, ignore. */
- if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return; /* Running state machine from timekeeping CPU. */
- if (isidle)
- rcu_sysidle(maxj); /* More idle! */
- else
- rcu_sysidle_cancel(); /* Idle is over. */
-}
-
-/*
- * Wrapper for rcu_sysidle_report() when called from the grace-period
- * kthread's context.
- */
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- rcu_sysidle_report(rsp, isidle, maxj, true);
-}
-
-/* Callback and function for forcing an RCU grace period. */
-struct rcu_sysidle_head {
- struct rcu_head rh;
- int inuse;
-};
-
-static void rcu_sysidle_cb(struct rcu_head *rhp)
-{
- struct rcu_sysidle_head *rshp;
-
- /*
- * The following memory barrier is needed to replace the
- * memory barriers that would normally be in the memory
- * allocator.
- */
- smp_mb(); /* grace period precedes setting inuse. */
-
- rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- WRITE_ONCE(rshp->inuse, 0);
-}
-
-/*
- * Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts. This is not intended to be
- * called unless tick_nohz_full_enabled().
- */
-bool rcu_sys_is_idle(void)
-{
- static struct rcu_sysidle_head rsh;
- int rss = READ_ONCE(full_sysidle_state);
-
- if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
- return false;
-
- /* Handle small-system case by doing a full scan of CPUs. */
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
- int oldrss = rss - 1;
-
- /*
- * One pass to advance to each state up to _FULL.
- * Give up if any pass fails to advance the state.
- */
- while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
- int cpu;
- bool isidle = true;
- unsigned long maxj = jiffies - ULONG_MAX / 4;
- struct rcu_data *rdp;
-
- /* Scan all the CPUs looking for nonidle CPUs. */
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
- if (!isidle)
- break;
- }
- rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
- oldrss = rss;
- rss = READ_ONCE(full_sysidle_state);
- }
- }
-
- /* If this is the first observation of an idle period, record it. */
- if (rss == RCU_SYSIDLE_FULL) {
- rss = cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
- return rss == RCU_SYSIDLE_FULL;
- }
-
- smp_mb(); /* ensure rss load happens before later caller actions. */
-
- /* If already fully idle, tell the caller (in case of races). */
- if (rss == RCU_SYSIDLE_FULL_NOTED)
- return true;
-
- /*
- * If we aren't there yet, and a grace period is not in flight,
- * initiate a grace period. Either way, tell the caller that
- * we are not there yet. We use an xchg() rather than an assignment
- * to make up for the memory barriers that would otherwise be
- * provided by the memory allocator.
- */
- if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
- !rcu_gp_in_progress(rcu_state_p) &&
- !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
- call_rcu(&rsh.rh, rcu_sysidle_cb);
- return false;
-}
-
-/*
- * Initialize dynticks sysidle state for CPUs coming online.
- */
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
-}
-
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-static void rcu_sysidle_enter(int irq)
-{
-}
-
-static void rcu_sysidle_exit(int irq)
-{
-}
-
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
-}
-
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return false;
-}
-
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
-}
-
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- cpu = tick_do_timer_cpu;
- if (cpu >= 0 && cpu < nr_cpu_ids)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
housekeeping_affine(current);
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
deleted file mode 100644
index 6cea17a1ea30..000000000000
--- a/kernel/rcu/tree_trace.c
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Read-Copy Update tracing for hierarchical implementation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2008
- * Author: Paul E. McKenney
- *
- * Papers: http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/prefetch.h>
-
-#define RCU_TREE_NONCORE
-#include "tree.h"
-#include "rcu.h"
-
-static int r_open(struct inode *inode, struct file *file,
- const struct seq_operations *op)
-{
- int ret = seq_open(file, op);
- if (!ret) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = inode->i_private;
- }
- return ret;
-}
-
-static void *r_start(struct seq_file *m, loff_t *pos)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- *pos = cpumask_next(*pos - 1, cpu_possible_mask);
- if ((*pos) < nr_cpu_ids)
- return per_cpu_ptr(rsp->rda, *pos);
- return NULL;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return r_start(m, pos);
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
-}
-
-static int show_rcubarrier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d bseq: %lu\n",
- atomic_read(&rsp->barrier_cpu_count),
- rsp->barrier_sequence);
- return 0;
-}
-
-static int rcubarrier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcubarrier, inode->i_private);
-}
-
-static const struct file_operations rcubarrier_fops = {
- .owner = THIS_MODULE,
- .open = rcubarrier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static char convert_kthread_status(unsigned int kthread_status)
-{
- if (kthread_status > RCU_KTHREAD_MAX)
- return '?';
- return "SRWOY"[kthread_status];
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
-{
- long ql, qll;
-
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->cpu_no_qs.b.norm,
- rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
- rdp->core_needs_qs);
- seq_printf(m, " dt=%d/%llx/%d df=%lu",
- rcu_dynticks_snap(rdp->dynticks),
- rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi_nesting,
- rdp->dynticks_fqs);
- seq_printf(m, " of=%lu", rdp->offline_fqs);
- rcu_nocb_q_lengths(rdp, &ql, &qll);
- qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
- ql += rcu_segcblist_n_cbs(&rdp->cblist);
- seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
- qll, ql,
- ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
- ".R"[!rcu_segcblist_segempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL)],
- ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
- ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
-#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c ktl=%x",
- per_cpu(rcu_cpu_has_work, rdp->cpu),
- convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
- rdp->cpu)),
- per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
-#endif /* #ifdef CONFIG_RCU_BOOST */
- seq_printf(m, " b=%ld", rdp->blimit);
- seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
- rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
- rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-
-static int show_rcudata(struct seq_file *m, void *v)
-{
- print_one_rcu_data(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcudate_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcudata,
-};
-
-static int rcudata_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcudate_op);
-}
-
-static const struct file_operations rcudata_fops = {
- .owner = THIS_MODULE,
- .open = rcudata_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcuexp(struct seq_file *m, void *v)
-{
- int cpu;
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- struct rcu_data *rdp;
- unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
- s0 += atomic_long_read(&rdp->exp_workdone0);
- s1 += atomic_long_read(&rdp->exp_workdone1);
- s2 += atomic_long_read(&rdp->exp_workdone2);
- s3 += atomic_long_read(&rdp->exp_workdone3);
- }
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s0, s1, s2, s3,
- atomic_read(&rsp->expedited_need_qs),
- rsp->expedited_sequence / 2);
- return 0;
-}
-
-static int rcuexp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuexp, inode->i_private);
-}
-
-static const struct file_operations rcuexp_fops = {
- .owner = THIS_MODULE,
- .open = rcuexp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
-{
- seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
- rnp->grplo, rnp->grphi,
- "T."[list_empty(&rnp->blkd_tasks)],
- "N."[!rnp->gp_tasks],
- "E."[!rnp->exp_tasks],
- "B."[!rnp->boost_tasks],
- convert_kthread_status(rnp->boost_kthread_status),
- rnp->n_tasks_boosted, rnp->n_exp_boosts,
- rnp->n_normal_boosts);
- seq_printf(m, "j=%04x bt=%04x\n",
- (int)(jiffies & 0xffff),
- (int)(rnp->boost_time & 0xffff));
- seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
- rnp->n_balk_blkd_tasks,
- rnp->n_balk_exp_gp_tasks,
- rnp->n_balk_boost_tasks,
- rnp->n_balk_notblocked,
- rnp->n_balk_notyet,
- rnp->n_balk_nos);
-}
-
-static int show_rcu_node_boost(struct seq_file *m, void *unused)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
- print_one_rcu_node_boost(m, rnp);
- return 0;
-}
-
-static int rcu_node_boost_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcu_node_boost, NULL);
-}
-
-static const struct file_operations rcu_node_boost_fops = {
- .owner = THIS_MODULE,
- .open = rcu_node_boost_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long gpnum;
- int level = 0;
- struct rcu_node *rnp;
-
- gpnum = rsp->gpnum;
- seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
- ulong2long(rsp->completed), ulong2long(gpnum),
- rsp->gp_state,
- (long)(rsp->jiffies_force_qs - jiffies),
- (int)(jiffies & 0xffff));
- seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
- rsp->n_force_qs, rsp->n_force_qs_ngp,
- rsp->n_force_qs - rsp->n_force_qs_ngp,
- READ_ONCE(rsp->n_force_qs_lh),
- rsp->orphan_done.len_lazy,
- rsp->orphan_done.len);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
- if (rnp->level != level) {
- seq_puts(m, "\n");
- level = rnp->level;
- }
- seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
- ".G"[rnp->gp_tasks != NULL],
- ".E"[rnp->exp_tasks != NULL],
- ".T"[!list_empty(&rnp->blkd_tasks)],
- rnp->grplo, rnp->grphi, rnp->grpnum);
- }
- seq_puts(m, "\n");
-}
-
-static int show_rcuhier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- print_one_rcu_state(m, rsp);
- return 0;
-}
-
-static int rcuhier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuhier, inode->i_private);
-}
-
-static const struct file_operations rcuhier_fops = {
- .owner = THIS_MODULE,
- .open = rcuhier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long completed;
- unsigned long gpnum;
- unsigned long gpage;
- unsigned long gpmax;
- struct rcu_node *rnp = &rsp->node[0];
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- completed = READ_ONCE(rsp->completed);
- gpnum = READ_ONCE(rsp->gpnum);
- if (completed == gpnum)
- gpage = 0;
- else
- gpage = jiffies - rsp->gp_start;
- gpmax = rsp->gp_max;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
- ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
-}
-
-static int show_rcugp(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- show_one_rcugp(m, rsp);
- return 0;
-}
-
-static int rcugp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcugp, inode->i_private);
-}
-
-static const struct file_operations rcugp_fops = {
- .owner = THIS_MODULE,
- .open = rcugp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
-{
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cnp=%ld ",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->n_rcu_pending);
- seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
- rdp->n_rp_core_needs_qs,
- rdp->n_rp_report_qs,
- rdp->n_rp_cb_ready,
- rdp->n_rp_cpu_needs_gp);
- seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
- rdp->n_rp_gp_completed,
- rdp->n_rp_gp_started,
- rdp->n_rp_nocb_defer_wakeup,
- rdp->n_rp_need_nothing);
-}
-
-static int show_rcu_pending(struct seq_file *m, void *v)
-{
- print_one_rcu_pending(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcu_pending_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcu_pending,
-};
-
-static int rcu_pending_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcu_pending_op);
-}
-
-static const struct file_operations rcu_pending_fops = {
- .owner = THIS_MODULE,
- .open = rcu_pending_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcutorture(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcutorture test sequence: %lu %s\n",
- rcutorture_testseq >> 1,
- (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
- seq_printf(m, "rcutorture update version number: %lu\n",
- rcutorture_vernum);
- return 0;
-}
-
-static int rcutorture_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcutorture, NULL);
-}
-
-static const struct file_operations rcutorture_fops = {
- .owner = THIS_MODULE,
- .open = rcutorture_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutree_trace_init(void)
-{
- struct rcu_state *rsp;
- struct dentry *retval;
- struct dentry *rspdir;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
-
- for_each_rcu_flavor(rsp) {
- rspdir = debugfs_create_dir(rsp->name, rcudir);
- if (!rspdir)
- goto free_out;
-
- retval = debugfs_create_file("rcudata", 0444,
- rspdir, rsp, &rcudata_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuexp", 0444,
- rspdir, rsp, &rcuexp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcu_pending", 0444,
- rspdir, rsp, &rcu_pending_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcubarrier", 0444,
- rspdir, rsp, &rcubarrier_fops);
- if (!retval)
- goto free_out;
-
-#ifdef CONFIG_RCU_BOOST
- if (rsp == &rcu_preempt_state) {
- retval = debugfs_create_file("rcuboost", 0444,
- rspdir, NULL, &rcu_node_boost_fops);
- if (!retval)
- goto free_out;
- }
-#endif
-
- retval = debugfs_create_file("rcugp", 0444,
- rspdir, rsp, &rcugp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuhier", 0444,
- rspdir, rsp, &rcuhier_fops);
- if (!retval)
- goto free_out;
- }
-
- retval = debugfs_create_file("rcutorture", 0444, rcudir,
- NULL, &rcutorture_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 273e869ca21d..00e77c470017 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,7 +62,9 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
+extern int rcu_expedited; /* from sysctl */
module_param(rcu_expedited, int, 0);
+extern int rcu_normal; /* from sysctl */
module_param(rcu_normal, int, 0);
static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
@@ -379,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
struct rcu_synchronize *rs_array)
{
int i;
+ int j;
/* Initialize and register callbacks for each flavor specified. */
for (i = 0; i < n; i++) {
@@ -390,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
init_rcu_head_on_stack(&rs_array[i].head);
init_completion(&rs_array[i].completion);
- (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i)
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
}
/* Wait for all callbacks to be invoked. */
@@ -399,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
(crcu_array[i] == call_rcu ||
crcu_array[i] == call_rcu_bh))
continue;
- wait_for_completion(&rs_array[i].completion);
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i)
+ wait_for_completion(&rs_array[i].completion);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
@@ -560,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
DEFINE_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
-static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
+static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
-/*
- * Post an RCU-tasks callback. First call must be from process context
- * after the scheduler if fully operational.
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution. As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
@@ -851,6 +877,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any non-default Tasks RCU settings.
+ */
+static void __init rcu_tasks_bootup_oddness(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
+ pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ else
+ pr_info("\tTasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -935,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests);
#else
void rcu_early_boot_tests(void) {}
#endif /* CONFIG_PROVE_RCU */
+
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any significant non-default boot-time settings.
+ */
+void __init rcupdate_announce_bootup_oddness(void)
+{
+ if (rcu_normal)
+ pr_info("\tNo expedited grace period (rcu_normal).\n");
+ else if (rcu_normal_after_boot)
+ pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
+ else if (rcu_expedited)
+ pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
+ if (rcu_cpu_stall_suppress)
+ pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
+ if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
+ pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
+ rcu_tasks_bootup_oddness();
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 326d4f88e2b1..5b60f3a8343f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5874,15 +5874,9 @@ int sched_cpu_deactivate(unsigned int cpu)
* users of this state to go away such that all new such users will
* observe it.
*
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so wait for both.
- *
* Do sync before park smpboot threads to take care the rcu boost case.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
- else
- synchronize_rcu();
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
if (!sched_smp_initialized)
return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index ca92bcfeb322..45b4c1ffe14e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -510,7 +510,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
+ bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
@@ -532,6 +533,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
+
+ *resched_timer =
+ (first->flags & SIGQUEUE_PREALLOC) &&
+ (info->si_code == SI_TIMER) &&
+ (info->si_sys_private);
+
__sigqueue_free(first);
} else {
/*
@@ -548,12 +555,12 @@ still_pending:
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
- siginfo_t *info)
+ siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
- collect_signal(sig, pending, info);
+ collect_signal(sig, pending, info, resched_timer);
return sig;
}
@@ -565,15 +572,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
+ bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
- signr = __dequeue_signal(&tsk->pending, mask, info);
+ signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
- mask, info);
+ mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
/*
* itimer signal ?
@@ -621,7 +629,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
#ifdef CONFIG_POSIX_TIMERS
- if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index ece4b177052b..939a158eab11 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file,
/* Only supports reads */
if (oldval && oldlen) {
char buf[UUID_STRING_LEN + 1];
- uuid_be uuid;
+ uuid_t uuid;
result = kernel_read(file, 0, buf, sizeof(buf) - 1);
if (result < 0)
@@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file,
buf[result] = '\0';
result = -EIO;
- if (uuid_be_to_bin(buf, &uuid))
+ if (uuid_parse(buf, &uuid))
goto out;
if (oldlen > 16)
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f95dd7..ac09bc29eb08 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL
Note the boot CPU will still be kept outside the range to
handle the timekeeping duty.
-config NO_HZ_FULL_SYSIDLE
- bool "Detect full-system idle state for full dynticks system"
- depends on NO_HZ_FULL
- default n
- help
- At least one CPU must keep the scheduling-clock tick running for
- timekeeping purposes whenever there is a non-idle CPU, where
- "non-idle" also includes dynticks CPUs as long as they are
- running non-idle tasks. Because the underlying adaptive-tick
- support cannot distinguish between all CPUs being idle and
- all CPUs each running a single task in dynticks mode, the
- underlying support simply ensures that there is always a CPU
- handling the scheduling-clock tick, whether or not all CPUs
- are idle. This Kconfig option enables scalable detection of
- the all-CPUs-idle state, thus allowing the scheduling-clock
- tick to be disabled when all CPUs are idle. Note that scalable
- detection of the all-CPUs-idle state means that larger systems
- will be slower to declare the all-CPUs-idle state.
-
- Say Y if you would like to help debug all-CPUs-idle detection.
-
- Say N if you are unsure.
-
-config NO_HZ_FULL_SYSIDLE_SMALL
- int "Number of CPUs above which large-system approach is used"
- depends on NO_HZ_FULL_SYSIDLE
- range 1 NR_CPUS
- default 8
- help
- The full-system idle detection mechanism takes a lazy approach
- on large systems, as is required to attain decent scalability.
- However, on smaller systems, scalability is not anywhere near as
- large a concern as is energy efficiency. The sysidle subsystem
- therefore uses a fast but non-scalable algorithm for small
- systems and a lazier but scalable algorithm for large systems.
- This Kconfig parameter defines the number of CPUs in the largest
- system that will be considered to be "small".
-
- The default value will be fine in most cases. Battery-powered
- systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
- numbers of CPUs, and (3) are suffering from battery-lifetime
- problems due to long sysidle latencies might wish to experiment
- with larger values for this Kconfig parameter. On the other
- hand, they might be even better served by disabling NO_HZ_FULL
- entirely, given that NO_HZ_FULL is intended for HPC and
- real-time workloads that at present do not tend to be run on
- battery-powered systems.
-
- Take the default if you are unsure.
-
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9652bc57fd09..b602c48cb841 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
@@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
u64 cycle_now, delta;
/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -238,12 +258,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
++tk->cs_was_changed_seq;
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
- tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
- tk->tkr_raw.read = clock->read;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
@@ -262,7 +280,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Go back from cycles -> shifted ns */
tk->xtime_interval = interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
- tk->raw_interval = (interval * clock->mult) >> clock->shift;
+ tk->raw_interval = interval * clock->mult;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
@@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
- tkr->read(tkr->clock),
+ tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -461,6 +479,10 @@ static u64 dummy_clock_read(struct clocksource *cs)
return cycles_at_suspend;
}
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
- tkr_dummy.read = dummy_clock_read;
+ cycles_at_suspend = tk_clock_read(tkr);
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- tkr_dummy.read = dummy_clock_read;
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
@@ -649,11 +671,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
u64 cycle_now, delta;
u64 nsec;
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -929,8 +950,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
do {
seq = read_seqcount_begin(&tk_core.seq);
-
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
@@ -1108,7 +1128,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval.
*/
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
if (!cycle_between(interval_start, cycles, now)) {
clock_was_set_seq = tk->clock_was_set_seq;
@@ -1629,7 +1649,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 nsec, cyc_delta;
@@ -1976,7 +1996,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
u32 shift, unsigned int *clock_set)
{
u64 interval = tk->cycle_interval << shift;
- u64 raw_nsecs;
+ u64 snsec_per_sec;
/* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
@@ -1991,14 +2011,15 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = (u64)tk->raw_interval << shift;
- raw_nsecs += tk->raw_time.tv_nsec;
- if (raw_nsecs >= NSEC_PER_SEC) {
- u64 raw_secs = raw_nsecs;
- raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- tk->raw_time.tv_sec += raw_secs;
+ tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
+ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
+ snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
+ tk->tkr_raw.xtime_nsec -= snsec_per_sec;
+ tk->raw_time.tv_sec++;
}
- tk->raw_time.tv_nsec = raw_nsecs;
+ tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
+ tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
@@ -2030,7 +2051,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 193c5f5e3f79..bc364f86100a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+ BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
&rpdu);
}
}
@@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
sizeof(r), &r);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e5841dc14b5..b308be30dfb9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4337,9 +4337,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
command = strsep(&next, ":");
- if (WARN_ON_ONCE(!tr))
- return -EINVAL;
-
mutex_lock(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
if (strcmp(p->name, command) == 0) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1122f151466f..091e801145c9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6881,6 +6881,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *number;
int ret;
+ if (!tr)
+ return -ENODEV;
+
/* hash funcs only work with set_ftrace_filter */
if (!enable)
return -EINVAL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a3bddbfd0874..a0910c0cdf2e 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
@@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
@@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
ops = &dump_probe_ops;
/* Only dump once. */
@@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
ops = &cpudump_probe_ops;
/* Only dump once. */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c129fca6ec99..b53c8d369163 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -707,20 +707,16 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- /* an address specified */
- ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
- if (ret) {
- pr_info("Failed to parse address.\n");
- return ret;
- }
- } else {
+
+ /* try to parse an address. if that fails, try to read the
+ * input as a symbol. */
+ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
- pr_info("Failed to parse symbol.\n");
+ pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
if (offset && is_return &&
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 76aa04d4c925..b4a751e8f9d6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -409,7 +409,9 @@ static const struct file_operations stack_trace_fops = {
static int
stack_trace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops, FTRACE_ITER_FILTER,
inode, file);
}
@@ -476,7 +478,7 @@ static __init int stack_trace_init(void)
NULL, &stack_trace_fops);
trace_create_file("stack_trace_filter", 0444, d_tracer,
- NULL, &stack_trace_filter_fops);
+ &trace_ops, &stack_trace_filter_fops);
if (stack_trace_filter_buf[0])
ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);