From be42f00b73a0f50710d16eb7cb4efda0cce062dd Mon Sep 17 00:00:00 2001 From: Zhen Lei <thunder.leizhen@huawei.com> Date: Sat, 19 Nov 2022 17:25:06 +0800 Subject: rcu: Add RCU stall diagnosis information Because RCU CPU stall warnings are driven from the scheduling-clock interrupt handler, a workload consisting of a very large number of short-duration hardware interrupts can result in misleading stall-warning messages. On systems supporting only a single level of interrupts, that is, where interrupts handlers cannot be interrupted, this can produce misleading diagnostics. The stack traces will show the innocent-bystander interrupted task, not the interrupts that are at the very least exacerbating the stall. This situation can be improved by displaying the number of interrupts and the CPU time that they have consumed. Diagnosing other types of stalls can be eased by also providing the count of softirqs and the CPU time that they consumed as well as the number of context switches and the task-level CPU time consumed. Consider the following output given this change: rcu: INFO: rcu_preempt self-detected stall on CPU rcu: 0-....: (1250 ticks this GP) <omitted> rcu: hardirqs softirqs csw/system rcu: number: 624 45 0 rcu: cputime: 69 1 2425 ==> 2500(ms) This output shows that the number of hard and soft interrupts is small, there are no context switches, and the system takes up a lot of time. This indicates that the current task is looping with preemption disabled. The impact on system performance is negligible because snapshot is recorded only once for all continuous RCU stalls. This added debugging information is suppressed by default and can be enabled by building the kernel with CONFIG_RCU_CPU_STALL_CPUTIME=y or by booting with rcupdate.rcu_cpu_stall_cputime=1. Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com> Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com> Reviewed-by: Frederic Weisbecker <frederic@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> --- kernel/rcu/Kconfig.debug | 13 +++++++++++++ kernel/rcu/rcu.h | 1 + kernel/rcu/tree.c | 18 ++++++++++++++++++ kernel/rcu/tree.h | 19 +++++++++++++++++++ kernel/rcu/tree_stall.h | 31 +++++++++++++++++++++++++++++++ kernel/rcu/update.c | 2 ++ 6 files changed, 84 insertions(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 232e29fe3e5e..49da904df6aa 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -92,6 +92,19 @@ config RCU_EXP_CPU_STALL_TIMEOUT says to use the RCU_CPU_STALL_TIMEOUT value converted from seconds to milliseconds. +config RCU_CPU_STALL_CPUTIME + bool "Provide additional RCU stall debug information" + depends on RCU_STALL_COMMON + default n + help + Collect statistics during the sampling period, such as the number of + (hard interrupts, soft interrupts, task switches) and the cputime of + (hard interrupts, soft interrupts, kernel tasks) are added to the + RCU stall report. For multiple continuous RCU stalls, all sampling + periods begin at half of the first RCU stall timeout. + The boot option rcupdate.rcu_cpu_stall_cputime has the same function + as this one, but will override this if it exists. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c5aa934de59b..ff35920e1055 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -224,6 +224,7 @@ extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; extern int rcu_exp_cpu_stall_timeout; +extern int rcu_cpu_stall_cputime; int rcu_jiffies_till_stall_check(void); int rcu_exp_jiffies_till_stall_check(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821a..65552e6a6a5d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -925,6 +925,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } + + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { + int cpu = rdp->cpu; + struct rcu_snap_record *rsrp; + struct kernel_cpustat *kcsp; + + kcsp = &kcpustat_cpu(cpu); + + rsrp = &rdp->snap_record; + rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); + rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->jiffies = jiffies; + rsrp->gp_seq = rdp->gp_seq; + } } return 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcb5d696eb17..192536916f9a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -158,6 +158,23 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; +/* + * Record the snapshot of the core stats at half of the first RCU stall timeout. + * The member gp_seq is used to ensure that all members are updated only once + * during the sampling period. The snapshot is taken only if this gp_seq is not + * equal to rdp->gp_seq. + */ +struct rcu_snap_record { + unsigned long gp_seq; /* Track rdp->gp_seq counter */ + u64 cputime_irq; /* Accumulated cputime of hard irqs */ + u64 cputime_softirq;/* Accumulated cputime of soft irqs */ + u64 cputime_system; /* Accumulated cputime of kernel tasks */ + unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ + unsigned int nr_softirqs; /* Accumulated number of soft irqs */ + unsigned long long nr_csw; /* Accumulated number of task switches */ + unsigned long jiffies; /* Track jiffies value */ +}; + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -262,6 +279,8 @@ struct rcu_data { short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ + struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ + /* the first RCU stall timeout */ long lazy_len; /* Length of buffered lazy callbacks. */ int cpu; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5653560573e2..6de15fb10bc4 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -428,6 +428,35 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp return j > 2 * HZ; } +static void print_cpu_stat_info(int cpu) +{ + struct rcu_snap_record rsr, *rsrp; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu); + + if (!rcu_cpu_stall_cputime) + return; + + rsrp = &rdp->snap_record; + if (rsrp->gp_seq != rdp->gp_seq) + return; + + rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + + pr_err("\t hardirqs softirqs csw/system\n"); + pr_err("\t number: %8ld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, + nr_context_switches_cpu(cpu) - rsrp->nr_csw); + pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", + div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC), + div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC), + div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC), + jiffies_to_msecs(jiffies - rsrp->jiffies)); +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -484,6 +513,8 @@ static void print_cpu_stall_info(int cpu) data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", falsepositive ? " (false positive?)" : ""); + + print_cpu_stat_info(cpu); } /* Complain about starvation of grace-period kthread. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5e6a2f95a2a..8d72cb7caead 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -508,6 +508,8 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; module_param(rcu_exp_cpu_stall_timeout, int, 0644); +int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME); +module_param(rcu_cpu_stall_cputime, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall -- cgit v1.2.3 From 3ab955de929a53245c89deb7ff9992b6eebbe4e8 Mon Sep 17 00:00:00 2001 From: Zhen Lei <thunder.leizhen@huawei.com> Date: Sat, 19 Nov 2022 17:25:08 +0800 Subject: rcu: Align the output of RCU CPU stall warning messages Time stamps are added to the output in kernels built with CONFIG_PRINTK_TIME=y, which causes misaligned output. Therefore, replace pr_cont() with pr_err(), which fixes alignment and gets rid of a couple of despised pr_cont() calls. Before: [ 37.567343] rcu: INFO: rcu_preempt self-detected stall on CPU [ 37.567839] rcu: 0-....: (1500 ticks this GP) idle=*** [ 37.568270] (t=1501 jiffies g=4717 q=28 ncpus=4) [ 37.568668] CPU: 0 PID: 313 Comm: test0 Not tainted 6.1.0-rc4 #8 After: [ 36.762074] rcu: INFO: rcu_preempt self-detected stall on CPU [ 36.762543] rcu: 0-....: (1499 ticks this GP) idle=*** [ 36.763003] rcu: (t=1500 jiffies g=5097 q=27 ncpus=4) [ 36.763522] CPU: 0 PID: 313 Comm: test0 Not tainted 6.1.0-rc4 #9 Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com> Reviewed-by: Frederic Weisbecker <frederic@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6de15fb10bc4..f360894f5599 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -619,7 +619,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", + pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { @@ -680,7 +680,7 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", + pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); -- cgit v1.2.3 From 84ec7c20363f46a4031695eceb0374c1c4d6b9ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@kernel.org> Date: Tue, 6 Dec 2022 09:38:39 -0800 Subject: rcu: Allow up to five minutes expedited RCU CPU stall-warning timeouts The maximum value of RCU CPU stall-warning timeouts has historically been five minutes (300 seconds). However, the recently introduced expedited RCU CPU stall-warning timeout is instead limited to 21 seconds. This causes problems for CI/fuzzing services such as syzkaller by obscuring the issue in question with expedited RCU CPU stall-warning timeout splats. This commit therefore sets the RCU_EXP_CPU_STALL_TIMEOUT Kconfig options upper bound to 300000 milliseconds, which is 300 seconds (AKA 5 minutes). [ paulmck: Apply feedback from Hillf Danton. ] [ paulmck: Apply feedback from Geert Uytterhoeven. ] Reported-by: Dave Chinner <david@fromorbit.com> Reported-by: Dmitry Vyukov <dvyukov@google.com> Tested-by: Dmitry Vyukov <dvyukov@google.com> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> --- kernel/rcu/Kconfig.debug | 2 +- kernel/rcu/tree_stall.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 49da904df6aa..2984de629f74 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -82,7 +82,7 @@ config RCU_CPU_STALL_TIMEOUT config RCU_EXP_CPU_STALL_TIMEOUT int "Expedited RCU CPU stall timeout in milliseconds" depends on RCU_STALL_COMMON - range 0 21000 + range 0 300000 default 0 help If a given expedited RCU grace period extends more than the diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index f360894f5599..b10b8349bb2a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -39,7 +39,7 @@ int rcu_exp_jiffies_till_stall_check(void) // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. // The minimum clamped value is "2UL", because at least one full // tick has to be guaranteed. - till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ); if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); -- cgit v1.2.3