summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorFeng Tang <feng.tang@intel.com>2024-02-21 07:08:59 +0100
committerThomas Gleixner <tglx@linutronix.de>2024-02-21 12:00:42 +0100
commit2ed08e4bc53298db3f87b528cd804cb0cce066a9 (patch)
treeeae9c67c9c775586b5edac4aad136d9d61813615 /kernel
parenttime/kunit: Use correct format specifier (diff)
downloadlinux-2ed08e4bc53298db3f87b528cd804cb0cce066a9.tar.xz
linux-2ed08e4bc53298db3f87b528cd804cb0cce066a9.zip
clocksource: Scale the watchdog read retries automatically
On a 8-socket server the TSC is wrongly marked as 'unstable' and disabled during boot time on about one out of 120 boot attempts: clocksource: timekeeping watchdog on CPU227: wd-tsc-wd excessive read-back delay of 153560ns vs. limit of 125000ns, wd-wd read-back delay only 11440ns, attempt 3, marking tsc unstable tsc: Marking TSC unstable due to clocksource watchdog TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'. sched_clock: Marking unstable (119294969739, 159204297)<-(125446229205, -5992055152) clocksource: Checking clocksource tsc synchronization from CPU 319 to CPUs 0,99,136,180,210,542,601,896. clocksource: Switched to clocksource hpet The reason is that for platform with a large number of CPUs, there are sporadic big or huge read latencies while reading the watchog/clocksource during boot or when system is under stress work load, and the frequency and maximum value of the latency goes up with the number of online CPUs. The cCurrent code already has logic to detect and filter such high latency case by reading the watchdog twice and checking the two deltas. Due to the randomness of the latency, there is a low probabilty that the first delta (latency) is big, but the second delta is small and looks valid. The watchdog code retries the readouts by default twice, which is not necessarily sufficient for systems with a large number of CPUs. There is a command line parameter 'max_cswd_read_retries' which allows to increase the number of retries, but that's not user friendly as it needs to be tweaked per system. As the number of required retries is proportional to the number of online CPUs, this parameter can be calculated at runtime. Scale and enlarge the number of retries according to the number of online CPUs and remove the command line parameter completely. [ tglx: Massaged change log and comments ] Signed-off-by: Feng Tang <feng.tang@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Jin Wang <jin1.wang@intel.com> Tested-by: Paul E. McKenney <paulmck@kernel.org> Reviewed-by: Waiman Long <longman@redhat.com> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Link: https://lore.kernel.org/r/20240221060859.1027450-1-feng.tang@intel.com
Diffstat (limited to 'kernel')
-rw-r--r--kernel/time/clocksource-wdtest.c13
-rw-r--r--kernel/time/clocksource.c10
2 files changed, 11 insertions, 12 deletions
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index df922f49d171..d06185e054ea 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -104,8 +104,8 @@ static void wdtest_ktime_clocksource_reset(void)
static int wdtest_func(void *arg)
{
unsigned long j1, j2;
+ int i, max_retries;
char *s;
- int i;
schedule_timeout_uninterruptible(holdoff * HZ);
@@ -139,18 +139,19 @@ static int wdtest_func(void *arg)
WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
/* Verify tsc-like stability with various numbers of errors injected. */
- for (i = 0; i <= max_cswd_read_retries + 1; i++) {
- if (i <= 1 && i < max_cswd_read_retries)
+ max_retries = clocksource_get_max_watchdog_retry();
+ for (i = 0; i <= max_retries + 1; i++) {
+ if (i <= 1 && i < max_retries)
s = "";
- else if (i <= max_cswd_read_retries)
+ else if (i <= max_retries)
s = ", expect message";
else
s = ", expect clock skew";
- pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
+ pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
WRITE_ONCE(wdtest_ktime_read_ndelays, i);
schedule_timeout_uninterruptible(2 * HZ);
WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
- WARN_ON_ONCE((i <= max_cswd_read_retries) !=
+ WARN_ON_ONCE((i <= max_retries) !=
!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
wdtest_ktime_clocksource_reset();
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4ef06651ad07..e5b260aa0e02 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -210,9 +210,6 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-ulong max_cswd_read_retries = 2;
-module_param(max_cswd_read_retries, ulong, 0644);
-EXPORT_SYMBOL_GPL(max_cswd_read_retries);
static int verify_n_cpus = 8;
module_param(verify_n_cpus, int, 0644);
@@ -224,11 +221,12 @@ enum wd_read_status {
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
- unsigned int nretries;
+ unsigned int nretries, max_retries;
u64 wd_end, wd_end2, wd_delta;
int64_t wd_delay, wd_seq_delay;
- for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
+ max_retries = clocksource_get_max_watchdog_retry();
+ for (nretries = 0; nretries <= max_retries; nretries++) {
local_irq_disable();
*wdnow = watchdog->read(watchdog);
*csnow = cs->read(cs);
@@ -240,7 +238,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
watchdog->shift);
if (wd_delay <= WATCHDOG_MAX_SKEW) {
- if (nretries > 1 || nretries >= max_cswd_read_retries) {
+ if (nretries > 1 || nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
smp_processor_id(), watchdog->name, nretries);
}