summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorDonald Sharp <sharpd@nvidia.com>2021-02-02 18:56:06 +0100
committerDonald Sharp <sharpd@nvidia.com>2021-03-26 16:41:57 +0100
commit039d547f6f1b7fe4b96cc22ac5a6ef8d18d5cf97 (patch)
treef5614f80d048b8a3c4906f06284c45ef6373ec60 /lib
parentMerge pull request #8154 from AnuradhaKaruppiah/evpn-mh-irb-2 (diff)
downloadfrr-039d547f6f1b7fe4b96cc22ac5a6ef8d18d5cf97.tar.xz
frr-039d547f6f1b7fe4b96cc22ac5a6ef8d18d5cf97.zip
lib: Differentiate between real and cpu bound processes
When generating SLOW_THREAD warnings let's differentiate between a cpu bound process and a wall bound process. Effectively a slow thread can now be a process in FRR doing lots of work( cpu bound ) or wall bound ( the cpu is heavy load and a FRR process may be pre-empted and never scheduled ). Signed-off-by: Donald Sharp <sharpd@nvidia.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/lib_errors.c12
-rw-r--r--lib/lib_errors.h3
-rw-r--r--lib/thread.c22
-rw-r--r--lib/vty.c12
4 files changed, 38 insertions, 11 deletions
diff --git a/lib/lib_errors.c b/lib/lib_errors.c
index 6e5088142..17695e660 100644
--- a/lib/lib_errors.c
+++ b/lib/lib_errors.c
@@ -45,9 +45,15 @@ static struct log_ref ferr_lib_warn[] = {
.suggestion = "Gather log data and open an Issue. restart FRR",
},
{
- .code = EC_LIB_SLOW_THREAD,
- .title = "The Event subsystem has detected a slow process",
- .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof.",
+ .code = EC_LIB_SLOW_THREAD_CPU,
+ .title = "The Event subsystem has detected a slow cpu time process",
+ .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof. In this case total CPU time was over 5 seconds. Which indicates that FRR is very busy doing some work and should be addressed",
+ .suggestion = "Gather log data and open an Issue",
+ },
+ {
+ .code = EC_LIB_SLOW_THREAD_WALL,
+ .title = "The Event subsystem has detected a slow wall time process",
+ .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination therof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying",
.suggestion = "Gather log data and open an Issue",
},
{
diff --git a/lib/lib_errors.h b/lib/lib_errors.h
index 4730b6aa3..9f0f58d20 100644
--- a/lib/lib_errors.h
+++ b/lib/lib_errors.h
@@ -44,7 +44,8 @@ enum lib_log_refs {
EC_LIB_SNMP,
EC_LIB_STREAM,
EC_LIB_LINUX_NS,
- EC_LIB_SLOW_THREAD,
+ EC_LIB_SLOW_THREAD_CPU,
+ EC_LIB_SLOW_THREAD_WALL,
EC_LIB_NO_THREAD,
EC_LIB_RMAP_RECURSION_LIMIT,
EC_LIB_BACKUP_CONFIG,
diff --git a/lib/thread.c b/lib/thread.c
index 866090341..7a3d50419 100644
--- a/lib/thread.c
+++ b/lib/thread.c
@@ -1850,15 +1850,29 @@ void thread_call(struct thread *thread)
memory_order_seq_cst);
#ifdef CONSUMED_TIME_CHECK
- if (realtime > CONSUMED_TIME_CHECK) {
+ if (cputime > CONSUMED_TIME_CHECK) {
/*
- * We have a CPU Hog on our hands.
+ * We have a CPU Hog on our hands. The time FRR
+ * has spent doing actual work ( not sleeping )
+ * is greater than 5 seconds.
* Whinge about it now, so we're aware this is yet another task
* to fix.
*/
flog_warn(
- EC_LIB_SLOW_THREAD,
- "SLOW THREAD: task %s (%lx) ran for %lums (cpu time %lums)",
+ EC_LIB_SLOW_THREAD_CPU,
+ "CPU HOG: task %s (%lx) ran for %lums (cpu time %lums)",
+ thread->xref->funcname, (unsigned long)thread->func,
+ realtime / 1000, cputime / 1000);
+ } else if (realtime > CONSUMED_TIME_CHECK) {
+ /*
+ * The runtime for a task is greater than 5 seconds, but
+ * the cpu time is under 5 seconds. Let's whine
+ * about this because this could imply some sort of
+ * scheduling issue.
+ */
+ flog_warn(
+ EC_LIB_SLOW_THREAD_WALL,
+ "STARVATION: task %s (%lx) ran for %lums (cpu time %lums)",
thread->xref->funcname, (unsigned long)thread->func,
realtime / 1000, cputime / 1000);
}
diff --git a/lib/vty.c b/lib/vty.c
index d44cc904c..05e495c45 100644
--- a/lib/vty.c
+++ b/lib/vty.c
@@ -512,13 +512,19 @@ static int vty_command(struct vty *vty, char *buf)
#ifdef CONSUMED_TIME_CHECK
GETRUSAGE(&after);
- if ((realtime = thread_consumed_time(&after, &before, &cputime))
- > CONSUMED_TIME_CHECK)
+ realtime = thread_consumed_time(&after, &before, &cputime);
+ if (cputime > CONSUMED_TIME_CHECK) {
/* Warn about CPU hog that must be fixed. */
flog_warn(
- EC_LIB_SLOW_THREAD,
+ EC_LIB_SLOW_THREAD_CPU,
"SLOW COMMAND: command took %lums (cpu time %lums): %s",
realtime / 1000, cputime / 1000, buf);
+ } else if (realtime > CONSUMED_TIME_CHECK) {
+ flog_warn(
+ EC_LIB_SLOW_THREAD_WALL,
+ "SLOW COMMAND: command took %lums (cpu time %lums): %s",
+ realtime / 1000, cputime / 1000, buf);
+ }
}
#endif /* CONSUMED_TIME_CHECK */