diff options
author | Donald Sharp <sharpd@nvidia.com> | 2021-02-02 18:56:06 +0100 |
---|---|---|
committer | Donald Sharp <sharpd@nvidia.com> | 2021-03-26 16:41:57 +0100 |
commit | 039d547f6f1b7fe4b96cc22ac5a6ef8d18d5cf97 (patch) | |
tree | f5614f80d048b8a3c4906f06284c45ef6373ec60 /lib | |
parent | Merge pull request #8154 from AnuradhaKaruppiah/evpn-mh-irb-2 (diff) | |
download | frr-039d547f6f1b7fe4b96cc22ac5a6ef8d18d5cf97.tar.xz frr-039d547f6f1b7fe4b96cc22ac5a6ef8d18d5cf97.zip |
lib: Differentiate between real and cpu bound processes
When generating SLOW_THREAD warnings let's differentiate between
a cpu bound process and a wall bound process. Effectively
a slow thread can now be a process in FRR doing lots of work( cpu bound )
or wall bound ( the cpu is heavy load and a FRR process may be pre-empted
and never scheduled ).
Signed-off-by: Donald Sharp <sharpd@nvidia.com>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/lib_errors.c | 12 | ||||
-rw-r--r-- | lib/lib_errors.h | 3 | ||||
-rw-r--r-- | lib/thread.c | 22 | ||||
-rw-r--r-- | lib/vty.c | 12 |
4 files changed, 38 insertions, 11 deletions
diff --git a/lib/lib_errors.c b/lib/lib_errors.c index 6e5088142..17695e660 100644 --- a/lib/lib_errors.c +++ b/lib/lib_errors.c @@ -45,9 +45,15 @@ static struct log_ref ferr_lib_warn[] = { .suggestion = "Gather log data and open an Issue. restart FRR", }, { - .code = EC_LIB_SLOW_THREAD, - .title = "The Event subsystem has detected a slow process", - .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof.", + .code = EC_LIB_SLOW_THREAD_CPU, + .title = "The Event subsystem has detected a slow cpu time process", + .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug, or some combination therof. In this case total CPU time was over 5 seconds. Which indicates that FRR is very busy doing some work and should be addressed", + .suggestion = "Gather log data and open an Issue", + }, + { + .code = EC_LIB_SLOW_THREAD_WALL, + .title = "The Event subsystem has detected a slow wall time process", + .description = "The Event subsystem has detected a slow process, this typically indicates that FRR is having trouble completing work in a timely manner. This can be either a misconfiguration, bug or some combination therof. In this case total WALL time was over 5 seconds. Which indicates that FRR might be having trouble being scheduled or some system call is delaying", .suggestion = "Gather log data and open an Issue", }, { diff --git a/lib/lib_errors.h b/lib/lib_errors.h index 4730b6aa3..9f0f58d20 100644 --- a/lib/lib_errors.h +++ b/lib/lib_errors.h @@ -44,7 +44,8 @@ enum lib_log_refs { EC_LIB_SNMP, EC_LIB_STREAM, EC_LIB_LINUX_NS, - EC_LIB_SLOW_THREAD, + EC_LIB_SLOW_THREAD_CPU, + EC_LIB_SLOW_THREAD_WALL, EC_LIB_NO_THREAD, EC_LIB_RMAP_RECURSION_LIMIT, EC_LIB_BACKUP_CONFIG, diff --git a/lib/thread.c b/lib/thread.c index 866090341..7a3d50419 100644 --- a/lib/thread.c +++ b/lib/thread.c @@ -1850,15 +1850,29 @@ void thread_call(struct thread *thread) memory_order_seq_cst); #ifdef CONSUMED_TIME_CHECK - if (realtime > CONSUMED_TIME_CHECK) { + if (cputime > CONSUMED_TIME_CHECK) { /* - * We have a CPU Hog on our hands. + * We have a CPU Hog on our hands. The time FRR + * has spent doing actual work ( not sleeping ) + * is greater than 5 seconds. * Whinge about it now, so we're aware this is yet another task * to fix. */ flog_warn( - EC_LIB_SLOW_THREAD, - "SLOW THREAD: task %s (%lx) ran for %lums (cpu time %lums)", + EC_LIB_SLOW_THREAD_CPU, + "CPU HOG: task %s (%lx) ran for %lums (cpu time %lums)", + thread->xref->funcname, (unsigned long)thread->func, + realtime / 1000, cputime / 1000); + } else if (realtime > CONSUMED_TIME_CHECK) { + /* + * The runtime for a task is greater than 5 seconds, but + * the cpu time is under 5 seconds. Let's whine + * about this because this could imply some sort of + * scheduling issue. + */ + flog_warn( + EC_LIB_SLOW_THREAD_WALL, + "STARVATION: task %s (%lx) ran for %lums (cpu time %lums)", thread->xref->funcname, (unsigned long)thread->func, realtime / 1000, cputime / 1000); } @@ -512,13 +512,19 @@ static int vty_command(struct vty *vty, char *buf) #ifdef CONSUMED_TIME_CHECK GETRUSAGE(&after); - if ((realtime = thread_consumed_time(&after, &before, &cputime)) - > CONSUMED_TIME_CHECK) + realtime = thread_consumed_time(&after, &before, &cputime); + if (cputime > CONSUMED_TIME_CHECK) { /* Warn about CPU hog that must be fixed. */ flog_warn( - EC_LIB_SLOW_THREAD, + EC_LIB_SLOW_THREAD_CPU, "SLOW COMMAND: command took %lums (cpu time %lums): %s", realtime / 1000, cputime / 1000, buf); + } else if (realtime > CONSUMED_TIME_CHECK) { + flog_warn( + EC_LIB_SLOW_THREAD_WALL, + "SLOW COMMAND: command took %lums (cpu time %lums): %s", + realtime / 1000, cputime / 1000, buf); + } } #endif /* CONSUMED_TIME_CHECK */ |