summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJafar Al-Gharaibeh <jafar@atcorp.com>2022-01-21 16:01:42 +0100
committerGitHub <noreply@github.com>2022-01-21 16:01:42 +0100
commitcdaa204effe98908a2396b528fdfe0d8dfe4393c (patch)
tree7943c9b7a689662f315823ea1fa912816994b823
parentMerge pull request #10392 from pguibert6WIND/isis_vrf_change (diff)
parentlib, zebra: Add ability to tell thread system to ignore late timers (diff)
downloadfrr-cdaa204effe98908a2396b528fdfe0d8dfe4393c.tar.xz
frr-cdaa204effe98908a2396b528fdfe0d8dfe4393c.zip
Merge pull request #8011 from donaldsharp/starvation
lib: Figure out if we are being starved for cpu
-rw-r--r--lib/lib_errors.c6
-rw-r--r--lib/lib_errors.h1
-rw-r--r--lib/thread.c20
-rw-r--r--lib/thread.h6
-rw-r--r--lib/workqueue.c5
-rw-r--r--zebra/interface.c8
6 files changed, 43 insertions, 3 deletions
diff --git a/lib/lib_errors.c b/lib/lib_errors.c
index a139b9a14..acc9a05c3 100644
--- a/lib/lib_errors.c
+++ b/lib/lib_errors.c
@@ -57,6 +57,12 @@ static struct log_ref ferr_lib_warn[] = {
.suggestion = "Gather log data and open an Issue",
},
{
+ .code = EC_LIB_STARVE_THREAD,
+ .title = "The Event subsystem has detected a thread starvation issue",
+ .description = "The event subsystem has detected a thread starvation issue. This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion",
+ .suggestion = "Gather log data and open an Issue",
+ },
+ {
.code = EC_LIB_NO_THREAD,
.title = "The Event subsystem has detected an internal FD problem",
.description = "The Event subsystem has detected a file descriptor read/write event without an associated handling function. This is a bug, please collect log data and open an issue.",
diff --git a/lib/lib_errors.h b/lib/lib_errors.h
index 9f0f58d20..64ac6c1ce 100644
--- a/lib/lib_errors.h
+++ b/lib/lib_errors.h
@@ -46,6 +46,7 @@ enum lib_log_refs {
EC_LIB_LINUX_NS,
EC_LIB_SLOW_THREAD_CPU,
EC_LIB_SLOW_THREAD_WALL,
+ EC_LIB_STARVE_THREAD,
EC_LIB_NO_THREAD,
EC_LIB_RMAP_RECURSION_LIMIT,
EC_LIB_BACKUP_CONFIG,
diff --git a/lib/thread.c b/lib/thread.c
index 77e34f48f..376f61c24 100644
--- a/lib/thread.c
+++ b/lib/thread.c
@@ -787,6 +787,7 @@ static struct thread *thread_get(struct thread_master *m, uint8_t type,
thread->arg = arg;
thread->yield = THREAD_YIELD_TIME_SLOT; /* default */
thread->ref = NULL;
+ thread->ignore_timer_late = false;
/*
* So if the passed in funcname is not what we have
@@ -1651,12 +1652,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num)
static unsigned int thread_process_timers(struct thread_master *m,
struct timeval *timenow)
{
+ struct timeval prev = *timenow;
+ bool displayed = false;
struct thread *thread;
unsigned int ready = 0;
while ((thread = thread_timer_list_first(&m->timer))) {
if (timercmp(timenow, &thread->u.sands, <))
break;
+ prev = thread->u.sands;
+ prev.tv_sec += 4;
+ /*
+ * If the timer would have popped 4 seconds in the
+ * past then we are in a situation where we are
+ * really getting behind on handling of events.
+ * Let's log it and do the right thing with it.
+ */
+ if (!displayed && !thread->ignore_timer_late &&
+ timercmp(timenow, &prev, >)) {
+ flog_warn(
+ EC_LIB_STARVE_THREAD,
+ "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago",
+ thread);
+ displayed = true;
+ }
+
thread_timer_list_pop(&m->timer);
thread->type = THREAD_READY;
thread_list_add_tail(&m->ready, thread);
diff --git a/lib/thread.h b/lib/thread.h
index 49a70696d..660f8bd28 100644
--- a/lib/thread.h
+++ b/lib/thread.h
@@ -126,6 +126,7 @@ struct thread {
unsigned long yield; /* yield time in microseconds */
const struct xref_threadsched *xref; /* origin location */
pthread_mutex_t mtx; /* mutex for thread.c functions */
+ bool ignore_timer_late;
};
#ifdef _FRR_ATTRIBUTE_PRINTFRR
@@ -285,6 +286,11 @@ extern bool thread_is_scheduled(struct thread *thread);
/* Debug signal mask */
void debug_signals(const sigset_t *sigs);
+static inline void thread_ignore_late_timer(struct thread *thread)
+{
+ thread->ignore_timer_late = true;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/workqueue.c b/lib/workqueue.c
index 2a8326c05..86afe4082 100644
--- a/lib/workqueue.c
+++ b/lib/workqueue.c
@@ -135,10 +135,11 @@ static int work_queue_schedule(struct work_queue *wq, unsigned int delay)
/* Schedule timer if there's a delay, otherwise just schedule
* as an 'event'
*/
- if (delay > 0)
+ if (delay > 0) {
thread_add_timer_msec(wq->master, work_queue_run, wq,
delay, &wq->thread);
- else
+ thread_ignore_late_timer(wq->thread);
+ } else
thread_add_event(wq->master, work_queue_run, wq, 0,
&wq->thread);
diff --git a/zebra/interface.c b/zebra/interface.c
index 8b5dbabb9..534953d90 100644
--- a/zebra/interface.c
+++ b/zebra/interface.c
@@ -91,9 +91,12 @@ static int if_zebra_speed_update(struct thread *thread)
changed = true;
}
- if (changed || new_speed == UINT32_MAX)
+ if (changed || new_speed == UINT32_MAX) {
thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 5,
&zif->speed_update);
+ thread_ignore_late_timer(zif->speed_update);
+ }
+
return 1;
}
@@ -187,6 +190,8 @@ static int if_zebra_new_hook(struct interface *ifp)
*/
thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 15,
&zebra_if->speed_update);
+ thread_ignore_late_timer(zebra_if->speed_update);
+
return 0;
}
@@ -1074,6 +1079,7 @@ void if_up(struct interface *ifp)
thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 0,
&zif->speed_update);
+ thread_ignore_late_timer(zif->speed_update);
}
/* Interface goes down. We have to manage different behavior of based