diff options
author | Jafar Al-Gharaibeh <jafar@atcorp.com> | 2022-01-21 16:01:42 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-21 16:01:42 +0100 |
commit | cdaa204effe98908a2396b528fdfe0d8dfe4393c (patch) | |
tree | 7943c9b7a689662f315823ea1fa912816994b823 | |
parent | Merge pull request #10392 from pguibert6WIND/isis_vrf_change (diff) | |
parent | lib, zebra: Add ability to tell thread system to ignore late timers (diff) | |
download | frr-cdaa204effe98908a2396b528fdfe0d8dfe4393c.tar.xz frr-cdaa204effe98908a2396b528fdfe0d8dfe4393c.zip |
Merge pull request #8011 from donaldsharp/starvation
lib: Figure out if we are being starved for cpu
-rw-r--r-- | lib/lib_errors.c | 6 | ||||
-rw-r--r-- | lib/lib_errors.h | 1 | ||||
-rw-r--r-- | lib/thread.c | 20 | ||||
-rw-r--r-- | lib/thread.h | 6 | ||||
-rw-r--r-- | lib/workqueue.c | 5 | ||||
-rw-r--r-- | zebra/interface.c | 8 |
6 files changed, 43 insertions, 3 deletions
diff --git a/lib/lib_errors.c b/lib/lib_errors.c index a139b9a14..acc9a05c3 100644 --- a/lib/lib_errors.c +++ b/lib/lib_errors.c @@ -57,6 +57,12 @@ static struct log_ref ferr_lib_warn[] = { .suggestion = "Gather log data and open an Issue", }, { + .code = EC_LIB_STARVE_THREAD, + .title = "The Event subsystem has detected a thread starvation issue", + .description = "The event subsystem has detected a thread starvation issue. This typically indicates that the system FRR is running on is heavily loaded and this load might be impacting FRR's ability to handle events in a timely fashion", + .suggestion = "Gather log data and open an Issue", + }, + { .code = EC_LIB_NO_THREAD, .title = "The Event subsystem has detected an internal FD problem", .description = "The Event subsystem has detected a file descriptor read/write event without an associated handling function. This is a bug, please collect log data and open an issue.", diff --git a/lib/lib_errors.h b/lib/lib_errors.h index 9f0f58d20..64ac6c1ce 100644 --- a/lib/lib_errors.h +++ b/lib/lib_errors.h @@ -46,6 +46,7 @@ enum lib_log_refs { EC_LIB_LINUX_NS, EC_LIB_SLOW_THREAD_CPU, EC_LIB_SLOW_THREAD_WALL, + EC_LIB_STARVE_THREAD, EC_LIB_NO_THREAD, EC_LIB_RMAP_RECURSION_LIMIT, EC_LIB_BACKUP_CONFIG, diff --git a/lib/thread.c b/lib/thread.c index 77e34f48f..376f61c24 100644 --- a/lib/thread.c +++ b/lib/thread.c @@ -787,6 +787,7 @@ static struct thread *thread_get(struct thread_master *m, uint8_t type, thread->arg = arg; thread->yield = THREAD_YIELD_TIME_SLOT; /* default */ thread->ref = NULL; + thread->ignore_timer_late = false; /* * So if the passed in funcname is not what we have @@ -1651,12 +1652,31 @@ static void thread_process_io(struct thread_master *m, unsigned int num) static unsigned int thread_process_timers(struct thread_master *m, struct timeval *timenow) { + struct timeval prev = *timenow; + bool displayed = false; struct thread *thread; unsigned int ready = 0; while ((thread = thread_timer_list_first(&m->timer))) { if (timercmp(timenow, &thread->u.sands, <)) break; + prev = thread->u.sands; + prev.tv_sec += 4; + /* + * If the timer would have popped 4 seconds in the + * past then we are in a situation where we are + * really getting behind on handling of events. + * Let's log it and do the right thing with it. + */ + if (!displayed && !thread->ignore_timer_late && + timercmp(timenow, &prev, >)) { + flog_warn( + EC_LIB_STARVE_THREAD, + "Thread Starvation: %pTHD was scheduled to pop greater than 4s ago", + thread); + displayed = true; + } + thread_timer_list_pop(&m->timer); thread->type = THREAD_READY; thread_list_add_tail(&m->ready, thread); diff --git a/lib/thread.h b/lib/thread.h index 49a70696d..660f8bd28 100644 --- a/lib/thread.h +++ b/lib/thread.h @@ -126,6 +126,7 @@ struct thread { unsigned long yield; /* yield time in microseconds */ const struct xref_threadsched *xref; /* origin location */ pthread_mutex_t mtx; /* mutex for thread.c functions */ + bool ignore_timer_late; }; #ifdef _FRR_ATTRIBUTE_PRINTFRR @@ -285,6 +286,11 @@ extern bool thread_is_scheduled(struct thread *thread); /* Debug signal mask */ void debug_signals(const sigset_t *sigs); +static inline void thread_ignore_late_timer(struct thread *thread) +{ + thread->ignore_timer_late = true; +} + #ifdef __cplusplus } #endif diff --git a/lib/workqueue.c b/lib/workqueue.c index 2a8326c05..86afe4082 100644 --- a/lib/workqueue.c +++ b/lib/workqueue.c @@ -135,10 +135,11 @@ static int work_queue_schedule(struct work_queue *wq, unsigned int delay) /* Schedule timer if there's a delay, otherwise just schedule * as an 'event' */ - if (delay > 0) + if (delay > 0) { thread_add_timer_msec(wq->master, work_queue_run, wq, delay, &wq->thread); - else + thread_ignore_late_timer(wq->thread); + } else thread_add_event(wq->master, work_queue_run, wq, 0, &wq->thread); diff --git a/zebra/interface.c b/zebra/interface.c index 8b5dbabb9..534953d90 100644 --- a/zebra/interface.c +++ b/zebra/interface.c @@ -91,9 +91,12 @@ static int if_zebra_speed_update(struct thread *thread) changed = true; } - if (changed || new_speed == UINT32_MAX) + if (changed || new_speed == UINT32_MAX) { thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 5, &zif->speed_update); + thread_ignore_late_timer(zif->speed_update); + } + return 1; } @@ -187,6 +190,8 @@ static int if_zebra_new_hook(struct interface *ifp) */ thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 15, &zebra_if->speed_update); + thread_ignore_late_timer(zebra_if->speed_update); + return 0; } @@ -1074,6 +1079,7 @@ void if_up(struct interface *ifp) thread_add_timer(zrouter.master, if_zebra_speed_update, ifp, 0, &zif->speed_update); + thread_ignore_late_timer(zif->speed_update); } /* Interface goes down. We have to manage different behavior of based |