diff options
author | Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> | 2021-04-06 07:59:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-06 07:59:59 +0200 |
commit | 9d5ae3a1217b188849bf99c19e3d40d0558369a6 (patch) | |
tree | e480ba75b807c750b558b8b18477cc0838277493 | |
parent | test: check if the unit file fuzzer corpora is up to date (diff) | |
parent | oomd: threshold swap kill candidates to usages of more than 5% (diff) | |
download | systemd-9d5ae3a1217b188849bf99c19e3d40d0558369a6.tar.xz systemd-9d5ae3a1217b188849bf99c19e3d40d0558369a6.zip |
Merge pull request #19126 from anitazha/oomdimprovements
systemd-oomd post-test week improvements
-rw-r--r-- | man/oomd.conf.xml | 8 | ||||
-rw-r--r-- | src/oom/oomd-manager.c | 291 | ||||
-rw-r--r-- | src/oom/oomd-manager.h | 16 | ||||
-rw-r--r-- | src/oom/oomd-util.c | 67 | ||||
-rw-r--r-- | src/oom/oomd-util.h | 38 | ||||
-rw-r--r-- | src/oom/oomd.c | 3 | ||||
-rw-r--r-- | src/oom/test-oomd-util.c | 78 |
7 files changed, 278 insertions, 223 deletions
diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml index 6156c98fbd..27914e0699 100644 --- a/man/oomd.conf.xml +++ b/man/oomd.conf.xml @@ -52,9 +52,9 @@ <listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command> will take action. If the fraction of swap used on the system is more than what is defined here, - <command>systemd-oomd</command> will act on eligible descendant control groups, starting from the - ones with the highest swap usage to the lowest swap usage. Which control groups are monitored and - what action gets taken depends on what the unit has configured for + <command>systemd-oomd</command> will act on eligible descendant control groups with swap usage greater + than 5% of total swap, starting from the ones with the highest swap usage. Which + control groups are monitored and what action gets taken depends on what the unit has configured for <varname>ManagedOOMSwap=</varname>. Takes a value specified in percent (when suffixed with "%"), permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem> </varlistentry> @@ -81,7 +81,7 @@ <listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by <varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>. - Defaults to 30 seconds when this property is unset or set to 0.</para></listitem> + Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0.</para></listitem> </varlistentry> </variablelist> diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c index c3e84aadde..49dc5eb26e 100644 --- a/src/oom/oomd-manager.c +++ b/src/oom/oomd-manager.c @@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) { return 0; } -static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { - _cleanup_set_free_ Set *targets = NULL; +static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { Manager *m = userdata; usec_t usec_now; int r; @@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo if (r < 0) return log_error_errno(r, "Failed to reset event timer: %m"); - r = sd_event_source_set_time_relative(s, INTERVAL_USEC); + r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC); if (r < 0) return log_error_errno(r, "Failed to set relative time for timer: %m"); @@ -324,97 +323,29 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo return log_error_errno(r, "Failed to acquire varlink connection: %m"); } - /* Update the cgroups used for detection/action */ - r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts); - if (r == -ENOMEM) - return log_oom(); - if (r < 0) - log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m"); - - r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); - if (r == -ENOMEM) - return log_oom(); - if (r < 0) - log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); - - r = update_monitored_cgroup_contexts_candidates( - m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); - if (r == -ENOMEM) - return log_oom(); - if (r < 0) - log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); - + /* We still try to acquire swap information for oomctl even if no units want swap monitoring */ r = oomd_system_context_acquire("/proc/swaps", &m->system_context); - /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM. + /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. * Allow ENOENT in the event that swap is disabled on the system. */ - if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) - return log_error_errno(r, "Failed to acquire system context: %m"); - else if (r == -ENOENT) + if (r == -ENOENT) { zero(m->system_context); + return 0; + } else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) + return log_error_errno(r, "Failed to acquire system context: %m"); - if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) - m->last_reclaim_at = usec_now; + /* Return early if nothing is requesting swap monitoring */ + if (hashmap_isempty(m->monitored_swap_cgroup_contexts)) + return 0; - /* If we're still recovering from a kill, don't try to kill again yet */ - if (m->post_action_delay_start > 0) { - if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) - return 0; - else - m->post_action_delay_start = 0; - } - - r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); - if (r == -ENOMEM) - return log_oom(); - if (r < 0) - log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); - else if (r == 1) { - /* Check if there was reclaim activity in the given interval. The concern is the following case: - * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending - * cgroup. Even after this, well-behaved processes will fault in recently resident pages and - * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need - * to kill something (it won't help anyways). */ - if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) { - OomdCGroupContext *t; - - SET_FOREACH(t, targets) { - _cleanup_free_ char *selected = NULL; - char ts[FORMAT_TIMESPAN_MAX]; - - log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", - t->path, - LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10), - LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit), - format_timespan(ts, sizeof ts, - m->default_mem_pressure_duration_usec, - USEC_PER_SEC)); - - r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected); - if (r == -ENOMEM) - return log_oom(); - if (r < 0) - log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path); - else { - /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */ - m->post_action_delay_start = usec_now; - if (selected) - log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" - " for > %s with reclaim activity", - selected, t->path, - LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10), - LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit), - format_timespan(ts, sizeof ts, - m->default_mem_pressure_duration_usec, - USEC_PER_SEC)); - return 0; - } - } - } - } + /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the + * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts + * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent + * nodes are the ones that matter). */ if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { _cleanup_hashmap_free_ Hashmap *candidates = NULL; _cleanup_free_ char *selected = NULL; + uint64_t threshold; log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, m->system_context.swap_used, m->system_context.swap_total, @@ -426,13 +357,13 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo if (r < 0) log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m"); - r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected); + threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100; + r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected); if (r == -ENOMEM) return log_oom(); if (r < 0) log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m"); else { - m->post_action_delay_start = usec_now; if (selected) log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, @@ -445,14 +376,183 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo return 0; } -static int monitor_cgroup_contexts(Manager *m) { +static void clear_candidate_hashmapp(Manager **m) { + if (*m) + hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates); +} + +static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { + /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we + * update the candidate data (in which case clear_candidates will be NULL). */ + _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata; + _cleanup_set_free_ Set *targets = NULL; + bool in_post_action_delay = false; + Manager *m = userdata; + usec_t usec_now; + int r; + + assert(s); + assert(userdata); + + /* Reset timer */ + r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now); + if (r < 0) + return log_error_errno(r, "Failed to reset event timer: %m"); + + r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC); + if (r < 0) + return log_error_errno(r, "Failed to set relative time for timer: %m"); + + /* Reconnect if our connection dropped */ + if (!m->varlink) { + r = acquire_managed_oom_connect(m); + if (r < 0) + return log_error_errno(r, "Failed to acquire varlink connection: %m"); + } + + /* Return early if nothing is requesting memory pressure monitoring */ + if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) + return 0; + + /* Update the cgroups used for detection/action */ + r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m"); + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + + /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale + * values and go on a kill storm. */ + if (m->mem_pressure_post_action_delay_start > 0) { + if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) + in_post_action_delay = true; + else + m->mem_pressure_post_action_delay_start = 0; + } + + r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); + else if (r == 1 && !in_post_action_delay) { + OomdCGroupContext *t; + SET_FOREACH(t, targets) { + _cleanup_free_ char *selected = NULL; + char ts[FORMAT_TIMESPAN_MAX]; + + /* Check if there was reclaim activity in the given interval. The concern is the following case: + * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending + * cgroup. Even after this, well-behaved processes will fault in recently resident pages and + * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need + * to kill something (it won't help anyways). */ + if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC) + continue; + + log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", + t->path, + LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10), + LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit), + format_timespan(ts, sizeof ts, + m->default_mem_pressure_duration_usec, + USEC_PER_SEC)); + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + else + clear_candidates = NULL; + + r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path); + else { + /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */ + m->mem_pressure_post_action_delay_start = usec_now; + if (selected) + log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" + " for > %s with reclaim activity", + selected, t->path, + LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10), + LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit), + format_timespan(ts, sizeof ts, + m->default_mem_pressure_duration_usec, + USEC_PER_SEC)); + return 0; + } + } + } else { + /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every + * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill + * might happen. + * Candidate cgroup data will continue to get updated during the post-action delay period in case + * pressure continues to be high after a kill. */ + OomdCGroupContext *c; + HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) { + if (c->mem_pressure_limit_hit_start == 0) + continue; + + r = update_monitored_cgroup_contexts_candidates( + m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); + else { + clear_candidates = NULL; + break; + } + } + } + + return 0; +} + +static int monitor_swap_contexts(Manager *m) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + int r; + + assert(m); + assert(m->event); + + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m); + if (r < 0) + return r; + + r = sd_event_source_set_exit_on_failure(s, true); + if (r < 0) + return r; + + r = sd_event_source_set_enabled(s, SD_EVENT_ON); + if (r < 0) + return r; + + (void) sd_event_source_set_description(s, "oomd-swap-timer"); + + m->swap_context_event_source = TAKE_PTR(s); + return 0; +} + +static int monitor_memory_pressure_contexts(Manager *m) { _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; int r; assert(m); assert(m->event); - r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m); + r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m); if (r < 0) return r; @@ -464,9 +564,9 @@ static int monitor_cgroup_contexts(Manager *m) { if (r < 0) return r; - (void) sd_event_source_set_description(s, "oomd-timer"); + (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer"); - m->cgroup_context_event_source = TAKE_PTR(s); + m->mem_pressure_context_event_source = TAKE_PTR(s); return 0; } @@ -474,7 +574,8 @@ Manager* manager_free(Manager *m) { assert(m); varlink_close_unref(m->varlink); - sd_event_source_unref(m->cgroup_context_event_source); + sd_event_source_unref(m->swap_context_event_source); + sd_event_source_unref(m->mem_pressure_context_event_source); sd_event_unref(m->event); bus_verify_polkit_async_registry_free(m->polkit_registry); @@ -596,7 +697,11 @@ int manager_start( if (r < 0) return r; - r = monitor_cgroup_contexts(m); + r = monitor_memory_pressure_contexts(m); + if (r < 0) + return r; + + r = monitor_swap_contexts(m); if (r < 0) return r; diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h index 9c580c8a24..dc170f2bda 100644 --- a/src/oom/oomd-manager.h +++ b/src/oom/oomd-manager.h @@ -7,10 +7,9 @@ #include "varlink.h" /* Polling interval for monitoring stats */ -#define INTERVAL_USEC (1 * USEC_PER_SEC) - -/* Used to weight the averages */ -#define AVERAGE_SIZE_DECAY 4 +#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */ +/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */ +#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC) /* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the * percentage of time all tasks were delayed (i.e. unproductive). @@ -20,6 +19,9 @@ #define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60 #define DEFAULT_SWAP_USED_LIMIT_PERCENT 90 +/* Only tackle candidates with large swap usage. */ +#define THRESHOLD_SWAP_USED_PERCENT 5 + #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) @@ -44,10 +46,10 @@ struct Manager { OomdSystemContext system_context; - usec_t last_reclaim_at; - usec_t post_action_delay_start; + usec_t mem_pressure_post_action_delay_start; - sd_event_source *cgroup_context_event_source; + sd_event_source *swap_context_event_source; + sd_event_source *mem_pressure_context_event_source; Varlink *varlink; }; diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c index 894d23a83a..5bf81479c9 100644 --- a/src/oom/oomd-util.c +++ b/src/oom/oomd-util.c @@ -82,17 +82,17 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { usec_t diff; - if (ctx->last_hit_mem_pressure_limit == 0) - ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC); + if (ctx->mem_pressure_limit_hit_start == 0) + ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC); - diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit; + diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start; if (diff >= duration) { r = set_put(targets, ctx); if (r < 0) return -ENOMEM; } } else - ctx->last_hit_mem_pressure_limit = 0; + ctx->mem_pressure_limit_hit_start = 0; } if (!set_isempty(targets)) { @@ -104,34 +104,21 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { return 0; } -bool oomd_memory_reclaim(Hashmap *h) { - uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0; - OomdCGroupContext *ctx; - - assert(h); - - /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values, - * there was reclaim activity. Used along with pressure checks to decide whether to take action. */ +uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) { + uint64_t last_pgscan; - HASHMAP_FOREACH(ctx, h) { - uint64_t sum; + assert(c); - sum = pgscan + ctx->pgscan; - if (sum < pgscan || sum < ctx->pgscan) - pgscan_of++; /* count overflows */ - pgscan = sum; - - sum = last_pgscan + ctx->last_pgscan; - if (sum < last_pgscan || sum < ctx->last_pgscan) - last_pgscan_of++; /* count overflows */ - last_pgscan = sum; + /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. + * pgscan is monotonic and in practice should not decrease (except in the recreation case). */ + last_pgscan = c->last_pgscan; + if (c->last_pgscan > c->pgscan) { + log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.", + c->last_pgscan, c->pgscan, c->path); + last_pgscan = 0; } - /* overflow counts are the same, return sums comparison */ - if (last_pgscan_of == pgscan_of) - return pgscan > last_pgscan; - - return pgscan_of > last_pgscan_of; + return c->pgscan - last_pgscan; } bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) { @@ -246,7 +233,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char return ret; } -int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) { +int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) { _cleanup_free_ OomdCGroupContext **sorted = NULL; int n, r, ret = 0; @@ -257,12 +244,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) { if (n < 0) return n; - /* Try to kill cgroups with non-zero swap usage until we either succeed in - * killing or we get to a cgroup with no swap usage. */ + /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with + * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */ for (int i = 0; i < n; i++) { - /* Skip over cgroups with no resource usage. - * Continue break since there might be "avoid" cgroups at the end. */ - if (sorted[i]->swap_usage == 0) + /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid" + * cgroups at the end. */ + if (sorted[i]->swap_usage <= threshold_usage) continue; r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); @@ -430,9 +417,13 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) if (old_ctx) { curr_ctx->last_pgscan = old_ctx->pgscan; curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; - curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; + curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; } + if (oomd_pgscan_rate(curr_ctx) > 0) + curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); + r = hashmap_put(new_h, curr_ctx->path, curr_ctx); if (r < 0) return r; @@ -456,7 +447,11 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_ ctx->last_pgscan = old_ctx->pgscan; ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; - ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; + ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start; + ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim; + + if (oomd_pgscan_rate(ctx) > 0) + ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC); } } diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h index 51423130d1..81fdc5e088 100644 --- a/src/oom/oomd-util.h +++ b/src/oom/oomd-util.h @@ -32,10 +32,10 @@ struct OomdCGroupContext { ManagedOOMPreference preference; - /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + /* These are only used for acting on high memory pressure. */ loadavg_t mem_pressure_limit; - usec_t mem_pressure_duration_usec; - usec_t last_hit_mem_pressure_limit; + usec_t mem_pressure_limit_hit_start; + usec_t last_had_mem_reclaim; }; struct OomdSystemContext { @@ -51,23 +51,22 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free); /* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. - * `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns + * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns * below the limit. * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. * Returns -ENOMEM for allocation errors. */ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); -/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the - * current sum is higher than the last interval's sum (there was some reclaim activity). */ -bool oomd_memory_reclaim(Hashmap *h); - /* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */ bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad); +/* Returns pgscan - last_pgscan, accounting for corner cases. */ +uint64_t oomd_pgscan_rate(const OomdCGroupContext *c); + /* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end * (after the smallest values). */ static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { - uint64_t last1, last2; + uint64_t diff1, diff2; int r; assert(c1); @@ -77,22 +76,9 @@ static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const if (r != 0) return r; - /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. */ - last2 = (*c2)->last_pgscan; - if ((*c2)->last_pgscan > (*c2)->pgscan) { - log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.", - (*c2)->last_pgscan, (*c2)->pgscan, (*c2)->path); - last2 = 0; - } - - last1 = (*c1)->last_pgscan; - if ((*c1)->last_pgscan > (*c1)->pgscan) { - log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.", - (*c1)->last_pgscan, (*c1)->pgscan, (*c1)->path); - last1 = 0; - } - - r = CMP((*c2)->pgscan - last2, (*c1)->pgscan - last1); + diff1 = oomd_pgscan_rate(*c1); + diff2 = oomd_pgscan_rate(*c2); + r = CMP(diff2, diff1); if (r != 0) return r; @@ -125,7 +111,7 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run); * everything in `h` is a candidate. * Returns the killed cgroup in ret_selected. */ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected); -int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected); +int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected); int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret); diff --git a/src/oom/oomd.c b/src/oom/oomd.c index 6e2a5889d1..deb7b094d5 100644 --- a/src/oom/oomd.c +++ b/src/oom/oomd.c @@ -155,6 +155,9 @@ static int run(int argc, char *argv[]) { assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); + if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC) + log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s"); + r = manager_new(&m); if (r < 0) return log_error_errno(r, "Failed to create manager: %m"); diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c index 278fc305b3..7ebea29c1a 100644 --- a/src/oom/test-oomd-util.c +++ b/src/oom/test-oomd-util.c @@ -160,9 +160,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST); /* make sure certain values from h1 get updated in h2 */ - c1->pgscan = 5555; + c1->pgscan = UINT64_MAX; c1->mem_pressure_limit = 6789; - c1->last_hit_mem_pressure_limit = 42; + c1->mem_pressure_limit_hit_start = 42; + c1->last_had_mem_reclaim = 888; assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); c1 = hashmap_get(h1, cgroup); @@ -170,9 +171,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { assert_se(c1); assert_se(c2); assert_se(c1 != c2); - assert_se(c2->last_pgscan == 5555); + assert_se(c2->last_pgscan == UINT64_MAX); assert_se(c2->mem_pressure_limit == 6789); - assert_se(c2->last_hit_mem_pressure_limit == 42); + assert_se(c2->mem_pressure_limit_hit_start == 42); + assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */ /* Assert that avoid/omit are not set if the cgroup is not owned by root */ if (test_xattrs) { @@ -189,20 +191,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { char **paths = STRV_MAKE("/0.slice", "/1.slice"); - OomdCGroupContext ctx_old[3] = { + OomdCGroupContext ctx_old[2] = { { .path = paths[0], .mem_pressure_limit = 5, - .last_hit_mem_pressure_limit = 777, + .mem_pressure_limit_hit_start = 777, + .last_had_mem_reclaim = 888, .pgscan = 57 }, { .path = paths[1], .mem_pressure_limit = 6, - .last_hit_mem_pressure_limit = 888, + .mem_pressure_limit_hit_start = 888, + .last_had_mem_reclaim = 888, .pgscan = 42 }, }; - OomdCGroupContext ctx_new[3] = { + OomdCGroupContext ctx_new[2] = { { .path = paths[0], - .pgscan = 100 }, + .pgscan = 57 }, { .path = paths[1], .pgscan = 101 }, }; @@ -221,13 +225,15 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) { assert_se(c_new = hashmap_get(h_new, "/0.slice")); assert_se(c_old->pgscan == c_new->last_pgscan); assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); - assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit); + assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim); assert_se(c_old = hashmap_get(h_old, "/1.slice")); assert_se(c_new = hashmap_get(h_new, "/1.slice")); assert_se(c_old->pgscan == c_new->last_pgscan); assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); - assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit); + assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start); + assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim); } static void test_oomd_system_context_acquire(void) { @@ -290,7 +296,7 @@ static void test_oomd_pressure_above(void) { assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); assert_se(set_contains(t1, &ctx[0]) == true); assert_se(c = hashmap_get(h1, "/herp.slice")); - assert_se(c->last_hit_mem_pressure_limit > 0); + assert_se(c->mem_pressure_limit_hit_start > 0); /* Low memory pressure */ assert_se(h2 = hashmap_new(&string_hash_ops)); @@ -298,7 +304,7 @@ static void test_oomd_pressure_above(void) { assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); assert_se(t2 == NULL); assert_se(c = hashmap_get(h2, "/derp.slice")); - assert_se(c->last_hit_mem_pressure_limit == 0); + assert_se(c->mem_pressure_limit_hit_start == 0); /* High memory pressure w/ multiple cgroups */ assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); @@ -306,50 +312,9 @@ static void test_oomd_pressure_above(void) { assert_se(set_contains(t3, &ctx[0]) == true); assert_se(set_size(t3) == 1); assert_se(c = hashmap_get(h1, "/herp.slice")); - assert_se(c->last_hit_mem_pressure_limit > 0); + assert_se(c->mem_pressure_limit_hit_start > 0); assert_se(c = hashmap_get(h1, "/derp.slice")); - assert_se(c->last_hit_mem_pressure_limit == 0); -} - -static void test_oomd_memory_reclaim(void) { - _cleanup_hashmap_free_ Hashmap *h1 = NULL; - char **paths = STRV_MAKE("/0.slice", - "/1.slice", - "/2.slice", - "/3.slice", - "/4.slice"); - - OomdCGroupContext ctx[5] = { - { .path = paths[0], - .last_pgscan = 100, - .pgscan = 100 }, - { .path = paths[1], - .last_pgscan = 100, - .pgscan = 100 }, - { .path = paths[2], - .last_pgscan = 77, - .pgscan = 33 }, - { .path = paths[3], - .last_pgscan = UINT64_MAX, - .pgscan = 100 }, - { .path = paths[4], - .last_pgscan = 100, - .pgscan = UINT64_MAX }, - }; - - assert_se(h1 = hashmap_new(&string_hash_ops)); - assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0); - assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0); - assert_se(oomd_memory_reclaim(h1) == false); - - assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0); - assert_se(oomd_memory_reclaim(h1) == false); - - assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0); - assert_se(oomd_memory_reclaim(h1) == true); - - assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0); - assert_se(oomd_memory_reclaim(h1) == false); + assert_se(c->mem_pressure_limit_hit_start == 0); } static void test_oomd_swap_free_below(void) { @@ -468,7 +433,6 @@ int main(void) { test_oomd_update_cgroup_contexts_between_hashmaps(); test_oomd_system_context_acquire(); test_oomd_pressure_above(); - test_oomd_memory_reclaim(); test_oomd_swap_free_below(); test_oomd_sort_cgroups(); |