From 3a5dda7a17cf3706f79b86293f29db02d61e0d48 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:09 -0700 Subject: oom: prevent unnecessary oom kills or kernel panics This patch prevents unnecessary oom kills or kernel panics by reverting two commits: 495789a5 (oom: make oom_score to per-process value) cef1d352 (oom: multi threaded process coredump don't make deadlock) First, 495789a5 (oom: make oom_score to per-process value) ignores the fact that all threads in a thread group do not necessarily exit at the same time. It is imperative that select_bad_process() detect threads that are in the exit path, specifically those with PF_EXITING set, to prevent needlessly killing additional tasks. If a process is oom killed and the thread group leader exits, select_bad_process() cannot detect the other threads that are PF_EXITING by iterating over only processes. Thus, it currently chooses another task unnecessarily for oom kill or panics the machine when nothing else is eligible. By iterating over threads instead, it is possible to detect threads that are exiting and nominate them for oom kill so they get access to memory reserves. Second, cef1d352 (oom: multi threaded process coredump don't make deadlock) erroneously avoids making the oom killer a no-op when an eligible thread other than current isfound to be exiting. We want to detect this situation so that we may allow that exiting thread time to exit and free its memory; if it is able to exit on its own, that should free memory so current is no loner oom. If it is not able to exit on its own, the oom killer will nominate it for oom kill which, in this case, only means it will get access to memory reserves. Without this change, it is easy for the oom killer to unnecessarily target tasks when all threads of a victim don't exit before the thread group leader or, in the worst case, panic the machine. Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrey Vagin Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7c..b5a7b5f46e7a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -292,11 +292,11 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *p; + struct task_struct *g, *p; struct task_struct *chosen = NULL; *ppoints = 0; - for_each_process(p) { + do_each_thread(g, p) { unsigned int points; if (oom_unkillable_task(p, mem, nodemask)) @@ -324,7 +324,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { + if ((p->flags & PF_EXITING) && p->mm) { if (p != current) return ERR_PTR(-1UL); @@ -337,7 +337,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; *ppoints = points; } - } + } while_each_thread(g, p); return chosen; } -- cgit v1.2.3 From 30e2b41f20b6238f51e7cffb879c7a0f0073f5fe Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 22 Mar 2011 16:30:11 -0700 Subject: oom: skip zombies when iterating tasklist We shouldn't defer oom killing if a thread has already detached its ->mm and still has TIF_MEMDIE set. Memory needs to be freed, so find kill other threads that pin the same ->mm or find another task to kill. Signed-off-by: Andrey Vagin Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b5a7b5f46e7a..d7f345e47e73 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -299,6 +299,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, do_each_thread(g, p) { unsigned int points; + if (!p->mm) + continue; if (oom_unkillable_task(p, mem, nodemask)) continue; @@ -324,7 +326,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if ((p->flags & PF_EXITING) && p->mm) { + if (p->flags & PF_EXITING) { if (p != current) return ERR_PTR(-1UL); -- cgit v1.2.3 From edd45544c6f09550df0a5491aa8a07af24767e73 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:12 -0700 Subject: oom: avoid deferring oom killer if exiting task is being traced The oom killer naturally defers killing anything if it finds an eligible task that is already exiting and has yet to detach its ->mm. This avoids unnecessarily killing tasks when one is already in the exit path and may free enough memory that the oom killer is no longer needed. This is detected by PF_EXITING since threads that have already detached its ->mm are no longer considered at all. The problem with always deferring when a thread is PF_EXITING, however, is that it may never actually exit when being traced, specifically if another task is tracing it with PTRACE_O_TRACEEXIT. The oom killer does not want to defer in this case since there is no guarantee that thread will ever exit without intervention. This patch will now only defer the oom killer when a thread is PF_EXITING and no ptracer has stopped its progress in the exit path. It also ensures that a child is sacrificed for the chosen parent only if it has a different ->mm as the comment implies: this ensures that the thread group leader is always targeted appropriately. Signed-off-by: David Rientjes Reported-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Andrey Vagin Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d7f345e47e73..33b58615072c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,6 +31,7 @@ #include #include #include +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -316,22 +317,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (test_tsk_thread_flag(p, TIF_MEMDIE)) return ERR_PTR(-1UL); - /* - * This is in the process of releasing memory so wait for it - * to finish before killing some other task by mistake. - * - * However, if p is the current task, we allow the 'kill' to - * go ahead if it is exiting: this will simply set TIF_MEMDIE, - * which will allow it to gain access to memory reserves in - * the process of exiting and releasing its resources. - * Otherwise we could get an easy OOM deadlock. - */ if (p->flags & PF_EXITING) { - if (p != current) - return ERR_PTR(-1UL); - - chosen = p; - *ppoints = 1000; + /* + * If p is the current task and is in the process of + * releasing memory, we allow the "kill" to set + * TIF_MEMDIE, which will allow it to gain access to + * memory reserves. Otherwise, it may stall forever. + * + * The loop isn't broken here, however, in case other + * threads are found to have already been oom killed. + */ + if (p == current) { + chosen = p; + *ppoints = 1000; + } else { + /* + * If this task is not being ptraced on exit, + * then wait for it to finish before killing + * some other task unnecessarily. + */ + if (!(task_ptrace(p->group_leader) & + PT_TRACE_EXIT)) + return ERR_PTR(-1UL); + } } points = oom_badness(p, mem, nodemask, totalpages); @@ -493,6 +501,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; + if (child->mm == p->mm) + continue; /* * oom_badness() returns 0 if the thread is unkillable */ -- cgit v1.2.3 From ddd588b5dd55f14320379961e47683db4e4c1d90 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:46 -0700 Subject: oom: suppress nodes that are not allowed from meminfo on oom kill The oom killer is extremely verbose for machines with a large number of cpus and/or nodes. This verbosity can often be harmful if it causes other important messages to be scrolled from the kernel log and incurs a signicant time delay, specifically for kernels with CONFIG_NODES_SHIFT > 8. This patch causes only memory information to be displayed for nodes that are allowed by current's cpuset when dumping the VM state. Information for all other nodes is irrelevant to the oom condition; we don't care if there's an abundance of memory elsewhere if we can't access it. This only affects the behavior of dumping memory information when an oom is triggered. Other dumps, such as for sysrq+m, still display the unfiltered form when using the existing show_mem() interface. Additionally, the per-cpu pageset statistics are extremely verbose in oom killer output, so it is now suppressed. This removes nodes_weight(current->mems_allowed) * (1 + nr_cpus) lines from the oom killer output. Callers may use __show_mem(SHOW_MEM_FILTER_NODES) to filter disallowed nodes. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++++++ lib/show_mem.c | 9 +++++++-- mm/oom_kill.c | 2 +- mm/page_alloc.c | 34 +++++++++++++++++++++++++++++++++- 4 files changed, 49 insertions(+), 4 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 581703d86fbd..1f82adc85352 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -859,7 +859,14 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +/* + * Flags passed to __show_mem() and __show_free_areas() to suppress output in + * various contexts. + */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ + extern void show_free_areas(void); +extern void __show_free_areas(unsigned int flags); int shmem_lock(struct file *file, int lock, struct user_struct *user); struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); @@ -1348,6 +1355,7 @@ extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); +extern void __show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; diff --git a/lib/show_mem.c b/lib/show_mem.c index fdc77c82f922..d8d602b58c31 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -9,14 +9,14 @@ #include #include -void show_mem(void) +void __show_mem(unsigned int filter) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, shared = 0, nonshared = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(); + __show_free_areas(filter); for_each_online_pgdat(pgdat) { unsigned long i, flags; @@ -61,3 +61,8 @@ void show_mem(void) quicklist_total_size()); #endif } + +void show_mem(void) +{ + __show_mem(0); +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 33b58615072c..3100bc57036b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - show_mem(); + __show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7945247b1e53..36be3ba4bbed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2411,19 +2411,42 @@ void si_meminfo_node(struct sysinfo *val, int nid) } #endif +/* + * Determine whether the zone's node should be displayed or not, depending on + * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). + */ +static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) +{ + bool ret = false; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + + get_mems_allowed(); + ret = !node_isset(zone->zone_pgdat->node_id, + cpuset_current_mems_allowed); + put_mems_allowed(); +out: + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. + * Suppresses nodes that are not allowed by current's cpuset if + * SHOW_MEM_FILTER_NODES is passed. */ -void show_free_areas(void) +void __show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2465,6 +2488,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { int i; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s" " free:%lukB" @@ -2532,6 +2557,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s: ", zone->name); @@ -2551,6 +2578,11 @@ void show_free_areas(void) show_swap_cache_info(); } +void show_free_areas(void) +{ + __show_free_areas(0); +} + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- cgit v1.2.3