From 8fe23e057172223fe2048768a4d87ab7de7477bc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 14 Dec 2009 17:58:33 -0800 Subject: mm: clear node in N_HIGH_MEMORY and stop kswapd when all memory is offlined When memory is hot-removed, its node must be cleared in N_HIGH_MEMORY if there are no present pages left. In such a situation, kswapd must also be stopped since it has nothing left to do. Signed-off-by: David Rientjes Signed-off-by: Lee Schermerhorn Cc: Christoph Lameter Cc: Yasunori Goto Cc: Mel Gorman Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Randy Dunlap Cc: Nishanth Aravamudan Cc: Andi Kleen Cc: David Rientjes Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57fd8c8..d0a631a428a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2173,6 +2173,7 @@ static int kswapd(void *p) order = 0; for ( ; ; ) { unsigned long new_order; + int ret; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; @@ -2184,19 +2185,23 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current)) + if (!freezing(current) && !kthread_should_stop()) schedule(); order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) balance_pgdat(pgdat, order); - } } return 0; } @@ -2451,6 +2456,17 @@ int kswapd_run(int nid) return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) + kthread_stop(kswapd); +} + static int __init kswapd_init(void) { int nid; -- cgit v1.2.3 From 6aceb53be44ed55a2374c20a62e3aef9d3919e8d Mon Sep 17 00:00:00 2001 From: Vincent Li Date: Mon, 14 Dec 2009 17:58:49 -0800 Subject: mm/vmscan: change comment generic_file_write to __generic_file_aio_write Commit 543ade1fc9 ("Streamline generic_file_* interfaces and filemap cleanups") removed generic_file_write() in filemap. Change the comment in vmscan pageout() to __generic_file_aio_write(). Signed-off-by: Vincent Li Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d0a631a428a0..61d3a9a0d96f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -358,7 +358,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_aio_write() against * this page's queue, we can perform writeback even if that * will block. * -- cgit v1.2.3 From f50de2d3811081957156b5d736778799379c29de Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 14 Dec 2009 17:58:53 -0800 Subject: vmscan: have kswapd sleep for a short interval and double check it should be asleep After kswapd balances all zones in a pgdat, it goes to sleep. In the event of no IO congestion, kswapd can go to sleep very shortly after the high watermark was reached. If there are a constant stream of allocations from parallel processes, it can mean that kswapd went to sleep too quickly and the high watermark is not being maintained for sufficient length time. This patch makes kswapd go to sleep as a two-stage process. It first tries to sleep for HZ/10. If it is woken up by another process or the high watermark is no longer met, it's considered a premature sleep and kswapd continues work. Otherwise it goes fully to sleep. This adds more counters to distinguish between fast and slow breaches of watermarks. A "fast" premature sleep is one where the low watermark was hit in a very short time after kswapd going to sleep. A "slow" premature sleep indicates that the high watermark was breached after a very short interval. Signed-off-by: Mel Gorman Cc: Frans Pop Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 1 + mm/vmscan.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- mm/vmstat.c | 2 ++ 3 files changed, 45 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d85889710f9b..fd5be240c0b7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -40,6 +40,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSCAN_ZONE_RECLAIM_FAILED, #endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, + KSWAPD_PREMATURE_FAST, KSWAPD_PREMATURE_SLOW, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/mm/vmscan.c b/mm/vmscan.c index 61d3a9a0d96f..e176bd3936da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1904,6 +1904,24 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* is kswapd sleeping prematurely? */ +static int sleeping_prematurely(int order, long remaining) +{ + struct zone *zone; + + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return 1; + + /* If after HZ/10, a zone is below the high mark, it's premature */ + for_each_populated_zone(zone) + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + + return 0; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -2185,8 +2203,30 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current) && !kthread_should_stop()) - schedule(); + if (!freezing(current) && !kthread_should_stop()) { + long remaining = 0; + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a + * premature sleep. If not, then go fully + * to sleep until explicitly woken up + */ + if (!sleeping_prematurely(order, remaining)) + schedule(); + else { + if (remaining) + count_vm_event(KSWAPD_PREMATURE_FAST); + else + count_vm_event(KSWAPD_PREMATURE_SLOW); + } + } order = pgdat->kswapd_max_order; } diff --git a/mm/vmstat.c b/mm/vmstat.c index dad2327e4580..63ab71455c5b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -683,6 +683,8 @@ static const char * const vmstat_text[] = { "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", + "kswapd_slept_prematurely_fast", + "kswapd_slept_prematurely_slow", "pageoutrun", "allocstall", -- cgit v1.2.3 From bb3ab596832b920c703d1aea1ce76d69c0f71fb7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 14 Dec 2009 17:58:55 -0800 Subject: vmscan: stop kswapd waiting on congestion when the min watermark is not being met If reclaim fails to make sufficient progress, the priority is raised. Once the priority is higher, kswapd starts waiting on congestion. However, if the zone is below the min watermark then kswapd needs to continue working without delay as there is a danger of an increased rate of GFP_ATOMIC allocation failure. This patch changes the conditions under which kswapd waits on congestion by only going to sleep if the min watermarks are being met. [mel@csn.ul.ie: add stats to track how relevant the logic is] [mel@csn.ul.ie: make kswapd only check its own zones and rename the relevant counters] Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 ++- mm/vmscan.c | 38 +++++++++++++++++++++++++++++--------- mm/vmstat.c | 5 +++-- 3 files changed, 34 insertions(+), 12 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fd5be240c0b7..ee03bba9c5df 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -40,7 +40,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSCAN_ZONE_RECLAIM_FAILED, #endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, - KSWAPD_PREMATURE_FAST, KSWAPD_PREMATURE_SLOW, + KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, + KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/mm/vmscan.c b/mm/vmscan.c index e176bd3936da..cb69f717799f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1905,19 +1905,25 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, #endif /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(int order, long remaining) +static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) { - struct zone *zone; + int i; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) return 1; /* If after HZ/10, a zone is below the high mark, it's premature */ - for_each_populated_zone(zone) + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 0, 0)) return 1; + } return 0; } @@ -1979,6 +1985,7 @@ loop_again: for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + int has_under_min_watermark_zone = 0; /* The swap token gets in the way of swapout... */ if (!priority) @@ -2085,6 +2092,15 @@ loop_again: if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + + /* + * We are still under min water mark. it mean we have + * GFP_ATOMIC allocation failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), + end_zone, 0)) + has_under_min_watermark_zone = 1; + } if (all_zones_ok) break; /* kswapd: all done */ @@ -2092,8 +2108,12 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (has_under_min_watermark_zone) + count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); + else + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* * We do this so kswapd doesn't build up large priorities for @@ -2207,7 +2227,7 @@ static int kswapd(void *p) long remaining = 0; /* Try to sleep for a short interval */ - if (!sleeping_prematurely(order, remaining)) { + if (!sleeping_prematurely(pgdat, order, remaining)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2218,13 +2238,13 @@ static int kswapd(void *p) * premature sleep. If not, then go fully * to sleep until explicitly woken up */ - if (!sleeping_prematurely(order, remaining)) + if (!sleeping_prematurely(pgdat, order, remaining)) schedule(); else { if (remaining) - count_vm_event(KSWAPD_PREMATURE_FAST); + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else - count_vm_event(KSWAPD_PREMATURE_SLOW); + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); } } diff --git a/mm/vmstat.c b/mm/vmstat.c index 63ab71455c5b..6051fbab67ba 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -683,8 +683,9 @@ static const char * const vmstat_text[] = { "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", - "kswapd_slept_prematurely_fast", - "kswapd_slept_prematurely_slow", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "kswapd_skip_congestion_wait", "pageoutrun", "allocstall", -- cgit v1.2.3 From 22fba33545b731408deab6e96b6e231ee05fd10b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 14 Dec 2009 17:59:10 -0800 Subject: vmscan: separate sc.swap_cluster_max and sc.nr_max_reclaim Currently, sc.scap_cluster_max has double meanings. 1) reclaim batch size as isolate_lru_pages()'s argument 2) reclaim baling out thresolds The two meanings pretty unrelated. Thus, Let's separate it. this patch doesn't change any behavior. Signed-off-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index cb69f717799f..7b0d5c784c7e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,9 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -1595,6 +1598,7 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; @@ -1639,8 +1643,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > swap_cluster_max && - priority < DEF_PRIORITY && !current_is_kswapd()) + if (nr_reclaimed > nr_to_reclaim && priority < DEF_PRIORITY) break; } @@ -1738,6 +1741,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long writeback_threshold; delayacct_freepages_start(); @@ -1773,7 +1777,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->swap_cluster_max) { + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ret = sc->nr_reclaimed; goto out; } @@ -1785,8 +1789,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } @@ -1832,6 +1836,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, @@ -1890,6 +1895,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_unmap = 1, .may_swap = !noswap, .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, @@ -1961,6 +1967,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .may_unmap = 1, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, + /* + * kswapd doesn't want to be bailed out while reclaim. because + * we want to put equal scanning pressure on each zone. + */ + .nr_to_reclaim = ULONG_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -2630,7 +2641,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + SWAP_CLUSTER_MAX), + .nr_to_reclaim = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, -- cgit v1.2.3 From 7b51755c3b38483b574d363d5ee587283c3f7999 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 14 Dec 2009 17:59:12 -0800 Subject: vmscan: kill hibernation specific reclaim logic and unify it shrink_all_zone() was introduced by commit d6277db4ab (swsusp: rework memory shrinker) for hibernate performance improvement. and sc.swap_cluster_max was introduced by commit a06fe4d307 (Speed freeing memory for suspend). commit a06fe4d307 said Without the patch: Freed 14600 pages in 1749 jiffies = 32.61 MB/s (Anomolous!) Freed 88563 pages in 14719 jiffies = 23.50 MB/s Freed 205734 pages in 32389 jiffies = 24.81 MB/s With the patch: Freed 68252 pages in 496 jiffies = 537.52 MB/s Freed 116464 pages in 569 jiffies = 798.54 MB/s Freed 209699 pages in 705 jiffies = 1161.89 MB/s At that time, their patch was pretty worth. However, Modern Hardware trend and recent VM improvement broke its worth. From several reason, I think we should remove shrink_all_zones() at all. detail: 1) Old days, shrink_zone()'s slowness was mainly caused by stupid io-throttle at no i/o congestion. but current shrink_zone() is sane, not slow. 2) shrink_all_zone() try to shrink all pages at a time. but it doesn't works fine on numa system. example) System has 4GB memory and each node have 2GB. and hibernate need 1GB. optimal) steal 500MB from each node. shrink_all_zones) steal 1GB from node-0. Oh, Cache balancing logic was broken. ;) Unfortunately, Desktop system moved ahead NUMA at nowadays. (Side note, if hibernate require 2GB, shrink_all_zones() never success on above machine) 3) if the node has several I/O flighting pages, shrink_all_zones() makes pretty bad result. schenario) hibernate need 1GB 1) shrink_all_zones() try to reclaim 1GB from Node-0 2) but it only reclaimed 990MB 3) stupidly, shrink_all_zones() try to reclaim 1GB from Node-1 4) it reclaimed 990MB Oh, well. it reclaimed twice much than required. In the other hand, current shrink_zone() has sane baling out logic. then, it doesn't make overkill reclaim. then, we lost shrink_zones()'s risk. 4) SplitLRU VM always keep active/inactive ratio very carefully. inactive list only shrinking break its assumption. it makes unnecessary OOM risk. it obviously suboptimal. Now, shrink_all_memory() is only the wrapper function of do_try_to_free_pages(). it bring good reviewability and debuggability, and solve above problems. side note: Reclaim logic unificication makes two good side effect. - Fix recursive reclaim bug on shrink_all_memory(). it did forgot to use PF_MEMALLOC. it mean the system be able to stuck into deadlock. - Now, shrink_all_memory() got lockdep awareness. it bring good debuggability. Signed-off-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Acked-by: Rafael J. Wysocki Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 153 +++++++++++------------------------------------------------- 1 file changed, 26 insertions(+), 127 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7b0d5c784c7e..63bd521bb229 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -58,6 +58,8 @@ struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -1796,7 +1798,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) + if (!sc->hibernation_mode && sc->nr_scanned && + priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ @@ -2336,148 +2339,44 @@ unsigned long zone_reclaimable_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION /* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority. - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages - */ -static void shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) -{ - struct zone *zone; - unsigned long nr_reclaimed = 0; - struct zone_reclaim_stat *reclaim_stat; - - for_each_populated_zone(zone) { - enum lru_list l; - - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; - - for_each_evictable_lru(l) { - enum zone_stat_item ls = NR_LRU_BASE + l; - unsigned long lru_pages = zone_page_state(zone, ls); - - /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && (l == LRU_ACTIVE_ANON || - l == LRU_ACTIVE_FILE)) - continue; - - reclaim_stat = get_reclaim_stat(zone, sc); - reclaim_stat->nr_saved_scan[l] += - (lru_pages >> prio) + 1; - if (reclaim_stat->nr_saved_scan[l] - >= nr_pages || pass > 3) { - unsigned long nr_to_scan; - - reclaim_stat->nr_saved_scan[l] = 0; - nr_to_scan = min(nr_pages, lru_pages); - nr_reclaimed += shrink_list(l, nr_to_scan, zone, - sc, prio); - if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed += nr_reclaimed; - return; - } - } - } - } - sc->nr_reclaimed += nr_reclaimed; -} - -/* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_unmap = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .swappiness = vm_swappiness, + .order = 0, .isolate_pages = isolate_pages_global, - .nr_reclaimed = 0, }; + struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = global_reclaimable_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) - sc.may_unmap = 1; - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; - - sc.nr_scanned = 0; - sc.swap_cluster_max = nr_to_scan; - shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (sc.nr_reclaimed >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ / 10); - } - } - - /* - * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be - * something in slab caches - */ - if (!sc.nr_reclaimed) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - } while (sc.nr_reclaimed < nr_pages && - reclaim_state.reclaimed_slab > 0); - } + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return sc.nr_reclaimed; + return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ -- cgit v1.2.3 From 4f0ddfdffc8bef3a5eb9154734d68a6053194948 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 14 Dec 2009 17:59:13 -0800 Subject: vmscan: zone_reclaim() don't use insane swap_cluster_max In old days, we didn't have sc.nr_to_reclaim and it brought sc.swap_cluster_max misuse. huge sc.swap_cluster_max might makes unnecessary OOM risk and no performance benefit. Now, we can stop its insane thing. Signed-off-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 63bd521bb229..d55d106ad179 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2539,8 +2539,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + .swap_cluster_max = SWAP_CLUSTER_MAX, .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, -- cgit v1.2.3 From ece74b2e7acfb71453f3f39948cc667434550dbb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 14 Dec 2009 17:59:14 -0800 Subject: vmscan: kill sc.swap_cluster_max Now, All caller of reclaim use swap_cluster_max as SWAP_CLUSTER_MAX. Then, we can remove it perfectly. Signed-off-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d55d106ad179..2b1c74817a1e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -71,12 +71,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - int swappiness; int all_unreclaimable; @@ -1137,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_anon; unsigned long nr_file; - nr_taken = sc->isolate_pages(sc->swap_cluster_max, + nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); @@ -1572,15 +1566,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * until we collected @swap_cluster_max pages to scan. */ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan, - unsigned long swap_cluster_max) + unsigned long *nr_saved_scan) { unsigned long nr; *nr_saved_scan += nr_to_scan; nr = *nr_saved_scan; - if (nr >= swap_cluster_max) + if (nr >= SWAP_CLUSTER_MAX) *nr_saved_scan = 0; else nr = 0; @@ -1599,7 +1592,6 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; - unsigned long swap_cluster_max = sc->swap_cluster_max; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; @@ -1622,15 +1614,15 @@ static void shrink_zone(int priority, struct zone *zone, scan = (scan * percent[file]) / 100; } nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l], - swap_cluster_max); + &reclaim_stat->nr_saved_scan[l]); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], swap_cluster_max); + nr_to_scan = min_t(unsigned long, + nr[l], SWAP_CLUSTER_MAX); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, @@ -1838,7 +1830,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, @@ -1863,7 +1854,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, @@ -1897,7 +1887,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, @@ -1969,7 +1958,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, /* * kswapd doesn't want to be bailed out while reclaim. because * we want to put equal scanning pressure on each zone. @@ -2354,7 +2342,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .may_swap = 1, .may_unmap = 1, .may_writepage = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, .swappiness = vm_swappiness, @@ -2539,7 +2526,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, -- cgit v1.2.3 From 338fde90930eaa02f6f394daa23d35a410af5852 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 14 Dec 2009 17:59:15 -0800 Subject: vmscan: make consistent of reclaim bale out between do_try_to_free_page and shrink_zone Fix small inconsistent of ">" and ">=". Signed-off-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2b1c74817a1e..2ef59d5b16a6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1637,7 +1637,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > nr_to_reclaim && priority < DEF_PRIORITY) + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } -- cgit v1.2.3 From b39415b2731d7dec5e612d2d12595da82399eedf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 14 Dec 2009 17:59:48 -0800 Subject: vmscan: do not evict inactive pages when skipping an active list scan In AIM7 runs, recent kernels start swapping out anonymous pages well before they should. This is due to shrink_list falling through to shrink_inactive_list if !inactive_anon_is_low(zone, sc), when all we really wanted to do is pre-age some anonymous pages to give them extra time to be referenced while on the inactive list. The obvious fix is to make sure that shrink_list does not fall through to scanning/reclaiming inactive pages when we called it to scan one of the active lists. This change should be safe because the loop in shrink_zone ensures that we will still shrink the anon and file inactive lists whenever we should. [kosaki.motohiro@jp.fujitsu.com: inactive_file_is_low() should be inactive_anon_is_low()] Reported-by: Larry Woodman Signed-off-by: Rik van Riel Acked-by: Johannes Weiner Cc: Tomasz Chmielewski Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2ef59d5b16a6..04658189b9a5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1463,20 +1463,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, + int file) +{ + if (file) + return inactive_file_is_low(zone, sc); + else + return inactive_anon_is_low(zone, sc); +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (is_active_lru(lru)) { + if (inactive_list_is_low(zone, sc, file)) + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); - return 0; - } return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } -- cgit v1.2.3 From 62c0c2f198c1f2ead05c961e83ef486c45888325 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 14 Dec 2009 17:59:48 -0800 Subject: vmscan: simplify code Simplify the code for shrink_inactive_list(). Signed-off-by: Huang Shijie Reviewed-by: KOSAKI Motohiro Reviewed-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 04658189b9a5..885207a6b6b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1165,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); - reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + reclaim_stat->recent_scanned[0] += nr_anon; + reclaim_stat->recent_scanned[1] += nr_file; spin_unlock_irq(&zone->lru_lock); -- cgit v1.2.3 From de3fab39348dff18c69a0cd04efee9c276a02f51 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Jan 2010 17:01:25 -0800 Subject: vmscan: kswapd: don't retry balance_pgdat() if all zones are unreclaimable Commit f50de2d3 (vmscan: have kswapd sleep for a short interval and double check it should be asleep) can cause kswapd to enter an infinite loop if running on a single-CPU system. If all zones are unreclaimble, sleeping_prematurely return 1 and kswapd will call balance_pgdat() again. but it's totally meaningless, balance_pgdat() doesn't anything against unreclaimable zone! Signed-off-by: KOSAKI Motohiro Cc: Mel Gorman Reported-by: Will Newton Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Tested-by: Will Newton Reviewed-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 885207a6b6b7..c26986c85ce0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!populated_zone(zone)) continue; + if (zone_is_all_unreclaimable(zone)) + continue; + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 0, 0)) return 1; -- cgit v1.2.3