diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-09-03 19:08:28 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-09-03 19:08:28 +0200 |
commit | 14726903c835101cd8d0a703b609305094350d61 (patch) | |
tree | 5cdcf5d2f06ca14be76efd33a4de0e3b28a70de0 /mm/vmscan.c | |
parent | Merge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi (diff) | |
parent | mm/madvise: add MADV_WILLNEED to process_madvise() (diff) | |
download | linux-14726903c835101cd8d0a703b609305094350d61.tar.xz linux-14726903c835101cd8d0a703b609305094350d61.zip |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
"173 patches.
Subsystems affected by this series: ia64, ocfs2, block, and mm (debug,
pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap,
bootmem, sparsemem, vmalloc, kasan, pagealloc, memory-failure,
hugetlb, userfaultfd, vmscan, compaction, mempolicy, memblock,
oom-kill, migration, ksm, percpu, vmstat, and madvise)"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (173 commits)
mm/madvise: add MADV_WILLNEED to process_madvise()
mm/vmstat: remove unneeded return value
mm/vmstat: simplify the array size calculation
mm/vmstat: correct some wrong comments
mm/percpu,c: remove obsolete comments of pcpu_chunk_populated()
selftests: vm: add COW time test for KSM pages
selftests: vm: add KSM merging time test
mm: KSM: fix data type
selftests: vm: add KSM merging across nodes test
selftests: vm: add KSM zero page merging test
selftests: vm: add KSM unmerge test
selftests: vm: add KSM merge test
mm/migrate: correct kernel-doc notation
mm: wire up syscall process_mrelease
mm: introduce process_mrelease system call
memblock: make memblock_find_in_range method private
mm/mempolicy.c: use in_task() in mempolicy_slab_node()
mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies
mm/mempolicy: advertise new MPOL_PREFERRED_MANY
mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY
...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 210 |
1 files changed, 174 insertions, 36 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index eeae2f6bc532..740d03e6dae2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/oom.h> @@ -121,6 +122,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + /* Allocation order */ s8 order; @@ -518,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } +static bool can_demote(int nid, struct scan_control *sc) +{ + if (!numa_demotion_enabled) + return false; + if (sc) { + if (sc->no_demotion) + return false; + /* It is pointless to do demotion in memcg reclaim */ + if (cgroup_reclaim(sc)) + return false; + } + if (next_demotion_node(nid) == NUMA_NO_NODE) + return false; + + return true; +} + +static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, + int nid, + struct scan_control *sc) +{ + if (memcg == NULL) { + /* + * For non-memcg reclaim, is there + * space in any swap device? + */ + if (get_nr_swap_pages() > 0) + return true; + } else { + /* Is the memcg below its swap limit? */ + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) + return true; + } + + /* + * The page can not be swapped. + * + * Can it be reclaimed from this node via demotion? + */ + return can_demote(nid, sc); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -529,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); @@ -893,6 +939,7 @@ out: void drop_slab_node(int nid) { unsigned long freed; + int shift = 0; do { struct mem_cgroup *memcg = NULL; @@ -905,7 +952,7 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while (freed > 10); + } while ((freed >> shift++) > 1); } void drop_slab(void) @@ -1052,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed, struct mem_cgroup *target_memcg) { - unsigned long flags; int refcount; void *shadow = NULL; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. * @@ -1100,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -1126,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (freepage != NULL) freepage(page); @@ -1135,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } @@ -1264,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +static struct page *alloc_demote_page(struct page *page, unsigned long node) +{ + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_THISNODE | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = node + }; + + return alloc_migration_target(page, (unsigned long)&mtc); +} + +/* + * Take pages on @demote_list and attempt to demote them to + * another node. Pages which are not demoted are left on + * @demote_pages. + */ +static unsigned int demote_page_list(struct list_head *demote_pages, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + int err; + + if (list_empty(demote_pages)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + /* Demotion ignores all cpuset and mempolicy settings */ + err = migrate_pages(demote_pages, alloc_demote_page, NULL, + target_nid, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + if (current_is_kswapd()) + __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); + else + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + + return nr_succeeded; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1275,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); + LIST_HEAD(demote_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; + bool do_demote_pass; memset(stat, 0, sizeof(*stat)); cond_resched(); + do_demote_pass = can_demote(pgdat->node_id, sc); +retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1430,6 +1528,17 @@ static unsigned int shrink_page_list(struct list_head *page_list, } /* + * Before reclaiming the page, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !PageTransHuge(page))) { + list_add(&page->lru, &demote_pages); + unlock_page(page); + continue; + } + + /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree page could be freed directly @@ -1624,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list, /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { - page_ref_unfreeze(page, 1); - goto keep_locked; - } - + /* + * The page has only one reference left, which is + * from the isolation. After the caller puts the + * page back on lru and drops the reference, the + * page will be freed anyway. It doesn't matter + * which lru it goes. So we don't bother checking + * PageDirty here. + */ count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true, @@ -1680,6 +1792,17 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + /* 'page_list' is always empty here */ + + /* Migrate pages selected for demotion */ + nr_reclaimed += demote_page_list(&demote_pages, pgdat); + /* Pages that could not be demoted are still in @demote_pages */ + if (!list_empty(&demote_pages)) { + /* Pages which failed to demoted go back on @page_list for retry: */ + list_splice_init(&demote_pages, page_list); + do_demote_pass = false; + goto retry; + } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -1698,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, { struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_unmap = 1, }; struct reclaim_stat stat; @@ -2323,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list) unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, + .no_demotion = 1, }; noreclaim_flag = memalloc_noreclaim_save(); @@ -2452,6 +2574,7 @@ enum scan_balance { static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); @@ -2462,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { + if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } @@ -2645,6 +2768,21 @@ out: } } +/* + * Anonymous LRU management is a waste if there is + * ultimately no way to reclaim the memory. + */ +static bool can_age_anon_pages(struct pglist_data *pgdat, + struct scan_control *sc) +{ + /* Aging the anon LRU is valuable if swap is present: */ + if (total_swap_pages > 0) + return true; + + /* Also valuable if anon pages can be demoted: */ + return can_demote(pgdat->node_id, sc); +} + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -2754,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2824,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; @@ -2898,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; @@ -3434,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ - if (!(gfp_mask & __GFP_FS)) { + if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat)); - goto check_pending; - } - - /* Throttle until kswapd wakes the process */ - wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); - -check_pending: if (fatal_signal_pending(current)) return true; @@ -3583,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat, struct mem_cgroup *memcg; struct lruvec *lruvec; - if (!total_swap_pages) + if (!can_age_anon_pages(pgdat, sc)) return; lruvec = mem_cgroup_lruvec(NULL, pgdat); @@ -3812,7 +3953,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); @@ -3938,9 +4079,9 @@ restart: wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); if (ret || kthread_should_stop()) break; @@ -3992,7 +4133,7 @@ out: } snapshot_refaults(NULL, pgdat); - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); @@ -4290,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; if (pgdat->kswapd) - return 0; + return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; } - return ret; } /* |