summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-09-03 19:08:28 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2021-09-03 19:08:28 +0200
commit14726903c835101cd8d0a703b609305094350d61 (patch)
tree5cdcf5d2f06ca14be76efd33a4de0e3b28a70de0 /mm/vmscan.c
parentMerge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi (diff)
parentmm/madvise: add MADV_WILLNEED to process_madvise() (diff)
downloadlinux-14726903c835101cd8d0a703b609305094350d61.tar.xz
linux-14726903c835101cd8d0a703b609305094350d61.zip
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "173 patches. Subsystems affected by this series: ia64, ocfs2, block, and mm (debug, pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap, bootmem, sparsemem, vmalloc, kasan, pagealloc, memory-failure, hugetlb, userfaultfd, vmscan, compaction, mempolicy, memblock, oom-kill, migration, ksm, percpu, vmstat, and madvise)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (173 commits) mm/madvise: add MADV_WILLNEED to process_madvise() mm/vmstat: remove unneeded return value mm/vmstat: simplify the array size calculation mm/vmstat: correct some wrong comments mm/percpu,c: remove obsolete comments of pcpu_chunk_populated() selftests: vm: add COW time test for KSM pages selftests: vm: add KSM merging time test mm: KSM: fix data type selftests: vm: add KSM merging across nodes test selftests: vm: add KSM zero page merging test selftests: vm: add KSM unmerge test selftests: vm: add KSM merge test mm/migrate: correct kernel-doc notation mm: wire up syscall process_mrelease mm: introduce process_mrelease system call memblock: make memblock_find_in_range method private mm/mempolicy.c: use in_task() in mempolicy_slab_node() mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies mm/mempolicy: advertise new MPOL_PREFERRED_MANY mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY ...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c210
1 files changed, 174 insertions, 36 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeae2f6bc532..740d03e6dae2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
@@ -121,6 +122,9 @@ struct scan_control {
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
/* Allocation order */
s8 order;
@@ -518,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}
+static bool can_demote(int nid, struct scan_control *sc)
+{
+ if (!numa_demotion_enabled)
+ return false;
+ if (sc) {
+ if (sc->no_demotion)
+ return false;
+ /* It is pointless to do demotion in memcg reclaim */
+ if (cgroup_reclaim(sc))
+ return false;
+ }
+ if (next_demotion_node(nid) == NUMA_NO_NODE)
+ return false;
+
+ return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+ int nid,
+ struct scan_control *sc)
+{
+ if (memcg == NULL) {
+ /*
+ * For non-memcg reclaim, is there
+ * space in any swap device?
+ */
+ if (get_nr_swap_pages() > 0)
+ return true;
+ } else {
+ /* Is the memcg below its swap limit? */
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+ return true;
+ }
+
+ /*
+ * The page can not be swapped.
+ *
+ * Can it be reclaimed from this node via demotion?
+ */
+ return can_demote(nid, sc);
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -529,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
@@ -893,6 +939,7 @@ out:
void drop_slab_node(int nid)
{
unsigned long freed;
+ int shift = 0;
do {
struct mem_cgroup *memcg = NULL;
@@ -905,7 +952,7 @@ void drop_slab_node(int nid)
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- } while (freed > 10);
+ } while ((freed >> shift++) > 1);
}
void drop_slab(void)
@@ -1052,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed, struct mem_cgroup *target_memcg)
{
- unsigned long flags;
int refcount;
void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
*
@@ -1100,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
@@ -1126,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
if (freepage != NULL)
freepage(page);
@@ -1135,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
return 0;
}
@@ -1264,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_THISNODE | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = node
+ };
+
+ return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node. Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+ struct pglist_data *pgdat)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded;
+ int err;
+
+ if (list_empty(demote_pages))
+ return 0;
+
+ if (target_nid == NUMA_NO_NODE)
+ return 0;
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ if (current_is_kswapd())
+ __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+ else
+ __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+ return nr_succeeded;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1275,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+ LIST_HEAD(demote_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
+ bool do_demote_pass;
memset(stat, 0, sizeof(*stat));
cond_resched();
+ do_demote_pass = can_demote(pgdat->node_id, sc);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1430,6 +1528,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
}
/*
+ * Before reclaiming the page, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass &&
+ (thp_migration_supported() || !PageTransHuge(page))) {
+ list_add(&page->lru, &demote_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
@@ -1624,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list,
/* follow __remove_mapping for reference */
if (!page_ref_freeze(page, 1))
goto keep_locked;
- if (PageDirty(page)) {
- page_ref_unfreeze(page, 1);
- goto keep_locked;
- }
-
+ /*
+ * The page has only one reference left, which is
+ * from the isolation. After the caller puts the
+ * page back on lru and drops the reference, the
+ * page will be freed anyway. It doesn't matter
+ * which lru it goes. So we don't bother checking
+ * PageDirty here.
+ */
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1680,6 +1792,17 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ /* 'page_list' is always empty here */
+
+ /* Migrate pages selected for demotion */
+ nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+ /* Pages that could not be demoted are still in @demote_pages */
+ if (!list_empty(&demote_pages)) {
+ /* Pages which failed to demoted go back on @page_list for retry: */
+ list_splice_init(&demote_pages, page_list);
+ do_demote_pass = false;
+ goto retry;
+ }
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
@@ -1698,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_unmap = 1,
};
struct reclaim_stat stat;
@@ -2323,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list)
unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
+ .no_demotion = 1,
};
noreclaim_flag = memalloc_noreclaim_save();
@@ -2452,6 +2574,7 @@ enum scan_balance {
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
@@ -2462,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2645,6 +2768,21 @@ out:
}
}
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ /* Aging the anon LRU is valuable if swap is present: */
+ if (total_swap_pages > 0)
+ return true;
+
+ /* Also valuable if anon pages can be demoted: */
+ return can_demote(pgdat->node_id, sc);
+}
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -2754,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2824,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
return inactive_lru_pages > pages_for_compaction;
@@ -2898,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
@@ -3434,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
- if (!(gfp_mask & __GFP_FS)) {
+ if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ);
+ else
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat));
- goto check_pending;
- }
-
- /* Throttle until kswapd wakes the process */
- wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- allow_direct_reclaim(pgdat));
-
-check_pending:
if (fatal_signal_pending(current))
return true;
@@ -3583,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
struct mem_cgroup *memcg;
struct lruvec *lruvec;
- if (!total_swap_pages)
+ if (!can_age_anon_pages(pgdat, sc))
return;
lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -3812,7 +3953,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
count_vm_event(PAGEOUTRUN);
@@ -3938,9 +4079,9 @@ restart:
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;
@@ -3992,7 +4133,7 @@ out:
}
snapshot_refaults(NULL, pgdat);
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
@@ -4290,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
-int kswapd_run(int nid)
+void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kswapd)
- return 0;
+ return;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
}
- return ret;
}
/*