summaryrefslogtreecommitdiffstats
path: root/mm/swap.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2020-06-04 01:03:06 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-04 05:09:49 +0200
commit7cf111bc39f6792abedcdfbc4e6291a5603b0ef0 (patch)
treefb3462f50a495a95a6275100a61b30310d8e1faf /mm/swap.c
parentmm: balance LRU lists based on relative thrashing (diff)
downloadlinux-7cf111bc39f6792abedcdfbc4e6291a5603b0ef0.tar.xz
linux-7cf111bc39f6792abedcdfbc4e6291a5603b0ef0.zip
mm: vmscan: determine anon/file pressure balance at the reclaim root
We split the LRU lists into anon and file, and we rebalance the scan pressure between them when one of them begins thrashing: if the file cache experiences workingset refaults, we increase the pressure on anonymous pages; if the workload is stalled on swapins, we increase the pressure on the file cache instead. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, LRU pressure balancing is done on an individual cgroup LRU level. As a result, when one cgroup is thrashing on the filesystem cache while a sibling may have cold anonymous pages, pressure doesn't get equalized between them. This patch moves LRU balancing decision to the root of reclaim - the same level where the LRU order is established. It does this by tracking LRU cost recursively, so that every level of the cgroup tree knows the aggregate LRU cost of all memory within its domain. When the page scanner calculates the scan balance for any given individual cgroup's LRU list, it uses the values from the ancestor cgroup that initiated the reclaim cycle. If one sibling is then thrashing on the cache, it will tip the pressure balance inside its ancestors, and the next hierarchical reclaim iteration will go more after the anon pages in the tree. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@surriel.com> Link: http://lkml.kernel.org/r/20200520232525.798933-13-hannes@cmpxchg.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap.c')
-rw-r--r--mm/swap.c32
1 files changed, 27 insertions, 5 deletions
diff --git a/mm/swap.c b/mm/swap.c
index 2dc7d392642f..4dff2123f695 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -282,11 +282,33 @@ void lru_note_cost(struct page *page)
{
struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
- /* Record new data point */
- if (page_is_file_lru(page))
- lruvec->file_cost++;
- else
- lruvec->anon_cost++;
+ do {
+ unsigned long lrusize;
+
+ /* Record cost event */
+ if (page_is_file_lru(page))
+ lruvec->file_cost++;
+ else
+ lruvec->anon_cost++;
+
+ /*
+ * Decay previous events
+ *
+ * Because workloads change over time (and to avoid
+ * overflow) we keep these statistics as a floating
+ * average, which ends up weighing recent refaults
+ * more than old ones.
+ */
+ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+
+ if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
+ lruvec->file_cost /= 2;
+ lruvec->anon_cost /= 2;
+ }
+ } while ((lruvec = parent_lruvec(lruvec)));
}
static void __activate_page(struct page *page, struct lruvec *lruvec,