diff options
author | Alex Shi <alex.shi@linux.alibaba.com> | 2020-12-15 21:34:29 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 23:48:04 +0100 |
commit | 6168d0da2b479ce25a4647de194045de1bdd1f1d (patch) | |
tree | d3a8a1d484d5ca32097347ca9b314483b6186fbd /mm/swap.c | |
parent | mm/swap.c: serialize memcg changes in pagevec_lru_move_fn (diff) | |
download | linux-6168d0da2b479ce25a4647de194045de1bdd1f1d.tar.xz linux-6168d0da2b479ce25a4647de194045de1bdd1f1d.zip |
mm/lru: replace pgdat lru_lock with lruvec lock
This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't have
to suffer from per node pgdat->lru_lock competition. They could go fast
with their self lru_lock.
After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.
In isolate_migratepages_block(), compact_unlock_should_abort and
lock_page_lruvec_irqsave are open coded to work with compact_control.
Also add a debug func in locking which may give some clues if there are
sth out of hands.
Daniel Jordan's testing show 62% improvement on modified readtwice case on
his 2P * 10 core * 2 HT broadwell box.
https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/
Hugh Dickins helped on the patch polish, thanks!
[alex.shi@linux.alibaba.com: fix comment typo]
Link: https://lkml.kernel.org/r/5b085715-292a-4b43-50b3-d73dc90d1de5@linux.alibaba.com
[alex.shi@linux.alibaba.com: use page_memcg()]
Link: https://lkml.kernel.org/r/5a4c2b72-7ee8-2478-fc0e-85eb83aafec4@linux.alibaba.com
Link: https://lkml.kernel.org/r/1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rong Chen <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap.c')
-rw-r--r-- | mm/swap.c | 116 |
1 files changed, 58 insertions, 58 deletions
diff --git a/mm/swap.c b/mm/swap.c index d952af770053..ba9fc21b24ea 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&pgdat->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); } @@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); - - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); - } + struct lruvec *new_lruvec; /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; - lruvec = mem_cgroup_page_lruvec(page, pgdat); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); + } + (*move_fn)(page, lruvec); SetPageLRU(page); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -274,9 +270,15 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { do { unsigned long lrusize; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&pgdat->lru_lock); + /* + * Hold lruvec->lru_lock is safe here, since + * 1) The pinned lruvec in reclaim, or + * 2) From a pre-LRU page during refault (which also holds the + * rcu lock, so would be safe even if the page was on the LRU + * and could move simultaneously to a new lruvec). + */ + spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += nr_pages; @@ -300,7 +302,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } @@ -364,13 +366,15 @@ static inline void activate_page_drain(int cpu) static void activate_page(struct page *page) { - pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; page = compound_head(page); - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat)); - spin_unlock_irq(&pgdat->lru_lock); + if (TestClearPageLRU(page)) { + lruvec = lock_page_lruvec_irq(page); + __activate_page(page, lruvec); + unlock_page_lruvec_irq(lruvec); + SetPageLRU(page); + } } #endif @@ -860,8 +864,7 @@ void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); - struct pglist_data *locked_pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags; unsigned int lock_batch; @@ -871,11 +874,11 @@ void release_pages(struct page **pages, int nr) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same pgdat. The lock is held only if pgdat != NULL. + * same lruvec. The lock is held only if lruvec != NULL. */ - if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } page = compound_head(page); @@ -883,10 +886,9 @@ void release_pages(struct page **pages, int nr) continue; if (is_zone_device_page(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, - flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } /* * ZONE_DEVICE pages that return 'false' from @@ -907,27 +909,27 @@ void release_pages(struct page **pages, int nr) continue; if (PageCompound(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pgdat != locked_pgdat) { - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); lock_batch = 0; - locked_pgdat = pgdat; - spin_lock_irqsave(&locked_pgdat->lru_lock, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -937,8 +939,8 @@ void release_pages(struct page **pages, int nr) list_add(&page->lru, &pages_to_free); } - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); @@ -1026,26 +1028,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) void __pagevec_lru_add(struct pagevec *pvec) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); __pagevec_lru_add_fn(page, lruvec); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } |