thp: reduce usage of huge zero page's atomic counter

The global zero page is used to satisfy an anonymous read fault. If THP(Transparent HugePage) is enabled then the global huge zero page is used. The global huge zero page uses an atomic counter for reference counting and is allocated/freed dynamically according to its counter value. CPU time spent on that counter will greatly increase if there are a lot of processes doing anonymous read faults. This patch proposes a way to reduce the access to the global counter so that the CPU load can be reduced accordingly. To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE. With this flag, the process only need to touch the global counter in two cases: 1 The first time it uses the global huge zero page; 2 The time when mm_user of its mm_struct reaches zero. Note that right now, the huge zero page is eligible to be freed as soon as its last use goes away. With this patch, the page will not be eligible to be freed until the exit of the last process from which it was ever used. And with the use of mm_user, the kthread is not eligible to use huge zero page either. Since no kthread is using huge zero page today, there is no difference after applying this patch. But if that is not desired, I can change it to when mm_count reaches zero. Case used for test on Haswell EP: usemem -n 72 --readonly -j 0x200000 100G Which spawns 72 processes and each will mmap 100G anonymous space and then do read only access to that space sequentially with a step of 2MB. CPU cycles from perf report for base commit: 54.03% usemem [kernel.kallsyms] [k] get_huge_zero_page CPU cycles from perf report for this commit: 0.11% usemem [kernel.kallsyms] [k] mm_get_huge_zero_page Performance(throughput) of the workload for base commit: 1784430792 Performance(throughput) of the workload for this commit: 4726928591 164% increase. Runtime of the workload for base commit: 707592 us Runtime of the workload for this commit: 303970 us 50% drop. Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com Signed-off-by: Aaron Lu <aaron.lu@intel.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jerome Marchand <jmarchan@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Aaron Lu <aaron.lu@intel.com> 2016-10-08 02:00:08 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-10-08 03:46:28 +0200
commit: 6fcb52a56ff60d240f06296b12827e7f20d45f63 (patch)
tree: 662e3a878be89475dc0bf52a85d5396191e2166d /mm/huge_memory.c
parent: fs/proc/task_mmu.c: make the task_mmu walk_page_range() limit in clear_refs_w... (diff)
download: linux-6fcb52a56ff60d240f06296b12827e7f20d45f63.tar.xz
linux-6fcb52a56ff60d240f06296b12827e7f20d45f63.zip
1 files changed, 25 insertions, 11 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a0b0e562407d..12b9f1a39b63 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
@@ -86,7 +86,7 @@ retry:
 	return READ_ONCE(huge_zero_page);
 }
 
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		return READ_ONCE(huge_zero_page);
+
+	if (!get_huge_zero_page())
+		return NULL;
+
+	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+
+	return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+}
+
 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
@@ -644,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 		pgtable = pte_alloc_one(vma->vm_mm, haddr);
 		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(vma->vm_mm);
 		if (unlikely(!zero_page)) {
 			pte_free(vma->vm_mm, pgtable);
 			count_vm_event(THP_FAULT_FALLBACK);
@@ -666,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 			}
 		} else
 			spin_unlock(fe->ptl);
-		if (!set) {
+		if (!set)
 			pte_free(vma->vm_mm, pgtable);
-			put_huge_zero_page();
-		}
 		return ret;
 	}
 	gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -823,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(dst_mm);
 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_page);
 		ret = 0;
@@ -1081,7 +1099,6 @@ alloc:
 		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
 		if (!page) {
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-			put_huge_zero_page();
 		} else {
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 			page_remove_rmap(page, true);
@@ -1542,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
-	put_huge_zero_page();
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1565,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-		if (is_huge_zero_pmd(_pmd))
-			put_huge_zero_page();
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
author	Aaron Lu <aaron.lu@intel.com>	2016-10-08 02:00:08 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-10-08 03:46:28 +0200
commit	6fcb52a56ff60d240f06296b12827e7f20d45f63 (patch)
tree	662e3a878be89475dc0bf52a85d5396191e2166d /mm/huge_memory.c
parent	fs/proc/task_mmu.c: make the task_mmu walk_page_range() limit in clear_refs_w... (diff)
download	linux-6fcb52a56ff60d240f06296b12827e7f20d45f63.tar.xz linux-6fcb52a56ff60d240f06296b12827e7f20d45f63.zip