mm: enable page walking API to lock vmas during the walk

walk_page_range() and friends often operate under write-locked mmap_lock. With introduction of vma locks, the vmas have to be locked as well during such walks to prevent concurrent page faults in these areas. Add an additional member to mm_walk_ops to indicate locking requirements for the walk. The change ensures that page walks which prevent concurrent page faults by write-locking mmap_lock, operate correctly after introduction of per-vma locks. With per-vma locks page faults can be handled under vma lock without taking mmap_lock at all, so write locking mmap_lock would not stop them. The change ensures vmas are properly locked during such walks. A sample issue this solves is do_mbind() performing queue_pages_range() to queue pages for migration. Without this change a concurrent page can be faulted into the area and be left out of migration. Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org> Suggested-by: Jann Horn <jannh@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Michel Lespinasse <michel@lespinasse.org> Cc: Peter Xu <peterx@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Suren Baghdasaryan <surenb@google.com> 2023-08-04 17:27:19 +0200
committer: Andrew Morton <akpm@linux-foundation.org> 2023-08-21 22:07:20 +0200
commit: 49b0638502da097c15d46cd4e871dbaa022caf7c (patch)
tree: a36bae8bb4dd533dbe491ae25c3e506cd4dfb679 /mm/mempolicy.c
parent: smaps: use vm_normal_page_pmd() instead of follow_trans_huge_pmd() (diff)
download: linux-49b0638502da097c15d46cd4e871dbaa022caf7c.tar.xz
linux-49b0638502da097c15d46cd4e871dbaa022caf7c.zip
1 files changed, 14 insertions, 8 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c53f8beeb507..ec2eaceffd74 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -718,6 +718,14 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
 	.hugetlb_entry		= queue_folios_hugetlb,
 	.pmd_entry		= queue_folios_pte_range,
 	.test_walk		= queue_pages_test_walk,
+	.walk_lock		= PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
+	.hugetlb_entry		= queue_folios_hugetlb,
+	.pmd_entry		= queue_folios_pte_range,
+	.test_walk		= queue_pages_test_walk,
+	.walk_lock		= PGWALK_WRLOCK,
 };
 
 /*
@@ -738,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
 static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		struct list_head *pagelist, bool lock_vma)
 {
 	int err;
 	struct queue_pages qp = {
@@ -749,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		.end = end,
 		.first = NULL,
 	};
+	const struct mm_walk_ops *ops = lock_vma ?
+			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
 
-	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+	err = walk_page_range(mm, start, end, ops, &qp);
 
 	if (!qp.first)
 		/* whole range in hole */
@@ -1078,7 +1088,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	vma = find_vma(mm, 0);
 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
 	queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
-			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
@@ -1321,12 +1331,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 	 * Lock the VMAs before scanning for pages to migrate, to ensure we don't
 	 * miss a concurrently inserted page.
 	 */
-	vma_iter_init(&vmi, mm, start);
-	for_each_vma_range(vmi, vma, end)
-		vma_start_write(vma);
-
 	ret = queue_pages_range(mm, start, end, nmask,
-			  flags | MPOL_MF_INVERT, &pagelist);
+			  flags | MPOL_MF_INVERT, &pagelist, true);
 
 	if (ret < 0) {
 		err = ret;
author	Suren Baghdasaryan <surenb@google.com>	2023-08-04 17:27:19 +0200
committer	Andrew Morton <akpm@linux-foundation.org>	2023-08-21 22:07:20 +0200
commit	49b0638502da097c15d46cd4e871dbaa022caf7c (patch)
tree	a36bae8bb4dd533dbe491ae25c3e506cd4dfb679 /mm/mempolicy.c
parent	smaps: use vm_normal_page_pmd() instead of follow_trans_huge_pmd() (diff)
download	linux-49b0638502da097c15d46cd4e871dbaa022caf7c.tar.xz linux-49b0638502da097c15d46cd4e871dbaa022caf7c.zip