diff options
author | Suren Baghdasaryan <surenb@google.com> | 2023-08-04 17:27:19 +0200 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2023-08-21 22:07:20 +0200 |
commit | 49b0638502da097c15d46cd4e871dbaa022caf7c (patch) | |
tree | a36bae8bb4dd533dbe491ae25c3e506cd4dfb679 /mm/pagewalk.c | |
parent | smaps: use vm_normal_page_pmd() instead of follow_trans_huge_pmd() (diff) | |
download | linux-49b0638502da097c15d46cd4e871dbaa022caf7c.tar.xz linux-49b0638502da097c15d46cd4e871dbaa022caf7c.zip |
mm: enable page walking API to lock vmas during the walk
walk_page_range() and friends often operate under write-locked mmap_lock.
With introduction of vma locks, the vmas have to be locked as well during
such walks to prevent concurrent page faults in these areas. Add an
additional member to mm_walk_ops to indicate locking requirements for the
walk.
The change ensures that page walks which prevent concurrent page faults
by write-locking mmap_lock, operate correctly after introduction of
per-vma locks. With per-vma locks page faults can be handled under vma
lock without taking mmap_lock at all, so write locking mmap_lock would
not stop them. The change ensures vmas are properly locked during such
walks.
A sample issue this solves is do_mbind() performing queue_pages_range()
to queue pages for migration. Without this change a concurrent page
can be faulted into the area and be left out of migration.
Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Suggested-by: Jann Horn <jannh@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/pagewalk.c')
-rw-r--r-- | mm/pagewalk.c | 36 |
1 files changed, 33 insertions, 3 deletions
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2022333805d3..9b2d23fbf4d3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -400,6 +400,33 @@ static int __walk_page_range(unsigned long start, unsigned long end, return err; } +static inline void process_mm_walk_lock(struct mm_struct *mm, + enum page_walk_lock walk_lock) +{ + if (walk_lock == PGWALK_RDLOCK) + mmap_assert_locked(mm); + else + mmap_assert_write_locked(mm); +} + +static inline void process_vma_walk_lock(struct vm_area_struct *vma, + enum page_walk_lock walk_lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + switch (walk_lock) { + case PGWALK_WRLOCK: + vma_start_write(vma); + break; + case PGWALK_WRLOCK_VERIFY: + vma_assert_write_locked(vma); + break; + case PGWALK_RDLOCK: + /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ + break; + } +#endif +} + /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk @@ -459,7 +486,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { @@ -474,6 +501,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ + process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); @@ -549,7 +577,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); + process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } @@ -566,7 +595,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); + process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } |