summaryrefslogtreecommitdiffstats
path: root/mm/gup.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/gup.c')
-rw-r--r--mm/gup.c158
1 files changed, 125 insertions, 33 deletions
diff --git a/mm/gup.c b/mm/gup.c
index e2a39e30756d..5abdaf487460 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -134,7 +134,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
* path.
*/
if (unlikely((flags & FOLL_LONGTERM) &&
- !is_pinnable_page(page)))
+ !is_longterm_pinnable_page(page)))
return NULL;
/*
@@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
return -EEXIST;
}
-/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ /* If the pte is writable, we can write to the page. */
+ if (pte_write(pte))
+ return true;
+
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+ return false;
+ return !userfaultfd_pte_wp(vma, pte);
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -528,12 +556,19 @@ retry:
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
- pte_unmap_unlock(ptep, ptl);
- return NULL;
- }
page = vm_normal_page(vma, address, pte);
+
+ /*
+ * We only care about anon pages in can_follow_write_pte() and don't
+ * have to worry about pte_devmap() because they are never anon.
+ */
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pte(pte, page, vma, flags)) {
+ page = NULL;
+ goto out;
+ }
+
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
@@ -953,6 +988,25 @@ static int faultin_page(struct vm_area_struct *vma,
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+ * mmap lock in the page fault handler. Sanity check this.
+ */
+ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+ if (locked)
+ *locked = 0;
+ /*
+ * We should do the same as VM_FAULT_RETRY, but let's not
+ * return -EBUSY since that's not reflecting the reality of
+ * what has happened - we've just fully completed a page
+ * fault, with the mmap lock released. Use -EAGAIN to show
+ * that we want to take the mmap lock _again_.
+ */
+ return -EAGAIN;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -967,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma,
return -EBUSY;
}
- /*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
- * necessary, even if maybe_mkwrite decided not to set pte_write. We
- * can thus safely do subsequent page lookups as if they were reads.
- * But only do so when looping for pte_write is futile: in some cases
- * userspace may also be wanting to write to the gotten user page,
- * which a read fault here might prevent (a readonly page might get
- * reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
- *flags |= FOLL_COW;
return 0;
}
@@ -1179,6 +1222,7 @@ retry:
case 0:
goto retry;
case -EBUSY:
+ case -EAGAIN:
ret = 0;
fallthrough;
case -EFAULT:
@@ -1305,6 +1349,18 @@ retry:
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * NOTE: it's a pity that we need to retake the lock here
+ * to pair with the unlock() in the callers. Ideally we
+ * could tell the callers so they do not need to unlock.
+ */
+ mmap_read_lock(mm);
+ *unlocked = true;
+ return 0;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1370,7 +1426,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
- /* VM_FAULT_RETRY cannot return errors */
+ /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
if (!*locked) {
BUG_ON(ret < 0);
BUG_ON(ret >= nr_pages);
@@ -1674,7 +1730,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
goto finish_or_fault;
if (pages) {
- pages[i] = virt_to_page(start);
+ pages[i] = virt_to_page((void *)start);
if (pages[i])
get_page(pages[i]);
}
@@ -1883,7 +1939,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
unsigned long isolation_error_count = 0, i;
struct folio *prev_folio = NULL;
LIST_HEAD(movable_page_list);
- bool drain_allow = true;
+ bool drain_allow = true, coherent_pages = false;
int ret = 0;
for (i = 0; i < nr_pages; i++) {
@@ -1893,14 +1949,43 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_folio = folio;
- if (folio_is_pinnable(folio))
+ /*
+ * Device coherent pages are managed by a driver and should not
+ * be pinned indefinitely as it prevents the driver moving the
+ * page. So when trying to pin with FOLL_LONGTERM instead try
+ * to migrate the page out of device memory.
+ */
+ if (folio_is_device_coherent(folio)) {
+ /*
+ * We always want a new GUP lookup with device coherent
+ * pages.
+ */
+ pages[i] = 0;
+ coherent_pages = true;
+
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ if (gup_flags & FOLL_PIN) {
+ get_page(&folio->page);
+ unpin_user_page(&folio->page);
+ }
+
+ ret = migrate_device_coherent_page(&folio->page);
+ if (ret)
+ goto unpin_pages;
+
continue;
+ }
+ if (folio_is_longterm_pinnable(folio))
+ continue;
/*
* Try to move out any movable page before pinning the range.
*/
if (folio_test_hugetlb(folio)) {
- if (!isolate_huge_page(&folio->page,
+ if (isolate_hugetlb(&folio->page,
&movable_page_list))
isolation_error_count++;
continue;
@@ -1921,7 +2006,8 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
folio_nr_pages(folio));
}
- if (!list_empty(&movable_page_list) || isolation_error_count)
+ if (!list_empty(&movable_page_list) || isolation_error_count ||
+ coherent_pages)
goto unpin_pages;
/*
@@ -1931,10 +2017,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
return nr_pages;
unpin_pages:
- if (gup_flags & FOLL_PIN) {
- unpin_user_pages(pages, nr_pages);
- } else {
- for (i = 0; i < nr_pages; i++)
+ /*
+ * pages[i] might be NULL if any device coherent pages were found.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ if (!pages[i])
+ continue;
+
+ if (gup_flags & FOLL_PIN)
+ unpin_user_page(pages[i]);
+ else
put_page(pages[i]);
}