diff options
Diffstat (limited to 'mm/gup.c')
-rw-r--r-- | mm/gup.c | 133 |
1 files changed, 102 insertions, 31 deletions
@@ -382,13 +382,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, } /* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pte's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); +} + +/* + * A (separate) COW fault might break the page the other way and + * get_user_pages() would return the page from what is now the wrong + * VM. So we need to force a COW break at GUP time even for reads. + */ +static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags) +{ + return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -1066,9 +1075,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto out; } if (is_vm_hugetlb_page(vma)) { + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags, locked); + foll_flags, locked); if (locked && *locked == 0) { /* * We've got a VM_FAULT_RETRY @@ -1082,6 +1093,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } } + + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; + retry: /* * If we have a pending SIGKILL, don't keep faulting pages and @@ -1168,7 +1183,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, return true; } -/* +/** * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1176,7 +1191,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller - * does not allow retry + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -1839,7 +1855,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } -/* +/** * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1870,13 +1886,13 @@ static long __get_user_pages_remote(struct task_struct *tsk, * * Must be called with mmap_sem held for read or write. * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different + * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page @@ -1888,17 +1904,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * - * get_user_pages should be phased out in favor of + * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass + * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -1937,7 +1953,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, } #endif /* !CONFIG_MMU */ -/* +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1960,11 +1986,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * +/** * get_user_pages_locked() is suitable to replace the form: * * down_read(&mm->mmap_sem); @@ -1980,6 +2002,21 @@ EXPORT_SYMBOL(get_user_pages); * get_user_pages_locked(tsk, mm, ..., pages, &locked); * if (locked) * up_read(&mm->mmap_sem); + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -2674,6 +2711,10 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) * * If the architecture does not support this function, simply return with no * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -2709,6 +2750,12 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * * We do not adopt an rcu_read_lock(.) here as we also want to * block IPIs that come from THPs splitting. + * + * NOTE! We allow read-only gup_fast() here, but you'd better be + * careful about possible COW pages. You'll get _a_ COW page, but + * not necessarily the one you intended to get depending on what + * COW event happens after this. COW may break the page copy in a + * random direction. */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && @@ -2766,10 +2813,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; + /* + * The FAST_GUP case requires FOLL_WRITE even for pure reads, + * because get_user_pages() may need to cause an early COW in + * order to avoid confusing the normal COW routines. So only + * targets that are already writable are safe to do by just + * looking at the page tables. + */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { local_irq_disable(); - gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned); + gup_pgd_range(addr, end, gup_flags | FOLL_WRITE, pages, &nr_pinned); local_irq_enable(); ret = nr_pinned; } @@ -2845,9 +2899,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. + * see Documentation/core-api/pin_user_pages.rst for further details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, @@ -2885,9 +2939,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -2921,9 +2975,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, @@ -2939,3 +2993,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); |