summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/exec.c81
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/page.c11
-rw-r--r--fs/userfaultfd.c171
-rw-r--r--fs/xfs/scrub/xfile.c6
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
8 files changed, 33 insertions, 247 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a46b0cbc4d8f..0e4efec1d92e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -288,6 +288,10 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
+config HUGETLB_PMD_PAGE_TABLE_SHARING
+ def_bool HUGETLB_PAGE
+ depends on ARCH_WANT_HUGE_PMD_SHARE && SPLIT_PMD_PTLOCKS
+
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/exec.c b/fs/exec.c
index dad402d55681..6c53920795c2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -710,80 +710,6 @@ static int copy_strings_kernel(int argc, const char *const *argv,
#ifdef CONFIG_MMU
/*
- * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
- * the binfmt code determines where the new stack should reside, we shift it to
- * its final location. The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
-static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
-
- BUG_ON(new_start > new_end);
-
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
-
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
- return -ENOMEM;
-
- /*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
- */
- if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false, true))
- return -ENOMEM;
-
- lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
- /*
- * when the old and new regions overlap clear from new_end.
- */
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- } else {
- /*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
- */
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- }
- tlb_finish_mmu(&tlb);
-
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
-}
-
-/*
* Finalizes the stack vm_area_struct. The flags and permissions are updated,
* the stack is optionally relocated, and some extra space is added.
*/
@@ -876,7 +802,12 @@ int setup_arg_pages(struct linux_binprm *bprm,
/* Move stack pages down in memory. */
if (stack_shift) {
- ret = shift_arg_pages(vma, stack_shift);
+ /*
+ * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
+ * the binfmt code determines where the new stack should reside, we shift it to
+ * its final location.
+ */
+ ret = relocate_vma_down(vma, stack_shift);
if (ret)
goto out_unlock;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9e3f25e4c188..87e4d6282025 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -166,8 +166,7 @@ static inline int folio_precise_page_mapcount(struct folio *folio,
{
int mapcount = atomic_read(&page->_mapcount) + 1;
- /* Handle page_has_type() pages */
- if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+ if (page_mapcount_is_type(mapcount))
mapcount = 0;
if (folio_test_large(folio))
mapcount += folio_entire_mapcount(folio);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b7a5c84b5819..a55f5acefa97 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -182,7 +182,6 @@ u64 stable_page_flags(const struct page *page)
#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
- u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
@@ -207,18 +206,16 @@ u64 stable_page_flags(const struct page *page)
u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
-#endif
-
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
- u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2);
u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 27a3e9285fbf..68cdd89c97a3 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}
-static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
-
- vm_flags_reset(vma, flags);
- /*
- * For shared mappings, we want to enable writenotify while
- * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
- * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
- */
- if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
- vma_set_page_prot(vma);
-}
-
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -386,15 +371,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
unsigned int blocking_state;
/*
- * We don't do userfault handling for the final child pid update.
- *
- * We also don't do userfault handling during
- * coredumping. hugetlbfs has the special
- * hugetlb_follow_page_mask() to skip missing pages in the
- * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
- * the no_page_table() helper in follow_page_mask(), but the
- * shmem_vm_ops->fault method is invoked even during
- * coredumping and it ends up here.
+ * We don't do userfault handling for the final child pid update
+ * and when coredumping (faults triggered by get_dump_page()).
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
@@ -615,22 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) {
- struct vm_area_struct *vma;
- struct mm_struct *mm = release_new_ctx->mm;
- VMA_ITERATOR(vmi, mm, 0);
-
- /* the various vma->vm_userfaultfd_ctx still points to it */
- mmap_write_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma,
- vma->vm_flags & ~__VM_UFFD_FLAGS);
- }
- }
- mmap_write_unlock(mm);
-
+ userfaultfd_release_new(release_new_ctx);
userfaultfd_ctx_put(release_new_ctx);
}
@@ -662,9 +625,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
return 0;
if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
return 0;
}
@@ -749,9 +710,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
}
}
@@ -870,54 +829,14 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
- unsigned long new_flags;
- VMA_ITERATOR(vmi, mm, 0);
WRITE_ONCE(ctx->released, true);
- if (!mmget_not_zero(mm))
- goto wakeup;
+ userfaultfd_release_all(mm, ctx);
/*
- * Flush page faults out of all CPUs. NOTE: all page faults
- * must be retried without returning VM_FAULT_SIGBUS if
- * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
- * changes while handle_userfault released the mmap_lock. So
- * it's critical that released is set to true (above), before
- * taking the mmap_lock for writing.
- */
- mmap_write_lock(mm);
- prev = NULL;
- for_each_vma(vmi, vma) {
- cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
- if (vma->vm_userfaultfd_ctx.ctx != ctx) {
- prev = vma;
- continue;
- }
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, vma->vm_start,
- vma->vm_end - vma->vm_start, false);
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
- vma->vm_end, new_flags,
- NULL_VM_UFFD_CTX);
-
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-
- prev = vma;
- }
- mmap_write_unlock(mm);
- mmput(mm);
-wakeup:
- /*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
* the fault_*wqh.
@@ -1293,14 +1212,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev, *cur;
+ struct vm_area_struct *vma, *cur;
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags, new_flags;
+ unsigned long vm_flags;
bool found;
bool basic_ioctls;
- unsigned long start, end, vma_end;
+ unsigned long start, end;
struct vma_iterator vmi;
bool wp_async = userfaultfd_wp_async_ctx(ctx);
@@ -1428,57 +1347,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
} for_each_vma_range(vmi, cur, end);
BUG_ON(!found);
- vma_iter_set(&vmi, start);
- prev = vma_prev(&vmi);
- if (vma->vm_start < start)
- prev = vma;
-
- ret = 0;
- for_each_vma_range(vmi, vma, end) {
- cond_resched();
-
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
- if (vma->vm_userfaultfd_ctx.ctx == ctx &&
- (vma->vm_flags & vm_flags) == vm_flags)
- goto skip;
-
- if (vma->vm_start > start)
- start = vma->vm_start;
- vma_end = min(end, vma->vm_end);
-
- new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags,
- (struct vm_userfaultfd_ctx){ctx});
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- break;
- }
-
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx.ctx = ctx;
-
- if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
- hugetlb_unshare_all_pmds(vma);
-
- skip:
- prev = vma;
- start = vma->vm_end;
- }
+ ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
+ wp_async);
out_unlock:
mmap_write_unlock(mm);
@@ -1519,7 +1389,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma, *prev, *cur;
int ret;
struct uffdio_range uffdio_unregister;
- unsigned long new_flags;
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
@@ -1622,27 +1491,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, start, vma_end - start, false);
-
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags, NULL_VM_UFFD_CTX);
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ start, vma_end);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
break;
}
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-
skip:
prev = vma;
start = vma->vm_end;
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 9b5d98fe1f8a..c753c79df203 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -126,7 +126,7 @@ xfile_load(
unsigned int len;
unsigned int offset;
- if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
SGP_READ) < 0)
break;
if (!folio) {
@@ -196,7 +196,7 @@ xfile_store(
unsigned int len;
unsigned int offset;
- if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
SGP_CACHE) < 0)
break;
if (filemap_check_wb_err(inode->i_mapping, 0)) {
@@ -267,7 +267,7 @@ xfile_get_folio(
i_size_write(inode, pos + len);
pflags = memalloc_nofs_save();
- error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
(flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
memalloc_nofs_restore(pflags);
if (error)
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 9bb2d24de709..07bebbfb16ee 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -149,7 +149,7 @@ xmbuf_map_page(
return -ENOMEM;
}
- error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE);
if (error)
return error;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1a74fe22672e..ec766b4bc853 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2128,7 +2128,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
+ ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
memcpy(&ptr[old_len], dp, len);