diff options
Diffstat (limited to 'arch/powerpc/mm/book3s64')
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_native.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_pgtable.c | 21 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_tlb.c | 23 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/hash_utils.c | 121 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/internal.h | 16 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/iommu_api.c | 4 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/pgtable.c | 45 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/pkeys.c | 311 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_pgtable.c | 280 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_tlb.c | 86 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/slb.c | 168 | ||||
-rw-r--r-- | arch/powerpc/mm/book3s64/subpage_prot.c | 22 |
14 files changed, 655 insertions, 448 deletions
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c index eefa89c6117b..964467b3a776 100644 --- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -10,8 +10,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/machdep.h> diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index d2d8237ea9d5..cf20e5229ce1 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -14,11 +14,11 @@ #include <linux/processor.h> #include <linux/threads.h> #include <linux/smp.h> +#include <linux/pgtable.h> #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/trace.h> #include <asm/tlb.h> #include <asm/cputable.h> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64733b9cb20a..fd9c7f91b092 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -9,8 +9,6 @@ #include <linux/mm_types.h> #include <linux/mm.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/sections.h> #include <asm/mmu.h> #include <asm/tlb.h> @@ -148,6 +146,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -155,7 +154,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); @@ -236,7 +236,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * to hugepage, we first clear the pmd, then invalidate all * the PTE entries. The assumption here is that any low level * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a + * will wait on mmap_lock. But we could very well be in a * hash_page with local ptep pointer value. Such a hash page * can result in adding new HPTE entries for normal subpages. * That means we could be modifying the page content as we @@ -250,7 +250,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres * Now invalidate the hpte entries in the range * covered by pmd. This make sure we take a * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and + * result in a major fault which takes mmap_lock and * hence wait for collapse to complete. Without this * the __collapse_huge_page_copy can result in copying * the old content. @@ -363,17 +363,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, * hash fault look at them. */ memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_current_mm_pte variants which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_curren_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 4a70d8dd39cd..eb0bccaf221e 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -21,7 +21,6 @@ #include <linux/mm.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/bug.h> @@ -176,7 +175,6 @@ void hash__tlb_flush(struct mmu_gather *tlb) * from the hash table (and the TLB). But keeps * the linux PTEs intact. * - * @mm : mm_struct of the target address space (generally init_mm) * @start : starting address * @end : ending address (not included in the flush) * @@ -189,17 +187,14 @@ void hash__tlb_flush(struct mmu_gather *tlb) * Because of that usage pattern, it is implemented for small size rather * than speed. */ -void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end) +void __flush_hash_table_range(unsigned long start, unsigned long end) { - bool is_thp; int hugepage_shift; unsigned long flags; - start = _ALIGN_DOWN(start, PAGE_SIZE); - end = _ALIGN_UP(end, PAGE_SIZE); + start = ALIGN_DOWN(start, PAGE_SIZE); + end = ALIGN(end, PAGE_SIZE); - BUG_ON(!mm->pgd); /* * Note: Normally, we should only ever use a batch within a @@ -212,21 +207,15 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, - &hugepage_shift); + pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; if (ptep == NULL) continue; pte = pte_val(*ptep); - if (is_thp) - trace_hugepage_invalidate(start, pte); if (!(pte & H_PAGE_HASHPTE)) continue; - if (unlikely(is_thp)) - hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); - else - hpte_need_flush(mm, start, ptep, pte, hugepage_shift); + hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); @@ -238,7 +227,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) pte_t *start_pte; unsigned long flags; - addr = _ALIGN_DOWN(addr, PMD_SIZE); + addr = ALIGN_DOWN(addr, PMD_SIZE); /* * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 8ed2411c3f39..c663e7ba801f 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -35,10 +35,10 @@ #include <linux/pkeys.h> #include <linux/hugetlb.h> #include <linux/cpu.h> +#include <linux/pgtable.h> #include <asm/debugfs.h> #include <asm/processor.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/page.h> @@ -66,6 +66,9 @@ #include <mm/mmu_decl.h> +#include "internal.h" + + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -593,7 +596,7 @@ static void __init htab_scan_page_sizes(void) } #ifdef CONFIG_HUGETLB_PAGE - if (!hugetlb_disabled) { + if (!hugetlb_disabled && !early_radix_enabled() ) { /* Reserve 16G huge page memory sections for huge pages */ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); } @@ -660,11 +663,10 @@ static void __init htab_init_page_sizes(void) * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default */ - if (IS_ENABLED(STRICT_KERNEL_RWX) && + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && (unsigned long)_stext % 0x1000000) { if (mmu_psize_defs[MMU_PAGE_16M].shift) - pr_warn("Kernel not 16M aligned, " - "disabling 16M linear map alignment"); + pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n"); aligned = false; } @@ -785,7 +787,7 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int resize_hpt_for_hotplug(unsigned long new_mem_size) +static int resize_hpt_for_hotplug(unsigned long new_mem_size) { unsigned target_hpt_shift; @@ -819,6 +821,8 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, return -1; } + resize_hpt_for_hotplug(memblock_phys_mem_size()); + rc = htab_bolt_mapping(start, end, __pa(start), pgprot_val(prot), mmu_linear_psize, mmu_kernel_ssize); @@ -836,6 +840,10 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end) int rc = htab_remove_mapping(start, end, mmu_linear_psize, mmu_kernel_ssize); WARN_ON(rc < 0); + + if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) + pr_warn("Hash collision while resizing HPT\n"); + return rc; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -870,6 +878,9 @@ static void __init htab_initialize(void) printk(KERN_INFO "Using 1TB segments\n"); } + if (stress_slb_enabled) + static_branch_enable(&stress_slb_key); + /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. @@ -1105,6 +1116,11 @@ void hash__early_init_mmu_secondary(void) if (cpu_has_feature(CPU_FTR_ARCH_206) && cpu_has_feature(CPU_FTR_HVMODE)) tlbiel_all(); + +#ifdef CONFIG_PPC_MEM_KEYS + if (mmu_has_feature(MMU_FTR_PKEY)) + mtspr(SPRN_UAMOR, default_uamor); +#endif } #endif /* CONFIG_SMP */ @@ -1350,8 +1366,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, goto bail; } - /* Add _PAGE_PRESENT to the required access perm */ - access |= _PAGE_PRESENT; + /* + * Add _PAGE_PRESENT to the required access perm. If there are parallel + * updates to the pte that can possibly clear _PAGE_PTE, catch that too. + * + * We can safely use the return pte address in rest of the function + * because we do set H_PAGE_BUSY which prevents further updates to pte + * from generic code. + */ + access |= _PAGE_PRESENT | _PAGE_PTE; /* * Pre-check access permissions (will be re-checked atomically @@ -1539,16 +1562,14 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) } #endif -static void hash_preload(struct mm_struct *mm, unsigned long ea, +static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, bool is_exec, unsigned long trap) { - int hugepage_shift; unsigned long vsid; pgd_t *pgdir; - pte_t *ptep; - unsigned long flags; int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); + unsigned long flags; BUG_ON(get_region_id(ea) != USER_REGION_ID); @@ -1568,32 +1589,42 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; - /* - * Hash doesn't like irqs. Walking linux page table with irq disabled - * saves us from holding multiple locks. - */ - local_irq_save(flags); - /* - * THP pages use update_mmu_cache_pmd. We don't do - * hash preload there. Hence can ignore THP here - */ - ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); - if (!ptep) - goto out_exit; - - WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here + * Called with PTL held, hence can be sure the value won't change in + * between. */ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) - goto out_exit; + return; #endif /* CONFIG_PPC_64K_PAGES */ + /* + * __hash_page_* must run with interrupts off, as it sets the + * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any + * time and may take a hash fault reading the user stack, see + * read_user_stack_slow() in the powerpc/perf code. + * + * If that takes a hash fault on the same page as we lock here, it + * will bail out when seeing H_PAGE_BUSY set, and retry the access + * leading to an infinite loop. + * + * Disabling interrupts here does not prevent perf interrupts, but it + * will prevent them taking hash faults (see the NMI test in + * do_hash_page), then read_user_stack's copy_from_user_nofault will + * fail and perf will fall back to read_user_stack_slow(), which + * walks the Linux page tables. + * + * Interrupts must also be off for the duration of the + * mm_is_thread_local test and update, to prevent preempt running the + * mm on another CPU (XXX: this may be racy vs kthread_use_mm). + */ + local_irq_save(flags); + /* Is that local to this CPU ? */ if (mm_is_thread_local(mm)) update_flags |= HPTE_LOCAL_UPDATE; @@ -1616,7 +1647,7 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, mm_ctx_user_psize(&mm->context), mm_ctx_user_psize(&mm->context), pte_val(*ptep)); -out_exit: + local_irq_restore(flags); } @@ -1638,10 +1669,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, unsigned long trap; bool is_exec; - if (radix_enabled()) { - prefetch((void *)address); + if (radix_enabled()) return; - } /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) @@ -1668,33 +1697,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; } - hash_preload(vma->vm_mm, address, is_exec, trap); + hash_preload(vma->vm_mm, ptep, address, is_exec, trap); } -#ifdef CONFIG_PPC_MEM_KEYS -/* - * Return the protection key associated with the given address and the - * mm_struct. - */ -u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) -{ - pte_t *ptep; - u16 pkey = 0; - unsigned long flags; - - if (!mm || !mm->pgd) - return 0; - - local_irq_save(flags); - ptep = find_linux_pte(mm->pgd, address, NULL, NULL); - if (ptep) - pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); - local_irq_restore(flags); - - return pkey; -} -#endif /* CONFIG_PPC_MEM_KEYS */ - #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline void tm_flush_hash_page(int local) { @@ -1736,10 +1741,6 @@ unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, return gslot; } -/* - * WARNING: This is called from hash_low_64.S, if you change this prototype, - * do not forget to update the assembly call site ! - */ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, unsigned long flags) { diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h new file mode 100644 index 000000000000..7eda0d30d765 --- /dev/null +++ b/arch/powerpc/mm/book3s64/internal.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H +#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H + +#include <linux/jump_label.h> + +extern bool stress_slb_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_slb_key); + +static inline bool stress_slb(void) +{ + return static_branch_unlikely(&stress_slb_key); +} + +#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */ diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index fa05bbd1f682..563faa10bb66 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -96,7 +96,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, goto unlock_exit; } - down_read(&mm->mmap_sem); + mmap_read_lock(mm); chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); @@ -114,7 +114,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, pinned += ret; break; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (pinned != entries) { if (!ret) ret = -EFAULT; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index e0bb69c616e4..e18ae50a275c 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -15,6 +15,7 @@ #include <asm/powernv.h> #include <asm/firmware.h> #include <asm/ultravisor.h> +#include <asm/kexec.h> #include <mm/mmu_decl.h> #include <trace/events/thp.h> @@ -109,15 +110,25 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return __pmd(old_pmd); +} + +pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, int full) +{ + pmd_t pmd; + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - * - * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires - * a special case check in pmd_access_permitted. + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. */ - serialize_against_pte_lookup(vma->vm_mm); - return __pmd(old_pmd); + if (!full) + flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + return pmd; } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) @@ -146,19 +157,6 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdv &= _HPAGE_CHG_MASK; return pmd_set_protbits(__pmd(pmdv), newprot); } - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - if (radix_enabled()) - prefetch((void *)addr); -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* For use by kexec */ @@ -168,6 +166,8 @@ void mmu_cleanup_all(void) radix__mmu_cleanup_all(); else if (mmu_hash_ops.hpte_clear_all) mmu_hash_ops.hpte_clear_all(); + + reset_sprs(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -342,6 +342,9 @@ void pmd_fragment_free(unsigned long *pmd) { struct page *page = virt_to_page(pmd); + if (PageReserved(page)) + return free_reserved_page(page); + BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { pgtable_pmd_page_dtor(page); @@ -359,7 +362,7 @@ static inline void pgtable_free(void *table, int index) pmd_fragment_free(table); break; case PUD_INDEX: - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); + __pud_free(table); break; #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) /* 16M hugepd directory at pud level */ diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 1199fc2bfaec..b1d091a97611 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -10,58 +10,97 @@ #include <asm/mmu.h> #include <asm/setup.h> #include <linux/pkeys.h> -#include <linux/of_device.h> +#include <linux/of_fdt.h> -DEFINE_STATIC_KEY_TRUE(pkey_disabled); -int pkeys_total; /* Total pkeys as per device tree */ -u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ -u32 reserved_allocation_mask; /* Bits set for reserved keys */ -static bool pkey_execute_disable_supported; -static bool pkeys_devtree_defined; /* property exported by device tree */ -static u64 pkey_amr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ -static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ +int num_pkey; /* Max number of pkeys supported */ +/* + * Keys marked in the reservation list cannot be allocated by userspace + */ +u32 reserved_allocation_mask __ro_after_init; + +/* Bits set for the initially allocated keys */ +static u32 initial_allocation_mask __ro_after_init; + +/* + * Even if we allocate keys with sys_pkey_alloc(), we need to make sure + * other thread still find the access denied using the same keys. + */ +static u64 default_amr = ~0x0UL; +static u64 default_iamr = 0x5555555555555555UL; +u64 default_uamor __ro_after_init; +/* + * Key used to implement PROT_EXEC mmap. Denies READ/WRITE + * We pick key 2 because 0 is special key and 1 is reserved as per ISA. + */ static int execute_only_key = 2; +static bool pkey_execute_disable_supported; + #define AMR_BITS_PER_PKEY 2 #define AMR_RD_BIT 0x1UL #define AMR_WR_BIT 0x2UL #define IAMR_EX_BIT 0x1UL -#define PKEY_REG_BITS (sizeof(u64)*8) +#define PKEY_REG_BITS (sizeof(u64) * 8) #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) -static void scan_pkey_feature(void) +static int __init dt_scan_storage_keys(unsigned long node, + const char *uname, int depth, + void *data) { - u32 vals[2]; - struct device_node *cpu; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int *pkeys_total = (int *) data; - cpu = of_find_node_by_type(NULL, "cpu"); - if (!cpu) - return; + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; - if (of_property_read_u32_array(cpu, - "ibm,processor-storage-keys", vals, 2)) - return; + prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL); + if (!prop) + return 0; + *pkeys_total = be32_to_cpu(prop[0]); + return 1; +} + +static int scan_pkey_feature(void) +{ + int ret; + int pkeys_total = 0; /* - * Since any pkey can be used for data or execute, we will just treat - * all keys as equal and track them as one entity. + * Pkey is not supported with Radix translation. */ - pkeys_total = vals[0]; - pkeys_devtree_defined = true; -} + if (early_radix_enabled()) + return 0; -static inline bool pkey_mmu_enabled(void) -{ - if (firmware_has_feature(FW_FEATURE_LPAR)) - return pkeys_total; - else - return cpu_has_feature(CPU_FTR_PKEY); + ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total); + if (ret == 0) { + /* + * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device + * tree. We make this exception since some version of skiboot forgot to + * expose this property on power8/9. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + unsigned long pvr = mfspr(SPRN_PVR); + + if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || + PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9) + pkeys_total = 32; + } + } + + /* + * Adjust the upper limit, based on the number of bits supported by + * arch-neutral code. + */ + pkeys_total = min_t(int, pkeys_total, + ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + 1)); + return pkeys_total; } -static int pkey_initialize(void) +void __init pkey_early_init_devtree(void) { - int os_reserved, i; + int pkeys_total, i; /* * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral @@ -79,32 +118,21 @@ static int pkey_initialize(void) __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) != (sizeof(u64) * BITS_PER_BYTE)); - /* scan the device tree for pkey feature */ - scan_pkey_feature(); - /* - * Let's assume 32 pkeys on P8 bare metal, if its not defined by device - * tree. We make this exception since skiboot forgot to expose this - * property on power8. + * Only P7 and above supports SPRN_AMR update with MSR[PR] = 1 */ - if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) && - cpu_has_feature(CPU_FTRS_POWER8)) - pkeys_total = 32; + if (!early_cpu_has_feature(CPU_FTR_ARCH_206)) + return; - /* - * Adjust the upper limit, based on the number of bits supported by - * arch-neutral code. - */ - pkeys_total = min_t(int, pkeys_total, - ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); + /* scan the device tree for pkey feature */ + pkeys_total = scan_pkey_feature(); + if (!pkeys_total) + goto out; - if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) - static_branch_enable(&pkey_disabled); - else - static_branch_disable(&pkey_disabled); + /* Allow all keys to be modified by default */ + default_uamor = ~0x0UL; - if (static_branch_likely(&pkey_disabled)) - return 0; + cur_cpu_spec->mmu_features |= MMU_FTR_PKEY; /* * The device tree cannot be relied to indicate support for @@ -118,53 +146,86 @@ static int pkey_initialize(void) #ifdef CONFIG_PPC_4K_PAGES /* * The OS can manage only 8 pkeys due to its inability to represent them - * in the Linux 4K PTE. + * in the Linux 4K PTE. Mark all other keys reserved. */ - os_reserved = pkeys_total - 8; + num_pkey = min(8, pkeys_total); #else - os_reserved = 0; + num_pkey = pkeys_total; #endif - /* Bits are in LE format. */ - reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); - - /* register mask is in BE format */ - pkey_amr_mask = ~0x0ul; - pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); - - pkey_iamr_mask = ~0x0ul; - pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); - pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - - pkey_uamor_mask = ~0x0ul; - pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - - /* mark the rest of the keys as reserved and hence unavailable */ - for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { - reserved_allocation_mask |= (0x1 << i); - pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); - } - initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); - if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { + if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) { /* * Insufficient number of keys to support * execute only key. Mark it unavailable. - * Any AMR, UAMOR, IAMR bit set for - * this key is irrelevant since this key - * can never be allocated. */ execute_only_key = -1; + } else { + /* + * Mark the execute_only_pkey as not available for + * user allocation via pkey_alloc. + */ + reserved_allocation_mask |= (0x1 << execute_only_key); + + /* + * Deny READ/WRITE for execute_only_key. + * Allow execute in IAMR. + */ + default_amr |= (0x3ul << pkeyshift(execute_only_key)); + default_iamr &= ~(0x1ul << pkeyshift(execute_only_key)); + + /* + * Clear the uamor bits for this key. + */ + default_uamor &= ~(0x3ul << pkeyshift(execute_only_key)); } - return 0; -} + /* + * Allow access for only key 0. And prevent any other modification. + */ + default_amr &= ~(0x3ul << pkeyshift(0)); + default_iamr &= ~(0x1ul << pkeyshift(0)); + default_uamor &= ~(0x3ul << pkeyshift(0)); + /* + * key 0 is special in that we want to consider it an allocated + * key which is preallocated. We don't allow changing AMR bits + * w.r.t key 0. But one can pkey_free(key0) + */ + initial_allocation_mask |= (0x1 << 0); + + /* + * key 1 is recommended not to be used. PowerISA(3.0) page 1015, + * programming note. + */ + reserved_allocation_mask |= (0x1 << 1); + default_uamor &= ~(0x3ul << pkeyshift(1)); -arch_initcall(pkey_initialize); + /* + * Prevent the usage of OS reserved keys. Update UAMOR + * for those keys. Also mark the rest of the bits in the + * 32 bit mask as reserved. + */ + for (i = num_pkey; i < 32 ; i++) { + reserved_allocation_mask |= (0x1 << i); + default_uamor &= ~(0x3ul << pkeyshift(i)); + } + /* + * Prevent the allocation of reserved keys too. + */ + initial_allocation_mask |= reserved_allocation_mask; + + pr_info("Enabling pkeys with max key count %d\n", num_pkey); +out: + /* + * Setup uamor on boot cpu + */ + mtspr(SPRN_UAMOR, default_uamor); + + return; +} void pkey_mm_init(struct mm_struct *mm) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return; mm_pkey_allocation_map(mm) = initial_allocation_mask; mm->context.execute_only_pkey = execute_only_key; @@ -196,30 +257,6 @@ static inline void write_iamr(u64 value) mtspr(SPRN_IAMR, value); } -static inline u64 read_uamor(void) -{ - return mfspr(SPRN_UAMOR); -} - -static inline void write_uamor(u64 value) -{ - mtspr(SPRN_UAMOR, value); -} - -static bool is_pkey_enabled(int pkey) -{ - u64 uamor = read_uamor(); - u64 pkey_bits = 0x3ul << pkeyshift(pkey); - u64 uamor_pkey_bits = (uamor & pkey_bits); - - /* - * Both the bits in UAMOR corresponding to the key should be set or - * reset. - */ - WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits)); - return !!(uamor_pkey_bits); -} - static inline void init_amr(int pkey, u8 init_bits) { u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); @@ -245,8 +282,18 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, { u64 new_amr_bits = 0x0ul; u64 new_iamr_bits = 0x0ul; + u64 pkey_bits, uamor_pkey_bits; + + /* + * Check whether the key is disabled by UAMOR. + */ + pkey_bits = 0x3ul << pkeyshift(pkey); + uamor_pkey_bits = (default_uamor & pkey_bits); - if (!is_pkey_enabled(pkey)) + /* + * Both the bits in UAMOR corresponding to the key should be set + */ + if (uamor_pkey_bits != pkey_bits) return -EINVAL; if (init_val & PKEY_DISABLE_EXECUTE) { @@ -268,7 +315,7 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, void thread_pkey_regs_save(struct thread_struct *thread) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return; /* @@ -276,38 +323,33 @@ void thread_pkey_regs_save(struct thread_struct *thread) */ thread->amr = read_amr(); thread->iamr = read_iamr(); - thread->uamor = read_uamor(); } void thread_pkey_regs_restore(struct thread_struct *new_thread, struct thread_struct *old_thread) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return; if (old_thread->amr != new_thread->amr) write_amr(new_thread->amr); if (old_thread->iamr != new_thread->iamr) write_iamr(new_thread->iamr); - if (old_thread->uamor != new_thread->uamor) - write_uamor(new_thread->uamor); } void thread_pkey_regs_init(struct thread_struct *thread) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return; - thread->amr = pkey_amr_mask; - thread->iamr = pkey_iamr_mask; - thread->uamor = pkey_uamor_mask; + thread->amr = default_amr; + thread->iamr = default_iamr; - write_uamor(pkey_uamor_mask); - write_amr(pkey_amr_mask); - write_iamr(pkey_iamr_mask); + write_amr(default_amr); + write_iamr(default_iamr); } -int __execute_only_pkey(struct mm_struct *mm) +int execute_only_pkey(struct mm_struct *mm) { return mm->context.execute_only_pkey; } @@ -353,21 +395,20 @@ static bool pkey_access_permitted(int pkey, bool write, bool execute) int pkey_shift; u64 amr; - if (!is_pkey_enabled(pkey)) - return true; - pkey_shift = pkeyshift(pkey); - if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift))) - return true; + if (execute) + return !(read_iamr() & (IAMR_EX_BIT << pkey_shift)); + + amr = read_amr(); + if (write) + return !(amr & (AMR_WR_BIT << pkey_shift)); - amr = read_amr(); /* Delay reading amr until absolutely needed */ - return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) || - (write && !(amr & (AMR_WR_BIT << pkey_shift)))); + return !(amr & (AMR_RD_BIT << pkey_shift)); } bool arch_pte_access_permitted(u64 pte, bool write, bool execute) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return true; return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute); @@ -384,7 +425,7 @@ bool arch_pte_access_permitted(u64 pte, bool write, bool execute) bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return true; /* * Do not enforce our key-permissions on a foreign vma. @@ -397,7 +438,7 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { - if (static_branch_likely(&pkey_disabled)) + if (!mmu_has_feature(MMU_FTR_PKEY)) return; /* Duplicate the oldmm pkey state in mm: */ diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index cab06331c0c0..cb91071eef52 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -2,8 +2,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/security.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/machdep.h> #include <asm/mman.h> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 8f9edf07063a..d5f0c10d752a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -15,9 +15,8 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/string_helpers.h> -#include <linux/stop_machine.h> +#include <linux/memory.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/mmu_context.h> #include <asm/dma.h> @@ -35,6 +34,7 @@ unsigned int mmu_pid_bits; unsigned int mmu_base_pid; +unsigned int radix_mem_block_size __ro_after_init; static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) @@ -57,6 +57,13 @@ static __ref void *early_alloc_pgtable(unsigned long size, int nid, return ptr; } +/* + * When allocating pud or pmd pointers, we allocate a complete page + * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This + * is to ensure that the page obtained from the memblock allocator + * can be completely used as page table page and can be freed + * correctly when the page table entries are removed. + */ static int early_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, unsigned int map_page_size, @@ -65,24 +72,26 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, - region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pudp = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + p4d_populate(&init_mm, p4dp, pudp); } - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (map_page_size == PUD_SIZE) { ptep = (pte_t *)pudp; goto set_the_pte; } if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, - region_start, region_end); + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, + region_end); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); @@ -115,6 +124,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -137,7 +147,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, * boot. */ pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; if (map_page_size == PUD_SIZE) { @@ -174,6 +185,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end, { unsigned long idx; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -186,7 +198,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end, for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); + p4dp = p4d_offset(pgdp, idx); + pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; if (pud_is_leaf(*pudp)) { @@ -254,6 +267,7 @@ static unsigned long next_boundary(unsigned long addr, unsigned long end) static int __meminit create_physical_mapping(unsigned long start, unsigned long end, + unsigned long max_mapping_size, int nid, pgprot_t _prot) { unsigned long vaddr, addr, mapping_size = 0; @@ -261,12 +275,14 @@ static int __meminit create_physical_mapping(unsigned long start, pgprot_t prot; int psize; - start = _ALIGN_UP(start, PAGE_SIZE); + start = ALIGN(start, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { unsigned long gap, previous_size; int rc; gap = next_boundary(addr, end) - addr; + if (gap > max_mapping_size) + gap = max_mapping_size; previous_size = mapping_size; prev_exec = exec; @@ -317,8 +333,9 @@ static void __init radix_init_pgtable(void) /* We don't support slb for radix */ mmu_slb_size = 0; + /* - * Create the linear mapping, using standard page size for now + * Create the linear mapping */ for_each_memblock(memory, reg) { /* @@ -334,6 +351,7 @@ static void __init radix_init_pgtable(void) WARN_ON(create_physical_mapping(reg->base, reg->base + reg->size, + radix_mem_block_size, -1, PAGE_KERNEL)); } @@ -474,6 +492,57 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, return 1; } +#ifdef CONFIG_MEMORY_HOTPLUG +static int __init probe_memory_block_size(unsigned long node, const char *uname, int + depth, void *data) +{ + unsigned long *mem_block_size = (unsigned long *)data; + const __be64 *prop; + int len; + + if (depth != 1) + return 0; + + if (strcmp(uname, "ibm,dynamic-reconfiguration-memory")) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len); + if (!prop || len < sizeof(__be64)) + /* + * Nothing in the device tree + */ + *mem_block_size = MIN_MEMORY_BLOCK_SIZE; + else + *mem_block_size = be64_to_cpup(prop); + return 1; +} + +static unsigned long radix_memory_block_size(void) +{ + unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE; + + /* + * OPAL firmware feature is set by now. Hence we are ok + * to test OPAL feature. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + mem_block_size = 1UL * 1024 * 1024 * 1024; + else + of_scan_flat_dt(probe_memory_block_size, &mem_block_size); + + return mem_block_size; +} + +#else /* CONFIG_MEMORY_HOTPLUG */ + +static unsigned long radix_memory_block_size(void) +{ + return 1UL * 1024 * 1024 * 1024; +} + +#endif /* CONFIG_MEMORY_HOTPLUG */ + + void __init radix__early_init_devtree(void) { int rc; @@ -482,17 +551,27 @@ void __init radix__early_init_devtree(void) * Try to find the available page sizes in the device-tree */ rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); - if (rc != 0) /* Found */ - goto found; + if (!rc) { + /* + * No page size details found in device tree. + * Let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; + } + /* - * let's assume we have page 4k and 64k support + * Max mapping size used when mapping pages. We don't use + * ppc_md.memory_block_size() here because this get called + * early and we don't have machine probe called yet. Also + * the pseries implementation only check for ibm,lmb-size. + * All hypervisor supporting radix do expose that device + * tree node. */ - mmu_psize_defs[MMU_PAGE_4K].shift = 12; - mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; - - mmu_psize_defs[MMU_PAGE_64K].shift = 16; - mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; -found: + radix_mem_block_size = radix_memory_block_size(); return; } @@ -514,8 +593,10 @@ void setup_kuep(bool disabled) if (disabled || !early_radix_enabled()) return; - if (smp_processor_id() == boot_cpuid) + if (smp_processor_id() == boot_cpuid) { pr_info("Activating Kernel Userspace Execution Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_KUEP; + } /* * Radix always uses key0 of the IAMR to determine if an access is @@ -539,6 +620,10 @@ void setup_kuap(bool disabled) /* Make sure userspace can't change the AMR */ mtspr(SPRN_UAMOR, 0); + + /* + * Set the default kernel AMR values on all cpus. + */ mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); isync(); } @@ -649,21 +734,6 @@ void radix__mmu_cleanup_all(void) } } -void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* - * We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * Radix mode is not limited by RMA / VRMA addressing. - */ - ppc64_rma_size = ULONG_MAX; -} - #ifdef CONFIG_MEMORY_HOTPLUG static void free_pte_table(pte_t *pte_start, pmd_t *pmd) { @@ -695,30 +765,19 @@ static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) pud_clear(pud); } -struct change_mapping_params { - pte_t *pte; - unsigned long start; - unsigned long end; - unsigned long aligned_start; - unsigned long aligned_end; -}; - -static int __meminit stop_machine_change_mapping(void *data) +static void free_pud_table(pud_t *pud_start, p4d_t *p4d) { - struct change_mapping_params *params = - (struct change_mapping_params *)data; + pud_t *pud; + int i; - if (!data) - return -1; + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } - spin_unlock(&init_mm.page_table_lock); - pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(__pa(params->aligned_start), - __pa(params->start), -1, PAGE_KERNEL); - create_physical_mapping(__pa(params->end), __pa(params->aligned_end), - -1, PAGE_KERNEL); - spin_lock(&init_mm.page_table_lock); - return 0; + pud_free(&init_mm, pud_start); + p4d_clear(p4d); } static void remove_pte_table(pte_t *pte_start, unsigned long addr, @@ -749,53 +808,7 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, } } -/* - * clear the pte and potentially split the mapping helper - */ -static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, - unsigned long size, pte_t *pte) -{ - unsigned long mask = ~(size - 1); - unsigned long aligned_start = addr & mask; - unsigned long aligned_end = addr + size; - struct change_mapping_params params; - bool split_region = false; - - if ((end - addr) < size) { - /* - * We're going to clear the PTE, but not flushed - * the mapping, time to remap and flush. The - * effects if visible outside the processor or - * if we are running in code close to the - * mapping we cleared, we are in trouble. - */ - if (overlaps_kernel_text(aligned_start, addr) || - overlaps_kernel_text(end, aligned_end)) { - /* - * Hack, just return, don't pte_clear - */ - WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel " - "text, not splitting\n", addr, end); - return; - } - split_region = true; - } - - if (split_region) { - params.pte = pte; - params.start = addr; - params.end = end; - params.aligned_start = addr & ~(size - 1); - params.aligned_end = min_t(unsigned long, aligned_end, - (unsigned long)__va(memblock_end_of_DRAM())); - stop_machine(stop_machine_change_mapping, ¶ms, NULL); - return; - } - - pte_clear(&init_mm, addr, pte); -} - -static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, +static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end) { unsigned long next; @@ -810,7 +823,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, continue; if (pmd_is_leaf(*pmd)) { - split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + pte_clear(&init_mm, addr, (pte_t *)pmd); continue; } @@ -820,7 +838,7 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, } } -static void remove_pud_table(pud_t *pud_start, unsigned long addr, +static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end) { unsigned long next; @@ -835,7 +853,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr, continue; if (pud_is_leaf(*pud)) { - split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); + if (!IS_ALIGNED(addr, PUD_SIZE) || + !IS_ALIGNED(next, PUD_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + pte_clear(&init_mm, addr, (pte_t *)pud); continue; } @@ -850,6 +873,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) unsigned long addr, next; pud_t *pud_base; pgd_t *pgd; + p4d_t *p4d; spin_lock(&init_mm.page_table_lock); @@ -857,16 +881,24 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) continue; - if (pgd_is_leaf(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + if (p4d_is_leaf(*p4d)) { + if (!IS_ALIGNED(addr, P4D_SIZE) || + !IS_ALIGNED(next, P4D_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, (pte_t *)pgd); continue; } - pud_base = (pud_t *)pgd_page_vaddr(*pgd); + pud_base = (pud_t *)p4d_page_vaddr(*p4d); remove_pud_table(pud_base, addr, next); + free_pud_table(pud_base, p4d); } spin_unlock(&init_mm.page_table_lock); @@ -882,7 +914,8 @@ int __meminit radix__create_section_mapping(unsigned long start, return -1; } - return create_physical_mapping(__pa(start), __pa(end), nid, prot); + return create_physical_mapping(__pa(start), __pa(end), + radix_mem_block_size, nid, prot); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -962,7 +995,13 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); - /*FIXME!! Verify whether we need this kick below */ + /* + * pmdp collapse_flush need to ensure that there are no parallel gup + * walk after this call. This is needed so that we can have stable + * page ref count when collapsing a page. We don't allow a collapse page + * if we have gup taken on the page. We can ensure that by sending IPI + * because gup walk happens with IRQ disabled. + */ serialize_against_pte_lookup(vma->vm_mm); radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); @@ -1023,17 +1062,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); old_pmd = __pmd(old); - /* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 758ade2c2b6e..0d233763441f 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -16,6 +16,7 @@ #include <asm/tlbflush.h> #include <asm/trace.h> #include <asm/cputhreads.h> +#include <asm/plpar_wrappers.h> #define RIC_FLUSH_TLB 0 #define RIC_FLUSH_PWC 1 @@ -694,7 +695,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto local; } - if (cputlb_use_tlbie()) { + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) { if (mm_needs_flush_escalation(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else @@ -727,7 +735,16 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) goto local; } } - if (cputlb_use_tlbie()) + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, + H_RPTI_PAGE_ALL, 0, -1UL); + } else if (cputlb_use_tlbie()) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); @@ -760,7 +777,19 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, exit_flush_lazy_tlbs(mm); goto local; } - if (cputlb_use_tlbie()) + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, pg_sizes, size; + + tgt = H_RPTI_TARGET_CMMU; + pg_sizes = psize_to_rpti_pgsize(psize); + size = 1UL << mmu_psize_to_shift(psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, + pg_sizes, vmaddr, + vmaddr + size); + } else if (cputlb_use_tlbie()) _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); @@ -810,7 +839,14 @@ static inline void _tlbiel_kernel_broadcast(void) */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - if (cputlb_use_tlbie()) + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU; + unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + + pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL, + start, end); + } else if (cputlb_use_tlbie()) _tlbie_pid(0, RIC_FLUSH_ALL); else _tlbiel_kernel_broadcast(); @@ -864,7 +900,17 @@ is_local: nr_pages > tlb_local_single_page_flush_ceiling); } - if (full) { + if (!mmu_has_feature(MMU_FTR_GTSE) && !local) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M); + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, pg_sizes, + start, end); + } else if (full) { if (local) { _tlbiel_pid(pid, RIC_FLUSH_TLB); } else { @@ -884,9 +930,7 @@ is_local: if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { hstart = (start + PMD_SIZE - 1) & PMD_MASK; hend = end & PMD_MASK; - if (hstart == hend) - hflush = false; - else + if (hstart < hend) hflush = true; } @@ -1048,7 +1092,17 @@ is_local: nr_pages > tlb_local_single_page_flush_ceiling); } - if (full) { + if (!mmu_has_feature(MMU_FTR_GTSE) && !local) { + unsigned long tgt = H_RPTI_TARGET_CMMU; + unsigned long type = H_RPTI_TYPE_TLB; + unsigned long pg_sizes = psize_to_rpti_pgsize(psize); + + if (also_pwc) + type |= H_RPTI_TYPE_PWC; + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end); + } else if (full) { if (local) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { @@ -1113,7 +1167,19 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) exit_flush_lazy_tlbs(mm); goto local; } - if (cputlb_use_tlbie()) + if (!mmu_has_feature(MMU_FTR_GTSE)) { + unsigned long tgt, type, pg_sizes; + + tgt = H_RPTI_TARGET_CMMU; + type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | + H_RPTI_TYPE_PRT; + pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize); + + if (atomic_read(&mm->context.copros) > 0) + tgt |= H_RPTI_TARGET_NMMU; + pseries_rpt_invalidate(pid, tgt, type, pg_sizes, + addr, end); + } else if (cputlb_use_tlbie()) _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); else _tlbiel_va_range_multicast(mm, diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 716204aee3da..156c38f89511 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -10,7 +10,6 @@ */ #include <asm/asm-prototypes.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> @@ -21,10 +20,14 @@ #include <linux/compiler.h> #include <linux/context_tracking.h> #include <linux/mm_types.h> +#include <linux/pgtable.h> #include <asm/udbg.h> #include <asm/code-patching.h> +#include "internal.h" + + enum slb_index { LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ KSTACK_INDEX = 1, /* Kernel stack map */ @@ -54,6 +57,17 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); } +bool stress_slb_enabled __initdata; + +static int __init parse_stress_slb(char *p) +{ + stress_slb_enabled = true; + return 0; +} +early_param("stress_slb", parse_stress_slb); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key); + static void assert_slb_presence(bool present, unsigned long ea) { #ifdef CONFIG_DEBUG_VM @@ -68,7 +82,7 @@ static void assert_slb_presence(bool present, unsigned long ea) * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware * ignores all other bits from 0-27, so just clear them all. */ - ea &= ~((1UL << 28) - 1); + ea &= ~((1UL << SID_SHIFT) - 1); asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); WARN_ON(present == (tmp == 0)); @@ -153,14 +167,42 @@ void slb_flush_all_realmode(void) asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } +static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside) +{ + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data, ksp_vsid_data; + u32 ih; + + /* + * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside + * information created with Class=0 entries, which we use for kernel + * SLB entries (the SLB entries themselves are still invalidated). + * + * Older processors will ignore this optimisation. Over-invalidation + * is fine because we never rely on lookaside information existing. + */ + if (preserve_kernel_lookaside) + ih = 1; + else + ih = 0; + + ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(%0)" \n" + "slbmte %1, %2 \n" + :: "i" (ih), + "r" (ksp_vsid_data), + "r" (ksp_esid_data) + : "memory"); +} + /* * This flushes non-bolted entries, it can be run in virtual mode. Must * be called with interrupts disabled. */ void slb_flush_and_restore_bolted(void) { - struct slb_shadow *p = get_slb_shadow(); - BUILD_BUG_ON(SLB_NUM_BOLTED != 2); WARN_ON(!irqs_disabled()); @@ -171,13 +213,10 @@ void slb_flush_and_restore_bolted(void) */ hard_irq_disable(); - asm volatile("isync\n" - "slbia\n" - "slbmte %0, %1\n" - "isync\n" - :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), - "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) - : "memory"); + isync(); + __slb_flush_and_restore_bolted(false); + isync(); + assert_slb_presence(true, get_paca()->kstack); get_paca()->slb_cache_ptr = 0; @@ -400,6 +439,30 @@ void preload_new_slb_context(unsigned long start, unsigned long sp) local_irq_enable(); } +static void slb_cache_slbie_kernel(unsigned int index) +{ + unsigned long slbie_data = get_paca()->slb_cache[index]; + unsigned long ksp = get_paca()->kstack; + + slbie_data <<= SID_SHIFT; + slbie_data |= 0xc000000000000000ULL; + if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data) + return; + slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT; + + asm volatile("slbie %0" : : "r" (slbie_data)); +} + +static void slb_cache_slbie_user(unsigned int index) +{ + unsigned long slbie_data = get_paca()->slb_cache[index]; + + slbie_data <<= SID_SHIFT; + slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + + asm volatile("slbie %0" : : "r" (slbie_data)); +} /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) @@ -414,8 +477,14 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * which would update the slb_cache/slb_cache_ptr fields in the PACA. */ hard_irq_disable(); - asm volatile("isync" : : : "memory"); - if (cpu_has_feature(CPU_FTR_ARCH_300)) { + isync(); + if (stress_slb()) { + __slb_flush_and_restore_bolted(false); + isync(); + get_paca()->slb_cache_ptr = 0; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + + } else if (cpu_has_feature(CPU_FTR_ARCH_300)) { /* * SLBIA IH=3 invalidates all Class=1 SLBEs and their * associated lookaside structures, which matches what @@ -423,47 +492,29 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * cache. */ asm volatile(PPC_SLBIA(3)); + } else { unsigned long offset = get_paca()->slb_cache_ptr; if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { - unsigned long slbie_data = 0; - - for (i = 0; i < offset; i++) { - unsigned long ea; - - ea = (unsigned long) - get_paca()->slb_cache[i] << SID_SHIFT; - /* - * Could assert_slb_presence(true) here, but - * hypervisor or machine check could have come - * in and removed the entry at this point. - */ - - slbie_data = ea; - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* user slbs have C=1 */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } + /* + * Could assert_slb_presence(true) here, but + * hypervisor or machine check could have come + * in and removed the entry at this point. + */ + + for (i = 0; i < offset; i++) + slb_cache_slbie_user(i); /* Workaround POWER5 < DD2.1 issue */ if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) - asm volatile("slbie %0" : : "r" (slbie_data)); + slb_cache_slbie_user(0); } else { - struct slb_shadow *p = get_slb_shadow(); - unsigned long ksp_esid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].esid); - unsigned long ksp_vsid_data = - be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); - - asm volatile(PPC_SLBIA(1) "\n" - "slbmte %0,%1\n" - "isync" - :: "r"(ksp_vsid_data), - "r"(ksp_esid_data)); + /* Flush but retain kernel lookaside information */ + __slb_flush_and_restore_bolted(true); + isync(); get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; } @@ -503,7 +554,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * address accesses by the kernel (user mode won't happen until * rfid, which is safe). */ - asm volatile("isync" : : : "memory"); + isync(); } void slb_set_size(u16 size) @@ -571,6 +622,9 @@ static void slb_cache_update(unsigned long esid_data) if (cpu_has_feature(CPU_FTR_ARCH_300)) return; /* ISAv3.0B and later does not use slb_cache */ + if (stress_slb()) + return; + /* * Now update slb cache entries */ @@ -580,7 +634,7 @@ static void slb_cache_update(unsigned long esid_data) * We have space in slb cache for optimized switch_slb(). * Top 36 bits from esid_data as per ISA */ - local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT; local_paca->slb_cache_ptr++; } else { /* @@ -671,6 +725,28 @@ static long slb_insert_entry(unsigned long ea, unsigned long context, * accesses user memory before it returns to userspace with rfid. */ assert_slb_presence(false, ea); + if (stress_slb()) { + int slb_cache_index = local_paca->slb_cache_ptr; + + /* + * stress_slb() does not use slb cache, repurpose as a + * cache of inserted (non-bolted) kernel SLB entries. All + * non-bolted kernel entries are flushed on any user fault, + * or if there are already 3 non-boled kernel entries. + */ + BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3); + if (!kernel || slb_cache_index == 3) { + int i; + + for (i = 0; i < slb_cache_index; i++) + slb_cache_slbie_kernel(i); + slb_cache_index = 0; + } + + if (kernel) + local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT; + local_paca->slb_cache_ptr = slb_cache_index; + } asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); barrier(); diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 2ef24a53f4c9..60c6ea16a972 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -11,7 +11,7 @@ #include <linux/hugetlb.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <linux/uaccess.h> /* @@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) return; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return; pmd = pmd_offset(pud, addr); @@ -92,7 +94,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) size_t nw; unsigned long next, limit; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); spt = mm_ctx_subpage_prot(&mm->context); if (!spt) @@ -127,7 +129,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) } err_out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -217,13 +219,13 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) return -EFAULT; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); spt = mm_ctx_subpage_prot(&mm->context); if (!spt) { /* * Allocate subpage prot table if not already done. - * Do this with mmap_sem held + * Do this with mmap_lock held */ spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); if (!spt) { @@ -267,11 +269,11 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, if (addr + (nw << PAGE_SHIFT) > next) nw = (next - addr) >> PAGE_SHIFT; - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (__copy_from_user(spp, map, nw * sizeof(u32))) return -EFAULT; map += nw; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); @@ -280,6 +282,6 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, spt->maxaddr = limit; err = 0; out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return err; } |