diff options
Diffstat (limited to 'arch/powerpc/mm')
27 files changed, 1992 insertions, 786 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index adfee3f1aeb9..f2cea6d5e764 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,10 +13,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) -obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o -obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ - mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o +obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o +obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o ifeq ($(CONFIG_PPC_STD_MMU_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o @@ -33,6 +34,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index a1b2713f6e96..139dec421e57 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -135,7 +135,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(flags)) { + if (pte_user(__pte(flags))) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 47d1b26effc6..6333b273d2d5 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -34,21 +34,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * PP bits. _PAGE_USER is already PP bit 0x2, so we only * need to add in 0x1 if it's a read-only user page @@ -60,22 +60,22 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K, MMU_PAGE_4K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -115,9 +115,10 @@ repeat: MMU_PAGE_4K, MMU_PAGE_4K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index b2d659cf51c6..16644e1f4e6b 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -23,7 +23,7 @@ bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) unsigned long g_idx; unsigned long ptev = pte_val(rpte.pte); - g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT; + g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT; index = index >> 2; if (g_idx & (0x1 << index)) return true; @@ -37,12 +37,12 @@ static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long in { unsigned long g_idx; - if (!(ptev & _PAGE_COMBO)) + if (!(ptev & H_PAGE_COMBO)) return ptev; index = index >> 2; g_idx = 0x1 << index; - return ptev | (g_idx << _PAGE_F_GIX_SHIFT); + return ptev | (g_idx << H_PAGE_F_GIX_SHIFT); } int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, @@ -66,21 +66,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * Handle the subpage protection bits */ @@ -103,21 +103,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, /* *None of the sub 4k page is hashed */ - if (!(old_pte & _PAGE_HASHPTE)) + if (!(old_pte & H_PAGE_HASHPTE)) goto htab_insert_hpte; /* * Check if the pte was already inserted into the hash table * as a 64k HW page, and invalidate the 64k HPTE if so. */ - if (!(old_pte & _PAGE_COMBO)) { + if (!(old_pte & H_PAGE_COMBO)) { flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); /* * clear the old slot details from the old and new pte. * On hash insert failure we use old pte value and we don't * want slot information there if we have a insert failure. */ - old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); - new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); + old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); + new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); goto htab_insert_hpte; } /* @@ -143,15 +143,15 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, if (ret == -1) goto htab_insert_hpte; - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } htab_insert_hpte: /* - * handle _PAGE_4K_PFN case + * handle H_PAGE_4K_PFN case */ - if (old_pte & _PAGE_4K_PFN) { + if (old_pte & H_PAGE_4K_PFN) { /* * All the sub 4k page have the same * physical address. @@ -199,20 +199,20 @@ repeat: } /* * Insert slot number & secondary bit in PTE second half, - * clear _PAGE_BUSY and set appropriate HPTE slot bit - * Since we have _PAGE_BUSY set on ptep, we can be sure + * clear H_PAGE_BUSY and set appropriate HPTE slot bit + * Since we have H_PAGE_BUSY set on ptep, we can be sure * nobody is undating hidx. */ hidxp = (unsigned long *)(ptep + PTRS_PER_PTE); rpte.hidx &= ~(0xfUL << (subpg_index << 2)); *hidxp = rpte.hidx | (slot << (subpg_index << 2)); new_pte = mark_subptegroup_valid(new_pte, subpg_index); - new_pte |= _PAGE_HASHPTE; + new_pte |= H_PAGE_HASHPTE; /* * check __real_pte for details on matching smp_rmb() */ smp_wmb(); - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -220,7 +220,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize) { - unsigned long hpte_group; unsigned long rflags, pa; unsigned long old_pte, new_pte; @@ -235,27 +234,26 @@ int __hash_page_64K(unsigned long ea, unsigned long access, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Check if PTE has the cache-inhibit bit set * If so, bail out and refault as a 4k page */ if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && - unlikely(old_pte & _PAGE_NO_CACHE)) + unlikely(pte_ci(pte))) return 0; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); rflags = htab_convert_pte_flags(new_pte); @@ -264,22 +262,22 @@ int __hash_page_64K(unsigned long ea, unsigned long access, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K, MMU_PAGE_64K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -319,9 +317,10 @@ repeat: MMU_PAGE_64K, MMU_PAGE_64K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 8eaac81347fd..d873f6507f72 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -221,7 +221,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, return -1; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", @@ -719,6 +719,12 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_restore(flags); } +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + void __init hpte_init_native(void) { ppc_md.hpte_invalidate = native_hpte_invalidate; @@ -729,4 +735,7 @@ void __init hpte_init_native(void) ppc_md.hpte_clear_all = native_hpte_clear; ppc_md.flush_hash_range = native_flush_hash_range; ppc_md.hugepage_invalidate = native_hugepage_invalidate; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + ppc_md.update_partition_table = native_update_partition_table; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7635b1c6b5da..59268969a0bc 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -167,16 +167,22 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) if ((pteflags & _PAGE_EXEC) == 0) rflags |= HPTE_R_N; /* - * PP bits: + * PPP bits: * Linux uses slb key 0 for kernel and 1 for user. - * kernel areas are mapped with PP=00 - * and there is no kernel RO (_PAGE_KERNEL_RO). - * User area is mapped with PP=0x2 for read/write - * or PP=0x3 for read-only (including writeable but clean pages). + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). */ - if (pteflags & _PAGE_USER) { - rflags |= 0x2; - if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY))) + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) + rflags |= (HPTE_R_PP0 | 0x2); + } else { + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } /* @@ -186,12 +192,13 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) /* * Add in WIG bits */ - if (pteflags & _PAGE_WRITETHRU) - rflags |= HPTE_R_W; - if (pteflags & _PAGE_NO_CACHE) + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) rflags |= HPTE_R_I; - if (pteflags & _PAGE_GUARDED) - rflags |= HPTE_R_G; + if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_I | HPTE_R_W); return rflags; } @@ -669,6 +676,41 @@ int remove_section_mapping(unsigned long start, unsigned long end) } #endif /* CONFIG_MEMORY_HOTPLUG */ +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long pteg_count) +{ + unsigned long ps_field; + unsigned long htab_size; + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + + /* + * slb llp encoding for the page size used in VPM real mode. + * We can ignore that for lpid 0 + */ + ps_field = 0; + htab_size = __ilog2(pteg_count) - 11; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); + /* + * FIXME!! This should be done via update_partition table + * For now UPRT is 0 for us. + */ + partition_tb->patb1 = 0; + DBG("Partition table %p\n", partition_tb); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + +} + static void __init htab_initialize(void) { unsigned long table; @@ -737,8 +779,11 @@ static void __init htab_initialize(void) /* Initialize the HPT with no entries */ memset((void *)table, 0, htab_size_bytes); - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, pteg_count); } prot = pgprot_val(PAGE_KERNEL); @@ -823,8 +868,38 @@ static void __init htab_initialize(void) #undef KB #undef MB -void __init early_init_mmu(void) +void __init hash__early_init_mmu(void) { + /* + * initialize page table size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pmd_cache_index = H_PMD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; + /* + * 4k use hugepd format, so for hash set then to + * zero + */ + __pmd_val_bits = 0; + __pud_val_bits = 0; + __pgd_val_bits = 0; + + __kernel_virt_start = H_KERN_VIRT_START; + __kernel_virt_size = H_KERN_VIRT_SIZE; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + vmemmap = (struct page *)H_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + /* Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. @@ -836,12 +911,16 @@ void __init early_init_mmu(void) } #ifdef CONFIG_SMP -void early_init_mmu_secondary(void) +void hash__early_init_mmu_secondary(void) { /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - mtspr(SPRN_SDR1, _SDR1); - + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } /* Initialize SLB */ slb_initialize(); } @@ -920,7 +999,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) * Userspace sets the subpage permissions using the subpage_prot system call. * * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. + * _PAGE_RWX: no access. */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { @@ -946,8 +1025,13 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) /* extract 2-bit bitfield for this 4k subpage */ spp >>= 30 - 2 * ((ea >> 12) & 0xf); - /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ - spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); + /* + * 0 -> full premission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); return spp; } @@ -1084,7 +1168,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ - if (access & ~pte_val(*ptep)) { + if (!check_pte_access(access, pte_val(*ptep))) { DBG_LOW(" no access !\n"); rc = 1; goto bail; @@ -1122,8 +1206,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES - /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; } @@ -1131,8 +1215,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && - (pte_val(*ptep) & _PAGE_NO_CACHE)) { + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { if (user_region) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; @@ -1209,7 +1292,7 @@ EXPORT_SYMBOL_GPL(hash_page); int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, unsigned long dsisr) { - unsigned long access = _PAGE_PRESENT; + unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; @@ -1220,14 +1303,18 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, flags |= HPTE_NOHPTE_UPDATE; if (dsisr & DSISR_ISSTORE) - access |= _PAGE_RW; + access |= _PAGE_WRITE; /* - * We need to set the _PAGE_USER bit if MSR_PR is set or if we are - * accessing a userspace segment (even from the kernel). We assume - * kernel addresses always have the high bit set. + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. */ + access |= _PAGE_PRIVILEGED; if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) - access |= _PAGE_USER; + access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) access |= _PAGE_EXEC; @@ -1235,6 +1322,30 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, return hash_page_mm(mm, ea, access, trap, flags); } +#ifdef CONFIG_PPC_MM_SLICES +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + int psize = get_slice_psize(mm, ea); + + /* We only prefault standard pages for now */ + if (unlikely(psize != mm->context.user_psize)) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) + return false; + + return true; +} +#else +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + return true; +} +#endif + void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { @@ -1247,11 +1358,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, BUG_ON(REGION_ID(ea) != USER_REGION_ID); -#ifdef CONFIG_PPC_MM_SLICES - /* We only prefault standard pages for now */ - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + if (!should_hash_preload(mm, ea)) return; -#endif DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," " trap=%lx\n", mm, mm->pgd, ea, access, trap); @@ -1282,13 +1390,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES - /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here */ - if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ @@ -1570,7 +1678,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #endif /* CONFIG_DEBUG_PAGEALLOC */ -void setup_initial_memory_limit(phys_addr_t first_memblock_base, +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { /* We don't currently support the first MEMBLOCK not mapping 0 diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index eb2accdd76fd..ba3fc229468a 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -37,20 +37,20 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, old_pmd = pmd_val(pmd); /* If PMD busy, retry the access */ - if (unlikely(old_pmd & _PAGE_BUSY)) + if (unlikely(old_pmd & H_PAGE_BUSY)) return 0; /* If PMD permissions don't match, take page fault */ - if (unlikely(access & ~old_pmd)) + if (unlikely(!check_pte_access(access, old_pmd))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pmd |= _PAGE_DIRTY; - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, - old_pmd, new_pmd)); + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + rflags = htab_convert_pte_flags(new_pmd); #if 0 @@ -78,7 +78,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * base page size. This is because demote_segment won't flush * hash page table entries. */ - if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) { + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, ssize, flags); /* @@ -125,7 +125,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, hash = hpt_hash(vpn, shift, ssize); /* insert new entry */ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; - new_pmd |= _PAGE_HASHPTE; + new_pmd |= H_PAGE_HASHPTE; repeat: hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; @@ -169,17 +169,17 @@ repeat: mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* - * Mark the pte with _PAGE_COMBO, if we are trying to hash it with + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with * base page size 4k. */ if (psize == MMU_PAGE_4K) - new_pmd |= _PAGE_COMBO; + new_pmd |= H_PAGE_COMBO; /* * The hpte valid is stored in the pgtable whose address is in the * second half of the PMD. Order this against clearing of the busy bit in * huge pmd. */ smp_wmb(); - *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 8555fce902fe..3058560b6121 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -47,18 +47,19 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, do { old_pte = pte_val(*ptep); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + rflags = htab_convert_pte_flags(new_pte); sz = ((1UL) << shift); @@ -68,28 +69,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* There MIGHT be an HPTE for this pte */ unsigned long hash, slot; hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, mmu_psize, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { unsigned long hash = hpt_hash(vpn, shift, ssize); pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, mmu_psize, ssize); @@ -105,14 +106,14 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, return -1; } - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & - (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } /* * No need to use ldarx/stdcx here */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c new file mode 100644 index 000000000000..1e11559e1aac --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -0,0 +1,87 @@ +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> +#include <asm/mman.h> + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +/* + * A vairant of hugetlb_get_unmapped_area doing topdown search + * FIXME!! should we do as x86 does or non hugetlb area does ? + * ie, use topdown or not based on mmap_is_legacy check ? + */ +unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + /* + * We are always doing an topdown search here. Slice code + * does that too. + */ + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a4a90a869999..5aac1a3f86cd 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -711,6 +711,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + if (radix_enabled()) + return radix__hugetlb_get_unmapped_area(file, addr, len, + pgoff, flags); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } #endif @@ -719,14 +722,14 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { #ifdef CONFIG_PPC_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); -#else + /* With radix we don't use slice, so derive it from vma*/ + if (!radix_enabled()) + return 1UL << mmu_psize_to_shift(psize); +#endif if (!is_vm_hugetlb_page(vma)) return PAGE_SIZE; return huge_page_size(hstate_vma(vma)); -#endif } static inline bool is_power_of_4(unsigned long x) @@ -825,7 +828,7 @@ static int __init hugetlbpage_init(void) { int psize; - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { @@ -865,6 +868,9 @@ static int __init hugetlbpage_init(void) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; else if (mmu_psize_defs[MMU_PAGE_1M].shift) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; + return 0; } @@ -1005,9 +1011,9 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_USER; + mask = _PAGE_PRESENT | _PAGE_READ; if (write) - mask |= _PAGE_RW; + mask |= _PAGE_WRITE; if ((pte_val(pte) & mask) != mask) return 0; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ba655666186d..33709bdb0419 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -66,11 +66,11 @@ #include "mmu_decl.h" #ifdef CONFIG_PPC_STD_MMU_64 -#if PGTABLE_RANGE > USER_VSID_RANGE +#if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) #warning TASK_SIZE is smaller than it needs to be. #endif #endif /* CONFIG_PPC_STD_MMU_64 */ @@ -189,75 +189,6 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } -/* On hash-based CPUs, the vmemmap is bolted in the hash table. - * - * On Book3E CPUs, the vmemmap is currently mapped in the top half of - * the vmalloc space using normal page tables, though the size of - * pages encoded in the PTEs can be different - */ - -#ifdef CONFIG_PPC_BOOK3E -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding without page size */ - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | - _PAGE_KERNEL_RW; - - /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); - - /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; - - /* For each PTE for that area, map things. Note that we don't - * increment phys because all PTEs are of the large size and - * thus must have the low bits clear - */ - for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ -} -#endif -#else /* CONFIG_PPC_BOOK3E */ -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - if (rc < 0) { - int rc2 = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ - int rc = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON((rc < 0) && (rc != -ENOENT)); - WARN_ON(rc == -ENOENT); -} -#endif - -#endif /* CONFIG_PPC_BOOK3E */ - struct vmemmap_backing *vmemmap_list; static struct vmemmap_backing *next; static int num_left; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ac79dbde1015..2fd57fa48429 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -68,12 +68,15 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); +#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } +#else +#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) */ int dma_pfn_limit_to_zone(u64 pfn_limit) { - enum zone_type top_zone = ZONE_NORMAL; int i; -#ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; -#endif - - for (i = top_zone; i >= 0; i--) { + for (i = TOP_ZONE; i >= 0; i--) { if (max_zone_pfns[i] <= pfn_limit) return i; } @@ -289,7 +287,6 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -313,13 +310,9 @@ void __init paging_init(void) (long int)((top_of_ram - total_ram) >> 20)); #ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); -#else - top_zone = ZONE_NORMAL; #endif - - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); zone_limits_final = true; free_area_init_nodes(max_zone_pfns); @@ -498,7 +491,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access = 0, trap; + unsigned long access, trap; + + if (radix_enabled()) + return; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) @@ -511,13 +507,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * * We also avoid filling the hash if not coming from a fault */ - if (current->thread.regs == NULL) - return; - trap = TRAP(current->thread.regs); - if (trap == 0x400) - access |= _PAGE_EXEC; - else if (trap != 0x300) + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + access = 0UL; + break; + case 0x400: + access = _PAGE_EXEC; + break; + default: return; + } + hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 4087705ba90f..2f1e44362198 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -26,6 +26,9 @@ #include <linux/mm.h> #include <linux/random.h> #include <linux/sched.h> +#include <linux/elf-randomize.h> +#include <linux/security.h> +#include <linux/mman.h> /* * Top of mmap area (just below the process stack). @@ -78,6 +81,111 @@ static inline unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } +#ifdef CONFIG_PPC_RADIX_MMU +/* + * Same function as generic code used only for radix, because we don't need to overload + * the generic one. But we will have to duplicate, because hash select + * HAVE_ARCH_UNMAPPED_AREA + */ +static unsigned long +radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); +} + +static unsigned long +radix__arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +static void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor) +{ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = radix__arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor); + mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; + } +} +#else +/* dummy */ +extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor); +#endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -89,6 +197,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); + if (radix_enabled()) + return radix__arch_pick_mmap_layout(mm, random_factor); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 9ca6fe16cb29..227b2a6c4544 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -58,6 +58,17 @@ again: return index; } EXPORT_SYMBOL_GPL(__init_new_context); +static int radix__init_new_context(struct mm_struct *mm, int index) +{ + unsigned long rts_field; + + /* + * set the process table entry, + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + return 0; +} int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { @@ -67,13 +78,27 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) if (index < 0) return index; - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); + if (radix_enabled()) { + radix__init_new_context(mm, index); + } else { + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we + * properly initialize context slice details for newly allocated + * mm's (which will have id == 0) and don't alter context slice + * inherited via fork (which will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is ok. + */ + if (mm->context.id == 0) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + } mm->context.id = index; #ifdef CONFIG_PPC_ICSWX mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); @@ -144,8 +169,19 @@ void destroy_context(struct mm_struct *mm) mm->context.cop_lockp = NULL; #endif /* CONFIG_PPC_ICSWX */ + if (radix_enabled()) + process_tb[mm->context.id].prtb1 = 0; + else + subpage_prot_free(mm); destroy_pagetable_page(mm); __destroy_context(mm->context.id); - subpage_prot_free(mm); mm->context.id = MMU_NO_CONTEXT; } + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + asm volatile("isync": : :"memory"); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 986afbc22c76..7d95bc402dba 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -226,7 +226,8 @@ static void context_check_map(void) static void context_check_map(void) { } #endif -void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; @@ -334,8 +335,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.active = 0; #ifdef CONFIG_PPC_MM_SLICES - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); + slice_set_user_psize(mm, mmu_virtual_psize); #endif return 0; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index bfb7c0bcabd5..6af65327c993 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -108,11 +108,6 @@ extern unsigned long Hash_size, Hash_mask; #endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); -#endif /* CONFIG_PPC64 */ - extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c new file mode 100644 index 000000000000..a2298930f990 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -0,0 +1,122 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/dma.h> + +#include "mmu_decl.h" + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + pgdp = pgd_offset_k(ea); +#ifndef __PAGETABLE_PUD_FOLDED + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* !__PAGETABLE_PUD_FOLDED */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } + + smp_wmb(); + return 0; +} diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c new file mode 100644 index 000000000000..eb4451144746 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -0,0 +1,118 @@ +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> + +#include "mmu_decl.h" +#include <trace/events/thp.h> + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c new file mode 100644 index 000000000000..c23e286a6b8f --- /dev/null +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -0,0 +1,342 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> + +#include "mmu_decl.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/thp.h> + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + + /* + * We can't mark the pmd none here, because that will cause a race + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while + * we spilt, but at the same time we wan't rest of the ppc64 code + * not to insert hash pte on this, because we will be modifying + * the deposited pgtable in the caller of this function. Hence + * clear the _PAGE_USER so that we move the fault handling to + * higher level function and that will serialize against ptl. + * We need to flush existing hash pte entries here even though, + * the translation is still valid, because we will withdraw + * pgtable_t after this. + */ + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + const struct cpumask *tmp; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(mm), tmp)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int hash__has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c new file mode 100644 index 000000000000..18b2c11604fa --- /dev/null +++ b/arch/powerpc/mm/pgtable-radix.c @@ -0,0 +1,526 @@ +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/sched.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/machdep.h> +#include <asm/mmu.h> +#include <asm/firmware.h> + +#include <trace/events/thp.h> + +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); + memset(pt, 0, size); + + return pt; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + } else { + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + } + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + smp_wmb(); + return 0; +} + +static void __init radix_init_pgtable(void) +{ + int loop_count; + u64 base, end, start_addr; + unsigned long rts_field; + struct memblock_region *reg; + unsigned long linear_page_size; + + /* We don't support slb for radix */ + mmu_slb_size = 0; + /* + * Create the linear mapping, using standard page size for now + */ + loop_count = 0; + for_each_memblock(memory, reg) { + + start_addr = reg->base; + +redo: + if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift) + linear_page_size = PUD_SIZE; + else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift) + linear_page_size = PMD_SIZE; + else + linear_page_size = PAGE_SIZE; + + base = _ALIGN_UP(start_addr, linear_page_size); + end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size); + + pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n", + (unsigned long)base, (unsigned long)end, + linear_page_size); + + while (base < end) { + radix__map_kernel_page((unsigned long)__va(base), + base, PAGE_KERNEL_X, + linear_page_size); + base += linear_page_size; + } + /* + * map the rest using lower page size + */ + if (end < reg->base + reg->size) { + start_addr = end; + loop_count++; + goto redo; + } + } + /* + * Allocate Partition table and process table for the + * host. + */ + BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large."); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); + /* + * Fill in the process table. + * we support 52 bits, hence 52-28 = 24, 11000 + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + /* + * Fill in the partition table. We are suppose to use effective address + * of process table here. But our linear mapping also enable us to use + * physical address here. + */ + ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field; + /* + * we support 52 bits, hence 52-28 = 24, 11000 + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); + partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | + RADIX_PGD_INDEX_SIZE | PATB_HR); + printk("Partition table %p\n", partition_tb); + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void __init radix_init_native(void) +{ + ppc_md.update_partition_table = native_update_partition_table; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size sift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +static void __init radix_init_page_sizes(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + /* + * let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; +found: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + return; +} + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + /* + * setup LPCR UPRT based on mmu_features + */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* vmemmap mapping */ + mmu_vmemmap_psize = mmu_virtual_psize; +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pmd_cache_index = RADIX_PMD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + + __kernel_virt_start = RADIX_KERN_VIRT_START; + __kernel_virt_size = RADIX_KERN_VIRT_SIZE; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + /* + * For now radix also use the same frag size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + + radix_init_page_sizes(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + radix_init_partition_table(); + + radix_init_pgtable(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * setup LPCR UPRT based on mmu_features + */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + /* + * update partition table control register, 64 K size. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + /* + * We limit the allocation that depend on ppc64_rma_size + * to first_memblock_size. We also clamp it to 1GB to + * avoid some funky things such as RTAS bugs. + * + * On radix config we really don't have a limitation + * on real mode access. But keeping it as above works + * well enough. + */ + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + /* + * Finally limit subsequent allocations. We really don't want + * to limit the memblock allocations to rma_size. FIXME!! should + * we even limit at all ? + */ + memblock_set_current_limit(first_memblock_base + first_memblock_size); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + + BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */ +} +#endif +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ + kick_all_cpus_sync(); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index de37ff445362..88a307504b5a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -38,14 +38,25 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { + +#if defined(CONFIG_PPC_BOOK3S_64) + if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_ci(pte)) + return 0; + if (pte_user(pte)) + return 1; + } + return 0; +#else return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); +#endif } static struct page *maybe_pte_to_page(pte_t pte) @@ -71,6 +82,9 @@ static struct page *maybe_pte_to_page(pte_t pte) static pte_t set_pte_filter(pte_t pte) { + if (radix_enabled()) + return pte; + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { @@ -177,8 +191,8 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * _PAGE_PRESENT, but we can be sure that it is not in hpte. * Hence we can use set_pte_at for them. */ - VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); + VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + /* * Add the pte bit when tryint set a pte */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 347106080bb1..e009e0604a8a 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -55,104 +55,63 @@ #include "mmu_decl.h" -#define CREATE_TRACE_POINTS -#include <trace/events/thp.h> - -/* Some sanity checking */ -#if TASK_SIZE_USER64 > PGTABLE_RANGE -#error TASK_SIZE_USER64 exceeds pagetable range -#endif - #ifdef CONFIG_PPC_STD_MMU_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif #endif -unsigned long ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PPC_MMU_NOHASH -static __ref void *early_alloc_pgtable(unsigned long size) -{ - void *pt; - - pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); - memset(pt, 0, size); - - return pt; -} -#endif /* CONFIG_PPC_MMU_NOHASH */ - +#ifdef CONFIG_PPC_BOOK3S_64 /* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it + * partition table and process table for ISA 3.0 */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); - } else { -#ifdef CONFIG_PPC_MMU_NOHASH - pgdp = pgd_offset_k(ea); -#ifdef PUD_TABLE_SIZE - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } -#endif /* PUD_TABLE_SIZE */ - pudp = pud_offset(pgdp, ea); - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); -#else /* CONFIG_PPC_MMU_NOHASH */ - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } -#endif /* !CONFIG_PPC_MMU_NOHASH */ - } - - smp_wmb(); - return 0; -} - +struct prtb_entry *process_tb; +struct patb_entry *partition_tb; +/* + * page table size + */ +unsigned long __pte_index_size; +EXPORT_SYMBOL(__pte_index_size); +unsigned long __pmd_index_size; +EXPORT_SYMBOL(__pmd_index_size); +unsigned long __pud_index_size; +EXPORT_SYMBOL(__pud_index_size); +unsigned long __pgd_index_size; +EXPORT_SYMBOL(__pgd_index_size); +unsigned long __pmd_cache_index; +EXPORT_SYMBOL(__pmd_cache_index); +unsigned long __pte_table_size; +EXPORT_SYMBOL(__pte_table_size); +unsigned long __pmd_table_size; +EXPORT_SYMBOL(__pmd_table_size); +unsigned long __pud_table_size; +EXPORT_SYMBOL(__pud_table_size); +unsigned long __pgd_table_size; +EXPORT_SYMBOL(__pgd_table_size); +unsigned long __pmd_val_bits; +EXPORT_SYMBOL(__pmd_val_bits); +unsigned long __pud_val_bits; +EXPORT_SYMBOL(__pud_val_bits); +unsigned long __pgd_val_bits; +EXPORT_SYMBOL(__pgd_val_bits); +unsigned long __kernel_virt_start; +EXPORT_SYMBOL(__kernel_virt_start); +unsigned long __kernel_virt_size; +EXPORT_SYMBOL(__kernel_virt_size); +unsigned long __vmalloc_start; +EXPORT_SYMBOL(__vmalloc_start); +unsigned long __vmalloc_end; +EXPORT_SYMBOL(__vmalloc_end); +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); +unsigned long __pte_frag_nr; +EXPORT_SYMBOL(__pte_frag_nr); +unsigned long __pte_frag_size_shift; +EXPORT_SYMBOL(__pte_frag_size_shift); +unsigned long ioremap_bot; +#else /* !CONFIG_PPC_BOOK3S_64 */ +unsigned long ioremap_bot = IOREMAP_BASE; +#endif /** * __ioremap_at - Low level function to establish the page tables @@ -167,12 +126,8 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, if ((flags & _PAGE_PRESENT) == 0) flags |= pgprot_val(PAGE_KERNEL); - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* We don't support the 4K PFN hack with ioremap */ - if (flags & _PAGE_4K_PFN) + if (flags & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -253,7 +208,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -263,7 +218,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size) void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE; + unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -277,11 +232,20 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_RW) + if (flags & _PAGE_WRITE) flags |= _PAGE_DIRTY; - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); + /* we don't want to let _PAGE_EXEC leak out */ + flags &= ~_PAGE_EXEC; + /* + * Force kernel mapping. + */ +#if defined(CONFIG_PPC_BOOK3S_64) + flags |= _PAGE_PRIVILEGED; +#else + flags &= ~_PAGE_USER; +#endif + #ifdef _PAGE_BAP_SR /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format @@ -411,7 +375,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) return (pte_t *)ret; } -pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) { pte_t *pte; @@ -421,8 +385,9 @@ pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) return __alloc_for_cache(mm, kernel); } +#endif /* CONFIG_PPC_64K_PAGES */ -void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) +void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); if (put_page_testzero(page)) { @@ -433,15 +398,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) } #ifdef CONFIG_SMP -static void page_table_free_rcu(void *table) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { unsigned long pgf = (unsigned long)table; @@ -458,7 +414,7 @@ void __tlb_remove_table(void *_table) if (!shift) /* PTE page needs special handling */ - page_table_free_rcu(table); + pte_fragment_free(table, 0); else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); @@ -469,385 +425,10 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { if (!shift) { /* PTE page needs special handling */ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } + pte_fragment_free(table, 0); } else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); } } #endif -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - /* - * Since we are not supporting SW TLB systems, we don't - * have any thing similar to flush_tlb_page_nohash() - */ - } - return changed; -} - -unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - - unsigned long old, tmp; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&mm->page_table_lock); -#endif - -#ifdef PTE_ATOMIC_UPDATES - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - andi. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - or %1,%1,%7\n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) - : "cc" ); -#else - old = pmd_val(*pmdp); - *pmdp = __pmd((old & ~clr) | set); -#endif - trace_hugepage_update(addr, old, clr, set); - if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp, old); - return old; -} - -pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); - - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - return pmd; -} - -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(&mm->page_table_lock); - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0); -} - - -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - kick_all_cpus_sync(); -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long old_pmd) -{ - int ssize; - unsigned int psize; - unsigned long vsid; - unsigned long flags = 0; - const struct cpumask *tmp; - - /* get the base page size,vsid and segment size */ -#ifdef CONFIG_DEBUG_VM - psize = get_slice_psize(mm, addr); - BUG_ON(psize == MMU_PAGE_16M); -#endif - if (old_pmd & _PAGE_COMBO) - psize = MMU_PAGE_4K; - else - psize = MMU_PAGE_64K; - - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(mm), tmp)) - flags |= HPTE_LOCAL_UPDATE; - - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - return; -} - -pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_linux_pte_or_hugepte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. - */ - kick_all_cpus_sync(); - return old_pmd; -} - -int has_transparent_hugepage(void) -{ - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER, - "hugepages can't be allocated by the buddy allocator"); - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2, - "We need more than 2 pages to do deferred thp split"); - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 825b6873391f..48fc28bab544 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -32,7 +32,6 @@ enum slb_index { }; extern void slb_allocate_realmode(unsigned long ea); -extern void slb_allocate_user(unsigned long ea); static void slb_allocate(unsigned long ea) { diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 736d18b3cefd..dfdb90cb4403 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode) * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE */ - rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) + rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) bne- 8f srdi r9,r3,60 /* get region */ @@ -91,7 +91,7 @@ slb_miss_kernel_load_vmemmap: * can be demoted from 64K -> 4K dynamically on some machines */ clrldi r11,r10,48 - cmpldi r11,(VMALLOC_SIZE >> 28) - 1 + cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f @@ -179,56 +179,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) li r11,SLB_VSID_USER /* flags don't much matter */ b slb_finish_load -#ifdef __DISABLED__ - -/* void slb_allocate_user(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - * - * It is called with translation enabled in order to be able to walk the - * page tables. This is not currently used. - */ -_GLOBAL(slb_allocate_user) - /* r3 = faulting address */ - srdi r10,r3,28 /* get esid */ - - crset 4*cr7+lt /* set "user" flag for later */ - - /* check if we fit in the range covered by the pagetables*/ - srdi. r9,r3,PGTABLE_EADDR_SIZE - crnot 4*cr0+eq,4*cr0+eq - beqlr - - /* now we need to get to the page tables in order to get the page - * size encoding from the PMD. In the future, we'll be able to deal - * with 1T segments too by getting the encoding from the PGD instead - */ - ld r9,PACAPGDIR(r13) - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,8,25,28 - ldx r9,r9,r11 /* get pgd_t */ - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,3,17,28 - ldx r9,r9,r11 /* get pmd_t */ - cmpldi cr0,r9,0 - beqlr - - /* build vsid flags */ - andi. r11,r9,SLB_VSID_LLP - ori r11,r11,SLB_VSID_USER - - /* get context to calculate proto-VSID */ - ld r9,PACACONTEXTID(r13) - /* fall through slb_finish_load */ - -#endif /* __DISABLED__ */ - - /* * Finish loading of an SLB entry and return * diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 42954f0b47ac..2b27458902ee 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -37,8 +37,8 @@ #include <asm/hugetlb.h> /* some sanity checks */ -#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error PGTABLE_RANGE exceeds slice_mask high_slices size +#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error H_PGTABLE_RANGE exceeds slice_mask high_slices size #endif static DEFINE_SPINLOCK(slice_convert_lock); @@ -395,6 +395,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); + VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", @@ -568,6 +569,16 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) unsigned char *hpsizes; int index, mask_index; + /* + * Radix doesn't use slice, but can get enabled along with MMU_SLICE + */ + if (radix_enabled()) { +#ifdef CONFIG_PPC_64K_PAGES + return MMU_PAGE_64K; +#else + return MMU_PAGE_4K; +#endif + } if (addr < SLICE_LOW_TOP) { u64 lpsizes; lpsizes = mm->context.low_slices_psize; @@ -605,6 +616,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + VM_BUG_ON(radix_enabled()); spin_lock_irqsave(&slice_convert_lock, flags); old_psize = mm->context.user_psize; @@ -649,6 +661,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, { struct slice_mask mask = slice_range_to_mask(start, len); + VM_BUG_ON(radix_enabled()); slice_convert(mm, mask, psize); } @@ -678,6 +691,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, struct slice_mask mask, available; unsigned int psize = mm->context.user_psize; + if (radix_enabled()) + return 0; + mask = slice_range_to_mask(addr, len); available = slice_mask_for_size(mm, psize); #ifdef CONFIG_PPC_64K_PAGES diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c new file mode 100644 index 000000000000..0fdaf93a3e09 --- /dev/null +++ b/arch/powerpc/mm/tlb-radix.c @@ -0,0 +1,251 @@ +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/memblock.h> + +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +static inline void __tlbiel_pid(unsigned long pid, int set) +{ + unsigned long rb,rs,ric,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 2; /* invalidate all the caches */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid) +{ + int set; + + for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { + __tlbiel_pid(pid, set); + } + return; +} + +static inline void _tlbie_pid(unsigned long pid) +{ + unsigned long rb,rs,ric,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 2; /* invalidate all the caches */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + unsigned long rb,rs,ric,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 0; /* no cluster flush yet */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + unsigned long rb,rs,ric,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 0; /* no cluster flush yet */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbiel_va(vmaddr, pid, ap); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (vma && is_vm_hugetlb_page(vma)) + return __local_flush_hugetlb_page(vma, vmaddr); +#endif + radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +#ifdef CONFIG_SMP +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_sibling_cpumask(smp_processor_id())); +} + +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(pid); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_pid(pid); +no_context: + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_va(vmaddr, pid, ap); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_va(vmaddr, pid, ap); +bail: + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + return flush_hugetlb_page(vma, vmaddr); +#endif + radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#endif /* CONFIG_SMP */ + +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(0); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. Because + * we use this in code path where we don' track the page size. + */ +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + struct mm_struct *mm = tlb->mm; + radix__flush_tlb_mm(mm); +} diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index f7b80391bee7..4517aa43a8b1 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -155,7 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } -void tlb_flush(struct mmu_gather *tlb) +void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); @@ -218,7 +218,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (is_thp) trace_hugepage_invalidate(start, pte); - if (!(pte & _PAGE_HASHPTE)) + if (!(pte & H_PAGE_HASHPTE)) continue; if (unlikely(is_thp)) hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); @@ -248,7 +248,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) start_pte = pte_offset_map(pmd, addr); for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); - if (pteval & _PAGE_HASHPTE) + if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } |