diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-05 21:11:37 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-05 21:11:37 +0200 |
commit | ab182e67ec99ea0c8d7435a32a4a1ed9bb02559a (patch) | |
tree | fa71bef0067a61952561552c6652d922060f5530 /arch/arm64/mm | |
parent | Merge tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/p... (diff) | |
parent | arm64: Fix the DMA mmap and get_sgtable API with DMA_ATTR_FORCE_CONTIGUOUS (diff) | |
download | linux-ab182e67ec99ea0c8d7435a32a4a1ed9bb02559a.tar.xz linux-ab182e67ec99ea0c8d7435a32a4a1ed9bb02559a.zip |
Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull arm64 updates from Catalin Marinas:
- kdump support, including two necessary memblock additions:
memblock_clear_nomap() and memblock_cap_memory_range()
- ARMv8.3 HWCAP bits for JavaScript conversion instructions, complex
numbers and weaker release consistency
- arm64 ACPI platform MSI support
- arm perf updates: ACPI PMU support, L3 cache PMU in some Qualcomm
SoCs, Cortex-A53 L2 cache events and DTLB refills, MAINTAINERS update
for DT perf bindings
- architected timer errata framework (the arch/arm64 changes only)
- support for DMA_ATTR_FORCE_CONTIGUOUS in the arm64 iommu DMA API
- arm64 KVM refactoring to use common system register definitions
- remove support for ASID-tagged VIVT I-cache (no ARMv8 implementation
using it and deprecated in the architecture) together with some
I-cache handling clean-up
- PE/COFF EFI header clean-up/hardening
- define BUG() instruction without CONFIG_BUG
* tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (92 commits)
arm64: Fix the DMA mmap and get_sgtable API with DMA_ATTR_FORCE_CONTIGUOUS
arm64: Print DT machine model in setup_machine_fdt()
arm64: pmu: Wire-up Cortex A53 L2 cache events and DTLB refills
arm64: module: split core and init PLT sections
arm64: pmuv3: handle pmuv3+
arm64: Add CNTFRQ_EL0 trap handler
arm64: Silence spurious kbuild warning on menuconfig
arm64: pmuv3: use arm_pmu ACPI framework
arm64: pmuv3: handle !PMUv3 when probing
drivers/perf: arm_pmu: add ACPI framework
arm64: add function to get a cpu's MADT GICC table
drivers/perf: arm_pmu: split out platform device probe logic
drivers/perf: arm_pmu: move irq request/free into probe
drivers/perf: arm_pmu: split cpu-local irq request/free
drivers/perf: arm_pmu: rename irq request/free functions
drivers/perf: arm_pmu: handle no platform_device
drivers/perf: arm_pmu: simplify cpu_pmu_request_irqs()
drivers/perf: arm_pmu: factor out pmu registration
drivers/perf: arm_pmu: fold init into alloc
drivers/perf: arm_pmu: define armpmu_init_fn
...
Diffstat (limited to 'arch/arm64/mm')
-rw-r--r-- | arch/arm64/mm/context.c | 3 | ||||
-rw-r--r-- | arch/arm64/mm/dma-mapping.c | 128 | ||||
-rw-r--r-- | arch/arm64/mm/fault.c | 55 | ||||
-rw-r--r-- | arch/arm64/mm/flush.c | 4 | ||||
-rw-r--r-- | arch/arm64/mm/init.c | 181 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 311 | ||||
-rw-r--r-- | arch/arm64/mm/pageattr.c | 15 |
7 files changed, 528 insertions, 169 deletions
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 68634c630cdd..ab9f5f0fb2c7 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -119,9 +119,6 @@ static void flush_context(unsigned int cpu) /* Queue a TLB invalidate and flush the I-cache if necessary. */ cpumask_setall(&tlb_flush_pending); - - if (icache_is_aivivt()) - __flush_icache_all(); } static bool check_update_reserved_asid(u64 asid, u64 newasid) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 7f8b37e85a2b..4dac4afc95a5 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -308,24 +308,15 @@ static void __swiotlb_sync_sg_for_device(struct device *dev, sg->length, dir); } -static int __swiotlb_mmap(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) +static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, + unsigned long pfn, size_t size) { int ret = -ENXIO; unsigned long nr_vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long pfn = dma_to_phys(dev, dma_addr) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; - vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, - is_device_dma_coherent(dev)); - - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) { ret = remap_pfn_range(vma, vma->vm_start, pfn + off, @@ -336,19 +327,43 @@ static int __swiotlb_mmap(struct device *dev, return ret; } -static int __swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size, - unsigned long attrs) +static int __swiotlb_mmap(struct device *dev, + struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + int ret; + unsigned long pfn = dma_to_phys(dev, dma_addr) >> PAGE_SHIFT; + + vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, + is_device_dma_coherent(dev)); + + if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + return __swiotlb_mmap_pfn(vma, pfn, size); +} + +static int __swiotlb_get_sgtable_page(struct sg_table *sgt, + struct page *page, size_t size) { int ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) - sg_set_page(sgt->sgl, phys_to_page(dma_to_phys(dev, handle)), - PAGE_ALIGN(size), 0); + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } +static int __swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t handle, size_t size, + unsigned long attrs) +{ + struct page *page = phys_to_page(dma_to_phys(dev, handle)); + + return __swiotlb_get_sgtable_page(sgt, page, size); +} + static int __swiotlb_dma_supported(struct device *hwdev, u64 mask) { if (swiotlb) @@ -584,20 +599,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, */ gfp |= __GFP_ZERO; - if (gfpflags_allow_blocking(gfp)) { - struct page **pages; - pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent); - - pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot, - handle, flush_page); - if (!pages) - return NULL; - - addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot, - __builtin_return_address(0)); - if (!addr) - iommu_dma_free(dev, pages, iosize, handle); - } else { + if (!gfpflags_allow_blocking(gfp)) { struct page *page; /* * In atomic context we can't remap anything, so we'll only @@ -621,6 +623,45 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, __free_from_pool(addr, size); addr = NULL; } + } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent); + struct page *page; + + page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, + get_order(size), gfp); + if (!page) + return NULL; + + *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot); + if (iommu_dma_mapping_error(dev, *handle)) { + dma_release_from_contiguous(dev, page, + size >> PAGE_SHIFT); + return NULL; + } + if (!coherent) + __dma_flush_area(page_to_virt(page), iosize); + + addr = dma_common_contiguous_remap(page, size, VM_USERMAP, + prot, + __builtin_return_address(0)); + if (!addr) { + iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs); + dma_release_from_contiguous(dev, page, + size >> PAGE_SHIFT); + } + } else { + pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent); + struct page **pages; + + pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot, + handle, flush_page); + if (!pages) + return NULL; + + addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot, + __builtin_return_address(0)); + if (!addr) + iommu_dma_free(dev, pages, iosize, handle); } return addr; } @@ -632,7 +673,8 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, size = PAGE_ALIGN(size); /* - * @cpu_addr will be one of 3 things depending on how it was allocated: + * @cpu_addr will be one of 4 things depending on how it was allocated: + * - A remapped array of pages for contiguous allocations. * - A remapped array of pages from iommu_dma_alloc(), for all * non-atomic allocations. * - A non-cacheable alias from the atomic pool, for atomic @@ -644,6 +686,12 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, if (__in_atomic_pool(cpu_addr, size)) { iommu_dma_unmap_page(dev, handle, iosize, 0, 0); __free_from_pool(cpu_addr, size); + } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + struct page *page = vmalloc_to_page(cpu_addr); + + iommu_dma_unmap_page(dev, handle, iosize, 0, attrs); + dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); + dma_common_free_remap(cpu_addr, size, VM_USERMAP); } else if (is_vmalloc_addr(cpu_addr)){ struct vm_struct *area = find_vm_area(cpu_addr); @@ -670,6 +718,15 @@ static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) return ret; + if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + /* + * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped, + * hence in the vmalloc space. + */ + unsigned long pfn = vmalloc_to_pfn(cpu_addr); + return __swiotlb_mmap_pfn(vma, pfn, size); + } + area = find_vm_area(cpu_addr); if (WARN_ON(!area || !area->pages)) return -ENXIO; @@ -684,6 +741,15 @@ static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt, unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct vm_struct *area = find_vm_area(cpu_addr); + if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + /* + * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped, + * hence in the vmalloc space. + */ + struct page *page = vmalloc_to_page(cpu_addr); + return __swiotlb_get_sgtable_page(sgt, page, size); + } + if (WARN_ON(!area || !area->pages)) return -ENXIO; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1b35b8bddbfb..37b95dff0b07 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -174,12 +174,33 @@ static bool is_el1_instruction_abort(unsigned int esr) return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; } +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs, + unsigned long addr) +{ + unsigned int ec = ESR_ELx_EC(esr); + unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (fsc_type == ESR_ELx_FSC_PERM) + return true; + + if (addr < USER_DS && system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + + return false; +} + /* * The kernel tried to access some page that wasn't present. */ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int esr, struct pt_regs *regs) { + const char *msg; + /* * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. @@ -191,9 +212,20 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, * No handler, we'll have to terminate things with extreme prejudice. */ bust_spinlocks(1); - pr_alert("Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); + + if (is_permission_fault(esr, regs, addr)) { + if (esr & ESR_ELx_WNR) + msg = "write to read-only memory"; + else + msg = "read from unreadable memory"; + } else if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; + } else { + msg = "paging request"; + } + + pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, + addr); show_pte(mm, addr); die("Oops", regs, esr); @@ -287,21 +319,6 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) -{ - unsigned int ec = ESR_ELx_EC(esr); - unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - - if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) - return false; - - if (system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && - (regs->pstate & PSR_PAN_BIT); - else - return fsc_type == ESR_ELx_FSC_PERM; -} - static bool is_el0_instruction_abort(unsigned int esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -339,7 +356,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (addr < USER_DS && is_permission_fault(esr, regs)) { + if (addr < USER_DS && is_permission_fault(esr, regs, addr)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 554a2558c12e..21a8d828cbf4 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -22,7 +22,7 @@ #include <linux/pagemap.h> #include <asm/cacheflush.h> -#include <asm/cachetype.h> +#include <asm/cache.h> #include <asm/tlbflush.h> void sync_icache_aliases(void *kaddr, unsigned long len) @@ -65,8 +65,6 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr) if (!test_and_set_bit(PG_dcache_clean, &page->flags)) sync_icache_aliases(page_address(page), PAGE_SIZE << compound_order(page)); - else if (icache_is_aivivt()) - __flush_icache_all(); } /* diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e19e06593e37..5960bef0170d 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/memblock.h> #include <linux/sort.h> +#include <linux/of.h> #include <linux/of_fdt.h> #include <linux/dma-mapping.h> #include <linux/dma-contiguous.h> @@ -37,6 +38,8 @@ #include <linux/swiotlb.h> #include <linux/vmalloc.h> #include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> #include <asm/boot.h> #include <asm/fixmap.h> @@ -77,6 +80,142 @@ static int __init early_initrd(char *p) early_param("initrd", early_initrd); #endif +#ifdef CONFIG_KEXEC_CORE +/* + * reserve_crashkernel() - reserves memory for crash kernel + * + * This function reserves memory area given in "crashkernel=" kernel command + * line parameter. The memory reserved is used by dump capture kernel when + * primary kernel is crashing. + */ +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_base, crash_size; + int ret; + + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + /* no crashkernel= or invalid value specified */ + if (ret || !crash_size) + return; + + crash_size = PAGE_ALIGN(crash_size); + + if (crash_base == 0) { + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT, + crash_size, SZ_2M); + if (crash_base == 0) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + } else { + /* User specifies base address explicitly. */ + if (!memblock_is_region_memory(crash_base, crash_size)) { + pr_warn("cannot reserve crashkernel: region is not memory\n"); + return; + } + + if (memblock_is_region_reserved(crash_base, crash_size)) { + pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); + return; + } + + if (!IS_ALIGNED(crash_base, SZ_2M)) { + pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); + return; + } + } + memblock_reserve(crash_base, crash_size); + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static void __init kexec_reserve_crashkres_pages(void) +{ +#ifdef CONFIG_HIBERNATION + phys_addr_t addr; + struct page *page; + + if (!crashk_res.end) + return; + + /* + * To reduce the size of hibernation image, all the pages are + * marked as Reserved initially. + */ + for (addr = crashk_res.start; addr < (crashk_res.end + 1); + addr += PAGE_SIZE) { + page = phys_to_page(addr); + SetPageReserved(page); + } +#endif +} +#else +static void __init reserve_crashkernel(void) +{ +} + +static void __init kexec_reserve_crashkres_pages(void) +{ +} +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_DUMP +static int __init early_init_dt_scan_elfcorehdr(unsigned long node, + const char *uname, int depth, void *data) +{ + const __be32 *reg; + int len; + + if (depth != 1 || strcmp(uname, "chosen") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len); + if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells))) + return 1; + + elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, ®); + elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, ®); + + return 1; +} + +/* + * reserve_elfcorehdr() - reserves memory for elf core header + * + * This function reserves the memory occupied by an elf core header + * described in the device tree. This region contains all the + * information about primary kernel's core image and is used by a dump + * capture kernel to access the system memory on primary kernel. + */ +static void __init reserve_elfcorehdr(void) +{ + of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL); + + if (!elfcorehdr_size) + return; + + if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) { + pr_warn("elfcorehdr is overlapped\n"); + return; + } + + memblock_reserve(elfcorehdr_addr, elfcorehdr_size); + + pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n", + elfcorehdr_size >> 10, elfcorehdr_addr); +} +#else +static void __init reserve_elfcorehdr(void) +{ +} +#endif /* CONFIG_CRASH_DUMP */ /* * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It * currently assumes that for memory starting above 4G, 32-bit devices will @@ -188,10 +327,45 @@ static int __init early_mem(char *p) } early_param("mem", early_mem); +static int __init early_init_dt_scan_usablemem(unsigned long node, + const char *uname, int depth, void *data) +{ + struct memblock_region *usablemem = data; + const __be32 *reg; + int len; + + if (depth != 1 || strcmp(uname, "chosen") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,usable-memory-range", &len); + if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells))) + return 1; + + usablemem->base = dt_mem_next_cell(dt_root_addr_cells, ®); + usablemem->size = dt_mem_next_cell(dt_root_size_cells, ®); + + return 1; +} + +static void __init fdt_enforce_memory_region(void) +{ + struct memblock_region reg = { + .size = 0, + }; + + of_scan_flat_dt(early_init_dt_scan_usablemem, ®); + + if (reg.size) + memblock_cap_memory_range(reg.base, reg.size); +} + void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; + /* Handle linux,usable-memory-range property */ + fdt_enforce_memory_region(); + /* * Ensure that the linear region takes up exactly half of the kernel * virtual address space. This way, we can distinguish a linear address @@ -297,6 +471,11 @@ void __init arm64_memblock_init(void) arm64_dma_phys_limit = max_zone_dma_phys(); else arm64_dma_phys_limit = PHYS_MASK + 1; + + reserve_crashkernel(); + + reserve_elfcorehdr(); + dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); @@ -416,6 +595,8 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ free_all_bootmem(); + kexec_reserve_crashkres_pages(); + mem_init_print_info(NULL); #define MLK(b, t) b, t, ((t) - (b)) >> 10 diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d28dbcf596b6..0c429ec6fde8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -22,6 +22,8 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> +#include <linux/ioport.h> +#include <linux/kexec.h> #include <linux/libfdt.h> #include <linux/mman.h> #include <linux/nodemask.h> @@ -43,6 +45,9 @@ #include <asm/mmu_context.h> #include <asm/ptdump.h> +#define NO_BLOCK_MAPPINGS BIT(0) +#define NO_CONT_MAPPINGS BIT(1) + u64 idmap_t0sz = TCR_T0SZ(VA_BITS); u64 kimage_voffset __ro_after_init; @@ -103,33 +108,27 @@ static bool pgattr_change_is_safe(u64 old, u64 new) */ static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; - return old == 0 || new == 0 || ((old ^ new) & ~mask) == 0; + /* creating or taking down mappings is always safe */ + if (old == 0 || new == 0) + return true; + + /* live contiguous mappings may not be manipulated at all */ + if ((old | new) & PTE_CONT) + return false; + + return ((old ^ new) & ~mask) == 0; } -static void alloc_init_pte(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long pfn, - pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) +static void init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot) { pte_t *pte; - BUG_ON(pmd_sect(*pmd)); - if (pmd_none(*pmd)) { - phys_addr_t pte_phys; - BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(); - pte = pte_set_fixmap(pte_phys); - __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); - pte_clear_fixmap(); - } - BUG_ON(pmd_bad(*pmd)); - pte = pte_set_fixmap_offset(pmd, addr); do { pte_t old_pte = *pte; - set_pte(pte, pfn_pte(pfn, prot)); - pfn++; + set_pte(pte, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we @@ -137,32 +136,51 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte))); + phys += PAGE_SIZE; } while (pte++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); } -static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), - bool page_mappings_only) +static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr, + unsigned long end, phys_addr_t phys, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), + int flags) { - pmd_t *pmd; unsigned long next; - /* - * Check for initial section mappings in the pgd/pud and remove them. - */ - BUG_ON(pud_sect(*pud)); - if (pud_none(*pud)) { - phys_addr_t pmd_phys; + BUG_ON(pmd_sect(*pmd)); + if (pmd_none(*pmd)) { + phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(); - pmd = pmd_set_fixmap(pmd_phys); - __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); - pmd_clear_fixmap(); + pte_phys = pgtable_alloc(); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); } - BUG_ON(pud_bad(*pud)); + BUG_ON(pmd_bad(*pmd)); + + do { + pgprot_t __prot = prot; + + next = pte_cont_addr_end(addr, end); + + /* use a contiguous mapping if the range is suitably aligned */ + if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && + (flags & NO_CONT_MAPPINGS) == 0) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + init_pte(pmd, addr, next, phys, __prot); + + phys += next - addr; + } while (addr = next, addr != end); +} + +static void init_pmd(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), int flags) +{ + unsigned long next; + pmd_t *pmd; pmd = pmd_set_fixmap_offset(pud, addr); do { @@ -172,7 +190,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && - !page_mappings_only) { + (flags & NO_BLOCK_MAPPINGS) == 0) { pmd_set_huge(pmd, phys, prot); /* @@ -182,8 +200,8 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), pmd_val(*pmd))); } else { - alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, pgtable_alloc); + alloc_init_cont_pte(pmd, addr, next, phys, prot, + pgtable_alloc, flags); BUG_ON(pmd_val(old_pmd) != 0 && pmd_val(old_pmd) != pmd_val(*pmd)); @@ -194,6 +212,41 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, pmd_clear_fixmap(); } +static void alloc_init_cont_pmd(pud_t *pud, unsigned long addr, + unsigned long end, phys_addr_t phys, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), int flags) +{ + unsigned long next; + + /* + * Check for initial section mappings in the pgd/pud. + */ + BUG_ON(pud_sect(*pud)); + if (pud_none(*pud)) { + phys_addr_t pmd_phys; + BUG_ON(!pgtable_alloc); + pmd_phys = pgtable_alloc(); + __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); + } + BUG_ON(pud_bad(*pud)); + + do { + pgprot_t __prot = prot; + + next = pmd_cont_addr_end(addr, end); + + /* use a contiguous mapping if the range is suitably aligned */ + if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && + (flags & NO_CONT_MAPPINGS) == 0) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + init_pmd(pud, addr, next, phys, __prot, pgtable_alloc, flags); + + phys += next - addr; + } while (addr = next, addr != end); +} + static inline bool use_1G_block(unsigned long addr, unsigned long next, unsigned long phys) { @@ -209,7 +262,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool page_mappings_only) + int flags) { pud_t *pud; unsigned long next; @@ -231,7 +284,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys) && !page_mappings_only) { + if (use_1G_block(addr, next, phys) && + (flags & NO_BLOCK_MAPPINGS) == 0) { pud_set_huge(pud, phys, prot); /* @@ -241,8 +295,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), pud_val(*pud))); } else { - alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc, page_mappings_only); + alloc_init_cont_pmd(pud, addr, next, phys, prot, + pgtable_alloc, flags); BUG_ON(pud_val(old_pud) != 0 && pud_val(old_pud) != pud_val(*pud)); @@ -257,7 +311,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool page_mappings_only) + int flags) { unsigned long addr, length, end, next; pgd_t *pgd = pgd_offset_raw(pgdir, virt); @@ -277,7 +331,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, - page_mappings_only); + flags); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -306,82 +360,80 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, bool page_mappings_only) { + int flags = 0; + BUG_ON(mm == &init_mm); + if (page_mappings_only) + flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; + __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, page_mappings_only); + pgd_pgtable_alloc, flags); } -static void create_mapping_late(phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot) +static void update_mapping_prot(phys_addr_t phys, unsigned long virt, + phys_addr_t size, pgprot_t prot) { if (virt < VMALLOC_START) { - pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", + pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n", &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - NULL, debug_pagealloc_enabled()); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); + + /* flush the TLBs after updating live kernel mappings */ + flush_tlb_kernel_range(virt, virt + size); } -static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, + phys_addr_t end, pgprot_t prot, int flags) { - phys_addr_t kernel_start = __pa_symbol(_text); - phys_addr_t kernel_end = __pa_symbol(__init_begin); - - /* - * Take care not to create a writable alias for the - * read-only text and rodata sections of the kernel image. - */ - - /* No overlap with the kernel text/rodata */ - if (end < kernel_start || start >= kernel_end) { - __create_pgd_mapping(pgd, start, __phys_to_virt(start), - end - start, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); - return; - } - - /* - * This block overlaps the kernel text/rodata mappings. - * Map the portion(s) which don't overlap. - */ - if (start < kernel_start) - __create_pgd_mapping(pgd, start, - __phys_to_virt(start), - kernel_start - start, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); - if (kernel_end < end) - __create_pgd_mapping(pgd, kernel_end, - __phys_to_virt(kernel_end), - end - kernel_end, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); + __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, + prot, early_pgtable_alloc, flags); +} +void __init mark_linear_text_alias_ro(void) +{ /* - * Map the linear alias of the [_text, __init_begin) interval as - * read-only/non-executable. This makes the contents of the - * region accessible to subsystems such as hibernate, but - * protects it from inadvertent modification or execution. + * Remove the write permissions from the linear alias of .text/.rodata */ - __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), - kernel_end - kernel_start, PAGE_KERNEL_RO, - early_pgtable_alloc, debug_pagealloc_enabled()); + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, + PAGE_KERNEL_RO); } static void __init map_mem(pgd_t *pgd) { + phys_addr_t kernel_start = __pa_symbol(_text); + phys_addr_t kernel_end = __pa_symbol(__init_begin); struct memblock_region *reg; + int flags = 0; + + if (debug_pagealloc_enabled()) + flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; + + /* + * Take care not to create a writable alias for the + * read-only text and rodata sections of the kernel image. + * So temporarily mark them as NOMAP to skip mappings in + * the following for-loop + */ + memblock_mark_nomap(kernel_start, kernel_end - kernel_start); +#ifdef CONFIG_KEXEC_CORE + if (crashk_res.end) + memblock_mark_nomap(crashk_res.start, + resource_size(&crashk_res)); +#endif /* map all the memory banks */ for_each_memblock(memory, reg) { @@ -393,33 +445,57 @@ static void __init map_mem(pgd_t *pgd) if (memblock_is_nomap(reg)) continue; - __map_memblock(pgd, start, end); + __map_memblock(pgd, start, end, PAGE_KERNEL, flags); + } + + /* + * Map the linear alias of the [_text, __init_begin) interval + * as non-executable now, and remove the write permission in + * mark_linear_text_alias_ro() below (which will be called after + * alternative patching has completed). This makes the contents + * of the region accessible to subsystems such as hibernate, + * but protects it from inadvertent modification or execution. + * Note that contiguous mappings cannot be remapped in this way, + * so we should avoid them here. + */ + __map_memblock(pgd, kernel_start, kernel_end, + PAGE_KERNEL, NO_CONT_MAPPINGS); + memblock_clear_nomap(kernel_start, kernel_end - kernel_start); + +#ifdef CONFIG_KEXEC_CORE + /* + * Use page-level mappings here so that we can shrink the region + * in page granularity and put back unused memory to buddy system + * through /sys/kernel/kexec_crash_size interface. + */ + if (crashk_res.end) { + __map_memblock(pgd, crashk_res.start, crashk_res.end + 1, + PAGE_KERNEL, + NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); + memblock_clear_nomap(crashk_res.start, + resource_size(&crashk_res)); } +#endif } void mark_rodata_ro(void) { unsigned long section_size; - section_size = (unsigned long)_etext - (unsigned long)_text; - create_mapping_late(__pa_symbol(_text), (unsigned long)_text, - section_size, PAGE_KERNEL_ROX); /* * mark .rodata as read only. Use __init_begin rather than __end_rodata * to cover NOTES and EXCEPTION_TABLE. */ section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; - create_mapping_late(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, + update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); - /* flush the TLBs after updating live kernel mappings */ - flush_tlb_all(); - debug_checkwx(); } static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, - pgprot_t prot, struct vm_struct *vma) + pgprot_t prot, struct vm_struct *vma, + int flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; @@ -428,7 +504,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, debug_pagealloc_enabled()); + early_pgtable_alloc, flags); vma->addr = va_start; vma->phys_addr = pa_start; @@ -439,18 +515,39 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, vm_area_add_early(vma); } +static int __init parse_rodata(char *arg) +{ + return strtobool(arg, &rodata_enabled); +} +early_param("rodata", parse_rodata); + /* * Create fine-grained mappings for the kernel. */ static void __init map_kernel(pgd_t *pgd) { - static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; + static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext, + vmlinux_initdata, vmlinux_data; - map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); - map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); - map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, - &vmlinux_init); - map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); + /* + * External debuggers may need to write directly to the text + * mapping to install SW breakpoints. Allow this (only) when + * explicitly requested with rodata=off. + */ + pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + + /* + * Only rodata will be remapped with different permissions later on, + * all other segments are allowed to use contiguous mappings. + */ + map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, 0); + map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL, + &vmlinux_rodata, NO_CONT_MAPPINGS); + map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot, + &vmlinux_inittext, 0); + map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL, + &vmlinux_initdata, 0); + map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, 0); if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { /* diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 8def55e7249b..3212ee0558f6 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -125,20 +125,23 @@ int set_memory_x(unsigned long addr, int numpages) } EXPORT_SYMBOL_GPL(set_memory_x); -#ifdef CONFIG_DEBUG_PAGEALLOC -void __kernel_map_pages(struct page *page, int numpages, int enable) +int set_memory_valid(unsigned long addr, int numpages, int enable) { - unsigned long addr = (unsigned long) page_address(page); - if (enable) - __change_memory_common(addr, PAGE_SIZE * numpages, + return __change_memory_common(addr, PAGE_SIZE * numpages, __pgprot(PTE_VALID), __pgprot(0)); else - __change_memory_common(addr, PAGE_SIZE * numpages, + return __change_memory_common(addr, PAGE_SIZE * numpages, __pgprot(0), __pgprot(PTE_VALID)); } + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + set_memory_valid((unsigned long)page_address(page), numpages, enable); +} #ifdef CONFIG_HIBERNATION /* * When built with CONFIG_DEBUG_PAGEALLOC and CONFIG_HIBERNATION, this function |