From bb1520d581a3a46e2d6e12bb74604ace33404de5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 13 Dec 2022 11:35:11 +0100 Subject: s390/mm: start kernel with DAT enabled The setup of the kernel virtual address space is spread throughout the sources, boot stages and config options like this: 1. The available physical memory regions are queried and stored as mem_detect information for later use in the decompressor. 2. Based on the physical memory availability the virtual memory layout is established in the decompressor; 3. If CONFIG_KASAN is disabled the kernel paging setup code populates kernel pgtables and turns DAT mode on. It uses the information stored at step [1]. 4. If CONFIG_KASAN is enabled the kernel early boot kasan setup populates kernel pgtables and turns DAT mode on. It uses the information stored at step [1]. The kasan setup creates early_pg_dir directory and directly overwrites swapper_pg_dir entries to make shadow memory pages available. Move the kernel virtual memory setup to the decompressor and start the kernel with DAT turned on right from the very first istruction. That completely eliminates the boot phase when the kernel runs in DAT-off mode, simplies the overall design and consolidates pgtables setup. The identity mapping is created in the decompressor, while kasan shadow mappings are still created by the early boot kernel code. Share with decompressor the existing kasan memory allocator. It decreases the size of a newly requested memory block from pgalloc_pos and ensures that kernel image is not overwritten. pgalloc_low and pgalloc_pos pointers are made preserved boot variables for that. Use the bootdata infrastructure to setup swapper_pg_dir and invalid_pg_dir directories used by the kernel later. The interim early_pg_dir directory established by the kasan initialization code gets eliminated as result. As the kernel runs in DAT-on mode only the PSW_KERNEL_BITS define gets PSW_MASK_DAT bit by default. Additionally, the setup_lowcore_dat_off() and setup_lowcore_dat_on() routines get merged, since there is no DAT-off mode stage anymore. The memory mappings are created with RW+X protection that allows the early boot code setting up all necessary data and services for the kernel being booted. Just before the paging is enabled the memory protection is changed to RO+X for text, RO+NX for read-only data and RW+NX for kernel data and the identity mapping. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/Makefile | 2 +- arch/s390/boot/boot.h | 6 +- arch/s390/boot/startup.c | 45 +++++-- arch/s390/boot/vmem.c | 254 ++++++++++++++++++++++++++++++++++++++++ arch/s390/include/asm/kasan.h | 4 - arch/s390/include/asm/pgtable.h | 1 + arch/s390/include/asm/ptrace.h | 2 +- arch/s390/include/asm/setup.h | 3 + arch/s390/kernel/early.c | 5 +- arch/s390/kernel/idle.c | 4 +- arch/s390/kernel/process.c | 4 +- arch/s390/kernel/setup.c | 84 ++++++------- arch/s390/kernel/smp.c | 4 +- arch/s390/kernel/vmlinux.lds.S | 3 + arch/s390/mm/init.c | 35 +----- arch/s390/mm/kasan_init.c | 85 +------------- arch/s390/mm/vmem.c | 96 +++++++++++++-- 17 files changed, 448 insertions(+), 189 deletions(-) create mode 100644 arch/s390/boot/vmem.c (limited to 'arch/s390') diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index d52c3e2e16bc..47a397da0498 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -35,7 +35,7 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 286441cf3bf0..547614496e7e 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -16,7 +16,7 @@ struct machine_info { struct vmlinux_info { unsigned long default_lma; - void (*entry)(void); + unsigned long entry; unsigned long image_size; /* does not include .bss */ unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; @@ -27,6 +27,9 @@ struct vmlinux_info { unsigned long rela_dyn_start; unsigned long rela_dyn_end; unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; }; void startup_kernel(void); @@ -41,6 +44,7 @@ void print_missing_facilities(void); void sclp_early_setup_buffer(void); void print_pgm_check_info(void); unsigned long get_random_base(unsigned long safe_addr); +void setup_vmem(unsigned long online_end, unsigned long asce_limit); void __printf(1, 2) decompressor_printk(const char *fmt, ...); void error(char *m); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index da6ee587fe9a..c5d59df9fa62 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "decompressor.h" #include "boot.h" #include "uv.h" @@ -166,9 +167,10 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #endif } -static void setup_kernel_memory_layout(void) +static unsigned long setup_kernel_memory_layout(void) { unsigned long vmemmap_start; + unsigned long asce_limit; unsigned long rte_size; unsigned long pages; unsigned long vmax; @@ -183,10 +185,10 @@ static void setup_kernel_memory_layout(void) vmalloc_size > _REGION2_SIZE || vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) { - vmax = _REGION1_SIZE; + asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; } else { - vmax = _REGION2_SIZE; + asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } /* @@ -194,7 +196,7 @@ static void setup_kernel_memory_layout(void) * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. */ - vmax = adjust_to_uv_max(vmax); + vmax = adjust_to_uv_max(asce_limit); #ifdef CONFIG_KASAN /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); @@ -223,6 +225,8 @@ static void setup_kernel_memory_layout(void) /* make sure vmemmap doesn't overlay with vmalloc area */ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); vmemmap = (struct page *)vmemmap_start; + + return asce_limit; } /* @@ -256,6 +260,9 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.rela_dyn_start += offset; vmlinux.rela_dyn_end += offset; vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; } static unsigned long reserve_amode31(unsigned long safe_addr) @@ -268,7 +275,10 @@ void startup_kernel(void) { unsigned long random_lma; unsigned long safe_addr; + unsigned long asce_limit; + unsigned long online_end; void *img; + psw_t psw; detect_facilities(); @@ -290,7 +300,8 @@ void startup_kernel(void) sanitize_prot_virt_host(); setup_ident_map_size(detect_memory()); setup_vmalloc_size(); - setup_kernel_memory_layout(); + asce_limit = setup_kernel_memory_layout(); + online_end = min(get_mem_detect_end(), ident_map_size); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { random_lma = get_random_base(safe_addr); @@ -307,9 +318,23 @@ void startup_kernel(void) } else if (__kaslr_offset) memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ clear_bss_section(); - copy_bootdata(); handle_relocs(__kaslr_offset); + setup_vmem(online_end, asce_limit); + copy_bootdata(); if (__kaslr_offset) { /* @@ -321,5 +346,11 @@ void startup_kernel(void) if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) memset(img, 0, vmlinux.image_size); } - vmlinux.entry(); + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..db1469c17289 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "decompressor.h" +#include "boot.h" + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +unsigned long __bootdata_preserved(s390_invalid_asce); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); + +static void boot_check_oom(void) +{ + if (pgalloc_pos < pgalloc_low) + error("out of memory on boot\n"); +} + +static void pgtable_populate_begin(unsigned long online_end) +{ + unsigned long initrd_end; + unsigned long kernel_end; + + kernel_end = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + pgalloc_low = round_up(kernel_end, PAGE_SIZE); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { + initrd_end = round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); + pgalloc_low = max(pgalloc_low, initrd_end); + } + + pgalloc_end = round_down(online_end, PAGE_SIZE); + pgalloc_pos = pgalloc_end; + + boot_check_oom(); +} + +static void *boot_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + pgalloc_pos -= size; + pgalloc_pos = round_down(pgalloc_pos, size); + + boot_check_oom(); + + return (void *)pgalloc_pos; +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = boot_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); + + if (!pte_leftover) { + pte_leftover = boot_alloc_pages(0); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + unsigned long next; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + entry = __pte(__pa(addr)); + entry = set_pte_bit(entry, PAGE_KERNEL_EXEC); + set_pte(pte, entry); + } + } +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(__pa(addr)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL_EXEC); + set_pmd(pmd, entry); + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next); + } +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (can_large_pud(pud, addr, next)) { + entry = __pud(__pa(addr)); + entry = set_pud_bit(entry, REGION3_KERNEL_EXEC); + set_pud(pud, entry); + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next); + } +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } + pgtable_p4d_populate(pgd, addr, next); + } +} + +/* + * The pgtables are located in the range [pgalloc_pos, pgalloc_end). + * That range must stay intact and is later reserved in the memblock. + * Therefore pgtable_populate(pgalloc_pos, pgalloc_end) is needed to + * finalize pgalloc_pos pointer. However that call can decrease the + * value of pgalloc_pos pointer itself. Therefore, pgtable_populate() + * needs to be called repeatedly until pgtables are complete and + * pgalloc_pos does not grow left anymore. + */ +static void pgtable_populate_end(void) +{ + unsigned long pgalloc_end_curr = pgalloc_end; + unsigned long pgalloc_pos_prev; + + do { + pgalloc_pos_prev = pgalloc_pos; + pgtable_populate(pgalloc_pos, pgalloc_end_curr); + pgalloc_end_curr = pgalloc_pos_prev; + } while (pgalloc_pos < pgalloc_pos_prev); +} + +void setup_vmem(unsigned long online_end, unsigned long asce_limit) +{ + unsigned long asce_type; + unsigned long asce_bits; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + * + * No further pgtable_populate() calls are allowed after the value + * of pgalloc_pos finalized with a call to pgtable_populate_end(). + */ + pgtable_populate_begin(online_end); + pgtable_populate(0, sizeof(struct lowcore)); + pgtable_populate(0, online_end); + pgtable_populate_end(); + + S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + + init_mm.context.asce = S390_lowcore.kernel_asce; +} diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h index 2768d5db181f..f7244cc16240 100644 --- a/arch/s390/include/asm/kasan.h +++ b/arch/s390/include/asm/kasan.h @@ -14,8 +14,6 @@ #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) extern void kasan_early_init(void); -extern void kasan_copy_shadow_mapping(void); -extern void kasan_free_early_identity(void); /* * Estimate kasan memory requirements, which it will reserve @@ -43,8 +41,6 @@ static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) } #else static inline void kasan_early_init(void) { } -static inline void kasan_copy_shadow_mapping(void) { } -static inline void kasan_free_early_identity(void) { } static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) { return 0; } #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0170f95f3b91..0f1eba005f6d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -23,6 +23,7 @@ #include extern pgd_t swapper_pg_dir[]; +extern pgd_t invalid_pg_dir[]; extern void paging_init(void); extern unsigned long s390_invalid_asce; diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 8bae33ab320a..bfb8c3cb8aee 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -26,7 +26,7 @@ #ifndef __ASSEMBLY__ #define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \ - PSW_MASK_EA | PSW_MASK_BA) + PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT) #define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \ PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 77e6506898f5..6792ce28d37a 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -73,6 +73,9 @@ extern unsigned int zlib_dfltcc_support; extern int noexec_disabled; extern unsigned long ident_map_size; +extern unsigned long pgalloc_pos; +extern unsigned long pgalloc_end; +extern unsigned long pgalloc_low; /* The Write Back bit position in the physaddr is given by the SLPC PCI */ extern unsigned long mio_wb_bit_mask; diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 9693c8630e73..9cfd9f4fc927 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -160,9 +161,7 @@ static noinline __init void setup_lowcore_early(void) psw_t psw; psw.addr = (unsigned long)early_pgm_check_handler; - psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; - if (IS_ENABLED(CONFIG_KASAN)) - psw.mask |= PSW_MASK_DAT; + psw.mask = PSW_KERNEL_BITS; S390_lowcore.program_new_psw = psw; S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 4bf1ee293f2b..a8aebb5c95cf 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -51,8 +51,8 @@ void arch_cpu_idle(void) unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ - psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); /* psw_idle() returns with interrupts disabled. */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 3f5d2db0b854..67df64ef4839 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -147,8 +147,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); - frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = (unsigned long)__ret_from_fork; frame->childregs.gprs[9] = (unsigned long)args->fn; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2b6091349daa..1ffaa85cd518 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -149,6 +149,9 @@ int __bootdata(noexec_disabled); unsigned long __bootdata(ident_map_size); struct mem_detect_info __bootdata(mem_detect); struct initrd_data __bootdata(initrd_data); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); unsigned long __bootdata_preserved(__kaslr_offset); unsigned long __bootdata(__amode31_base); @@ -411,16 +414,12 @@ void __init arch_call_rest_init(void) call_on_stack_noreturn(rest_init, stack); } -static void __init setup_lowcore_dat_off(void) +static void __init setup_lowcore(void) { - unsigned long int_psw_mask = PSW_KERNEL_BITS; - struct lowcore *abs_lc, *lc; + struct lowcore *lc, *abs_lc; unsigned long mcck_stack; unsigned long flags; - if (IS_ENABLED(CONFIG_KASAN)) - int_psw_mask |= PSW_MASK_DAT; - /* * Setup lowcore for boot cpu */ @@ -430,17 +429,17 @@ static void __init setup_lowcore_dat_off(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); - lc->restart_psw.mask = PSW_KERNEL_BITS; - lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; - lc->mcck_new_psw.mask = int_psw_mask; + lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; lc->nodat_stack = ((unsigned long) &init_thread_union) @@ -477,15 +476,7 @@ static void __init setup_lowcore_dat_off(void) lc->restart_fn = (unsigned long) do_restart; lc->restart_data = 0; lc->restart_source = -1U; - - abs_lc = get_abs_lowcore(&flags); - abs_lc->restart_stack = lc->restart_stack; - abs_lc->restart_fn = lc->restart_fn; - abs_lc->restart_data = lc->restart_data; - abs_lc->restart_source = lc->restart_source; - abs_lc->restart_psw = lc->restart_psw; - abs_lc->mcesad = lc->mcesad; - put_abs_lowcore(abs_lc, flags); + __ctl_store(lc->cregs_save_area, 0, 15); mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); if (!mcck_stack) @@ -499,33 +490,26 @@ static void __init setup_lowcore_dat_off(void) lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = S390_lowcore.user_asce; + + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + memcpy(abs_lc->cregs_save_area, lc->cregs_save_area, sizeof(abs_lc->cregs_save_area)); + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc, flags); set_prefix(__pa(lc)); lowcore_ptr[0] = lc; -} - -static void __init setup_lowcore_dat_on(void) -{ - struct lowcore *abs_lc; - unsigned long flags; - - __ctl_clear_bit(0, 28); - S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.mcck_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_set_bit(0, 28); - __ctl_store(S390_lowcore.cregs_save_area, 0, 15); if (abs_lowcore_map(0, lowcore_ptr[0], true)) panic("Couldn't setup absolute lowcore"); abs_lowcore_mapped = true; - abs_lc = get_abs_lowcore(&flags); - abs_lc->restart_flags = RESTART_FLAG_CTLREGS; - abs_lc->program_new_psw = S390_lowcore.program_new_psw; - memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area, - sizeof(abs_lc->cregs_save_area)); - put_abs_lowcore(abs_lc, flags); } static struct resource code_resource = { @@ -649,6 +633,14 @@ static struct notifier_block kdump_mem_nb = { #endif +/* + * Reserve page tables created by decompressor + */ +static void __init reserve_pgtables(void) +{ + memblock_reserve(pgalloc_pos, pgalloc_end - pgalloc_pos); +} + /* * Reserve memory for kdump kernel to be loaded with kexec */ @@ -1004,6 +996,7 @@ void __init setup_arch(char **cmdline_p) setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ + reserve_pgtables(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); @@ -1038,7 +1031,7 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_lowcore_dat_off(); + setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); cpu_init(); @@ -1050,7 +1043,7 @@ void __init setup_arch(char **cmdline_p) static_branch_enable(&cpu_has_bear); /* - * Create kernel page tables and switch to virtual addressing. + * Create kernel page tables. */ paging_init(); memcpy_real_init(); @@ -1058,7 +1051,6 @@ void __init setup_arch(char **cmdline_p) * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ - setup_lowcore_dat_on(); #ifdef CONFIG_CRASH_DUMP smp_save_dump_ipl_cpu(); #endif diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0031325ce4bc..24f19f10b237 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -327,7 +327,7 @@ static void pcpu_delegate(struct pcpu *pcpu, lc = lowcore_ptr[pcpu - pcpu_devices]; source_cpu = stap(); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, pcpu_delegate_fn *, func, void *, data); @@ -488,7 +488,7 @@ void smp_send_stop(void) int cpu; /* Disable all interrupts/machine checks */ - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + __load_psw_mask(PSW_KERNEL_BITS); trace_hardirqs_off(); debug_set_critical(); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 5ea3830af0cc..a965ddc34f43 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -213,6 +213,9 @@ SECTIONS QUAD(__rela_dyn_start) /* rela_dyn_start */ QUAD(__rela_dyn_end) /* rela_dyn_end */ QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) } :NONE /* Debugging sections. */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 30ab55f868f6..144447d5cb4c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -52,9 +52,9 @@ #include pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); -static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); -unsigned long s390_invalid_asce; +unsigned long __bootdata_preserved(s390_invalid_asce); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -93,37 +93,8 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - - s390_invalid_asce = (unsigned long)invalid_pg_dir; - s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = s390_invalid_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); - vmem_map_init(); - kasan_copy_shadow_mapping(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); + vmem_map_init(); sparse_init(); zone_dma_bits = 31; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index a97b7981358e..801d81c189a7 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include #include @@ -15,16 +14,11 @@ static unsigned long segment_pos __initdata; static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; static bool has_edat __initdata; static bool has_nx __initdata; #define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - static void __init kasan_early_panic(const char *reason) { sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); @@ -229,29 +223,6 @@ static void __init kasan_early_pgtable_populate(unsigned long address, } } -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - static void __init kasan_early_detect_facilities(void) { if (test_facility(8)) { @@ -272,7 +243,6 @@ void __init kasan_early_init(void) p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); unsigned long untracked_end = MODULES_VADDR; unsigned long shadow_alloc_size; - unsigned long initrd_end; unsigned long memsize; kasan_early_detect_facilities(); @@ -298,36 +268,24 @@ void __init kasan_early_init(void) BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); + crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } if (pgalloc_low + shadow_alloc_size > memsize) kasan_early_panic("out of memory during initialisation\n"); if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); + segment_pos = round_down(pgalloc_pos, _SEGMENT_SIZE); segment_low = segment_pos - shadow_alloc_size; + segment_low = round_down(segment_low, _SEGMENT_SIZE); pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; } - init_mm.pgd = early_pg_dir; /* * Current memory layout: * +- 0 -------------+ +- shadow start -+ @@ -376,40 +334,7 @@ void __init kasan_early_init(void) POPULATE_ZERO_SHADOW); kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); - kasan_enable_dat(); /* enable kasan */ init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); sclp_early_printk("KernelAddressSanitizer initialized\n"); } - -void __init kasan_copy_shadow_mapping(void) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index ee1a97078527..78d7768f93d7 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -657,6 +658,29 @@ void vmem_unmap_4k_page(unsigned long addr) mutex_unlock(&vmem_mutex); } +static int __init memblock_region_cmp(const void *a, const void *b) +{ + const struct memblock_region *r1 = a; + const struct memblock_region *r2 = b; + + if (r1->base < r2->base) + return -1; + if (r1->base > r2->base) + return 1; + return 0; +} + +static void __init memblock_region_swap(void *a, void *b, int size) +{ + struct memblock_region *r1 = a; + struct memblock_region *r2 = b; + struct memblock_region swap; + + swap = *r1; + *r1 = *r2; + *r2 = swap; +} + /* * map whole physical memory to virtual memory (identity mapping) * we reserve enough space in the vmalloc area for vmemmap to hotplug @@ -664,11 +688,68 @@ void vmem_unmap_4k_page(unsigned long addr) */ void __init vmem_map_init(void) { + struct memblock_region memory_rwx_regions[] = { + { + .base = 0, + .size = sizeof(struct lowcore), + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __pa(_stext), + .size = _etext - _stext, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __pa(_sinittext), + .size = _einittext - _sinittext, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __stext_amode31, + .size = __etext_amode31 - __stext_amode31, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + }; + struct memblock_type memory_rwx = { + .regions = memory_rwx_regions, + .cnt = ARRAY_SIZE(memory_rwx_regions), + .max = ARRAY_SIZE(memory_rwx_regions), + }; phys_addr_t base, end; u64 i; - for_each_mem_range(i, &base, &end) - vmem_add_range(base, end - base); + /* + * Set RW+NX attribute on all memory, except regions enumerated with + * memory_rwx exclude type. These regions need different attributes, + * which are enforced afterwards. + * + * __for_each_mem_range() iterate and exclude types should be sorted. + * The relative location of _stext and _sinittext is hardcoded in the + * linker script. However a location of __stext_amode31 and the kernel + * image itself are chosen dynamically. Thus, sort the exclude type. + */ + sort(&memory_rwx_regions, + ARRAY_SIZE(memory_rwx_regions), sizeof(memory_rwx_regions[0]), + memblock_region_cmp, memblock_region_swap); + __for_each_mem_range(i, &memblock.memory, &memory_rwx, + NUMA_NO_NODE, MEMBLOCK_NONE, &base, &end, NULL) { + __set_memory((unsigned long)__va(base), + (end - base) >> PAGE_SHIFT, + SET_MEMORY_RW | SET_MEMORY_NX); + } + __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); @@ -678,15 +759,14 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, + __set_memory(__stext_amode31, + (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - /* lowcore requires 4k mapping for real addresses / prefixing */ - set_memory_4k(0, LC_PAGES); - /* lowcore must be executable for LPSWE */ - if (!static_key_enabled(&cpu_has_bear)) - set_memory_x(0, 1); + if (static_key_enabled(&cpu_has_bear)) + set_memory_nx(0, 1); + set_memory_nx(PAGE_SIZE, 1); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); -- cgit v1.2.3