summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2024-09-10 13:49:53 +0200
committerThomas Gleixner <tglx@linutronix.de>2024-09-10 13:49:53 +0200
commit2f7eedca6cecbfad52d5b52c1550066d9252c947 (patch)
treeffce8be3ceee2c845a951d11b99578ca468639a4 /arch/x86
parenttreewide: Fix wrong singular form of jiffies in comments (diff)
parentLinux 6.11-rc7 (diff)
downloadlinux-2f7eedca6cecbfad52d5b52c1550066d9252c947.tar.xz
linux-2f7eedca6cecbfad52d5b52c1550066d9252c947.zip
Merge branch 'linus' into timers/core
To update with the latest fixes.
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/coco/sev/core.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/events/core.c22
-rw-r--r--arch/x86/events/intel/core.c23
-rw-r--r--arch/x86/events/intel/cstate.c5
-rw-r--r--arch/x86/include/asm/cmdline.h4
-rw-r--r--arch/x86/include/asm/fpu/types.h7
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/page_64.h1
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/qspinlock.h12
-rw-r--r--arch/x86/include/asm/resctrl.h6
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c2
-rw-r--r--arch/x86/kernel/apic/apic.c11
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c8
-rw-r--r--arch/x86/kernel/fpu/xstate.c3
-rw-r--r--arch/x86/kernel/fpu/xstate.h4
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kvm/Kconfig11
-rw-r--r--arch/x86/kvm/hyperv.h1
-rw-r--r--arch/x86/kvm/lapic.c24
-rw-r--r--arch/x86/kvm/mmu/mmu.c11
-rw-r--r--arch/x86/kvm/mmu/spte.c6
-rw-r--r--arch/x86/kvm/mmu/spte.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c8
-rw-r--r--arch/x86/kvm/svm/sev.c24
-rw-r--r--arch/x86/kvm/svm/svm.c16
-rw-r--r--arch/x86/kvm/x86.c24
-rw-r--r--arch/x86/lib/cmdline.c25
-rw-r--r--arch/x86/lib/getuser.S4
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kaslr.c32
-rw-r--r--arch/x86/mm/pti.c51
38 files changed, 256 insertions, 126 deletions
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 082d61d85dfc..de1df0cb45da 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -163,7 +163,7 @@ struct sev_config {
*/
use_cas : 1,
- __reserved : 62;
+ __reserved : 61;
};
static struct sev_config sev_cfg __read_mostly;
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 078e2bac2553..da8b66dce0da 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r12 = size,
.r13 = EPT_READ,
.r14 = addr,
- .r15 = *val,
};
if (__tdx_hypercall(&args))
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 83073fa3c989..7093ee21c0d1 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
332 common statx sys_statx
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
+335 common uretprobe sys_uretprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
@@ -385,7 +386,6 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
-467 common uretprobe sys_uretprobe
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 12f2a0c14d33..be01823b1bb4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1520,20 +1520,23 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ unsigned long *cntr_mask, *fixed_cntr_mask;
+ struct event_constraint *pebs_constraints;
+ struct cpu_hw_events *cpuc;
u64 pebs, debugctl;
- int cpu = smp_processor_id();
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask);
- unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
- struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
- unsigned long flags;
- int idx;
+ int cpu, idx;
+
+ guard(irqsave)();
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_events, cpu);
+ cntr_mask = hybrid(cpuc->pmu, cntr_mask);
+ fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
+ pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
if (!*(u64 *)cntr_mask)
return;
- local_irq_save(flags);
-
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -1577,7 +1580,6 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
cpu, idx, pmc_count);
}
- local_irq_restore(flags);
}
void x86_pmu_stop(struct perf_event *event, int flags)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index be58cfb012dd..9f116dfc4728 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -64,7 +64,7 @@
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
* KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
- * RPL,SPR,MTL,ARL,LNL
+ * RPL,SPR,MTL,ARL,LNL,SRF
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
@@ -693,7 +693,8 @@ static const struct cstate_model srf_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
- .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
.module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
};
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index 6faaf27e8899..6cbd9ae58b21 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,6 +2,10 @@
#ifndef _ASM_X86_CMDLINE_H
#define _ASM_X86_CMDLINE_H
+#include <asm/setup.h>
+
+extern char builtin_cmdline[COMMAND_LINE_SIZE];
+
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
int cmdline_find_option(const char *cmdline_ptr, const char *option,
char *buffer, int bufsize);
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb17f31b06d2..de16862bf230 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -591,6 +591,13 @@ struct fpu_state_config {
* even without XSAVE support, i.e. legacy features FP + SSE
*/
u64 legacy_features;
+ /*
+ * @independent_features:
+ *
+ * Features that are supported by XSAVES, but not managed as part of
+ * the FPU core, such as LBR
+ */
+ u64 independent_features;
};
/* FPU state configuration information */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 950a03e0181e..4a68cb3eba78 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1305,6 +1305,7 @@ struct kvm_arch {
u8 vm_type;
bool has_private_mem;
bool has_protected_state;
+ bool pre_fault_allowed;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
@@ -2191,6 +2192,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
#define kvm_arch_has_private_mem(kvm) false
#endif
+#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index a053c1293975..68da67df304d 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -66,13 +66,15 @@ static inline bool vcpu_is_preempted(long cpu)
#ifdef CONFIG_PARAVIRT
/*
- * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack.
+ * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
*
- * Native (and PV wanting native due to vCPU pinning) should disable this key.
- * It is done in this backwards fashion to only have a single direction change,
- * which removes ordering between native_pv_spin_init() and HV setup.
+ * Native (and PV wanting native due to vCPU pinning) should keep this key
+ * disabled. Native does not touch the key.
+ *
+ * When in a guest then native_pv_lock_init() enables the key first and
+ * KVM/XEN might conditionally disable it later in the boot process again.
*/
-DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
/*
* Shortcut for the queued_spin_lock_slowpath() function that allows
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 12dbd2588ca7..8b1b6ce1e51b 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk)
__resctrl_sched_in(tsk);
}
-static inline u32 resctrl_arch_system_num_rmid_idx(void)
-{
- /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
- return boot_cpu_data.x86_cache_max_rmid + 1;
-}
-
static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
{
*rmid = idx;
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 6cfe762be28b..d5ef6215583b 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -19,7 +19,7 @@
static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
-static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
static u64 acpi_mp_pgd __ro_after_init;
static u64 acpi_mp_reset_vector_paddr __ro_after_init;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index be5889bded49..1e0fe5f8ab84 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -462,7 +462,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x00 ... 0x2f:
case 0x40 ... 0x4f:
- case 0x70 ... 0x7f:
+ case 0x60 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
default:
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index b3fa61d45352..0b69bfbf345d 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -306,7 +306,7 @@ static void freq_invariance_enable(void)
WARN_ON_ONCE(1);
return;
}
- static_branch_enable(&arch_scale_freq_key);
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
@@ -323,8 +323,10 @@ static void __init bp_init_freq_invariance(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (intel_set_max_freq_ratio())
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
freq_invariance_enable();
+ }
}
static void disable_freq_invariance_workfn(struct work_struct *work)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 767bf1c71aad..2a2fc14955cd 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -609,7 +609,7 @@ void mtrr_save_state(void)
{
int first_cpu;
- if (!mtrr_enabled())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
first_cpu = cpumask_first(cpu_online_mask);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 1930fce9dfe9..8591d53c144b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = {
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c5a026fee5e0..1339f8328db5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 2ee0b9c53dcc..afb404cd2059 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void)
static inline u64 xfeatures_mask_independent(void)
{
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_INDEPENDENT;
+ return fpu_kernel_cfg.independent_features;
}
/* XSAVE/XRSTOR wrapper functions */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..fec381533555 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -51,13 +51,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
-DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
void __init native_pv_lock_init(void)
{
- if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
- !boot_cpu_has(X86_FEATURE_HYPERVISOR))
- static_branch_disable(&virt_spin_lock_key);
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_enable(&virt_spin_lock_key);
}
static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5d34cad9b7b1..6129dc2ba784 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -164,7 +164,7 @@ unsigned long saved_video_mode;
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
bool builtin_cmdline_added __ro_after_init;
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4287a8071a3a..730c2f34d347 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,6 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
@@ -141,11 +140,13 @@ config KVM_AMD_SEV
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
select KVM_GENERIC_PRIVATE_MEM
- select HAVE_KVM_GMEM_PREPARE
- select HAVE_KVM_GMEM_INVALIDATE
+ select HAVE_KVM_ARCH_GMEM_PREPARE
+ select HAVE_KVM_ARCH_GMEM_INVALIDATE
help
- Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
- with Encrypted State (SEV-ES) on AMD processors.
+ Provides support for launching encrypted VMs which use Secure
+ Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
+ Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
+ Secure Nested Paging (SEV-SNP) technologies on AMD processors.
config KVM_SMM
bool "System Management Mode emulation"
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 923e64903da9..913bfc96959c 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return HV_STATUS_ACCESS_DENIED;
}
static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {}
-static inline void kvm_hv_free_pa_page(struct kvm *kvm) {}
static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
{
return false;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a7172ba59ad2..5bb481aefcbc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -351,10 +351,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
* reversing the LDR calculation to get cluster of APICs, i.e. no
* additional work is required.
*/
- if (apic_x2apic_mode(apic)) {
- WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ if (apic_x2apic_mode(apic))
return;
- }
if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
&cluster, &mask))) {
@@ -1743,7 +1741,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
s64 min_period = min_timer_period_us * 1000LL;
if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
+ pr_info_once(
"vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
@@ -2966,18 +2964,28 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s, bool set)
{
if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
u32 *id = (u32 *)(s->regs + APIC_ID);
u32 *ldr = (u32 *)(s->regs + APIC_LDR);
u64 icr;
if (vcpu->kvm->arch.x2apic_format) {
- if (*id != vcpu->vcpu_id)
+ if (*id != x2apic_id)
return -EINVAL;
} else {
+ /*
+ * Ignore the userspace value when setting APIC state.
+ * KVM's model is that the x2APIC ID is readonly, e.g.
+ * KVM only supports delivering interrupts to KVM's
+ * version of the x2APIC ID. However, for backwards
+ * compatibility, don't reject attempts to set a
+ * mismatched ID for userspace that hasn't opted into
+ * x2apic_format.
+ */
if (set)
- *id >>= 24;
+ *id = x2apic_id;
else
- *id <<= 24;
+ *id = x2apic_id << 24;
}
/*
@@ -2986,7 +2994,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
* split to ICR+ICR2 in userspace for backwards compatibility.
*/
if (set) {
- *ldr = kvm_apic_calc_x2apic_ldr(*id);
+ *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
(u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 901be9e420a4..de05a26b0b7d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4335,7 +4335,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
if (req_max_level)
max_level = min(max_level, req_max_level);
- return req_max_level;
+ return max_level;
}
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
@@ -4743,11 +4743,16 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
u64 end;
int r;
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
*/
- kvm_mmu_reload(vcpu);
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
@@ -7510,7 +7515,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index d4527965e48c..8f7eb3ad88fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -391,9 +391,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
mmio_value = 0;
/*
- * The masked MMIO value must obviously match itself and a removed SPTE
- * must not get a false positive. Removed SPTEs and MMIO SPTEs should
- * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index ef793c459b05..2cb816ea2430 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -214,7 +214,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*/
#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
-/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_frozen_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c7dc49ee7388..3c55955bcaf8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -359,10 +359,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
/*
* Set the SPTE to a nonpresent value that other
* threads will not overwrite. If the SPTE was
- * already marked as removed then another thread
+ * already marked as frozen then another thread
* handling a page fault could overwrite it, so
* set the SPTE until it is set from some other
- * value to the removed SPTE value.
+ * value to the frozen SPTE value.
*/
for (;;) {
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
@@ -536,8 +536,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
u64 *sptep = rcu_dereference(iter->sptep);
/*
- * The caller is responsible for ensuring the old SPTE is not a REMOVED
- * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
* and pre-checking before inserting a new SPTE is advantageous as it
* avoids unnecessary work.
*/
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index a16c873b3232..714c517dd4b7 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2276,30 +2276,24 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
struct sev_data_snp_launch_update fw_args = {0};
- bool assigned;
+ bool assigned = false;
int level;
- if (!kvm_mem_is_private(kvm, gfn)) {
- pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
- __func__, gfn);
- ret = -EINVAL;
- goto err;
- }
-
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
- ret = -EINVAL;
+ ret = ret ? -EINVAL : -EEXIST;
goto err;
}
if (src) {
void *vaddr = kmap_local_pfn(pfn + i);
- ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE);
- if (ret)
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
goto err;
+ }
kunmap_local(vaddr);
}
@@ -2549,6 +2543,14 @@ static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
data->gctx_paddr = __psp_pa(sev->snp_context);
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
kfree(id_auth);
e_free_id_block:
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c115d26844f7..5ab2c92c7331 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2876,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CSTAR:
msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
+ case MSR_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.gs.base;
+ break;
+ case MSR_FS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.fs.base;
+ break;
case MSR_KERNEL_GS_BASE:
msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
@@ -3101,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_CSTAR:
svm->vmcb01.ptr->save.cstar = data;
break;
+ case MSR_GS_BASE:
+ svm->vmcb01.ptr->save.gs.base = data;
+ break;
+ case MSR_FS_BASE:
+ svm->vmcb01.ptr->save.fs.base = data;
+ break;
case MSR_KERNEL_GS_BASE:
svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
@@ -4949,6 +4961,7 @@ static int svm_vm_init(struct kvm *kvm)
to_kvm_sev_info(kvm)->need_init = true;
kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
+ kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
}
if (!pause_filter_count || !pause_filter_thresh)
@@ -5223,6 +5236,9 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
+
+ /* Don't advertise Bus Lock Detect to guest if SVM support is absent */
+ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
}
static __init int svm_hardware_setup(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af6c8cf6a37a..c983c8e434b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -427,8 +427,7 @@ static void kvm_user_return_msr_cpu_online(void)
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
int err;
value = (value & mask) | (msrs->values[slot].host & ~mask);
@@ -450,8 +449,7 @@ EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
if (msrs->registered)
kvm_on_user_return(&msrs->urn);
@@ -4658,7 +4656,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
- case KVM_CAP_READONLY_MEM:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_DISABLE_QUIRKS:
@@ -4817,6 +4814,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_TYPES:
r = kvm_caps.supported_vm_types;
break;
+ case KVM_CAP_READONLY_MEM:
+ r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
+ break;
default:
break;
}
@@ -6042,7 +6042,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
break;
}
case KVM_GET_DEBUGREGS: {
@@ -12646,6 +12648,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.vm_type = type;
kvm->arch.has_private_mem =
(type == KVM_X86_SW_PROTECTED_VM);
+ /* Decided by the vendor code for other VM types. */
+ kvm->arch.pre_fault_allowed =
+ type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
ret = kvm_page_track_init(kvm);
if (ret)
@@ -13641,19 +13646,14 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
-#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
-bool kvm_arch_gmem_prepare_needed(struct kvm *kvm)
-{
- return kvm->arch.vm_type == KVM_X86_SNP_VM;
-}
-
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
{
return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
}
#endif
-#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
{
kvm_x86_call(gmem_invalidate)(start, end);
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 384da1fdd5c6..c65cd5550454 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -207,18 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size,
int cmdline_find_option_bool(const char *cmdline, const char *option)
{
- if (IS_ENABLED(CONFIG_CMDLINE_BOOL))
- WARN_ON_ONCE(!builtin_cmdline_added);
+ int ret;
- return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option);
+
+ return ret;
}
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
int bufsize)
{
- if (IS_ENABLED(CONFIG_CMDLINE_BOOL))
- WARN_ON_ONCE(!builtin_cmdline_added);
+ int ret;
+
+ ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
- return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
- buffer, bufsize);
+ return ret;
}
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index a314622aa093..d066aecf8aeb 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -88,12 +88,14 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
+#ifndef CONFIG_X86_64
+ xor %ecx,%ecx
+#endif
check_range size=8
ASM_STAC
#ifdef CONFIG_X86_64
UACCESS movq (%_ASM_AX),%rdx
#else
- xor %ecx,%ecx
UACCESS movl (%_ASM_AX),%edx
UACCESS movl 4(%_ASM_AX),%ecx
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
+ /* Calculate the end of the region */
+ vaddr += get_padding(&kaslr_regions[i]);
/*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
*/
- vaddr += get_padding(&kaslr_regions[i]);
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2e69abf4f852..851ec8f1363a 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
if (!pmd)
return NULL;
- /* We can't do anything sensible if we hit a large mapping. */
+ /* Large PMD mapping found */
if (pmd_leaf(*pmd)) {
- WARN_ON(1);
- return NULL;
+ /* Clear the PMD if we hit a large mapping from the first round */
+ if (late_text) {
+ set_pmd(pmd, __pmd(0));
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
}
if (pmd_none(*pmd)) {
@@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void)
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
- target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false);
if (WARN_ON(!target_pte))
return;
@@ -301,7 +306,7 @@ enum pti_clone_level {
static void
pti_clone_pgtable(unsigned long start, unsigned long end,
- enum pti_clone_level level)
+ enum pti_clone_level level, bool late_text)
{
unsigned long addr;
@@ -374,14 +379,14 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
*/
*target_pmd = *pmd;
- addr += PMD_SIZE;
+ addr = round_up(addr + 1, PMD_SIZE);
} else if (level == PTI_CLONE_PTE) {
/* Walk the page-table down to the pte level */
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
continue;
}
@@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
return;
/* Allocate PTE in the user page-table */
- target_pte = pti_user_pagetable_walk_pte(addr);
+ target_pte = pti_user_pagetable_walk_pte(addr, late_text);
if (WARN_ON(!target_pte))
return;
@@ -401,7 +406,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
/* Clone the PTE */
*target_pte = *pte;
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
} else {
BUG();
@@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void)
phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
pte_t *target_pte;
- target_pte = pti_user_pagetable_walk_pte(va);
+ target_pte = pti_user_pagetable_walk_pte(va, false);
if (WARN_ON(!target_pte))
return;
@@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void)
start = CPU_ENTRY_AREA_BASE;
end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
- pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD, false);
}
#endif /* CONFIG_X86_64 */
@@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry text and force it RO.
*/
-static void pti_clone_entry_text(void)
+static void pti_clone_entry_text(bool late)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
(unsigned long) __entry_text_end,
- PTI_CLONE_PMD);
+ PTI_LEVEL_KERNEL_IMAGE, late);
}
/*
@@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false);
/*
* pti_clone_pgtable() will set the global bit in any PMDs
@@ -638,8 +643,15 @@ void __init pti_init(void)
/* Undo all global bits from the init pagetables in head_64.S: */
pti_set_kernel_image_nonglobal();
+
/* Replace some of the global bits just for shared entry text: */
- pti_clone_entry_text();
+ /*
+ * This is very early in boot. Device and Late initcalls can do
+ * modprobe before free_initmem() and mark_readonly(). This
+ * pti_clone_entry_text() allows those user-mode-helpers to function,
+ * but notably the text is still RW.
+ */
+ pti_clone_entry_text(false);
pti_setup_espfix64();
pti_setup_vsyscall();
}
@@ -656,10 +668,11 @@ void pti_finalize(void)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
/*
- * We need to clone everything (again) that maps parts of the
- * kernel image.
+ * This is after free_initmem() (all initcalls are done) and we've done
+ * mark_readonly(). Text is now NX which might've split some PMDs
+ * relative to the early clone.
*/
- pti_clone_entry_text();
+ pti_clone_entry_text(true);
pti_clone_kernel_text();
debug_checkwx_user();