From d8eabc37310a92df40d07c5a8afc53cebf996716 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Feb 2019 12:36:50 +0100 Subject: x86/msr-index: Cleanup bit defines Greg pointed out that speculation related bit defines are using (1 << N) format instead of BIT(N). Aside of that (1 << N) is wrong as it should use 1UL at least. Clean it up. [ Josh Poimboeuf: Fix tools build ] Reported-by: Greg Kroah-Hartman Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- arch/x86/include/asm/msr-index.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8e40c2446fd1..e4074556c37b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_MSR_INDEX_H #define _ASM_X86_MSR_INDEX_H +#include + /* * CPU model specific register (MSR) numbers. * @@ -40,14 +42,14 @@ /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ -#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ +#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ #define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ -#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ +#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ -#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ -#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ +#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f @@ -69,20 +71,20 @@ #define MSR_MTRRcap 0x000000fe #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a -#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ -#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ -#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ -#define ARCH_CAP_SSB_NO (1 << 4) /* - * Not susceptible to Speculative Store Bypass - * attack, so no Speculative Store Bypass - * control required. - */ +#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ +#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ +#define ARCH_CAP_SSB_NO BIT(4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b -#define L1D_FLUSH (1 << 0) /* - * Writeback and invalidate the - * L1 data cache. - */ +#define L1D_FLUSH BIT(0) /* + * Writeback and invalidate the + * L1 data cache. + */ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e -- cgit v1.2.3 From 36ad35131adacc29b328b9c8b6277a8bf0d6fd5d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Feb 2019 10:10:23 +0100 Subject: x86/speculation: Consolidate CPU whitelists The CPU vulnerability whitelists have some overlap and there are more whitelists coming along. Use the driver_data field in the x86_cpu_id struct to denote the whitelisted vulnerabilities and combine all whitelists into one. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Reviewed-by: Jon Masters Tested-by: Jon Masters --- arch/x86/kernel/cpu/common.c | 110 +++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..26ec15034f86 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -948,61 +948,72 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -static const __initconst struct x86_cpu_id cpu_no_speculation[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY }, - { X86_VENDOR_CENTAUR, 5 }, - { X86_VENDOR_INTEL, 5 }, - { X86_VENDOR_NSC, 5 }, - { X86_VENDOR_ANY, 4 }, +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) + +#define VULNWL(_vendor, _family, _model, _whitelist) \ + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } + +#define VULNWL_INTEL(model, whitelist) \ + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) + +#define VULNWL_AMD(family, whitelist) \ + VULNWL(AMD, family, X86_MODEL_ANY, whitelist) + +#define VULNWL_HYGON(family, whitelist) \ + VULNWL(HYGON, family, X86_MODEL_ANY, whitelist) + +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF), + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT, NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_L1TF), + + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF), {} }; -static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { - { X86_VENDOR_AMD }, - { X86_VENDOR_HYGON }, - {} -}; - -/* Only list CPUs which speculate but are non susceptible to SSB */ -static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, - { X86_VENDOR_AMD, 0x12, }, - { X86_VENDOR_AMD, 0x11, }, - { X86_VENDOR_AMD, 0x10, }, - { X86_VENDOR_AMD, 0xf, }, - {} -}; +static bool __init cpu_matches(unsigned long which) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); -static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { - /* in addition to cpu_no_speculation */ - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, - {} -}; + return m && !!(m->driver_data & which); +} static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; - if (x86_match_cpu(cpu_no_speculation)) + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); @@ -1011,15 +1022,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!x86_match_cpu(cpu_no_spec_store_bypass) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (x86_match_cpu(cpu_no_meltdown)) + if (cpu_matches(NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1028,7 +1038,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (x86_match_cpu(cpu_no_l1tf)) + if (cpu_matches(NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); -- cgit v1.2.3 From ed5194c2732c8084af9fd159c146ea92bf137128 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 18 Jan 2019 16:50:16 -0800 Subject: x86/speculation/mds: Add basic bug infrastructure for MDS Microarchitectural Data Sampling (MDS), is a class of side channel attacks on internal buffers in Intel CPUs. The variants are: - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126) - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130) - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127) MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a dependent load (store-to-load forwarding) as an optimization. The forward can also happen to a faulting or assisting load operation for a different memory address, which can be exploited under certain conditions. Store buffers are partitioned between Hyper-Threads so cross thread forwarding is not possible. But if a thread enters or exits a sleep state the store buffer is repartitioned which can expose data from one thread to the other. MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage L1 miss situations and to hold data which is returned or sent in response to a memory or I/O operation. Fill buffers can forward data to a load operation and also write data to the cache. When the fill buffer is deallocated it can retain the stale data of the preceding operations which can then be forwarded to a faulting or assisting load operation, which can be exploited under certain conditions. Fill buffers are shared between Hyper-Threads so cross thread leakage is possible. MLDPS leaks Load Port Data. Load ports are used to perform load operations from memory or I/O. The received data is then forwarded to the register file or a subsequent operation. In some implementations the Load Port can contain stale data from a previous operation which can be forwarded to faulting or assisting loads under certain conditions, which again can be exploited eventually. Load ports are shared between Hyper-Threads so cross thread leakage is possible. All variants have the same mitigation for single CPU thread case (SMT off), so the kernel can treat them as one MDS issue. Add the basic infrastructure to detect if the current CPU is affected by MDS. [ tglx: Rewrote changelog ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/include/asm/msr-index.h | 5 +++++ arch/x86/kernel/cpu/common.c | 25 ++++++++++++++++--------- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6d6122524711..ae3f987b24f1 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -344,6 +344,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ @@ -381,5 +382,6 @@ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e4074556c37b..e2d30636c03f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -79,6 +79,11 @@ * attack, so no Speculative Store Bypass * control required. */ +#define ARCH_CAP_MDS_NO BIT(5) /* + * Not susceptible to + * Microarchitectural Data + * Sampling (MDS) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 26ec15034f86..e34817bca504 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -952,6 +952,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_MELTDOWN BIT(1) #define NO_SSB BIT(2) #define NO_L1TF BIT(3) +#define NO_MDS BIT(4) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -971,6 +972,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + /* Intel Family 6 */ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), @@ -987,18 +989,20 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(CORE_YONAH, NO_SSB), VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF), - VULNWL_INTEL(ATOM_GOLDMONT, NO_L1TF), - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_L1TF), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_L1TF), - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF), + + /* AMD Family 0xf - 0x12 */ + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), {} }; @@ -1029,6 +1033,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) + setup_force_cpu_bug(X86_BUG_MDS); + if (cpu_matches(NO_MELTDOWN)) return; -- cgit v1.2.3 From e261f209c3666e842fd645a1e31f001c3a26def9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Mar 2019 20:21:08 +0100 Subject: x86/speculation/mds: Add BUG_MSBDS_ONLY This bug bit is set on CPUs which are only affected by Microarchitectural Store Buffer Data Sampling (MSBDS) and not by any other MDS variant. This is important because the Store Buffers are partitioned between Hyper-Threads so cross thread forwarding is not possible. But if a thread enters or exits a sleep state the store buffer is repartitioned which can expose data from one thread to the other. This transition can be mitigated. That means that for CPUs which are only affected by MSBDS SMT can be enabled, if the CPU is not affected by other SMT sensitive vulnerabilities, e.g. L1TF. The XEON PHI variants fall into that category. Also the Silvermont/Airmont ATOMs, but for them it's not really relevant as they do not support SMT, but mark them for completeness sake. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/common.c | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ae3f987b24f1..bdcea163850a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -383,5 +383,6 @@ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e34817bca504..132a63dc5a76 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -953,6 +953,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SSB BIT(2) #define NO_L1TF BIT(3) #define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -979,16 +980,16 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF), - VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF), + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY), VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF), VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF), @@ -1033,8 +1034,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) + if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); + if (cpu_matches(MSBDS_ONLY)) + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); + } if (cpu_matches(NO_MELTDOWN)) return; -- cgit v1.2.3 From 6c4dbbd14730c43f4ed808a9c42ca41625925c22 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 18 Jan 2019 16:50:23 -0800 Subject: x86/kvm: Expose X86_FEATURE_MD_CLEAR to guests X86_FEATURE_MD_CLEAR is a new CPUID bit which is set when microcode provides the mechanism to invoke a flush of various exploitable CPU buffers by invoking the VERW instruction. Hand it through to guests so they can adjust their mitigations. This also requires corresponding qemu changes, which are available separately. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c07958b59f50..39501e7afdb4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -410,7 +410,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | + F(MD_CLEAR); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.3 From 6a9e529272517755904b7afa639f6db59ddb793e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2019 23:13:06 +0100 Subject: x86/speculation/mds: Add mds_clear_cpu_buffers() The Microarchitectural Data Sampling (MDS) vulernabilities are mitigated by clearing the affected CPU buffers. The mechanism for clearing the buffers uses the unused and obsolete VERW instruction in combination with a microcode update which triggers a CPU buffer clear when VERW is executed. Provide a inline function with the assembly magic. The argument of the VERW instruction must be a memory operand as documented: "MD_CLEAR enumerates that the memory-operand variant of VERW (for example, VERW m16) has been extended to also overwrite buffers affected by MDS. This buffer overwriting functionality is not guaranteed for the register operand variant of VERW." Documentation also recommends to use a writable data segment selector: "The buffer overwriting occurs regardless of the result of the VERW permission check, as well as when the selector is null or causes a descriptor load segment violation. However, for lowest latency we recommend using a selector that indicates a valid writable data segment." Add x86 specific documentation about MDS and the internal workings of the mitigation. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- Documentation/index.rst | 1 + Documentation/x86/conf.py | 10 ++++ Documentation/x86/index.rst | 8 +++ Documentation/x86/mds.rst | 99 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/nospec-branch.h | 25 +++++++++ 5 files changed, 143 insertions(+) create mode 100644 Documentation/x86/conf.py create mode 100644 Documentation/x86/index.rst create mode 100644 Documentation/x86/mds.rst (limited to 'arch/x86') diff --git a/Documentation/index.rst b/Documentation/index.rst index c858c2e66e36..63864826dcd6 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -101,6 +101,7 @@ implementation. :maxdepth: 2 sh/index + x86/index Filesystem Documentation ------------------------ diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py new file mode 100644 index 000000000000..33c5c3142e20 --- /dev/null +++ b/Documentation/x86/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "X86 architecture specific documentation" + +tags.add("subproject") + +latex_documents = [ + ('index', 'x86.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst new file mode 100644 index 000000000000..ef389dcf1b1d --- /dev/null +++ b/Documentation/x86/index.rst @@ -0,0 +1,8 @@ +========================== +x86 architecture specifics +========================== + +.. toctree:: + :maxdepth: 1 + + mds diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst new file mode 100644 index 000000000000..1096738d50f2 --- /dev/null +++ b/Documentation/x86/mds.rst @@ -0,0 +1,99 @@ +Microarchitectural Data Sampling (MDS) mitigation +================================================= + +.. _mds: + +Overview +-------- + +Microarchitectural Data Sampling (MDS) is a family of side channel attacks +on internal buffers in Intel CPUs. The variants are: + + - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126) + - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130) + - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127) + +MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a +dependent load (store-to-load forwarding) as an optimization. The forward +can also happen to a faulting or assisting load operation for a different +memory address, which can be exploited under certain conditions. Store +buffers are partitioned between Hyper-Threads so cross thread forwarding is +not possible. But if a thread enters or exits a sleep state the store +buffer is repartitioned which can expose data from one thread to the other. + +MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage +L1 miss situations and to hold data which is returned or sent in response +to a memory or I/O operation. Fill buffers can forward data to a load +operation and also write data to the cache. When the fill buffer is +deallocated it can retain the stale data of the preceding operations which +can then be forwarded to a faulting or assisting load operation, which can +be exploited under certain conditions. Fill buffers are shared between +Hyper-Threads so cross thread leakage is possible. + +MLPDS leaks Load Port Data. Load ports are used to perform load operations +from memory or I/O. The received data is then forwarded to the register +file or a subsequent operation. In some implementations the Load Port can +contain stale data from a previous operation which can be forwarded to +faulting or assisting loads under certain conditions, which again can be +exploited eventually. Load ports are shared between Hyper-Threads so cross +thread leakage is possible. + + +Exposure assumptions +-------------------- + +It is assumed that attack code resides in user space or in a guest with one +exception. The rationale behind this assumption is that the code construct +needed for exploiting MDS requires: + + - to control the load to trigger a fault or assist + + - to have a disclosure gadget which exposes the speculatively accessed + data for consumption through a side channel. + + - to control the pointer through which the disclosure gadget exposes the + data + +The existence of such a construct in the kernel cannot be excluded with +100% certainty, but the complexity involved makes it extremly unlikely. + +There is one exception, which is untrusted BPF. The functionality of +untrusted BPF is limited, but it needs to be thoroughly investigated +whether it can be used to create such a construct. + + +Mitigation strategy +------------------- + +All variants have the same mitigation strategy at least for the single CPU +thread case (SMT off): Force the CPU to clear the affected buffers. + +This is achieved by using the otherwise unused and obsolete VERW +instruction in combination with a microcode update. The microcode clears +the affected CPU buffers when the VERW instruction is executed. + +For virtualization there are two ways to achieve CPU buffer +clearing. Either the modified VERW instruction or via the L1D Flush +command. The latter is issued when L1TF mitigation is enabled so the extra +VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to +be issued. + +If the VERW instruction with the supplied segment selector argument is +executed on a CPU without the microcode update there is no side effect +other than a small number of pointlessly wasted CPU cycles. + +This does not protect against cross Hyper-Thread attacks except for MSBDS +which is only exploitable cross Hyper-thread when one of the Hyper-Threads +enters a C-state. + +The kernel provides a function to invoke the buffer clearing: + + mds_clear_cpu_buffers() + +The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state +(idle) transitions. + +According to current knowledge additional mitigations inside the kernel +itself are not required because the necessary gadgets to expose the leaked +data cannot be controlled in a way which allows exploitation from malicious +user space or VM guests. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dad12b767ba0..67cb9b2082b1 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -318,6 +318,31 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +#include + +/** + * mds_clear_cpu_buffers - Mitigation for MDS vulnerability + * + * This uses the otherwise unused and obsolete VERW instruction in + * combination with microcode which triggers a CPU buffer flush when the + * instruction is executed. + */ +static inline void mds_clear_cpu_buffers(void) +{ + static const u16 ds = __KERNEL_DS; + + /* + * Has to be the memory-operand variant because only that + * guarantees the CPU buffer flush functionality according to + * documentation. The register-operand variant does not. + * Works with any segment selector, but a valid writable + * data segment is the fastest variant. + * + * "cc" clobber is required because VERW modifies ZF. + */ + asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); +} + #endif /* __ASSEMBLY__ */ /* -- cgit v1.2.3 From 04dcbdb8057827b043b3c71aa397c4c63e67d086 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2019 23:42:51 +0100 Subject: x86/speculation/mds: Clear CPU buffers on exit to user Add a static key which controls the invocation of the CPU buffer clear mechanism on exit to user space and add the call into prepare_exit_to_usermode() and do_nmi() right before actually returning. Add documentation which kernel to user space transition this covers and explain why some corner cases are not mitigated. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- Documentation/x86/mds.rst | 52 ++++++++++++++++++++++++++++++++++++ arch/x86/entry/common.c | 3 +++ arch/x86/include/asm/nospec-branch.h | 13 +++++++++ arch/x86/kernel/cpu/bugs.c | 3 +++ arch/x86/kernel/nmi.c | 4 +++ arch/x86/kernel/traps.c | 8 ++++++ 6 files changed, 83 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 1096738d50f2..54d935bf283b 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -97,3 +97,55 @@ According to current knowledge additional mitigations inside the kernel itself are not required because the necessary gadgets to expose the leaked data cannot be controlled in a way which allows exploitation from malicious user space or VM guests. + +Mitigation points +----------------- + +1. Return to user space +^^^^^^^^^^^^^^^^^^^^^^^ + + When transitioning from kernel to user space the CPU buffers are flushed + on affected CPUs when the mitigation is not disabled on the kernel + command line. The migitation is enabled through the static key + mds_user_clear. + + The mitigation is invoked in prepare_exit_to_usermode() which covers + most of the kernel to user space transitions. There are a few exceptions + which are not invoking prepare_exit_to_usermode() on return to user + space. These exceptions use the paranoid exit code. + + - Non Maskable Interrupt (NMI): + + Access to sensible data like keys, credentials in the NMI context is + mostly theoretical: The CPU can do prefetching or execute a + misspeculated code path and thereby fetching data which might end up + leaking through a buffer. + + But for mounting other attacks the kernel stack address of the task is + already valuable information. So in full mitigation mode, the NMI is + mitigated on the return from do_nmi() to provide almost complete + coverage. + + - Double fault (#DF): + + A double fault is usually fatal, but the ESPFIX workaround, which can + be triggered from user space through modify_ldt(2) is a recoverable + double fault. #DF uses the paranoid exit path, so explicit mitigation + in the double fault handler is required. + + - Machine Check Exception (#MC): + + Another corner case is a #MC which hits between the CPU buffer clear + invocation and the actual return to user. As this still is in kernel + space it takes the paranoid exit path which does not clear the CPU + buffers. So the #MC handler repopulates the buffers to some + extent. Machine checks are not reliably controllable and the window is + extremly small so mitigation would just tick a checkbox that this + theoretical corner case is covered. To keep the amount of special + cases small, ignore #MC. + + - Debug Exception (#DB): + + This takes the paranoid exit path only when the INT1 breakpoint is in + kernel space. #DB on a user space address takes the regular exit path, + so no extra mitigation required. diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7bc105f47d21..19f650d729f5 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -31,6 +31,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -212,6 +213,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) #endif user_enter_irqoff(); + + mds_user_clear_cpu_buffers(); } #define SYSCALL_EXIT_WORK_FLAGS \ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 67cb9b2082b1..65b747286d96 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -318,6 +318,8 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +DECLARE_STATIC_KEY_FALSE(mds_user_clear); + #include /** @@ -343,6 +345,17 @@ static inline void mds_clear_cpu_buffers(void) asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); } +/** + * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability + * + * Clear CPU buffers if the corresponding static key is enabled + */ +static inline void mds_user_clear_cpu_buffers(void) +{ + if (static_branch_likely(&mds_user_clear)) + mds_clear_cpu_buffers(); +} + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 01874d54f4fd..dbb45014de1b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -63,6 +63,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +/* Control MDS CPU buffer clear before returning to user space */ +DEFINE_STATIC_KEY_FALSE(mds_user_clear); + void __init check_bugs(void) { identify_boot_cpu(); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..086cf1d1d71d 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -34,6 +34,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -533,6 +534,9 @@ nmi_restart: write_cr2(this_cpu_read(nmi_cr2)); if (this_cpu_dec_return(nmi_state)) goto nmi_restart; + + if (user_mode(regs)) + mds_user_clear_cpu_buffers(); } NOKPROBE_SYMBOL(do_nmi); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9b7c4ca8f0a7..85fe1870f873 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -366,6 +367,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; + /* + * This situation can be triggered by userspace via + * modify_ldt(2) and the return does not take the regular + * user space exit, so a CPU buffer clear is required when + * MDS mitigation is enabled. + */ + mds_user_clear_cpu_buffers(); return; } #endif -- cgit v1.2.3 From 650b68a0622f933444a6d66936abb3103029413b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Feb 2019 12:48:14 +0100 Subject: x86/kvm/vmx: Add MDS protection when L1D Flush is not active CPUs which are affected by L1TF and MDS mitigate MDS with the L1D Flush on VMENTER when updated microcode is installed. If a CPU is not affected by L1TF or if the L1D Flush is not in use, then MDS mitigation needs to be invoked explicitly. For these cases, follow the host mitigation state and invoke the MDS mitigation before VMENTER. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Frederic Weisbecker Reviewed-by: Borislav Petkov Reviewed-by: Jon Masters Tested-by: Jon Masters --- arch/x86/kernel/cpu/bugs.c | 1 + arch/x86/kvm/vmx/vmx.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index dbb45014de1b..29ed8e8dfee2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -65,6 +65,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); /* Control MDS CPU buffer clear before returning to user space */ DEFINE_STATIC_KEY_FALSE(mds_user_clear); +EXPORT_SYMBOL_GPL(mds_user_clear); void __init check_bugs(void) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 30a6bcd735ec..544bd24a9c1e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6369,8 +6369,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? (unsigned long)¤t_evmcs->host_rsp : 0; + /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); asm( /* Store host registers */ -- cgit v1.2.3 From 07f07f55a29cb705e221eda7894dd67ab81ef343 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2019 23:04:01 +0100 Subject: x86/speculation/mds: Conditionally clear CPU buffers on idle entry Add a static key which controls the invocation of the CPU buffer clear mechanism on idle entry. This is independent of other MDS mitigations because the idle entry invocation to mitigate the potential leakage due to store buffer repartitioning is only necessary on SMT systems. Add the actual invocations to the different halt/mwait variants which covers all usage sites. mwaitx is not patched as it's not available on Intel CPUs. The buffer clear is only invoked before entering the C-State to prevent that stale data from the idling CPU is spilled to the Hyper-Thread sibling after the Store buffer got repartitioned and all entries are available to the non idle sibling. When coming out of idle the store buffer is partitioned again so each sibling has half of it available. Now CPU which returned from idle could be speculatively exposed to contents of the sibling, but the buffers are flushed either on exit to user space or on VMENTER. When later on conditional buffer clearing is implemented on top of this, then there is no action required either because before returning to user space the context switch will set the condition flag which causes a flush on the return to user path. Note, that the buffer clearing on idle is only sensible on CPUs which are solely affected by MSBDS and not any other variant of MDS because the other MDS variants cannot be mitigated when SMT is enabled, so the buffer clearing on idle would be a window dressing exercise. This intentionally does not handle the case in the acpi/processor_idle driver which uses the legacy IO port interface for C-State transitions for two reasons: - The acpi/processor_idle driver was replaced by the intel_idle driver almost a decade ago. Anything Nehalem upwards supports it and defaults to that new driver. - The legacy IO port interface is likely to be used on older and therefore unaffected CPUs or on systems which do not receive microcode updates anymore, so there is no point in adding that. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Reviewed-by: Frederic Weisbecker Reviewed-by: Jon Masters Tested-by: Jon Masters --- Documentation/x86/mds.rst | 42 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/irqflags.h | 4 ++++ arch/x86/include/asm/mwait.h | 7 ++++++ arch/x86/include/asm/nospec-branch.h | 12 +++++++++++ arch/x86/kernel/cpu/bugs.c | 3 +++ 5 files changed, 68 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 54d935bf283b..87ce8ac9f36e 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -149,3 +149,45 @@ Mitigation points This takes the paranoid exit path only when the INT1 breakpoint is in kernel space. #DB on a user space address takes the regular exit path, so no extra mitigation required. + + +2. C-State transition +^^^^^^^^^^^^^^^^^^^^^ + + When a CPU goes idle and enters a C-State the CPU buffers need to be + cleared on affected CPUs when SMT is active. This addresses the + repartitioning of the store buffer when one of the Hyper-Threads enters + a C-State. + + When SMT is inactive, i.e. either the CPU does not support it or all + sibling threads are offline CPU buffer clearing is not required. + + The idle clearing is enabled on CPUs which are only affected by MSBDS + and not by any other MDS variant. The other MDS variants cannot be + protected against cross Hyper-Thread attacks because the Fill Buffer and + the Load Ports are shared. So on CPUs affected by other variants, the + idle clearing would be a window dressing exercise and is therefore not + activated. + + The invocation is controlled by the static key mds_idle_clear which is + switched depending on the chosen mitigation mode and the SMT state of + the system. + + The buffer clear is only invoked before entering the C-State to prevent + that stale data from the idling CPU from spilling to the Hyper-Thread + sibling after the store buffer got repartitioned and all entries are + available to the non idle sibling. + + When coming out of idle the store buffer is partitioned again so each + sibling has half of it available. The back from idle CPU could be then + speculatively exposed to contents of the sibling. The buffers are + flushed either on exit to user space or on VMENTER so malicious code + in user space or the guest cannot speculatively access them. + + The mitigation is hooked into all variants of halt()/mwait(), but does + not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver + has been superseded by the intel_idle driver around 2010 and is + preferred on all affected CPUs which are expected to gain the MD_CLEAR + functionality in microcode. Aside of that the IO-Port mechanism is a + legacy interface which is only used on older systems which are either + not affected or do not receive microcode updates anymore. diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 058e40fed167..8a0e56e1dcc9 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -6,6 +6,8 @@ #ifndef __ASSEMBLY__ +#include + /* Provide __cpuidle; we can't safely include */ #define __cpuidle __attribute__((__section__(".cpuidle.text"))) @@ -54,11 +56,13 @@ static inline void native_irq_enable(void) static inline __cpuidle void native_safe_halt(void) { + mds_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } static inline __cpuidle void native_halt(void) { + mds_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 39a2fb29378a..eb0f80ce8524 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -6,6 +6,7 @@ #include #include +#include #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf @@ -40,6 +41,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx, static inline void __mwait(unsigned long eax, unsigned long ecx) { + mds_idle_clear_cpu_buffers(); + /* "mwait %eax, %ecx;" */ asm volatile(".byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); @@ -74,6 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void __mwaitx(unsigned long eax, unsigned long ebx, unsigned long ecx) { + /* No MDS buffer clear as this is AMD/HYGON only */ + /* "mwaitx %eax, %ebx, %ecx;" */ asm volatile(".byte 0x0f, 0x01, 0xfb;" :: "a" (eax), "b" (ebx), "c" (ecx)); @@ -81,6 +86,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { + mds_idle_clear_cpu_buffers(); + trace_hardirqs_on(); /* "mwait %eax, %ecx;" */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 65b747286d96..4e970390110f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -319,6 +319,7 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(mds_user_clear); +DECLARE_STATIC_KEY_FALSE(mds_idle_clear); #include @@ -356,6 +357,17 @@ static inline void mds_user_clear_cpu_buffers(void) mds_clear_cpu_buffers(); } +/** + * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability + * + * Clear CPU buffers if the corresponding static key is enabled + */ +static inline void mds_idle_clear_cpu_buffers(void) +{ + if (static_branch_likely(&mds_idle_clear)) + mds_clear_cpu_buffers(); +} + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 29ed8e8dfee2..916995167301 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -66,6 +66,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); /* Control MDS CPU buffer clear before returning to user space */ DEFINE_STATIC_KEY_FALSE(mds_user_clear); EXPORT_SYMBOL_GPL(mds_user_clear); +/* Control MDS CPU buffer clear before idling (halt, mwait) */ +DEFINE_STATIC_KEY_FALSE(mds_idle_clear); +EXPORT_SYMBOL_GPL(mds_idle_clear); void __init check_bugs(void) { -- cgit v1.2.3 From bc1241700acd82ec69fde98c5763ce51086269f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2019 22:04:08 +0100 Subject: x86/speculation/mds: Add mitigation control for MDS Now that the mitigations are in place, add a command line parameter to control the mitigation, a mitigation selector function and a SMT update mechanism. This is the minimal straight forward initial implementation which just provides an always on/off mode. The command line parameter is: mds=[full|off] This is consistent with the existing mitigations for other speculative hardware vulnerabilities. The idle invocation is dynamically updated according to the SMT state of the system similar to the dynamic update of the STIBP mitigation. The idle mitigation is limited to CPUs which are only affected by MSBDS and not any other variant, because the other variants cannot be mitigated on SMT enabled systems. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Jon Masters Tested-by: Jon Masters --- Documentation/admin-guide/kernel-parameters.txt | 22 ++++++++ arch/x86/include/asm/processor.h | 5 ++ arch/x86/kernel/cpu/bugs.c | 70 +++++++++++++++++++++++++ 3 files changed, 97 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 858b6c0b9a15..dddb024eb523 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2356,6 +2356,28 @@ Format: , Specifies range of consoles to be captured by the MDA. + mds= [X86,INTEL] + Control mitigation for the Micro-architectural Data + Sampling (MDS) vulnerability. + + Certain CPUs are vulnerable to an exploit against CPU + internal buffers which can forward information to a + disclosure gadget under certain conditions. + + In vulnerable processors, the speculatively + forwarded data can be used in a cache side channel + attack, to access data to which the attacker does + not have direct access. + + This parameter controls the MDS mitigation. The + options are: + + full - Enable MDS mitigation on vulnerable CPUs + off - Unconditionally disable MDS mitigation + + Not specifying this option is equivalent to + mds=full. + mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory Amount of memory to be used when the kernel is not able to see the whole system memory or for test. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 33051436c864..1f0295783325 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -992,4 +992,9 @@ enum l1tf_mitigations { extern enum l1tf_mitigations l1tf_mitigation; +enum mds_mitigations { + MDS_MITIGATION_OFF, + MDS_MITIGATION_FULL, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 916995167301..c7b29d200d27 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -37,6 +37,7 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init mds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -108,6 +109,8 @@ void __init check_bugs(void) l1tf_select_mitigation(); + mds_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -213,6 +216,50 @@ static void x86_amd_ssb_disable(void) wrmsrl(MSR_AMD64_LS_CFG, msrval); } +#undef pr_fmt +#define pr_fmt(fmt) "MDS: " fmt + +/* Default mitigation for L1TF-affected CPUs */ +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; + +static const char * const mds_strings[] = { + [MDS_MITIGATION_OFF] = "Vulnerable", + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers" +}; + +static void __init mds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_OFF; + return; + } + + if (mds_mitigation == MDS_MITIGATION_FULL) { + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + static_branch_enable(&mds_user_clear); + else + mds_mitigation = MDS_MITIGATION_OFF; + } + pr_info("%s\n", mds_strings[mds_mitigation]); +} + +static int __init mds_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + mds_mitigation = MDS_MITIGATION_OFF; + else if (!strcmp(str, "full")) + mds_mitigation = MDS_MITIGATION_FULL; + + return 0; +} +early_param("mds", mds_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt @@ -617,6 +664,26 @@ static void update_indir_branch_cond(void) static_branch_disable(&switch_to_cond_stibp); } +/* Update the static key controlling the MDS CPU buffer clear in idle */ +static void update_mds_branch_idle(void) +{ + /* + * Enable the idle clearing if SMT is active on CPUs which are + * affected only by MSBDS and not any other MDS variant. + * + * The other variants cannot be mitigated when SMT is enabled, so + * clearing the buffers on idle just to prevent the Store Buffer + * repartitioning leak would be a window dressing exercise. + */ + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) + return; + + if (sched_smt_active()) + static_branch_enable(&mds_idle_clear); + else + static_branch_disable(&mds_idle_clear); +} + void arch_smt_update(void) { /* Enhanced IBRS implies STIBP. No update required. */ @@ -638,6 +705,9 @@ void arch_smt_update(void) break; } + if (mds_mitigation == MDS_MITIGATION_FULL) + update_mds_branch_idle(); + mutex_unlock(&spec_ctrl_mutex); } -- cgit v1.2.3 From 8a4b06d391b0a42a373808979b5028f5c84d9c6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2019 22:51:43 +0100 Subject: x86/speculation/mds: Add sysfs reporting for MDS Add the sysfs reporting file for MDS. It exposes the vulnerability and mitigation state similar to the existing files for the other speculative hardware vulnerabilities. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Reviewed-by: Jon Masters Tested-by: Jon Masters --- Documentation/ABI/testing/sysfs-devices-system-cpu | 1 + arch/x86/kernel/cpu/bugs.c | 25 ++++++++++++++++++++++ drivers/base/cpu.c | 8 +++++++ include/linux/cpu.h | 2 ++ 4 files changed, 36 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9605dbd4b5b5..2db5c3407fd6 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -484,6 +484,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spectre_v2 /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf + /sys/devices/system/cpu/vulnerabilities/mds Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c7b29d200d27..7ab16a6ed064 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1172,6 +1172,22 @@ static ssize_t l1tf_show_state(char *buf) } #endif +static ssize_t mds_show_state(char *buf) +{ + if (!hypervisor_is_type(X86_HYPER_NATIVE)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); + } + + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "mitigated" : "disabled"); + } + + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1238,6 +1254,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) return l1tf_show_state(buf); break; + + case X86_BUG_MDS: + return mds_show_state(buf); + default: break; } @@ -1269,4 +1289,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b { return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); } + +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); +} #endif diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index eb9443d5bae1..2fd6ca1021c2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -546,11 +546,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_mds(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); +static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -558,6 +565,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_spectre_v2.attr, &dev_attr_spec_store_bypass.attr, &dev_attr_l1tf.attr, + &dev_attr_mds.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 5041357d0297..3c87ad888ed3 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -57,6 +57,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_mds(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From 22dd8365088b6403630b82423cf906491859b65e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Feb 2019 09:40:40 +0100 Subject: x86/speculation/mds: Add mitigation mode VMWERV In virtualized environments it can happen that the host has the microcode update which utilizes the VERW instruction to clear CPU buffers, but the hypervisor is not yet updated to expose the X86_FEATURE_MD_CLEAR CPUID bit to guests. Introduce an internal mitigation mode VMWERV which enables the invocation of the CPU buffer clearing even if X86_FEATURE_MD_CLEAR is not set. If the system has no updated microcode this results in a pointless execution of the VERW instruction wasting a few CPU cycles. If the microcode is updated, but not exposed to a guest then the CPU buffers will be cleared. That said: Virtual Machines Will Eventually Receive Vaccine Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Jon Masters Tested-by: Jon Masters --- Documentation/x86/mds.rst | 27 +++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/bugs.c | 18 ++++++++++++------ 3 files changed, 40 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 87ce8ac9f36e..3d6f943f1afb 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -93,11 +93,38 @@ The kernel provides a function to invoke the buffer clearing: The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state (idle) transitions. +As a special quirk to address virtualization scenarios where the host has +the microcode updated, but the hypervisor does not (yet) expose the +MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the +hope that it might actually clear the buffers. The state is reflected +accordingly. + According to current knowledge additional mitigations inside the kernel itself are not required because the necessary gadgets to expose the leaked data cannot be controlled in a way which allows exploitation from malicious user space or VM guests. +Kernel internal mitigation modes +-------------------------------- + + ======= ============================================================ + off Mitigation is disabled. Either the CPU is not affected or + mds=off is supplied on the kernel command line + + full Mitigation is eanbled. CPU is affected and MD_CLEAR is + advertised in CPUID. + + vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not + advertised in CPUID. That is mainly for virtualization + scenarios where the host has the updated microcode but the + hypervisor does not expose MD_CLEAR in CPUID. It's a best + effort approach without guarantee. + ======= ============================================================ + +If the CPU is affected and mds=off is not supplied on the kernel command +line then the kernel selects the appropriate mitigation mode depending on +the availability of the MD_CLEAR CPUID bit. + Mitigation points ----------------- diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1f0295783325..aca1ef8cc79f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -995,6 +995,7 @@ extern enum l1tf_mitigations l1tf_mitigation; enum mds_mitigations { MDS_MITIGATION_OFF, MDS_MITIGATION_FULL, + MDS_MITIGATION_VMWERV, }; #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7ab16a6ed064..95cda38c8785 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -224,7 +224,8 @@ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL static const char * const mds_strings[] = { [MDS_MITIGATION_OFF] = "Vulnerable", - [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers" + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; static void __init mds_select_mitigation(void) @@ -235,10 +236,9 @@ static void __init mds_select_mitigation(void) } if (mds_mitigation == MDS_MITIGATION_FULL) { - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) - static_branch_enable(&mds_user_clear); - else - mds_mitigation = MDS_MITIGATION_OFF; + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + mds_mitigation = MDS_MITIGATION_VMWERV; + static_branch_enable(&mds_user_clear); } pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -705,8 +705,14 @@ void arch_smt_update(void) break; } - if (mds_mitigation == MDS_MITIGATION_FULL) + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_VMWERV: update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } mutex_unlock(&spec_ctrl_mutex); } -- cgit v1.2.3 From 65fd4cb65b2dad97feb8330b6690445910b56d6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Feb 2019 11:10:49 +0100 Subject: Documentation: Move L1TF to separate directory Move L!TF to a separate directory so the MDS stuff can be added at the side. Otherwise the all hardware vulnerabilites have their own top level entry. Should have done that right away. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jon Masters --- Documentation/ABI/testing/sysfs-devices-system-cpu | 2 +- Documentation/admin-guide/hw-vuln/index.rst | 12 + Documentation/admin-guide/hw-vuln/l1tf.rst | 614 +++++++++++++++++++++ Documentation/admin-guide/index.rst | 6 +- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/l1tf.rst | 614 --------------------- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 +- 8 files changed, 633 insertions(+), 623 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/index.rst create mode 100644 Documentation/admin-guide/hw-vuln/l1tf.rst delete mode 100644 Documentation/admin-guide/l1tf.rst (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 2db5c3407fd6..744c6d764b0c 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -498,7 +498,7 @@ Description: Information about CPU vulnerabilities "Mitigation: $M" CPU is affected and mitigation $M is in effect Details about the l1tf file can be found in - Documentation/admin-guide/l1tf.rst + Documentation/admin-guide/hw-vuln/l1tf.rst What: /sys/devices/system/cpu/smt /sys/devices/system/cpu/smt/active diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst new file mode 100644 index 000000000000..8ce2009f1981 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -0,0 +1,12 @@ +======================== +Hardware vulnerabilities +======================== + +This section describes CPU vulnerabilities and provides an overview of the +possible mitigations along with guidance for selecting mitigations if they +are configurable at compile, boot or run time. + +.. toctree:: + :maxdepth: 1 + + l1tf diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst new file mode 100644 index 000000000000..9af977384168 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst @@ -0,0 +1,614 @@ +L1TF - L1 Terminal Fault +======================== + +L1 Terminal Fault is a hardware vulnerability which allows unprivileged +speculative access to data which is available in the Level 1 Data Cache +when the page table entry controlling the virtual address, which is used +for the access, has the Present bit cleared or other reserved bits set. + +Affected processors +------------------- + +This vulnerability affects a wide range of Intel processors. The +vulnerability is not present on: + + - Processors from AMD, Centaur and other non Intel vendors + + - Older processor models, where the CPU family is < 6 + + - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, + Penwell, Pineview, Silvermont, Airmont, Merrifield) + + - The Intel XEON PHI family + + - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the + IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected + by the Meltdown vulnerability either. These CPUs should become + available by end of 2018. + +Whether a processor is affected or not can be read out from the L1TF +vulnerability file in sysfs. See :ref:`l1tf_sys_info`. + +Related CVEs +------------ + +The following CVE entries are related to the L1TF vulnerability: + + ============= ================= ============================== + CVE-2018-3615 L1 Terminal Fault SGX related aspects + CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects + CVE-2018-3646 L1 Terminal Fault Virtualization related aspects + ============= ================= ============================== + +Problem +------- + +If an instruction accesses a virtual address for which the relevant page +table entry (PTE) has the Present bit cleared or other reserved bits set, +then speculative execution ignores the invalid PTE and loads the referenced +data if it is present in the Level 1 Data Cache, as if the page referenced +by the address bits in the PTE was still present and accessible. + +While this is a purely speculative mechanism and the instruction will raise +a page fault when it is retired eventually, the pure act of loading the +data and making it available to other speculative instructions opens up the +opportunity for side channel attacks to unprivileged malicious code, +similar to the Meltdown attack. + +While Meltdown breaks the user space to kernel space protection, L1TF +allows to attack any physical memory address in the system and the attack +works across all protection domains. It allows an attack of SGX and also +works from inside virtual machines because the speculation bypasses the +extended page table (EPT) protection mechanism. + + +Attack scenarios +---------------- + +1. Malicious user space +^^^^^^^^^^^^^^^^^^^^^^^ + + Operating Systems store arbitrary information in the address bits of a + PTE which is marked non present. This allows a malicious user space + application to attack the physical memory to which these PTEs resolve. + In some cases user-space can maliciously influence the information + encoded in the address bits of the PTE, thus making attacks more + deterministic and more practical. + + The Linux kernel contains a mitigation for this attack vector, PTE + inversion, which is permanently enabled and has no performance + impact. The kernel ensures that the address bits of PTEs, which are not + marked present, never point to cacheable physical memory space. + + A system with an up to date kernel is protected against attacks from + malicious user space applications. + +2. Malicious guest in a virtual machine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + The fact that L1TF breaks all domain protections allows malicious guest + OSes, which can control the PTEs directly, and malicious guest user + space applications, which run on an unprotected guest kernel lacking the + PTE inversion mitigation for L1TF, to attack physical host memory. + + A special aspect of L1TF in the context of virtualization is symmetric + multi threading (SMT). The Intel implementation of SMT is called + HyperThreading. The fact that Hyperthreads on the affected processors + share the L1 Data Cache (L1D) is important for this. As the flaw allows + only to attack data which is present in L1D, a malicious guest running + on one Hyperthread can attack the data which is brought into the L1D by + the context which runs on the sibling Hyperthread of the same physical + core. This context can be host OS, host user space or a different guest. + + If the processor does not support Extended Page Tables, the attack is + only possible, when the hypervisor does not sanitize the content of the + effective (shadow) page tables. + + While solutions exist to mitigate these attack vectors fully, these + mitigations are not enabled by default in the Linux kernel because they + can affect performance significantly. The kernel provides several + mechanisms which can be utilized to address the problem depending on the + deployment scenario. The mitigations, their protection scope and impact + are described in the next sections. + + The default mitigations and the rationale for choosing them are explained + at the end of this document. See :ref:`default_mitigations`. + +.. _l1tf_sys_info: + +L1TF system information +----------------------- + +The Linux kernel provides a sysfs interface to enumerate the current L1TF +status of the system: whether the system is vulnerable, and which +mitigations are active. The relevant sysfs file is: + +/sys/devices/system/cpu/vulnerabilities/l1tf + +The possible values in this file are: + + =========================== =============================== + 'Not affected' The processor is not vulnerable + 'Mitigation: PTE Inversion' The host protection is active + =========================== =============================== + +If KVM/VMX is enabled and the processor is vulnerable then the following +information is appended to the 'Mitigation: PTE Inversion' part: + + - SMT status: + + ===================== ================ + 'VMX: SMT vulnerable' SMT is enabled + 'VMX: SMT disabled' SMT is disabled + ===================== ================ + + - L1D Flush mode: + + ================================ ==================================== + 'L1D vulnerable' L1D flushing is disabled + + 'L1D conditional cache flushes' L1D flush is conditionally enabled + + 'L1D cache flushes' L1D flush is unconditionally enabled + ================================ ==================================== + +The resulting grade of protection is discussed in the following sections. + + +Host mitigation mechanism +------------------------- + +The kernel is unconditionally protected against L1TF attacks from malicious +user space running on the host. + + +Guest mitigation mechanisms +--------------------------- + +.. _l1d_flush: + +1. L1D flush on VMENTER +^^^^^^^^^^^^^^^^^^^^^^^ + + To make sure that a guest cannot attack data which is present in the L1D + the hypervisor flushes the L1D before entering the guest. + + Flushing the L1D evicts not only the data which should not be accessed + by a potentially malicious guest, it also flushes the guest + data. Flushing the L1D has a performance impact as the processor has to + bring the flushed guest data back into the L1D. Depending on the + frequency of VMEXIT/VMENTER and the type of computations in the guest + performance degradation in the range of 1% to 50% has been observed. For + scenarios where guest VMEXIT/VMENTER are rare the performance impact is + minimal. Virtio and mechanisms like posted interrupts are designed to + confine the VMEXITs to a bare minimum, but specific configurations and + application scenarios might still suffer from a high VMEXIT rate. + + The kernel provides two L1D flush modes: + - conditional ('cond') + - unconditional ('always') + + The conditional mode avoids L1D flushing after VMEXITs which execute + only audited code paths before the corresponding VMENTER. These code + paths have been verified that they cannot expose secrets or other + interesting data to an attacker, but they can leak information about the + address space layout of the hypervisor. + + Unconditional mode flushes L1D on all VMENTER invocations and provides + maximum protection. It has a higher overhead than the conditional + mode. The overhead cannot be quantified correctly as it depends on the + workload scenario and the resulting number of VMEXITs. + + The general recommendation is to enable L1D flush on VMENTER. The kernel + defaults to conditional mode on affected processors. + + **Note**, that L1D flush does not prevent the SMT problem because the + sibling thread will also bring back its data into the L1D which makes it + attackable again. + + L1D flush can be controlled by the administrator via the kernel command + line and sysfs control files. See :ref:`mitigation_control_command_line` + and :ref:`mitigation_control_kvm`. + +.. _guest_confinement: + +2. Guest VCPU confinement to dedicated physical cores +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + To address the SMT problem, it is possible to make a guest or a group of + guests affine to one or more physical cores. The proper mechanism for + that is to utilize exclusive cpusets to ensure that no other guest or + host tasks can run on these cores. + + If only a single guest or related guests run on sibling SMT threads on + the same physical core then they can only attack their own memory and + restricted parts of the host memory. + + Host memory is attackable, when one of the sibling SMT threads runs in + host OS (hypervisor) context and the other in guest context. The amount + of valuable information from the host OS context depends on the context + which the host OS executes, i.e. interrupts, soft interrupts and kernel + threads. The amount of valuable data from these contexts cannot be + declared as non-interesting for an attacker without deep inspection of + the code. + + **Note**, that assigning guests to a fixed set of physical cores affects + the ability of the scheduler to do load balancing and might have + negative effects on CPU utilization depending on the hosting + scenario. Disabling SMT might be a viable alternative for particular + scenarios. + + For further information about confining guests to a single or to a group + of cores consult the cpusets documentation: + + https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt + +.. _interrupt_isolation: + +3. Interrupt affinity +^^^^^^^^^^^^^^^^^^^^^ + + Interrupts can be made affine to logical CPUs. This is not universally + true because there are types of interrupts which are truly per CPU + interrupts, e.g. the local timer interrupt. Aside of that multi queue + devices affine their interrupts to single CPUs or groups of CPUs per + queue without allowing the administrator to control the affinities. + + Moving the interrupts, which can be affinity controlled, away from CPUs + which run untrusted guests, reduces the attack vector space. + + Whether the interrupts with are affine to CPUs, which run untrusted + guests, provide interesting data for an attacker depends on the system + configuration and the scenarios which run on the system. While for some + of the interrupts it can be assumed that they won't expose interesting + information beyond exposing hints about the host OS memory layout, there + is no way to make general assumptions. + + Interrupt affinity can be controlled by the administrator via the + /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is + available at: + + https://www.kernel.org/doc/Documentation/IRQ-affinity.txt + +.. _smt_control: + +4. SMT control +^^^^^^^^^^^^^^ + + To prevent the SMT issues of L1TF it might be necessary to disable SMT + completely. Disabling SMT can have a significant performance impact, but + the impact depends on the hosting scenario and the type of workloads. + The impact of disabling SMT needs also to be weighted against the impact + of other mitigation solutions like confining guests to dedicated cores. + + The kernel provides a sysfs interface to retrieve the status of SMT and + to control it. It also provides a kernel command line interface to + control SMT. + + The kernel command line interface consists of the following options: + + =========== ========================================================== + nosmt Affects the bring up of the secondary CPUs during boot. The + kernel tries to bring all present CPUs online during the + boot process. "nosmt" makes sure that from each physical + core only one - the so called primary (hyper) thread is + activated. Due to a design flaw of Intel processors related + to Machine Check Exceptions the non primary siblings have + to be brought up at least partially and are then shut down + again. "nosmt" can be undone via the sysfs interface. + + nosmt=force Has the same effect as "nosmt" but it does not allow to + undo the SMT disable via the sysfs interface. + =========== ========================================================== + + The sysfs interface provides two files: + + - /sys/devices/system/cpu/smt/control + - /sys/devices/system/cpu/smt/active + + /sys/devices/system/cpu/smt/control: + + This file allows to read out the SMT control state and provides the + ability to disable or (re)enable SMT. The possible states are: + + ============== =================================================== + on SMT is supported by the CPU and enabled. All + logical CPUs can be onlined and offlined without + restrictions. + + off SMT is supported by the CPU and disabled. Only + the so called primary SMT threads can be onlined + and offlined without restrictions. An attempt to + online a non-primary sibling is rejected + + forceoff Same as 'off' but the state cannot be controlled. + Attempts to write to the control file are rejected. + + notsupported The processor does not support SMT. It's therefore + not affected by the SMT implications of L1TF. + Attempts to write to the control file are rejected. + ============== =================================================== + + The possible states which can be written into this file to control SMT + state are: + + - on + - off + - forceoff + + /sys/devices/system/cpu/smt/active: + + This file reports whether SMT is enabled and active, i.e. if on any + physical core two or more sibling threads are online. + + SMT control is also possible at boot time via the l1tf kernel command + line parameter in combination with L1D flush control. See + :ref:`mitigation_control_command_line`. + +5. Disabling EPT +^^^^^^^^^^^^^^^^ + + Disabling EPT for virtual machines provides full mitigation for L1TF even + with SMT enabled, because the effective page tables for guests are + managed and sanitized by the hypervisor. Though disabling EPT has a + significant performance impact especially when the Meltdown mitigation + KPTI is enabled. + + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. + +There is ongoing research and development for new mitigation mechanisms to +address the performance impact of disabling SMT or EPT. + +.. _mitigation_control_command_line: + +Mitigation control on the kernel command line +--------------------------------------------- + +The kernel command line allows to control the L1TF mitigations at boot +time with the option "l1tf=". The valid arguments for this option are: + + ============ ============================================================= + full Provides all available mitigations for the L1TF + vulnerability. Disables SMT and enables all mitigations in + the hypervisors, i.e. unconditional L1D flushing + + SMT control and L1D flush control via the sysfs interface + is still possible after boot. Hypervisors will issue a + warning when the first VM is started in a potentially + insecure configuration, i.e. SMT enabled or L1D flush + disabled. + + full,force Same as 'full', but disables SMT and L1D flush runtime + control. Implies the 'nosmt=force' command line option. + (i.e. sysfs control of SMT is disabled.) + + flush Leaves SMT enabled and enables the default hypervisor + mitigation, i.e. conditional L1D flushing + + SMT control and L1D flush control via the sysfs interface + is still possible after boot. Hypervisors will issue a + warning when the first VM is started in a potentially + insecure configuration, i.e. SMT enabled or L1D flush + disabled. + + flush,nosmt Disables SMT and enables the default hypervisor mitigation, + i.e. conditional L1D flushing. + + SMT control and L1D flush control via the sysfs interface + is still possible after boot. Hypervisors will issue a + warning when the first VM is started in a potentially + insecure configuration, i.e. SMT enabled or L1D flush + disabled. + + flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is + started in a potentially insecure configuration. + + off Disables hypervisor mitigations and doesn't emit any + warnings. + It also drops the swap size and available RAM limit restrictions + on both hypervisor and bare metal. + + ============ ============================================================= + +The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. + + +.. _mitigation_control_kvm: + +Mitigation control for KVM - module parameter +------------------------------------------------------------- + +The KVM hypervisor mitigation mechanism, flushing the L1D cache when +entering a guest, can be controlled with a module parameter. + +The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the +following arguments: + + ============ ============================================================== + always L1D cache flush on every VMENTER. + + cond Flush L1D on VMENTER only when the code between VMEXIT and + VMENTER can leak host memory which is considered + interesting for an attacker. This still can leak host memory + which allows e.g. to determine the hosts address space layout. + + never Disables the mitigation + ============ ============================================================== + +The parameter can be provided on the kernel command line, as a module +parameter when loading the modules and at runtime modified via the sysfs +file: + +/sys/module/kvm_intel/parameters/vmentry_l1d_flush + +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush +module parameter is ignored and writes to the sysfs file are rejected. + + +Mitigation selection guide +-------------------------- + +1. No virtualization in use +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + The system is protected by the kernel unconditionally and no further + action is required. + +2. Virtualization with trusted guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + If the guest comes from a trusted source and the guest OS kernel is + guaranteed to have the L1TF mitigations in place the system is fully + protected against L1TF and no further action is required. + + To avoid the overhead of the default L1D flushing on VMENTER the + administrator can disable the flushing via the kernel command line and + sysfs control files. See :ref:`mitigation_control_command_line` and + :ref:`mitigation_control_kvm`. + + +3. Virtualization with untrusted guests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +3.1. SMT not supported or disabled +"""""""""""""""""""""""""""""""""" + + If SMT is not supported by the processor or disabled in the BIOS or by + the kernel, it's only required to enforce L1D flushing on VMENTER. + + Conditional L1D flushing is the default behaviour and can be tuned. See + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. + +3.2. EPT not supported or disabled +"""""""""""""""""""""""""""""""""" + + If EPT is not supported by the processor or disabled in the hypervisor, + the system is fully protected. SMT can stay enabled and L1D flushing on + VMENTER is not required. + + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. + +3.3. SMT and EPT supported and active +""""""""""""""""""""""""""""""""""""" + + If SMT and EPT are supported and active then various degrees of + mitigations can be employed: + + - L1D flushing on VMENTER: + + L1D flushing on VMENTER is the minimal protection requirement, but it + is only potent in combination with other mitigation methods. + + Conditional L1D flushing is the default behaviour and can be tuned. See + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. + + - Guest confinement: + + Confinement of guests to a single or a group of physical cores which + are not running any other processes, can reduce the attack surface + significantly, but interrupts, soft interrupts and kernel threads can + still expose valuable data to a potential attacker. See + :ref:`guest_confinement`. + + - Interrupt isolation: + + Isolating the guest CPUs from interrupts can reduce the attack surface + further, but still allows a malicious guest to explore a limited amount + of host physical memory. This can at least be used to gain knowledge + about the host address space layout. The interrupts which have a fixed + affinity to the CPUs which run the untrusted guests can depending on + the scenario still trigger soft interrupts and schedule kernel threads + which might expose valuable information. See + :ref:`interrupt_isolation`. + +The above three mitigation methods combined can provide protection to a +certain degree, but the risk of the remaining attack surface has to be +carefully analyzed. For full protection the following methods are +available: + + - Disabling SMT: + + Disabling SMT and enforcing the L1D flushing provides the maximum + amount of protection. This mitigation is not depending on any of the + above mitigation methods. + + SMT control and L1D flushing can be tuned by the command line + parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run + time with the matching sysfs control files. See :ref:`smt_control`, + :ref:`mitigation_control_command_line` and + :ref:`mitigation_control_kvm`. + + - Disabling EPT: + + Disabling EPT provides the maximum amount of protection as well. It is + not depending on any of the above mitigation methods. SMT can stay + enabled and L1D flushing is not required, but the performance impact is + significant. + + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' + parameter. + +3.4. Nested virtual machines +"""""""""""""""""""""""""""" + +When nested virtualization is in use, three operating systems are involved: +the bare metal hypervisor, the nested hypervisor and the nested virtual +machine. VMENTER operations from the nested hypervisor into the nested +guest will always be processed by the bare metal hypervisor. If KVM is the +bare metal hypervisor it will: + + - Flush the L1D cache on every switch from the nested hypervisor to the + nested virtual machine, so that the nested hypervisor's secrets are not + exposed to the nested virtual machine; + + - Flush the L1D cache on every switch from the nested virtual machine to + the nested hypervisor; this is a complex operation, and flushing the L1D + cache avoids that the bare metal hypervisor's secrets are exposed to the + nested virtual machine; + + - Instruct the nested hypervisor to not perform any L1D cache flush. This + is an optimization to avoid double L1D flushing. + + +.. _default_mitigations: + +Default mitigations +------------------- + + The kernel default mitigations for vulnerable processors are: + + - PTE inversion to protect against malicious user space. This is done + unconditionally and cannot be controlled. The swap storage is limited + to ~16TB. + + - L1D conditional flushing on VMENTER when EPT is enabled for + a guest. + + The kernel does not by default enforce the disabling of SMT, which leaves + SMT systems vulnerable when running untrusted guests with EPT enabled. + + The rationale for this choice is: + + - Force disabling SMT can break existing setups, especially with + unattended updates. + + - If regular users run untrusted guests on their machine, then L1TF is + just an add on to other malware which might be embedded in an untrusted + guest, e.g. spam-bots or attacks on the local network. + + There is no technical way to prevent a user from running untrusted code + on their machines blindly. + + - It's technically extremely unlikely and from today's knowledge even + impossible that L1TF can be exploited via the most popular attack + mechanisms like JavaScript because these mechanisms have no way to + control PTEs. If this would be possible and not other mitigation would + be possible, then the default might be different. + + - The administrators of cloud and hosting setups have to carefully + analyze the risk for their scenarios and make the appropriate + mitigation choices, which might even vary across their deployed + machines and also result in other changes of their overall setup. + There is no way for the kernel to provide a sensible default for this + kind of scenarios. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0a491676685e..42247516962a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -17,14 +17,12 @@ etc. kernel-parameters devices -This section describes CPU vulnerabilities and provides an overview of the -possible mitigations along with guidance for selecting mitigations if they -are configurable at compile, boot or run time. +This section describes CPU vulnerabilities and their mitigations. .. toctree:: :maxdepth: 1 - l1tf + hw-vuln/index Here is a set of documents aimed at users who are trying to track down problems and bugs in particular. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dddb024eb523..9afcb240a673 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2114,7 +2114,7 @@ Default is 'flush'. - For details see: Documentation/admin-guide/l1tf.rst + For details see: Documentation/admin-guide/hw-vuln/l1tf.rst l2cr= [PPC] diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst deleted file mode 100644 index 9af977384168..000000000000 --- a/Documentation/admin-guide/l1tf.rst +++ /dev/null @@ -1,614 +0,0 @@ -L1TF - L1 Terminal Fault -======================== - -L1 Terminal Fault is a hardware vulnerability which allows unprivileged -speculative access to data which is available in the Level 1 Data Cache -when the page table entry controlling the virtual address, which is used -for the access, has the Present bit cleared or other reserved bits set. - -Affected processors -------------------- - -This vulnerability affects a wide range of Intel processors. The -vulnerability is not present on: - - - Processors from AMD, Centaur and other non Intel vendors - - - Older processor models, where the CPU family is < 6 - - - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, - Penwell, Pineview, Silvermont, Airmont, Merrifield) - - - The Intel XEON PHI family - - - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the - IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected - by the Meltdown vulnerability either. These CPUs should become - available by end of 2018. - -Whether a processor is affected or not can be read out from the L1TF -vulnerability file in sysfs. See :ref:`l1tf_sys_info`. - -Related CVEs ------------- - -The following CVE entries are related to the L1TF vulnerability: - - ============= ================= ============================== - CVE-2018-3615 L1 Terminal Fault SGX related aspects - CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects - CVE-2018-3646 L1 Terminal Fault Virtualization related aspects - ============= ================= ============================== - -Problem -------- - -If an instruction accesses a virtual address for which the relevant page -table entry (PTE) has the Present bit cleared or other reserved bits set, -then speculative execution ignores the invalid PTE and loads the referenced -data if it is present in the Level 1 Data Cache, as if the page referenced -by the address bits in the PTE was still present and accessible. - -While this is a purely speculative mechanism and the instruction will raise -a page fault when it is retired eventually, the pure act of loading the -data and making it available to other speculative instructions opens up the -opportunity for side channel attacks to unprivileged malicious code, -similar to the Meltdown attack. - -While Meltdown breaks the user space to kernel space protection, L1TF -allows to attack any physical memory address in the system and the attack -works across all protection domains. It allows an attack of SGX and also -works from inside virtual machines because the speculation bypasses the -extended page table (EPT) protection mechanism. - - -Attack scenarios ----------------- - -1. Malicious user space -^^^^^^^^^^^^^^^^^^^^^^^ - - Operating Systems store arbitrary information in the address bits of a - PTE which is marked non present. This allows a malicious user space - application to attack the physical memory to which these PTEs resolve. - In some cases user-space can maliciously influence the information - encoded in the address bits of the PTE, thus making attacks more - deterministic and more practical. - - The Linux kernel contains a mitigation for this attack vector, PTE - inversion, which is permanently enabled and has no performance - impact. The kernel ensures that the address bits of PTEs, which are not - marked present, never point to cacheable physical memory space. - - A system with an up to date kernel is protected against attacks from - malicious user space applications. - -2. Malicious guest in a virtual machine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - The fact that L1TF breaks all domain protections allows malicious guest - OSes, which can control the PTEs directly, and malicious guest user - space applications, which run on an unprotected guest kernel lacking the - PTE inversion mitigation for L1TF, to attack physical host memory. - - A special aspect of L1TF in the context of virtualization is symmetric - multi threading (SMT). The Intel implementation of SMT is called - HyperThreading. The fact that Hyperthreads on the affected processors - share the L1 Data Cache (L1D) is important for this. As the flaw allows - only to attack data which is present in L1D, a malicious guest running - on one Hyperthread can attack the data which is brought into the L1D by - the context which runs on the sibling Hyperthread of the same physical - core. This context can be host OS, host user space or a different guest. - - If the processor does not support Extended Page Tables, the attack is - only possible, when the hypervisor does not sanitize the content of the - effective (shadow) page tables. - - While solutions exist to mitigate these attack vectors fully, these - mitigations are not enabled by default in the Linux kernel because they - can affect performance significantly. The kernel provides several - mechanisms which can be utilized to address the problem depending on the - deployment scenario. The mitigations, their protection scope and impact - are described in the next sections. - - The default mitigations and the rationale for choosing them are explained - at the end of this document. See :ref:`default_mitigations`. - -.. _l1tf_sys_info: - -L1TF system information ------------------------ - -The Linux kernel provides a sysfs interface to enumerate the current L1TF -status of the system: whether the system is vulnerable, and which -mitigations are active. The relevant sysfs file is: - -/sys/devices/system/cpu/vulnerabilities/l1tf - -The possible values in this file are: - - =========================== =============================== - 'Not affected' The processor is not vulnerable - 'Mitigation: PTE Inversion' The host protection is active - =========================== =============================== - -If KVM/VMX is enabled and the processor is vulnerable then the following -information is appended to the 'Mitigation: PTE Inversion' part: - - - SMT status: - - ===================== ================ - 'VMX: SMT vulnerable' SMT is enabled - 'VMX: SMT disabled' SMT is disabled - ===================== ================ - - - L1D Flush mode: - - ================================ ==================================== - 'L1D vulnerable' L1D flushing is disabled - - 'L1D conditional cache flushes' L1D flush is conditionally enabled - - 'L1D cache flushes' L1D flush is unconditionally enabled - ================================ ==================================== - -The resulting grade of protection is discussed in the following sections. - - -Host mitigation mechanism -------------------------- - -The kernel is unconditionally protected against L1TF attacks from malicious -user space running on the host. - - -Guest mitigation mechanisms ---------------------------- - -.. _l1d_flush: - -1. L1D flush on VMENTER -^^^^^^^^^^^^^^^^^^^^^^^ - - To make sure that a guest cannot attack data which is present in the L1D - the hypervisor flushes the L1D before entering the guest. - - Flushing the L1D evicts not only the data which should not be accessed - by a potentially malicious guest, it also flushes the guest - data. Flushing the L1D has a performance impact as the processor has to - bring the flushed guest data back into the L1D. Depending on the - frequency of VMEXIT/VMENTER and the type of computations in the guest - performance degradation in the range of 1% to 50% has been observed. For - scenarios where guest VMEXIT/VMENTER are rare the performance impact is - minimal. Virtio and mechanisms like posted interrupts are designed to - confine the VMEXITs to a bare minimum, but specific configurations and - application scenarios might still suffer from a high VMEXIT rate. - - The kernel provides two L1D flush modes: - - conditional ('cond') - - unconditional ('always') - - The conditional mode avoids L1D flushing after VMEXITs which execute - only audited code paths before the corresponding VMENTER. These code - paths have been verified that they cannot expose secrets or other - interesting data to an attacker, but they can leak information about the - address space layout of the hypervisor. - - Unconditional mode flushes L1D on all VMENTER invocations and provides - maximum protection. It has a higher overhead than the conditional - mode. The overhead cannot be quantified correctly as it depends on the - workload scenario and the resulting number of VMEXITs. - - The general recommendation is to enable L1D flush on VMENTER. The kernel - defaults to conditional mode on affected processors. - - **Note**, that L1D flush does not prevent the SMT problem because the - sibling thread will also bring back its data into the L1D which makes it - attackable again. - - L1D flush can be controlled by the administrator via the kernel command - line and sysfs control files. See :ref:`mitigation_control_command_line` - and :ref:`mitigation_control_kvm`. - -.. _guest_confinement: - -2. Guest VCPU confinement to dedicated physical cores -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - To address the SMT problem, it is possible to make a guest or a group of - guests affine to one or more physical cores. The proper mechanism for - that is to utilize exclusive cpusets to ensure that no other guest or - host tasks can run on these cores. - - If only a single guest or related guests run on sibling SMT threads on - the same physical core then they can only attack their own memory and - restricted parts of the host memory. - - Host memory is attackable, when one of the sibling SMT threads runs in - host OS (hypervisor) context and the other in guest context. The amount - of valuable information from the host OS context depends on the context - which the host OS executes, i.e. interrupts, soft interrupts and kernel - threads. The amount of valuable data from these contexts cannot be - declared as non-interesting for an attacker without deep inspection of - the code. - - **Note**, that assigning guests to a fixed set of physical cores affects - the ability of the scheduler to do load balancing and might have - negative effects on CPU utilization depending on the hosting - scenario. Disabling SMT might be a viable alternative for particular - scenarios. - - For further information about confining guests to a single or to a group - of cores consult the cpusets documentation: - - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt - -.. _interrupt_isolation: - -3. Interrupt affinity -^^^^^^^^^^^^^^^^^^^^^ - - Interrupts can be made affine to logical CPUs. This is not universally - true because there are types of interrupts which are truly per CPU - interrupts, e.g. the local timer interrupt. Aside of that multi queue - devices affine their interrupts to single CPUs or groups of CPUs per - queue without allowing the administrator to control the affinities. - - Moving the interrupts, which can be affinity controlled, away from CPUs - which run untrusted guests, reduces the attack vector space. - - Whether the interrupts with are affine to CPUs, which run untrusted - guests, provide interesting data for an attacker depends on the system - configuration and the scenarios which run on the system. While for some - of the interrupts it can be assumed that they won't expose interesting - information beyond exposing hints about the host OS memory layout, there - is no way to make general assumptions. - - Interrupt affinity can be controlled by the administrator via the - /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is - available at: - - https://www.kernel.org/doc/Documentation/IRQ-affinity.txt - -.. _smt_control: - -4. SMT control -^^^^^^^^^^^^^^ - - To prevent the SMT issues of L1TF it might be necessary to disable SMT - completely. Disabling SMT can have a significant performance impact, but - the impact depends on the hosting scenario and the type of workloads. - The impact of disabling SMT needs also to be weighted against the impact - of other mitigation solutions like confining guests to dedicated cores. - - The kernel provides a sysfs interface to retrieve the status of SMT and - to control it. It also provides a kernel command line interface to - control SMT. - - The kernel command line interface consists of the following options: - - =========== ========================================================== - nosmt Affects the bring up of the secondary CPUs during boot. The - kernel tries to bring all present CPUs online during the - boot process. "nosmt" makes sure that from each physical - core only one - the so called primary (hyper) thread is - activated. Due to a design flaw of Intel processors related - to Machine Check Exceptions the non primary siblings have - to be brought up at least partially and are then shut down - again. "nosmt" can be undone via the sysfs interface. - - nosmt=force Has the same effect as "nosmt" but it does not allow to - undo the SMT disable via the sysfs interface. - =========== ========================================================== - - The sysfs interface provides two files: - - - /sys/devices/system/cpu/smt/control - - /sys/devices/system/cpu/smt/active - - /sys/devices/system/cpu/smt/control: - - This file allows to read out the SMT control state and provides the - ability to disable or (re)enable SMT. The possible states are: - - ============== =================================================== - on SMT is supported by the CPU and enabled. All - logical CPUs can be onlined and offlined without - restrictions. - - off SMT is supported by the CPU and disabled. Only - the so called primary SMT threads can be onlined - and offlined without restrictions. An attempt to - online a non-primary sibling is rejected - - forceoff Same as 'off' but the state cannot be controlled. - Attempts to write to the control file are rejected. - - notsupported The processor does not support SMT. It's therefore - not affected by the SMT implications of L1TF. - Attempts to write to the control file are rejected. - ============== =================================================== - - The possible states which can be written into this file to control SMT - state are: - - - on - - off - - forceoff - - /sys/devices/system/cpu/smt/active: - - This file reports whether SMT is enabled and active, i.e. if on any - physical core two or more sibling threads are online. - - SMT control is also possible at boot time via the l1tf kernel command - line parameter in combination with L1D flush control. See - :ref:`mitigation_control_command_line`. - -5. Disabling EPT -^^^^^^^^^^^^^^^^ - - Disabling EPT for virtual machines provides full mitigation for L1TF even - with SMT enabled, because the effective page tables for guests are - managed and sanitized by the hypervisor. Though disabling EPT has a - significant performance impact especially when the Meltdown mitigation - KPTI is enabled. - - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. - -There is ongoing research and development for new mitigation mechanisms to -address the performance impact of disabling SMT or EPT. - -.. _mitigation_control_command_line: - -Mitigation control on the kernel command line ---------------------------------------------- - -The kernel command line allows to control the L1TF mitigations at boot -time with the option "l1tf=". The valid arguments for this option are: - - ============ ============================================================= - full Provides all available mitigations for the L1TF - vulnerability. Disables SMT and enables all mitigations in - the hypervisors, i.e. unconditional L1D flushing - - SMT control and L1D flush control via the sysfs interface - is still possible after boot. Hypervisors will issue a - warning when the first VM is started in a potentially - insecure configuration, i.e. SMT enabled or L1D flush - disabled. - - full,force Same as 'full', but disables SMT and L1D flush runtime - control. Implies the 'nosmt=force' command line option. - (i.e. sysfs control of SMT is disabled.) - - flush Leaves SMT enabled and enables the default hypervisor - mitigation, i.e. conditional L1D flushing - - SMT control and L1D flush control via the sysfs interface - is still possible after boot. Hypervisors will issue a - warning when the first VM is started in a potentially - insecure configuration, i.e. SMT enabled or L1D flush - disabled. - - flush,nosmt Disables SMT and enables the default hypervisor mitigation, - i.e. conditional L1D flushing. - - SMT control and L1D flush control via the sysfs interface - is still possible after boot. Hypervisors will issue a - warning when the first VM is started in a potentially - insecure configuration, i.e. SMT enabled or L1D flush - disabled. - - flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is - started in a potentially insecure configuration. - - off Disables hypervisor mitigations and doesn't emit any - warnings. - It also drops the swap size and available RAM limit restrictions - on both hypervisor and bare metal. - - ============ ============================================================= - -The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. - - -.. _mitigation_control_kvm: - -Mitigation control for KVM - module parameter -------------------------------------------------------------- - -The KVM hypervisor mitigation mechanism, flushing the L1D cache when -entering a guest, can be controlled with a module parameter. - -The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the -following arguments: - - ============ ============================================================== - always L1D cache flush on every VMENTER. - - cond Flush L1D on VMENTER only when the code between VMEXIT and - VMENTER can leak host memory which is considered - interesting for an attacker. This still can leak host memory - which allows e.g. to determine the hosts address space layout. - - never Disables the mitigation - ============ ============================================================== - -The parameter can be provided on the kernel command line, as a module -parameter when loading the modules and at runtime modified via the sysfs -file: - -/sys/module/kvm_intel/parameters/vmentry_l1d_flush - -The default is 'cond'. If 'l1tf=full,force' is given on the kernel command -line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush -module parameter is ignored and writes to the sysfs file are rejected. - - -Mitigation selection guide --------------------------- - -1. No virtualization in use -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - The system is protected by the kernel unconditionally and no further - action is required. - -2. Virtualization with trusted guests -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - If the guest comes from a trusted source and the guest OS kernel is - guaranteed to have the L1TF mitigations in place the system is fully - protected against L1TF and no further action is required. - - To avoid the overhead of the default L1D flushing on VMENTER the - administrator can disable the flushing via the kernel command line and - sysfs control files. See :ref:`mitigation_control_command_line` and - :ref:`mitigation_control_kvm`. - - -3. Virtualization with untrusted guests -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -3.1. SMT not supported or disabled -"""""""""""""""""""""""""""""""""" - - If SMT is not supported by the processor or disabled in the BIOS or by - the kernel, it's only required to enforce L1D flushing on VMENTER. - - Conditional L1D flushing is the default behaviour and can be tuned. See - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. - -3.2. EPT not supported or disabled -"""""""""""""""""""""""""""""""""" - - If EPT is not supported by the processor or disabled in the hypervisor, - the system is fully protected. SMT can stay enabled and L1D flushing on - VMENTER is not required. - - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. - -3.3. SMT and EPT supported and active -""""""""""""""""""""""""""""""""""""" - - If SMT and EPT are supported and active then various degrees of - mitigations can be employed: - - - L1D flushing on VMENTER: - - L1D flushing on VMENTER is the minimal protection requirement, but it - is only potent in combination with other mitigation methods. - - Conditional L1D flushing is the default behaviour and can be tuned. See - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. - - - Guest confinement: - - Confinement of guests to a single or a group of physical cores which - are not running any other processes, can reduce the attack surface - significantly, but interrupts, soft interrupts and kernel threads can - still expose valuable data to a potential attacker. See - :ref:`guest_confinement`. - - - Interrupt isolation: - - Isolating the guest CPUs from interrupts can reduce the attack surface - further, but still allows a malicious guest to explore a limited amount - of host physical memory. This can at least be used to gain knowledge - about the host address space layout. The interrupts which have a fixed - affinity to the CPUs which run the untrusted guests can depending on - the scenario still trigger soft interrupts and schedule kernel threads - which might expose valuable information. See - :ref:`interrupt_isolation`. - -The above three mitigation methods combined can provide protection to a -certain degree, but the risk of the remaining attack surface has to be -carefully analyzed. For full protection the following methods are -available: - - - Disabling SMT: - - Disabling SMT and enforcing the L1D flushing provides the maximum - amount of protection. This mitigation is not depending on any of the - above mitigation methods. - - SMT control and L1D flushing can be tuned by the command line - parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run - time with the matching sysfs control files. See :ref:`smt_control`, - :ref:`mitigation_control_command_line` and - :ref:`mitigation_control_kvm`. - - - Disabling EPT: - - Disabling EPT provides the maximum amount of protection as well. It is - not depending on any of the above mitigation methods. SMT can stay - enabled and L1D flushing is not required, but the performance impact is - significant. - - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' - parameter. - -3.4. Nested virtual machines -"""""""""""""""""""""""""""" - -When nested virtualization is in use, three operating systems are involved: -the bare metal hypervisor, the nested hypervisor and the nested virtual -machine. VMENTER operations from the nested hypervisor into the nested -guest will always be processed by the bare metal hypervisor. If KVM is the -bare metal hypervisor it will: - - - Flush the L1D cache on every switch from the nested hypervisor to the - nested virtual machine, so that the nested hypervisor's secrets are not - exposed to the nested virtual machine; - - - Flush the L1D cache on every switch from the nested virtual machine to - the nested hypervisor; this is a complex operation, and flushing the L1D - cache avoids that the bare metal hypervisor's secrets are exposed to the - nested virtual machine; - - - Instruct the nested hypervisor to not perform any L1D cache flush. This - is an optimization to avoid double L1D flushing. - - -.. _default_mitigations: - -Default mitigations -------------------- - - The kernel default mitigations for vulnerable processors are: - - - PTE inversion to protect against malicious user space. This is done - unconditionally and cannot be controlled. The swap storage is limited - to ~16TB. - - - L1D conditional flushing on VMENTER when EPT is enabled for - a guest. - - The kernel does not by default enforce the disabling of SMT, which leaves - SMT systems vulnerable when running untrusted guests with EPT enabled. - - The rationale for this choice is: - - - Force disabling SMT can break existing setups, especially with - unattended updates. - - - If regular users run untrusted guests on their machine, then L1TF is - just an add on to other malware which might be embedded in an untrusted - guest, e.g. spam-bots or attacks on the local network. - - There is no technical way to prevent a user from running untrusted code - on their machines blindly. - - - It's technically extremely unlikely and from today's knowledge even - impossible that L1TF can be exploited via the most popular attack - mechanisms like JavaScript because these mechanisms have no way to - control PTEs. If this would be possible and not other mitigation would - be possible, then the default might be different. - - - The administrators of cloud and hosting setups have to carefully - analyze the risk for their scenarios and make the appropriate - mitigation choices, which might even vary across their deployed - machines and also result in other changes of their overall setup. - There is no way for the kernel to provide a sensible default for this - kind of scenarios. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 95cda38c8785..373ae1dcd301 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1107,7 +1107,7 @@ static void __init l1tf_select_mitigation(void) pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", half_pa); pr_info("However, doing so will make a part of your RAM unusable.\n"); - pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); return; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 544bd24a9c1e..b0597507bde7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6801,8 +6801,8 @@ free_partial_vcpu: return ERR_PTR(err); } -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" static int vmx_vm_init(struct kvm *kvm) { -- cgit v1.2.3 From a07b20004793d8926f78d63eb5980559f7813404 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Nov 2018 17:40:30 +0000 Subject: vfs: syscall: Add open_tree(2) to reference or clone a mount open_tree(dfd, pathname, flags) Returns an O_PATH-opened file descriptor or an error. dfd and pathname specify the location to open, in usual fashion (see e.g. fstatat(2)). flags should be an OR of some of the following: * AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW - same meanings as usual * OPEN_TREE_CLOEXEC - make the resulting descriptor close-on-exec * OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE - instead of opening the location in question, create a detached mount tree matching the subtree rooted at location specified by dfd/pathname. With AT_RECURSIVE the entire subtree is cloned, without it - only the part within in the mount containing the location in question. In other words, the same as mount --rbind or mount --bind would've taken. The detached tree will be dissolved on the final close of obtained file. Creation of such detached trees requires the same capabilities as doing mount --bind. Signed-off-by: Al Viro Signed-off-by: David Howells cc: linux-api@vger.kernel.org Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/file_table.c | 9 +- fs/internal.h | 1 + fs/namespace.c | 157 ++++++++++++++++++++++++++++----- include/linux/fs.h | 7 +- include/linux/syscalls.h | 1 + include/uapi/linux/fcntl.h | 2 + include/uapi/linux/mount.h | 6 ++ 9 files changed, 159 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 1f9607ed087c..ae2294d07ecb 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,7 +398,8 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq -# don't use numbers 387 through 392, add new calls at the end +387 i386 open_tree sys_open_tree __ia32_sys_open_tree +# don't use numbers 388 through 392, add new calls at the end 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92ee0b4378d4..a6e06c35b5b1 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,7 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +335 common open_tree __x64_sys_open_tree # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/file_table.c b/fs/file_table.c index 155d7514a094..3f9c1b452c1d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -255,6 +255,7 @@ static void __fput(struct file *file) struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; + fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; @@ -277,18 +278,20 @@ static void __fput(struct file *file) if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && - !(file->f_mode & FMODE_PATH))) { + !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITER) { + if (mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(mnt); } dput(dentry); + if (unlikely(mode & FMODE_NEED_UNMOUNT)) + dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); diff --git a/fs/internal.h b/fs/internal.h index 6a8b71643af4..f3a027c44758 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -94,6 +94,7 @@ extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); +extern void dissolve_on_fput(struct vfsmount *); /* * fs_struct.c */ diff --git a/fs/namespace.c b/fs/namespace.c index c9cab307fa77..b804a1a497ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,6 +20,7 @@ #include /* init_rootfs */ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ +#include #include #include #include @@ -1832,6 +1833,21 @@ struct vfsmount *collect_mounts(const struct path *path) return &tree->mnt; } +static void free_mnt_ns(struct mnt_namespace *); +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + +void dissolve_on_fput(struct vfsmount *mnt) +{ + struct mnt_namespace *ns; + namespace_lock(); + lock_mount_hash(); + ns = real_mount(mnt)->mnt_ns; + umount_tree(real_mount(mnt), UMOUNT_CONNECTED); + unlock_mount_hash(); + namespace_unlock(); + free_mnt_ns(ns); +} + void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); @@ -2222,6 +2238,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +static struct mount *__do_loopback(struct path *old_path, int recurse) +{ + struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + + if (IS_MNT_UNBINDABLE(old)) + return mnt; + + if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) + return mnt; + + if (!recurse && has_locked_children(old, old_path->dentry)) + return mnt; + + if (recurse) + mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + else + mnt = clone_mnt(old, old_path->dentry, 0); + + if (!IS_ERR(mnt)) + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + return mnt; +} + /* * do loopback mount. */ @@ -2229,7 +2269,7 @@ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; - struct mount *mnt = NULL, *old, *parent; + struct mount *mnt = NULL, *parent; struct mountpoint *mp; int err; if (!old_name || !*old_name) @@ -2243,38 +2283,21 @@ static int do_loopback(struct path *path, const char *old_name, goto out; mp = lock_mount(path); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + if (IS_ERR(mp)) { + err = PTR_ERR(mp); goto out; + } - old = real_mount(old_path.mnt); parent = real_mount(path->mnt); - - err = -EINVAL; - if (IS_MNT_UNBINDABLE(old)) - goto out2; - if (!check_mnt(parent)) goto out2; - if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) - goto out2; - - if (!recurse && has_locked_children(old, old_path.dentry)) - goto out2; - - if (recurse) - mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); - else - mnt = clone_mnt(old, old_path.dentry, 0); - + mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); goto out2; } - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); @@ -2288,6 +2311,96 @@ out: return err; } +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mount *mnt, *p; + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); + + namespace_lock(); + mnt = __do_loopback(path, recursive); + if (IS_ERR(mnt)) { + namespace_unlock(); + free_mnt_ns(ns); + return ERR_CAST(mnt); + } + + lock_mount_hash(); + for (p = mnt; p; p = next_mnt(p, mnt)) { + p->mnt_ns = ns; + ns->mounts++; + } + ns->root = mnt; + list_add_tail(&ns->list, &mnt->mnt_list); + mntget(&mnt->mnt); + unlock_mount_hash(); + namespace_unlock(); + + mntput(path->mnt); + path->mnt = &mnt->mnt; + file = dentry_open(path, O_PATH, current_cred()); + if (IS_ERR(file)) + dissolve_on_fput(path->mnt); + else + file->f_mode |= FMODE_NEED_UNMOUNT; + return file; +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags) +{ + struct file *file; + struct path path; + int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + bool detached = flags & OPEN_TREE_CLONE; + int error; + int fd; + + BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); + + if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | + OPEN_TREE_CLOEXEC)) + return -EINVAL; + + if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (detached && !may_mount()) + return -EPERM; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(error)) { + file = ERR_PTR(error); + } else { + if (detached) + file = open_detached_copy(&path, flags & AT_RECURSIVE); + else + file = dentry_open(&path, O_PATH, current_cred()); + path_put(&path); + } + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + fd_install(fd, file); + return fd; +} + /* * Don't allow locked mount flags to be cleared. * diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b42df09b04c..09b05ec5d059 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -162,10 +162,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) /* File is capable of returning -EAGAIN if I/O will block */ -#define FMODE_NOWAIT ((__force fmode_t)0x8000000) +#define FMODE_NOWAIT ((__force fmode_t)0x8000000) + +/* File represents mount that needs unmounting */ +#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) /* File does not contribute to nr_files count */ -#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e446806a561f..6c29d586e66b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -985,6 +985,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a2f8658f1c55..1d338357df8a 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -91,5 +91,7 @@ #define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ #define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ + #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 3f9ec42510b0..fd7ae2e7eccf 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -55,4 +55,10 @@ #define MS_MGC_VAL 0xC0ED0000 #define MS_MGC_MSK 0xffff0000 +/* + * open_tree() flags. + */ +#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ + #endif /* _UAPI_LINUX_MOUNT_H */ -- cgit v1.2.3 From 2db154b3ea8e14b04fee23e3fdfd5e9d17fbc6ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Nov 2018 17:40:30 +0000 Subject: vfs: syscall: Add move_mount(2) to move mounts around Add a move_mount() system call that will move a mount from one place to another and, in the next commit, allow to attach an unattached mount tree. The new system call looks like the following: int move_mount(int from_dfd, const char *from_path, int to_dfd, const char *to_path, unsigned int flags); Signed-off-by: David Howells cc: linux-api@vger.kernel.org Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/namespace.c | 126 +++++++++++++++++++++++++-------- include/linux/lsm_hooks.h | 6 ++ include/linux/security.h | 7 ++ include/linux/syscalls.h | 3 + include/uapi/linux/mount.h | 11 +++ security/security.c | 5 ++ 8 files changed, 130 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ae2294d07ecb..0db9effb18d9 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -399,7 +399,8 @@ 385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq 387 i386 open_tree sys_open_tree __ia32_sys_open_tree -# don't use numbers 388 through 392, add new calls at the end +388 i386 move_mount sys_move_mount __ia32_sys_move_mount +# don't use numbers 389 through 392, add new calls at the end 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index a6e06c35b5b1..0440f0eefa02 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -344,6 +344,7 @@ 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq 335 common open_tree __x64_sys_open_tree +336 common move_mount __x64_sys_move_mount # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/namespace.c b/fs/namespace.c index b804a1a497ee..dc600f53de9d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2539,72 +2539,81 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -static int do_move_mount(struct path *path, const char *old_name) +static int do_move_mount(struct path *old_path, struct path *new_path) { - struct path old_path, parent_path; + struct path parent_path = {.mnt = NULL, .dentry = NULL}; struct mount *p; struct mount *old; struct mountpoint *mp; int err; - if (!old_name || !*old_name) - return -EINVAL; - err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); - if (err) - return err; - mp = lock_mount(path); - err = PTR_ERR(mp); + mp = lock_mount(new_path); if (IS_ERR(mp)) - goto out; + return PTR_ERR(mp); - old = real_mount(old_path.mnt); - p = real_mount(path->mnt); + old = real_mount(old_path->mnt); + p = real_mount(new_path->mnt); err = -EINVAL; if (!check_mnt(p) || !check_mnt(old)) - goto out1; + goto out; - if (old->mnt.mnt_flags & MNT_LOCKED) - goto out1; + if (!mnt_has_parent(old)) + goto out; - err = -EINVAL; - if (old_path.dentry != old_path.mnt->mnt_root) - goto out1; + if (old->mnt.mnt_flags & MNT_LOCKED) + goto out; - if (!mnt_has_parent(old)) - goto out1; + if (old_path->dentry != old_path->mnt->mnt_root) + goto out; - if (d_is_dir(path->dentry) != - d_is_dir(old_path.dentry)) - goto out1; + if (d_is_dir(new_path->dentry) != + d_is_dir(old_path->dentry)) + goto out; /* * Don't move a mount residing in a shared parent. */ if (IS_MNT_SHARED(old->mnt_parent)) - goto out1; + goto out; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) - goto out1; + goto out; err = -ELOOP; for (; mnt_has_parent(p); p = p->mnt_parent) if (p == old) - goto out1; + goto out; - err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, + &parent_path); if (err) - goto out1; + goto out; /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); -out1: - unlock_mount(mp); out: + unlock_mount(mp); if (!err) path_put(&parent_path); + return err; +} + +static int do_move_mount_old(struct path *path, const char *old_name) +{ + struct path old_path; + int err; + + if (!old_name || !*old_name) + return -EINVAL; + + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + if (err) + return err; + + err = do_move_mount(&old_path, path); path_put(&old_path); return err; } @@ -3050,7 +3059,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) - retval = do_move_mount(&path, dev_name); + retval = do_move_mount_old(&path, dev_name); else retval = do_new_mount(&path, type_page, sb_flags, mnt_flags, dev_name, data_page); @@ -3278,6 +3287,61 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, return ksys_mount(dev_name, dir_name, type, flags, data); } +/* + * Move a mount from one place to another. + * + * Note the flags value is a combination of MOVE_MOUNT_* flags. + */ +SYSCALL_DEFINE5(move_mount, + int, from_dfd, const char *, from_pathname, + int, to_dfd, const char *, to_pathname, + unsigned int, flags) +{ + struct path from_path, to_path; + unsigned int lflags; + int ret = 0; + + if (!may_mount()) + return -EPERM; + + if (flags & ~MOVE_MOUNT__MASK) + return -EINVAL; + + /* If someone gives a pathname, they aren't permitted to move + * from an fd that requires unmount as we can't get at the flag + * to clear it afterwards. + */ + lflags = 0; + if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + + ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); + if (ret < 0) + return ret; + + lflags = 0; + if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + + ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); + if (ret < 0) + goto out_from; + + ret = security_move_mount(&from_path, &to_path); + if (ret < 0) + goto out_to; + + ret = do_move_mount(&from_path, &to_path); + +out_to: + path_put(&to_path); +out_from: + path_put(&from_path); + return ret; +} + /* * Return true if path is reachable from root * diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index a9b8ff578b6b..cb33f81cf5a1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -160,6 +160,10 @@ * Parse a string of security data filling in the opts structure * @options string containing all mount options known by the LSM * @opts binary data structure usable by the LSM + * @move_mount: + * Check permission before a mount is moved. + * @from_path indicates the mount that is going to be moved. + * @to_path indicates the mountpoint that will be mounted upon. * @dentry_init_security: * Compute a context for a dentry as the inode is not yet available * since NFSv4 has no label backed by an EA anyway. @@ -1501,6 +1505,7 @@ union security_list_options { unsigned long *set_kern_flags); int (*sb_add_mnt_opt)(const char *option, const char *val, int len, void **mnt_opts); + int (*move_mount)(const struct path *from_path, const struct path *to_path); int (*dentry_init_security)(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen); @@ -1835,6 +1840,7 @@ struct security_hook_heads { struct hlist_head sb_set_mnt_opts; struct hlist_head sb_clone_mnt_opts; struct hlist_head sb_add_mnt_opt; + struct hlist_head move_mount; struct hlist_head dentry_init_security; struct hlist_head dentry_create_files_as; #ifdef CONFIG_SECURITY_PATH diff --git a/include/linux/security.h b/include/linux/security.h index 49f2685324b0..1f2e06afc28f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -250,6 +250,7 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb, unsigned long *set_kern_flags); int security_add_mnt_opt(const char *option, const char *val, int len, void **mnt_opts); +int security_move_mount(const struct path *from_path, const struct path *to_path); int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, void **ctx, u32 *ctxlen); @@ -611,6 +612,12 @@ static inline int security_add_mnt_opt(const char *option, const char *val, return 0; } +static inline int security_move_mount(const struct path *from_path, + const struct path *to_path) +{ + return 0; +} + static inline int security_inode_alloc(struct inode *inode) { return 0; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6c29d586e66b..84347fc0a1a7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -986,6 +986,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); +asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, + int to_dfd, const char __user *to_path, + unsigned int ms_flags); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index fd7ae2e7eccf..3634e065836c 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -61,4 +61,15 @@ #define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ +/* + * move_mount() flags. + */ +#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ +#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ +#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#define MOVE_MOUNT__MASK 0x00000077 + #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/security/security.c b/security/security.c index 23cbb1a295a3..5b3d23e427b3 100644 --- a/security/security.c +++ b/security/security.c @@ -866,6 +866,11 @@ int security_add_mnt_opt(const char *option, const char *val, int len, } EXPORT_SYMBOL(security_add_mnt_opt); +int security_move_mount(const struct path *from_path, const struct path *to_path) +{ + return call_int_hook(move_mount, 0, from_path, to_path); +} + int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); -- cgit v1.2.3 From dadd2299ab61fc2b55b95b7b3a8f674cdd3b69c9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Nov 2018 17:40:31 +0000 Subject: Make anon_inodes unconditional Make the anon_inodes facility unconditional so that it can be used by core VFS code. Signed-off-by: David Howells Signed-off-by: Al Viro --- arch/arm/kvm/Kconfig | 1 - arch/arm64/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/kvm/Kconfig | 1 - drivers/base/Kconfig | 1 - drivers/char/tpm/Kconfig | 1 - drivers/dma-buf/Kconfig | 1 - drivers/gpio/Kconfig | 1 - drivers/iio/Kconfig | 1 - drivers/infiniband/Kconfig | 1 - drivers/vfio/Kconfig | 1 - fs/Makefile | 2 +- fs/notify/fanotify/Kconfig | 1 - fs/notify/inotify/Kconfig | 1 - init/Kconfig | 10 ---------- 18 files changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3f5320f46de2..f591026347a5 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on MMU && OF select PREEMPT_NOTIFIERS - select ANON_INODES select ARM_GIC select ARM_GIC_V3 select ARM_GIC_V3_ITS diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a3f85624313e..a67121d419a2 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM depends on OF select MMU_NOTIFIER select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 4528bc9c3cb1..eac25aef21e0 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM depends on MIPS_FP_SUPPORT select EXPORT_UASM select PREEMPT_NOTIFIERS - select ANON_INODES select KVM_GENERIC_DIRTYLOG_READ_PROTECT select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index bfdde04e4905..f53997a8ca62 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select SRCU diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 767453faacfc..1816ee48eadd 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_EVENTFD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1f9b3cf437c..18f2c954464e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -44,7 +44,6 @@ config X86 # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ANON_INODES select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 72fa955f4a15..fc042419e670 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,7 +27,6 @@ config KVM depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER - select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select IRQ_BYPASS_MANAGER diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 059700ea3521..03f067da12ee 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig" config DMA_SHARED_BUFFER bool default n - select ANON_INODES select IRQ_WORK help This option enables the framework for buffer-sharing between diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 536e55d3919f..f3e4bc490cf0 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -157,7 +157,6 @@ config TCG_CRB config TCG_VTPM_PROXY tristate "VTPM Proxy Interface" depends on TCG_TPM - select ANON_INODES ---help--- This driver proxies for an emulated TPM (vTPM) running in userspace. A device /dev/vtpmx is provided that creates a device pair diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig index 2e5a0faa2cb1..3fc9c2efc583 100644 --- a/drivers/dma-buf/Kconfig +++ b/drivers/dma-buf/Kconfig @@ -3,7 +3,6 @@ menu "DMABUF options" config SYNC_FILE bool "Explicit Synchronization Framework" default n - select ANON_INODES select DMA_SHARED_BUFFER ---help--- The Sync File Framework adds explicit syncronization via diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3f50526a771f..0f91600c27ae 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -12,7 +12,6 @@ config ARCH_HAVE_CUSTOM_GPIO_H menuconfig GPIOLIB bool "GPIO Support" - select ANON_INODES help This enables GPIO support through the generic GPIO library. You only need to enable this, if you also want to enable diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig index d08aeb41cd07..1dec0fecb6ef 100644 --- a/drivers/iio/Kconfig +++ b/drivers/iio/Kconfig @@ -4,7 +4,6 @@ menuconfig IIO tristate "Industrial I/O support" - select ANON_INODES help The industrial I/O subsystem provides a unified framework for drivers for many different types of embedded sensors using a diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a1fb840de45d..d318bab25860 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -25,7 +25,6 @@ config INFINIBAND_USER_MAD config INFINIBAND_USER_ACCESS tristate "InfiniBand userspace access (verbs and CM)" - select ANON_INODES depends on MMU ---help--- Userspace InfiniBand access support. This enables the diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 9de5ed38da83..3798d77d131c 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -22,7 +22,6 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64) - select ANON_INODES help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/fs/Makefile b/fs/Makefile index 427fec226fae..35945f8139e6 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 735bfb2e9190..521dc91d2cb5 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -1,7 +1,6 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY - select ANON_INODES select EXPORTFS default n ---help--- diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index b981fc0c8379..0161c74e76e2 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -1,6 +1,5 @@ config INOTIFY_USER bool "Inotify support for userspace" - select ANON_INODES select FSNOTIFY default y ---help--- diff --git a/init/Kconfig b/init/Kconfig index 4592bf7997c0..be8f97e37a76 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1171,9 +1171,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION config SYSCTL bool -config ANON_INODES - bool - config HAVE_UID16 bool @@ -1378,14 +1375,12 @@ config HAVE_FUTEX_CMPXCHG config EPOLL bool "Enable eventpoll support" if EXPERT default y - select ANON_INODES help Disabling this option will cause the kernel to be built without support for epoll family of system calls. config SIGNALFD bool "Enable signalfd() system call" if EXPERT - select ANON_INODES default y help Enable the signalfd() system call that allows to receive signals @@ -1395,7 +1390,6 @@ config SIGNALFD config TIMERFD bool "Enable timerfd() system call" if EXPERT - select ANON_INODES default y help Enable the timerfd() system call that allows to receive timer @@ -1405,7 +1399,6 @@ config TIMERFD config EVENTFD bool "Enable eventfd() system call" if EXPERT - select ANON_INODES default y help Enable the eventfd() system call that allows to receive both @@ -1516,7 +1509,6 @@ config KALLSYMS_BASE_RELATIVE # syscall, maps, verifier config BPF_SYSCALL bool "Enable bpf() system call" - select ANON_INODES select BPF select IRQ_WORK default n @@ -1533,7 +1525,6 @@ config BPF_JIT_ALWAYS_ON config USERFAULTFD bool "Enable userfaultfd() system call" - select ANON_INODES depends on MMU help Enable the userfaultfd() system call that allows to intercept and @@ -1600,7 +1591,6 @@ config PERF_EVENTS bool "Kernel performance events and counters" default y if PROFILING depends on HAVE_PERF_EVENTS - select ANON_INODES select IRQ_WORK select SRCU help -- cgit v1.2.3 From 24dcb3d90a1f67fe08c68a004af37df059d74005 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Nov 2018 23:33:31 +0000 Subject: vfs: syscall: Add fsopen() to prepare for superblock creation Provide an fsopen() system call that starts the process of preparing to create a superblock that will then be mountable, using an fd as a context handle. fsopen() is given the name of the filesystem that will be used: int mfd = fsopen(const char *fsname, unsigned int flags); where flags can be 0 or FSOPEN_CLOEXEC. For example: sfd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD); fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0); fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0); fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsinfo(sfd, NULL, ...); // query new superblock attributes mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); sfd = fsopen("afs", -1); fsconfig(fd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); mfd = fsmount(sfd, 0, MS_NODEV); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); If an error is reported at any step, an error message may be available to be read() back (ENODATA will be reported if there isn't an error available) in the form: "e :" "e SELinux:Mount on mountpoint not permitted" Once fsmount() has been called, further fsconfig() calls will incur EBUSY, even if the fsmount() fails. read() is still possible to retrieve error information. The fsopen() syscall creates a mount context and hangs it of the fd that it returns. Netlink is not used because it is optional and would make the core VFS dependent on the networking layer and also potentially add network namespace issues. Note that, for the moment, the caller must have SYS_CAP_ADMIN to use fsopen(). Signed-off-by: David Howells cc: linux-api@vger.kernel.org Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/Makefile | 2 +- fs/fs_context.c | 4 ++ fs/fsopen.c | 88 ++++++++++++++++++++++++++++++++++ include/linux/fs_context.h | 16 +++++++ include/linux/syscalls.h | 1 + include/uapi/linux/mount.h | 5 ++ 8 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 fs/fsopen.c (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 0db9effb18d9..37fd1fc5396e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -400,7 +400,8 @@ 386 i386 rseq sys_rseq __ia32_sys_rseq 387 i386 open_tree sys_open_tree __ia32_sys_open_tree 388 i386 move_mount sys_move_mount __ia32_sys_move_mount -# don't use numbers 389 through 392, add new calls at the end +389 i386 fsopen sys_fsopen __ia32_sys_fsopen +# don't use numbers 390 through 392, add new calls at the end 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 0440f0eefa02..511608a21611 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 334 common rseq __x64_sys_rseq 335 common open_tree __x64_sys_open_tree 336 common move_mount __x64_sys_move_mount +337 common fsopen __x64_sys_fsopen # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/Makefile b/fs/Makefile index 35945f8139e6..5a51bc2489ba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o + fs_types.o fs_context.o fs_parser.o fsopen.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/fs_context.c b/fs/fs_context.c index 87e3546b9a52..eb806fae3117 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -271,6 +271,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, fc->cred = get_current_cred(); fc->net_ns = get_net(current->nsproxy->net_ns); + mutex_init(&fc->uapi_mutex); + switch (purpose) { case FS_CONTEXT_FOR_MOUNT: fc->user_ns = get_user_ns(fc->cred->user_ns); @@ -353,6 +355,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) if (!fc) return ERR_PTR(-ENOMEM); + mutex_init(&fc->uapi_mutex); + fc->fs_private = NULL; fc->s_fs_info = NULL; fc->source = NULL; diff --git a/fs/fsopen.c b/fs/fsopen.c new file mode 100644 index 000000000000..d256f1ac9ff1 --- /dev/null +++ b/fs/fsopen.c @@ -0,0 +1,88 @@ +/* Filesystem access-by-fd. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mount.h" + +static int fscontext_release(struct inode *inode, struct file *file) +{ + struct fs_context *fc = file->private_data; + + if (fc) { + file->private_data = NULL; + put_fs_context(fc); + } + return 0; +} + +const struct file_operations fscontext_fops = { + .release = fscontext_release, + .llseek = no_llseek, +}; + +/* + * Attach a filesystem context to a file and an fd. + */ +static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags) +{ + int fd; + + fd = anon_inode_getfd("fscontext", &fscontext_fops, fc, + O_RDWR | o_flags); + if (fd < 0) + put_fs_context(fc); + return fd; +} + +/* + * Open a filesystem by name so that it can be configured for mounting. + * + * We are allowed to specify a container in which the filesystem will be + * opened, thereby indicating which namespaces will be used (notably, which + * network namespace will be used for network filesystems). + */ +SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) +{ + struct file_system_type *fs_type; + struct fs_context *fc; + const char *fs_name; + + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if (flags & ~FSOPEN_CLOEXEC) + return -EINVAL; + + fs_name = strndup_user(_fs_name, PAGE_SIZE); + if (IS_ERR(fs_name)) + return PTR_ERR(fs_name); + + fs_type = get_fs_type(fs_name); + kfree(fs_name); + if (!fs_type) + return -ENODEV; + + fc = fs_context_for_mount(fs_type, 0); + put_filesystem(fs_type); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + fc->phase = FS_CONTEXT_CREATE_PARAMS; + return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0); +} diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index eaca452088fa..7ab8b44fab3e 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -15,6 +15,7 @@ #include #include #include +#include struct cred; struct dentry; @@ -34,6 +35,19 @@ enum fs_context_purpose { FS_CONTEXT_FOR_RECONFIGURE, /* Superblock reconfiguration (remount) */ }; +/* + * Userspace usage phase for fsopen/fspick. + */ +enum fs_context_phase { + FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */ + FS_CONTEXT_CREATING, /* A superblock is being created */ + FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */ + FS_CONTEXT_AWAITING_RECONF, /* Awaiting initialisation for reconfiguration */ + FS_CONTEXT_RECONF_PARAMS, /* Loading params for reconfiguration */ + FS_CONTEXT_RECONFIGURING, /* Reconfiguring the superblock */ + FS_CONTEXT_FAILED, /* Failed to correctly transition a context */ +}; + /* * Type of parameter value. */ @@ -74,6 +88,7 @@ struct fs_parameter { */ struct fs_context { const struct fs_context_operations *ops; + struct mutex uapi_mutex; /* Userspace access mutex */ struct file_system_type *fs_type; void *fs_private; /* The filesystem's context */ struct dentry *root; /* The root and superblock */ @@ -88,6 +103,7 @@ struct fs_context { unsigned int sb_flags_mask; /* Superblock flags that were changed */ unsigned int lsm_flags; /* Information flags from the fs to the LSM */ enum fs_context_purpose purpose:8; + enum fs_context_phase phase:8; /* The phase the context is in */ bool need_free:1; /* Need to call ops->free() */ bool global:1; /* Goes into &init_user_ns */ }; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 84347fc0a1a7..0c9bd5427e8f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -989,6 +989,7 @@ asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); +asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 3634e065836c..7570df43d08f 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -72,4 +72,9 @@ #define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ #define MOVE_MOUNT__MASK 0x00000077 +/* + * fsopen() flags. + */ +#define FSOPEN_CLOEXEC 0x00000001 + #endif /* _UAPI_LINUX_MOUNT_H */ -- cgit v1.2.3 From ecdab150fddb42fe6a739335257949220033b782 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Nov 2018 23:36:09 +0000 Subject: vfs: syscall: Add fsconfig() for configuring and managing a context Add a syscall for configuring a filesystem creation context and triggering actions upon it, to be used in conjunction with fsopen, fspick and fsmount. long fsconfig(int fs_fd, unsigned int cmd, const char *key, const void *value, int aux); Where fs_fd indicates the context, cmd indicates the action to take, key indicates the parameter name for parameter-setting actions and, if needed, value points to a buffer containing the value and aux can give more information for the value. The following command IDs are proposed: (*) FSCONFIG_SET_FLAG: No value is specified. The parameter must be boolean in nature. The key may be prefixed with "no" to invert the setting. value must be NULL and aux must be 0. (*) FSCONFIG_SET_STRING: A string value is specified. The parameter can be expecting boolean, integer, string or take a path. A conversion to an appropriate type will be attempted (which may include looking up as a path). value points to a NUL-terminated string and aux must be 0. (*) FSCONFIG_SET_BINARY: A binary blob is specified. value points to the blob and aux indicates its size. The parameter must be expecting a blob. (*) FSCONFIG_SET_PATH: A non-empty path is specified. The parameter must be expecting a path object. value points to a NUL-terminated string that is the path and aux is a file descriptor at which to start a relative lookup or AT_FDCWD. (*) FSCONFIG_SET_PATH_EMPTY: As fsconfig_set_path, but with AT_EMPTY_PATH implied. (*) FSCONFIG_SET_FD: An open file descriptor is specified. value must be NULL and aux indicates the file descriptor. (*) FSCONFIG_CMD_CREATE: Trigger superblock creation. (*) FSCONFIG_CMD_RECONFIGURE: Trigger superblock reconfiguration. For the "set" command IDs, the idea is that the file_system_type will point to a list of parameters and the types of value that those parameters expect to take. The core code can then do the parse and argument conversion and then give the LSM and FS a cooked option or array of options to use. Source specification is also done the same way same way, using special keys "source", "source1", "source2", etc.. [!] Note that, for the moment, the key and value are just glued back together and handed to the filesystem. Every filesystem that uses options uses match_token() and co. to do this, and this will need to be changed - but not all at once. Example usage: fd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(fd, fsconfig_set_path, "source", "/dev/sda1", AT_FDCWD); fsconfig(fd, fsconfig_set_path_empty, "journal_path", "", journal_fd); fsconfig(fd, fsconfig_set_fd, "journal_fd", "", journal_fd); fsconfig(fd, fsconfig_set_flag, "user_xattr", NULL, 0); fsconfig(fd, fsconfig_set_flag, "noacl", NULL, 0); fsconfig(fd, fsconfig_set_string, "sb", "1", 0); fsconfig(fd, fsconfig_set_string, "errors", "continue", 0); fsconfig(fd, fsconfig_set_string, "data", "journal", 0); fsconfig(fd, fsconfig_set_string, "context", "unconfined_u:...", 0); fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0); mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC); or: fd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(fd, fsconfig_set_string, "source", "/dev/sda1", 0); fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0); mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC); or: fd = fsopen("afs", FSOPEN_CLOEXEC); fsconfig(fd, fsconfig_set_string, "source", "#grand.central.org:root.cell", 0); fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0); mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC); or: fd = fsopen("jffs2", FSOPEN_CLOEXEC); fsconfig(fd, fsconfig_set_string, "source", "mtd0", 0); fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0); mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC); Signed-off-by: David Howells cc: linux-api@vger.kernel.org Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/fs_context.c | 51 +++++++ fs/fsopen.c | 265 +++++++++++++++++++++++++++++++++ fs/internal.h | 3 + include/linux/syscalls.h | 2 + include/uapi/linux/mount.h | 14 ++ 7 files changed, 338 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 37fd1fc5396e..786728143205 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -401,7 +401,8 @@ 387 i386 open_tree sys_open_tree __ia32_sys_open_tree 388 i386 move_mount sys_move_mount __ia32_sys_move_mount 389 i386 fsopen sys_fsopen __ia32_sys_fsopen -# don't use numbers 390 through 392, add new calls at the end +390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig +# don't use numbers 391 through 392, add new calls at the end 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 511608a21611..7039a809d37d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -346,6 +346,7 @@ 335 common open_tree __x64_sys_open_tree 336 common move_mount __x64_sys_move_mount 337 common fsopen __x64_sys_fsopen +338 common fsconfig __x64_sys_fsconfig # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/fs_context.c b/fs/fs_context.c index dcf3786f90f9..a47ccd5a4a78 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -721,3 +721,54 @@ int parse_monolithic_mount_data(struct fs_context *fc, void *data) return monolithic_mount_data(fc, data); } + +/* + * Clean up a context after performing an action on it and put it into a state + * from where it can be used to reconfigure a superblock. + * + * Note that here we do only the parts that can't fail; the rest is in + * finish_clean_context() below and in between those fs_context is marked + * FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after + * successful mount or remount we need to report success to userland. + * Trying to do full reinit (for the sake of possible subsequent remount) + * and failing to allocate memory would've put us into a nasty situation. + * So here we only discard the old state and reinitialization is left + * until we actually try to reconfigure. + */ +void vfs_clean_context(struct fs_context *fc) +{ + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + fc->need_free = false; + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->sb_flags = 0; + security_free_mnt_opts(&fc->security); + kfree(fc->subtype); + fc->subtype = NULL; + kfree(fc->source); + fc->source = NULL; + + fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; + fc->phase = FS_CONTEXT_AWAITING_RECONF; +} + +int finish_clean_context(struct fs_context *fc) +{ + int error; + + if (fc->phase != FS_CONTEXT_AWAITING_RECONF) + return 0; + + if (fc->fs_type->init_fs_context) + error = fc->fs_type->init_fs_context(fc); + else + error = legacy_init_fs_context(fc); + if (unlikely(error)) { + fc->phase = FS_CONTEXT_FAILED; + return error; + } + fc->need_free = true; + fc->phase = FS_CONTEXT_RECONF_PARAMS; + return 0; +} diff --git a/fs/fsopen.c b/fs/fsopen.c index 5fce6347de7a..65cc2f68f994 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include "internal.h" #include "mount.h" /* @@ -153,3 +155,266 @@ err_fc: put_fs_context(fc); return ret; } + +/* + * Check the state and apply the configuration. Note that this function is + * allowed to 'steal' the value by setting param->xxx to NULL before returning. + */ +static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, + struct fs_parameter *param) +{ + struct super_block *sb; + int ret; + + ret = finish_clean_context(fc); + if (ret) + return ret; + switch (cmd) { + case FSCONFIG_CMD_CREATE: + if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + return -EBUSY; + fc->phase = FS_CONTEXT_CREATING; + ret = vfs_get_tree(fc); + if (ret) + break; + sb = fc->root->d_sb; + ret = security_sb_kern_mount(sb); + if (unlikely(ret)) { + fc_drop_locked(fc); + break; + } + up_write(&sb->s_umount); + fc->phase = FS_CONTEXT_AWAITING_MOUNT; + return 0; + case FSCONFIG_CMD_RECONFIGURE: + if (fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + fc->phase = FS_CONTEXT_RECONFIGURING; + sb = fc->root->d_sb; + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + ret = -EPERM; + break; + } + down_write(&sb->s_umount); + ret = reconfigure_super(fc); + up_write(&sb->s_umount); + if (ret) + break; + vfs_clean_context(fc); + return 0; + default: + if (fc->phase != FS_CONTEXT_CREATE_PARAMS && + fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + + return vfs_parse_fs_param(fc, param); + } + fc->phase = FS_CONTEXT_FAILED; + return ret; +} + +/** + * sys_fsconfig - Set parameters and trigger actions on a context + * @fd: The filesystem context to act upon + * @cmd: The action to take + * @_key: Where appropriate, the parameter key to set + * @_value: Where appropriate, the parameter value to set + * @aux: Additional information for the value + * + * This system call is used to set parameters on a context, including + * superblock settings, data source and security labelling. + * + * Actions include triggering the creation of a superblock and the + * reconfiguration of the superblock attached to the specified context. + * + * When setting a parameter, @cmd indicates the type of value being proposed + * and @_key indicates the parameter to be altered. + * + * @_value and @aux are used to specify the value, should a value be required: + * + * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean + * in nature. The key may be prefixed with "no" to invert the + * setting. @_value must be NULL and @aux must be 0. + * + * (*) fsconfig_set_string: A string value is specified. The parameter can be + * expecting boolean, integer, string or take a path. A conversion to an + * appropriate type will be attempted (which may include looking up as a + * path). @_value points to a NUL-terminated string and @aux must be 0. + * + * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the + * blob and @aux indicates its size. The parameter must be expecting a + * blob. + * + * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be + * expecting a path object. @_value points to a NUL-terminated string that + * is the path and @aux is a file descriptor at which to start a relative + * lookup or AT_FDCWD. + * + * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH + * implied. + * + * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be + * NULL and @aux indicates the file descriptor. + */ +SYSCALL_DEFINE5(fsconfig, + int, fd, + unsigned int, cmd, + const char __user *, _key, + const void __user *, _value, + int, aux) +{ + struct fs_context *fc; + struct fd f; + int ret; + + struct fs_parameter param = { + .type = fs_value_is_undefined, + }; + + if (fd < 0) + return -EINVAL; + + switch (cmd) { + case FSCONFIG_SET_FLAG: + if (!_key || _value || aux) + return -EINVAL; + break; + case FSCONFIG_SET_STRING: + if (!_key || !_value || aux) + return -EINVAL; + break; + case FSCONFIG_SET_BINARY: + if (!_key || !_value || aux <= 0 || aux > 1024 * 1024) + return -EINVAL; + break; + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + if (!_key || !_value || (aux != AT_FDCWD && aux < 0)) + return -EINVAL; + break; + case FSCONFIG_SET_FD: + if (!_key || _value || aux < 0) + return -EINVAL; + break; + case FSCONFIG_CMD_CREATE: + case FSCONFIG_CMD_RECONFIGURE: + if (_key || _value || aux) + return -EINVAL; + break; + default: + return -EOPNOTSUPP; + } + + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -EINVAL; + if (f.file->f_op != &fscontext_fops) + goto out_f; + + fc = f.file->private_data; + if (fc->ops == &legacy_fs_context_ops) { + switch (cmd) { + case FSCONFIG_SET_BINARY: + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + case FSCONFIG_SET_FD: + ret = -EOPNOTSUPP; + goto out_f; + } + } + + if (_key) { + param.key = strndup_user(_key, 256); + if (IS_ERR(param.key)) { + ret = PTR_ERR(param.key); + goto out_f; + } + } + + switch (cmd) { + case FSCONFIG_SET_FLAG: + param.type = fs_value_is_flag; + break; + case FSCONFIG_SET_STRING: + param.type = fs_value_is_string; + param.string = strndup_user(_value, 256); + if (IS_ERR(param.string)) { + ret = PTR_ERR(param.string); + goto out_key; + } + param.size = strlen(param.string); + break; + case FSCONFIG_SET_BINARY: + param.type = fs_value_is_blob; + param.size = aux; + param.blob = memdup_user_nul(_value, aux); + if (IS_ERR(param.blob)) { + ret = PTR_ERR(param.blob); + goto out_key; + } + break; + case FSCONFIG_SET_PATH: + param.type = fs_value_is_filename; + param.name = getname_flags(_value, 0, NULL); + if (IS_ERR(param.name)) { + ret = PTR_ERR(param.name); + goto out_key; + } + param.dirfd = aux; + param.size = strlen(param.name->name); + break; + case FSCONFIG_SET_PATH_EMPTY: + param.type = fs_value_is_filename_empty; + param.name = getname_flags(_value, LOOKUP_EMPTY, NULL); + if (IS_ERR(param.name)) { + ret = PTR_ERR(param.name); + goto out_key; + } + param.dirfd = aux; + param.size = strlen(param.name->name); + break; + case FSCONFIG_SET_FD: + param.type = fs_value_is_file; + ret = -EBADF; + param.file = fget(aux); + if (!param.file) + goto out_key; + break; + default: + break; + } + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret == 0) { + ret = vfs_fsconfig_locked(fc, cmd, ¶m); + mutex_unlock(&fc->uapi_mutex); + } + + /* Clean up the our record of any value that we obtained from + * userspace. Note that the value may have been stolen by the LSM or + * filesystem, in which case the value pointer will have been cleared. + */ + switch (cmd) { + case FSCONFIG_SET_STRING: + case FSCONFIG_SET_BINARY: + kfree(param.string); + break; + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + if (param.name) + putname(param.name); + break; + case FSCONFIG_SET_FD: + if (param.file) + fput(param.file); + break; + default: + break; + } +out_key: + kfree(param.key); +out_f: + fdput(f); + return ret; +} diff --git a/fs/internal.h b/fs/internal.h index f3a027c44758..95cf7b0af21f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,8 +55,11 @@ extern void __init chrdev_init(void); /* * fs_context.c */ +extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void fc_drop_locked(struct fs_context *); +extern void vfs_clean_context(struct fs_context *fc); +extern int finish_clean_context(struct fs_context *fc); /* * namei.c diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0c9bd5427e8f..925f9dfc356b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -990,6 +990,8 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags); +asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key, + const void __user *value, int aux); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 7570df43d08f..4b90ba9d1770 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -77,4 +77,18 @@ */ #define FSOPEN_CLOEXEC 0x00000001 +/* + * The type of fsconfig() call made. + */ +enum fsconfig_command { + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +}; + #endif /* _UAPI_LINUX_MOUNT_H */ -- cgit v1.2.3 From 93766fbd2696c2c4453dd8e1070977e9cd4e6b6d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Nov 2018 23:36:14 +0000 Subject: vfs: syscall: Add fsmount() to create a mount for a superblock Provide a system call by which a filesystem opened with fsopen() and configured by a series of fsconfig() calls can have a detached mount object created for it. This mount object can then be attached to the VFS mount hierarchy using move_mount() by passing the returned file descriptor as the from directory fd. The system call looks like: int mfd = fsmount(int fsfd, unsigned int flags, unsigned int attr_flags); where fsfd is the file descriptor returned by fsopen(). flags can be 0 or FSMOUNT_CLOEXEC. attr_flags is a bitwise-OR of the following flags: MOUNT_ATTR_RDONLY Mount read-only MOUNT_ATTR_NOSUID Ignore suid and sgid bits MOUNT_ATTR_NODEV Disallow access to device special files MOUNT_ATTR_NOEXEC Disallow program execution MOUNT_ATTR__ATIME Setting on how atime should be updated MOUNT_ATTR_RELATIME - Update atime relative to mtime/ctime MOUNT_ATTR_NOATIME - Do not update access times MOUNT_ATTR_STRICTATIME - Always perform atime updates MOUNT_ATTR_NODIRATIME Do not update directory access times In the event that fsmount() fails, it may be possible to get an error message by calling read() on fsfd. If no message is available, ENODATA will be reported. Signed-off-by: David Howells cc: linux-api@vger.kernel.org Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/namespace.c | 146 ++++++++++++++++++++++++++++++++- include/linux/syscalls.h | 1 + include/uapi/linux/mount.h | 18 ++++ 5 files changed, 165 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 786728143205..5b5c9189c507 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -402,7 +402,8 @@ 388 i386 move_mount sys_move_mount __ia32_sys_move_mount 389 i386 fsopen sys_fsopen __ia32_sys_fsopen 390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig -# don't use numbers 391 through 392, add new calls at the end +391 i386 fsmount sys_fsmount __ia32_sys_fsmount +# don't use number 392, add new calls at the end 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 7039a809d37d..984ad594bb2b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -347,6 +347,7 @@ 336 common move_mount __x64_sys_move_mount 337 common fsopen __x64_sys_fsopen 338 common fsconfig __x64_sys_fsconfig +339 common fsmount __x64_sys_fsmount # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/namespace.c b/fs/namespace.c index 1e72d19fa4f8..3357c3d65475 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3334,9 +3334,149 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, } /* - * Move a mount from one place to another. - * In combination with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be - * used to copy a mount subtree. + * Create a kernel mount representation for a new, prepared superblock + * (specified by fs_fd) and attach to an open_tree-like file descriptor. + */ +SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, + unsigned int, attr_flags) +{ + struct mnt_namespace *ns; + struct fs_context *fc; + struct file *file; + struct path newmount; + struct mount *mnt; + struct fd f; + unsigned int mnt_flags = 0; + long ret; + + if (!may_mount()) + return -EPERM; + + if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) + return -EINVAL; + + if (attr_flags & ~(MOUNT_ATTR_RDONLY | + MOUNT_ATTR_NOSUID | + MOUNT_ATTR_NODEV | + MOUNT_ATTR_NOEXEC | + MOUNT_ATTR__ATIME | + MOUNT_ATTR_NODIRATIME)) + return -EINVAL; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + switch (attr_flags & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_STRICTATIME: + break; + case MOUNT_ATTR_NOATIME: + mnt_flags |= MNT_NOATIME; + break; + case MOUNT_ATTR_RELATIME: + mnt_flags |= MNT_RELATIME; + break; + default: + return -EINVAL; + } + + f = fdget(fs_fd); + if (!f.file) + return -EBADF; + + ret = -EINVAL; + if (f.file->f_op != &fscontext_fops) + goto err_fsfd; + + fc = f.file->private_data; + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret < 0) + goto err_fsfd; + + /* There must be a valid superblock or we can't mount it */ + ret = -EINVAL; + if (!fc->root) + goto err_unlock; + + ret = -EPERM; + if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { + pr_warn("VFS: Mount too revealing\n"); + goto err_unlock; + } + + ret = -EBUSY; + if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) + goto err_unlock; + + ret = -EPERM; + if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) + goto err_unlock; + + newmount.mnt = vfs_create_mount(fc); + if (IS_ERR(newmount.mnt)) { + ret = PTR_ERR(newmount.mnt); + goto err_unlock; + } + newmount.dentry = dget(fc->root); + newmount.mnt->mnt_flags = mnt_flags; + + /* We've done the mount bit - now move the file context into more or + * less the same state as if we'd done an fspick(). We don't want to + * do any memory allocation or anything like that at this point as we + * don't want to have to handle any errors incurred. + */ + vfs_clean_context(fc); + + ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); + if (IS_ERR(ns)) { + ret = PTR_ERR(ns); + goto err_path; + } + mnt = real_mount(newmount.mnt); + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts = 1; + list_add(&mnt->mnt_list, &ns->list); + + /* Attach to an apparent O_PATH fd with a note that we need to unmount + * it, not just simply put it. + */ + file = dentry_open(&newmount, O_PATH, fc->cred); + if (IS_ERR(file)) { + dissolve_on_fput(newmount.mnt); + ret = PTR_ERR(file); + goto err_path; + } + file->f_mode |= FMODE_NEED_UNMOUNT; + + ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + +err_path: + path_put(&newmount); +err_unlock: + mutex_unlock(&fc->uapi_mutex); +err_fsfd: + fdput(f); + return ret; +} + +/* + * Move a mount from one place to another. In combination with + * fsopen()/fsmount() this is used to install a new mount and in combination + * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy + * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 925f9dfc356b..0e697f595278 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -992,6 +992,7 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags); asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key, const void __user *value, int aux); +asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 4b90ba9d1770..3888d3b91dc5 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -91,4 +91,22 @@ enum fsconfig_command { FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ }; +/* + * fsmount() flags. + */ +#define FSMOUNT_CLOEXEC 0x00000001 + +/* + * Mount attributes. + */ +#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ +#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ +#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ +#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ +#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ +#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ + #endif /* _UAPI_LINUX_MOUNT_H */ -- cgit v1.2.3 From cf3cba4a429be43e5527a3f78859b1bfd9ebc5fb Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Nov 2018 23:36:23 +0000 Subject: vfs: syscall: Add fspick() to select a superblock for reconfiguration Provide an fspick() system call that can be used to pick an existing mountpoint into an fs_context which can thereafter be used to reconfigure a superblock (equivalent of the superblock side of -o remount). This looks like: int fd = fspick(AT_FDCWD, "/mnt", FSPICK_CLOEXEC | FSPICK_NO_AUTOMOUNT); fsconfig(fd, FSCONFIG_SET_FLAG, "intr", NULL, 0); fsconfig(fd, FSCONFIG_SET_FLAG, "noac", NULL, 0); fsconfig(fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0); At the point of fspick being called, the file descriptor referring to the filesystem context is in exactly the same state as the one that was created by fsopen() after fsmount() has been successfully called. Signed-off-by: David Howells cc: linux-api@vger.kernel.org Signed-off-by: Al Viro --- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/fsopen.c | 57 ++++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 1 + include/uapi/linux/mount.h | 8 +++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 5b5c9189c507..4cd5f982b1e5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -403,7 +403,7 @@ 389 i386 fsopen sys_fsopen __ia32_sys_fsopen 390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig 391 i386 fsmount sys_fsmount __ia32_sys_fsmount -# don't use number 392, add new calls at the end +392 i386 fspick sys_fspick __ia32_sys_fspick 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 984ad594bb2b..64ca0d06259a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -348,6 +348,7 @@ 337 common fsopen __x64_sys_fsopen 338 common fsconfig __x64_sys_fsconfig 339 common fsmount __x64_sys_fsmount +340 common fspick __x64_sys_fspick # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/fsopen.c b/fs/fsopen.c index 65cc2f68f994..3bb9c0c8cbcc 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -156,6 +156,63 @@ err_fc: return ret; } +/* + * Pick a superblock into a context for reconfiguration. + */ +SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags) +{ + struct fs_context *fc; + struct path target; + unsigned int lookup_flags; + int ret; + + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if ((flags & ~(FSPICK_CLOEXEC | + FSPICK_SYMLINK_NOFOLLOW | + FSPICK_NO_AUTOMOUNT | + FSPICK_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & FSPICK_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & FSPICK_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & FSPICK_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + ret = user_path_at(dfd, path, lookup_flags, &target); + if (ret < 0) + goto err; + + ret = -EINVAL; + if (target.mnt->mnt_root != target.dentry) + goto err_path; + + fc = fs_context_for_reconfigure(target.dentry, 0, 0); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + goto err_path; + } + + fc->phase = FS_CONTEXT_RECONF_PARAMS; + + ret = fscontext_alloc_log(fc); + if (ret < 0) + goto err_fc; + + path_put(&target); + return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0); + +err_fc: + put_fs_context(fc); +err_path: + path_put(&target); +err: + return ret; +} + /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0e697f595278..e2870fe1be5b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -993,6 +993,7 @@ asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags); asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key, const void __user *value, int aux); asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags); +asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags); asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 3888d3b91dc5..96a0240f23fe 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -77,6 +77,14 @@ */ #define FSOPEN_CLOEXEC 0x00000001 +/* + * fspick() flags. + */ +#define FSPICK_CLOEXEC 0x00000001 +#define FSPICK_SYMLINK_NOFOLLOW 0x00000002 +#define FSPICK_NO_AUTOMOUNT 0x00000004 +#define FSPICK_EMPTY_PATH 0x00000008 + /* * The type of fsconfig() call made. */ -- cgit v1.2.3 From 16add411645cff83360086e102daa67b25f1e39a Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 18 Mar 2019 02:30:18 +0300 Subject: syscall_get_arch: add "struct task_struct *" argument This argument is required to extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going to be called from ptrace_request() along with syscall_get_nr(), syscall_get_arguments(), syscall_get_error(), and syscall_get_return_value() functions with a tracee as their argument. The primary intent is that the triple (audit_arch, syscall_nr, arg1..arg6) should describe what system call is being called and what its arguments are. Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments") Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()") Reviewed-by: Andy Lutomirski # for x86 Reviewed-by: Palmer Dabbelt Acked-by: Paul Moore Acked-by: Paul Burton # MIPS parts Acked-by: Michael Ellerman (powerpc) Acked-by: Kees Cook # seccomp parts Acked-by: Mark Salter # for the c6x bit Cc: Elvira Khabirova Cc: Eugene Syromyatnikov Cc: Oleg Nesterov Cc: x86@kernel.org Cc: linux-alpha@vger.kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: uclinux-h8-devel@lists.sourceforge.jp Cc: linux-hexagon@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-mips@vger.kernel.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-riscv@lists.infradead.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Cc: linux-audit@redhat.com Signed-off-by: Dmitry V. Levin Signed-off-by: Paul Moore --- arch/alpha/include/asm/syscall.h | 2 +- arch/arc/include/asm/syscall.h | 2 +- arch/arm/include/asm/syscall.h | 2 +- arch/arm64/include/asm/syscall.h | 4 ++-- arch/c6x/include/asm/syscall.h | 2 +- arch/csky/include/asm/syscall.h | 2 +- arch/h8300/include/asm/syscall.h | 2 +- arch/hexagon/include/asm/syscall.h | 2 +- arch/ia64/include/asm/syscall.h | 2 +- arch/m68k/include/asm/syscall.h | 2 +- arch/microblaze/include/asm/syscall.h | 2 +- arch/mips/include/asm/syscall.h | 6 +++--- arch/mips/kernel/ptrace.c | 2 +- arch/nds32/include/asm/syscall.h | 2 +- arch/nios2/include/asm/syscall.h | 2 +- arch/openrisc/include/asm/syscall.h | 2 +- arch/parisc/include/asm/syscall.h | 4 ++-- arch/powerpc/include/asm/syscall.h | 10 ++++++++-- arch/riscv/include/asm/syscall.h | 2 +- arch/s390/include/asm/syscall.h | 4 ++-- arch/sh/include/asm/syscall_32.h | 2 +- arch/sh/include/asm/syscall_64.h | 2 +- arch/sparc/include/asm/syscall.h | 5 +++-- arch/unicore32/include/asm/syscall.h | 2 +- arch/x86/include/asm/syscall.h | 8 +++++--- arch/x86/um/asm/syscall.h | 2 +- arch/xtensa/include/asm/syscall.h | 2 +- include/asm-generic/syscall.h | 5 +++-- kernel/auditsc.c | 4 ++-- kernel/seccomp.c | 4 ++-- 30 files changed, 52 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/syscall.h b/arch/alpha/include/asm/syscall.h index d73a6fcb519c..11c688c1d7ec 100644 --- a/arch/alpha/include/asm/syscall.h +++ b/arch/alpha/include/asm/syscall.h @@ -4,7 +4,7 @@ #include -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_ALPHA; } diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h index c7fc4c0c3bcb..caf2697ef5b7 100644 --- a/arch/arc/include/asm/syscall.h +++ b/arch/arc/include/asm/syscall.h @@ -70,7 +70,7 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, } static inline int -syscall_get_arch(void) +syscall_get_arch(struct task_struct *task) { return IS_ENABLED(CONFIG_ISA_ARCOMPACT) ? (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index 06dea6bce293..3940ceac0bdc 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -104,7 +104,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->ARM_r0 + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { /* ARM tasks don't change audit architectures on the fly. */ return AUDIT_ARCH_ARM; diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index ad8be16a39c9..1870df03f774 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -117,9 +117,9 @@ static inline void syscall_set_arguments(struct task_struct *task, * We don't care about endianness (__AUDIT_ARCH_LE bit) here because * AArch64 has the same system calls both on little- and big- endian. */ -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { - if (is_compat_task()) + if (is_compat_thread(task_thread_info(task))) return AUDIT_ARCH_ARM; return AUDIT_ARCH_AARCH64; diff --git a/arch/c6x/include/asm/syscall.h b/arch/c6x/include/asm/syscall.h index 39dbd1ef994c..595057191c9c 100644 --- a/arch/c6x/include/asm/syscall.h +++ b/arch/c6x/include/asm/syscall.h @@ -121,7 +121,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) ? AUDIT_ARCH_C6XBE : AUDIT_ARCH_C6X; diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h index d637445737b7..150ffb894fa2 100644 --- a/arch/csky/include/asm/syscall.h +++ b/arch/csky/include/asm/syscall.h @@ -70,7 +70,7 @@ syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, } static inline int -syscall_get_arch(void) +syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_CSKY; } diff --git a/arch/h8300/include/asm/syscall.h b/arch/h8300/include/asm/syscall.h index 5135910616e2..d316c3d40d4e 100644 --- a/arch/h8300/include/asm/syscall.h +++ b/arch/h8300/include/asm/syscall.h @@ -49,7 +49,7 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, } static inline int -syscall_get_arch(void) +syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_H8300; } diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index de3917aad3fd..47b0bc3f16be 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -46,7 +46,7 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, &(®s->r00)[i], n * sizeof(args[0])); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_HEXAGON; } diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h index 1d0b875fec44..47ab33f5448a 100644 --- a/arch/ia64/include/asm/syscall.h +++ b/arch/ia64/include/asm/syscall.h @@ -81,7 +81,7 @@ static inline void syscall_set_arguments(struct task_struct *task, ia64_syscall_get_set_arguments(task, regs, i, n, args, 1); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_IA64; } diff --git a/arch/m68k/include/asm/syscall.h b/arch/m68k/include/asm/syscall.h index d4d7deda8d50..465ac039be09 100644 --- a/arch/m68k/include/asm/syscall.h +++ b/arch/m68k/include/asm/syscall.h @@ -4,7 +4,7 @@ #include -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_M68K; } diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h index 220decd605a4..77a86fafa974 100644 --- a/arch/microblaze/include/asm/syscall.h +++ b/arch/microblaze/include/asm/syscall.h @@ -101,7 +101,7 @@ static inline void syscall_set_arguments(struct task_struct *task, asmlinkage unsigned long do_syscall_trace_enter(struct pt_regs *regs); asmlinkage void do_syscall_trace_leave(struct pt_regs *regs); -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_MICROBLAZE; } diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 6cf8ffb5367e..6a22c9352ef6 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -140,14 +140,14 @@ extern const unsigned long sys_call_table[]; extern const unsigned long sys32_call_table[]; extern const unsigned long sysn32_call_table[]; -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { int arch = AUDIT_ARCH_MIPS; #ifdef CONFIG_64BIT - if (!test_thread_flag(TIF_32BIT_REGS)) { + if (!test_tsk_thread_flag(task, TIF_32BIT_REGS)) { arch |= __AUDIT_ARCH_64BIT; /* N32 sets only TIF_32BIT_ADDR */ - if (test_thread_flag(TIF_32BIT_ADDR)) + if (test_tsk_thread_flag(task, TIF_32BIT_ADDR)) arch |= __AUDIT_ARCH_CONVENTION_MIPS64_N32; } #endif diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0057c910bc2f..2ead6ff919b7 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1418,7 +1418,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) unsigned long args[6]; sd.nr = syscall; - sd.arch = syscall_get_arch(); + sd.arch = syscall_get_arch(current); syscall_get_arguments(current, regs, 0, 6, args); for (i = 0; i < 6; i++) sd.args[i] = args[i]; diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h index cc56a3962f8b..7501e376a6b1 100644 --- a/arch/nds32/include/asm/syscall.h +++ b/arch/nds32/include/asm/syscall.h @@ -188,7 +188,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, } static inline int -syscall_get_arch(void) +syscall_get_arch(struct task_struct *task) { return IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) ? AUDIT_ARCH_NDS32BE : AUDIT_ARCH_NDS32; diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h index cf35e210fc4d..f0f6ae208e78 100644 --- a/arch/nios2/include/asm/syscall.h +++ b/arch/nios2/include/asm/syscall.h @@ -136,7 +136,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_NIOS2; } diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h index 2db9f1cf0694..46b10c674bd2 100644 --- a/arch/openrisc/include/asm/syscall.h +++ b/arch/openrisc/include/asm/syscall.h @@ -72,7 +72,7 @@ syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, memcpy(®s->gpr[3 + i], args, n * sizeof(args[0])); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_OPENRISC; } diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h index 8bff1a58c97f..c04ffc6ac928 100644 --- a/arch/parisc/include/asm/syscall.h +++ b/arch/parisc/include/asm/syscall.h @@ -62,11 +62,11 @@ static inline void syscall_rollback(struct task_struct *task, /* do nothing */ } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { int arch = AUDIT_ARCH_PARISC; #ifdef CONFIG_64BIT - if (!is_compat_task()) + if (!__is_compat_task(task)) arch = AUDIT_ARCH_PARISC64; #endif return arch; diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 1a0e7a8b1c81..efb50429c9f4 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -99,9 +99,15 @@ static inline void syscall_set_arguments(struct task_struct *task, regs->orig_gpr3 = args[0]; } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { - int arch = is_32bit_task() ? AUDIT_ARCH_PPC : AUDIT_ARCH_PPC64; + int arch; + + if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT)) + arch = AUDIT_ARCH_PPC64; + else + arch = AUDIT_ARCH_PPC; + #ifdef __LITTLE_ENDIAN__ arch |= __AUDIT_ARCH_LE; #endif diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index bba3da6ef157..ca120a36a037 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -100,7 +100,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->a1 + i * sizeof(regs->a1), args, n * sizeof(regs->a0)); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_64BIT return AUDIT_ARCH_RISCV64; diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 96f9a9151fde..5a40ea8b90ea 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -92,10 +92,10 @@ static inline void syscall_set_arguments(struct task_struct *task, regs->orig_gpr2 = args[0]; } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(current, TIF_31BIT)) + if (test_tsk_thread_flag(task, TIF_31BIT)) return AUDIT_ARCH_S390; #endif return AUDIT_ARCH_S390X; diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h index 6e118799831c..08de429eccd4 100644 --- a/arch/sh/include/asm/syscall_32.h +++ b/arch/sh/include/asm/syscall_32.h @@ -95,7 +95,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { int arch = AUDIT_ARCH_SH; diff --git a/arch/sh/include/asm/syscall_64.h b/arch/sh/include/asm/syscall_64.h index 43882580c7f9..9b62a2404531 100644 --- a/arch/sh/include/asm/syscall_64.h +++ b/arch/sh/include/asm/syscall_64.h @@ -63,7 +63,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->regs[2 + i], args, n * sizeof(args[0])); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { int arch = AUDIT_ARCH_SH; diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 053989e3f6a6..9ffb367c17fd 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -128,10 +128,11 @@ static inline void syscall_set_arguments(struct task_struct *task, regs->u_regs[UREG_I0 + i + j] = args[j]; } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { #if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT) - return in_compat_syscall() ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64; + return test_tsk_thread_flag(task, TIF_32BIT) + ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64; #elif defined(CONFIG_SPARC64) return AUDIT_ARCH_SPARC64; #else diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h index 3a6b885476b4..607961797fff 100644 --- a/arch/unicore32/include/asm/syscall.h +++ b/arch/unicore32/include/asm/syscall.h @@ -4,7 +4,7 @@ #include -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_UNICORE; } diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d653139857af..435f3f09279c 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -107,7 +107,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; } @@ -236,10 +236,12 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ - return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + return (IS_ENABLED(CONFIG_IA32_EMULATION) && + task->thread_info.status & TS_COMPAT) + ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index ef898af102d1..56a2f0913e3c 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -9,7 +9,7 @@ typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_X86_32 return AUDIT_ARCH_I386; diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h index a168bf81c7f4..0681ca656809 100644 --- a/arch/xtensa/include/asm/syscall.h +++ b/arch/xtensa/include/asm/syscall.h @@ -14,7 +14,7 @@ #include #include -static inline int syscall_get_arch(void) +static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_XTENSA; } diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 0c938a4354f6..e0d060b43321 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -144,14 +144,15 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, /** * syscall_get_arch - return the AUDIT_ARCH for the current system call + * @task: task of interest, must be blocked * * Returns the AUDIT_ARCH_* based on the system call convention in use. * - * It's only valid to call this when current is stopped on entry to a system + * It's only valid to call this when @task is stopped on entry to a system * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. */ -int syscall_get_arch(void); +int syscall_get_arch(struct task_struct *task); #endif /* _ASM_SYSCALL_H */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 17b0007fafc2..98a98e6dca05 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1636,7 +1636,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; } - context->arch = syscall_get_arch(); + context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -2590,7 +2590,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, + signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..36f36ab00f48 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(); + sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, 0, 6, args); sd->args[0] = args[0]; sd->args[1] = args[1]; @@ -591,7 +591,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); info->si_errno = reason; - info->si_arch = syscall_get_arch(); + info->si_arch = syscall_get_arch(current); info->si_syscall = syscall; } -- cgit v1.2.3 From 8b56d3488d87555c77494b1fb90dbea84ff0e9fc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:51 -0700 Subject: crypto: x86/aesni - convert to use skcipher SIMD bulk registration Convert the AES-NI glue code to use simd_register_skciphers_compat() to create SIMD wrappers for all the internal skcipher algorithms at once, rather than wrapping each one individually. This simplifies the code. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 43 +++++++------------------------------- 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 1e3d2102033a..451918cc5f73 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1253,23 +1253,9 @@ static const struct x86_cpu_id aesni_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); -static void aesni_free_simds(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) && - aesni_simd_skciphers[i]; i++) - simd_skcipher_free(aesni_simd_skciphers[i]); -} - static int __init aesni_init(void) { - struct simd_skcipher_alg *simd; - const char *basename; - const char *algname; - const char *drvname; int err; - int i; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; @@ -1304,8 +1290,9 @@ static int __init aesni_init(void) if (err) return err; - err = crypto_register_skciphers(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers)); + err = simd_register_skciphers_compat(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); if (err) goto unregister_algs; @@ -1314,26 +1301,11 @@ static int __init aesni_init(void) if (err) goto unregister_skciphers; - for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) { - algname = aesni_skciphers[i].base.cra_name + 2; - drvname = aesni_skciphers[i].base.cra_driver_name + 2; - basename = aesni_skciphers[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(algname, drvname, basename); - err = PTR_ERR(simd); - if (IS_ERR(simd)) - goto unregister_simds; - - aesni_simd_skciphers[i] = simd; - } - return 0; -unregister_simds: - aesni_free_simds(); - crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); unregister_skciphers: - crypto_unregister_skciphers(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers)); + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); unregister_algs: crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); return err; @@ -1341,10 +1313,9 @@ unregister_algs: static void __exit aesni_exit(void) { - aesni_free_simds(); crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); - crypto_unregister_skciphers(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers)); + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); } -- cgit v1.2.3 From 149e12252fb38937d3efa4419be1a87884ab1427 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:52 -0700 Subject: crypto: x86/aesni - convert to use AEAD SIMD helpers Convert the AES-NI implementations of "gcm(aes)" and "rfc4106(gcm(aes))" to use the AEAD SIMD helpers, rather than hand-rolling the same functionality. This simplifies the code and also fixes the bug where the user-provided aead_request is modified. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 161 ++++--------------------------------- 1 file changed, 15 insertions(+), 146 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 451918cc5f73..1466d59384ac 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -643,29 +642,6 @@ static int xts_decrypt(struct skcipher_request *req) aes_ctx(ctx->raw_crypt_ctx)); } -static int rfc4106_init(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void rfc4106_exit(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -710,15 +686,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } -static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key, - unsigned int key_len) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(parent); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, key_len); -} - +/* This is the Integrity Check Value (aka the authentication tag) length and can + * be 8, 12 or 16 bytes long. */ static int common_rfc4106_set_authsize(struct crypto_aead *aead, unsigned int authsize) { @@ -734,17 +703,6 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, return 0; } -/* This is the Integrity Check Value (aka the authentication tag length and can - * be 8, 12 or 16 bytes long. */ -static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(parent); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { @@ -964,38 +922,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req) return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, aes_ctx); } - -static int gcmaes_wrapper_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(tfm); - struct cryptd_aead *cryptd_tfm = *ctx; - - tfm = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - tfm = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, tfm); - - return crypto_aead_encrypt(req); -} - -static int gcmaes_wrapper_decrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(tfm); - struct cryptd_aead *cryptd_tfm = *ctx; - - tfm = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - tfm = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, tfm); - - return crypto_aead_decrypt(req); -} #endif static struct crypto_alg aesni_algs[] = { { @@ -1148,31 +1074,7 @@ static int generic_gcmaes_decrypt(struct aead_request *req) aes_ctx); } -static int generic_gcmaes_init(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - - return 0; -} - -static void generic_gcmaes_exit(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg aesni_aead_algs[] = { { +static struct aead_alg aesni_aeads[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, .encrypt = helper_rfc4106_encrypt, @@ -1180,32 +1082,15 @@ static struct aead_alg aesni_aead_algs[] = { { .ivsize = GCM_RFC4106_IV_SIZE, .maxauthsize = 16, .base = { - .cra_name = "__gcm-aes-aesni", - .cra_driver_name = "__driver-gcm-aes-aesni", + .cra_name = "__rfc4106(gcm(aes))", + .cra_driver_name = "__rfc4106-gcm-aesni", + .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), .cra_alignmask = AESNI_ALIGN - 1, .cra_module = THIS_MODULE, }, -}, { - .init = rfc4106_init, - .exit = rfc4106_exit, - .setkey = gcmaes_wrapper_set_key, - .setauthsize = gcmaes_wrapper_set_authsize, - .encrypt = gcmaes_wrapper_encrypt, - .decrypt = gcmaes_wrapper_decrypt, - .ivsize = GCM_RFC4106_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "rfc4106(gcm(aes))", - .cra_driver_name = "rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_module = THIS_MODULE, - }, }, { .setkey = generic_gcmaes_set_key, .setauthsize = generic_gcmaes_set_authsize, @@ -1214,38 +1099,21 @@ static struct aead_alg aesni_aead_algs[] = { { .ivsize = GCM_AES_IV_SIZE, .maxauthsize = 16, .base = { - .cra_name = "__generic-gcm-aes-aesni", - .cra_driver_name = "__driver-generic-gcm-aes-aesni", - .cra_priority = 0, + .cra_name = "__gcm(aes)", + .cra_driver_name = "__generic-gcm-aesni", + .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), .cra_alignmask = AESNI_ALIGN - 1, .cra_module = THIS_MODULE, }, -}, { - .init = generic_gcmaes_init, - .exit = generic_gcmaes_exit, - .setkey = gcmaes_wrapper_set_key, - .setauthsize = gcmaes_wrapper_set_authsize, - .encrypt = gcmaes_wrapper_encrypt, - .decrypt = gcmaes_wrapper_decrypt, - .ivsize = GCM_AES_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "gcm(aes)", - .cra_driver_name = "generic-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_module = THIS_MODULE, - }, } }; #else -static struct aead_alg aesni_aead_algs[0]; +static struct aead_alg aesni_aeads[0]; #endif +static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; static const struct x86_cpu_id aesni_cpu_id[] = { X86_FEATURE_MATCH(X86_FEATURE_AES), @@ -1296,8 +1164,8 @@ static int __init aesni_init(void) if (err) goto unregister_algs; - err = crypto_register_aeads(aesni_aead_algs, - ARRAY_SIZE(aesni_aead_algs)); + err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); if (err) goto unregister_skciphers; @@ -1313,7 +1181,8 @@ unregister_algs: static void __exit aesni_exit(void) { - crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); -- cgit v1.2.3 From de272ca72c6152e26b9799d21eb511aac03b6e2d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:53 -0700 Subject: crypto: x86/aegis128 - convert to use AEAD SIMD helpers Convert the x86 implementation of AEGIS-128 to use the AEAD SIMD helpers, rather than hand-rolling the same functionality. This simplifies the code and also fixes the bug where the user-provided aead_request is modified. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-glue.c | 157 +++++++--------------------------- crypto/Kconfig | 2 +- 2 files changed, 31 insertions(+), 128 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 3ea71b871813..bdeee1b830be 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -11,8 +11,8 @@ * any later version. */ -#include #include +#include #include #include #include @@ -242,131 +242,35 @@ static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) { } -static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} - -static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - -static int cryptd_aegis128_aesni_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} - -static int cryptd_aegis128_aesni_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} - -static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg crypto_aegis128_aesni_alg[] = { - { - .setkey = crypto_aegis128_aesni_setkey, - .setauthsize = crypto_aegis128_aesni_setauthsize, - .encrypt = crypto_aegis128_aesni_encrypt, - .decrypt = crypto_aegis128_aesni_decrypt, - .init = crypto_aegis128_aesni_init_tfm, - .exit = crypto_aegis128_aesni_exit_tfm, - - .ivsize = AEGIS128_NONCE_SIZE, - .maxauthsize = AEGIS128_MAX_AUTH_SIZE, - .chunksize = AEGIS128_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - - .cra_name = "__aegis128", - .cra_driver_name = "__aegis128-aesni", - - .cra_module = THIS_MODULE, - } - }, { - .setkey = cryptd_aegis128_aesni_setkey, - .setauthsize = cryptd_aegis128_aesni_setauthsize, - .encrypt = cryptd_aegis128_aesni_encrypt, - .decrypt = cryptd_aegis128_aesni_decrypt, - .init = cryptd_aegis128_aesni_init_tfm, - .exit = cryptd_aegis128_aesni_exit_tfm, - - .ivsize = AEGIS128_NONCE_SIZE, - .maxauthsize = AEGIS128_MAX_AUTH_SIZE, - .chunksize = AEGIS128_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_alignmask = 0, - - .cra_priority = 400, - - .cra_name = "aegis128", - .cra_driver_name = "aegis128-aesni", - - .cra_module = THIS_MODULE, - } +static struct aead_alg crypto_aegis128_aesni_alg = { + .setkey = crypto_aegis128_aesni_setkey, + .setauthsize = crypto_aegis128_aesni_setauthsize, + .encrypt = crypto_aegis128_aesni_encrypt, + .decrypt = crypto_aegis128_aesni_decrypt, + .init = crypto_aegis128_aesni_init_tfm, + .exit = crypto_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, + .chunksize = AEGIS128_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + .cra_priority = 400, + + .cra_name = "__aegis128", + .cra_driver_name = "__aegis128-aesni", + + .cra_module = THIS_MODULE, } }; +static struct simd_aead_alg *simd_alg; + static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || @@ -374,14 +278,13 @@ static int __init crypto_aegis128_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_aegis128_aesni_alg, - ARRAY_SIZE(crypto_aegis128_aesni_alg)); + return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, + &simd_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { - crypto_unregister_aeads(crypto_aegis128_aesni_alg, - ARRAY_SIZE(crypto_aegis128_aesni_alg)); + simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis128_aesni_module_init); diff --git a/crypto/Kconfig b/crypto/Kconfig index bbab6bf33519..5b2c4cd7923f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -310,7 +310,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT select CRYPTO_AEAD - select CRYPTO_CRYPTD + select CRYPTO_SIMD help AESNI+SSE2 implementation of the AEGSI-128 dedicated AEAD algorithm. -- cgit v1.2.3 From d628132a5e3d0183ac59a3ebbd3a9dff8b20ac7e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:54 -0700 Subject: crypto: x86/aegis128l - convert to use AEAD SIMD helpers Convert the x86 implementation of AEGIS-128L to use the AEAD SIMD helpers, rather than hand-rolling the same functionality. This simplifies the code and also fixes the bug where the user-provided aead_request is modified. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128l-aesni-glue.c | 157 +++++++-------------------------- crypto/Kconfig | 2 +- 2 files changed, 31 insertions(+), 128 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 1b1b39c66c5e..80d917f7e467 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -11,8 +11,8 @@ * any later version. */ -#include #include +#include #include #include #include @@ -242,131 +242,35 @@ static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) { } -static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} - -static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - -static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} - -static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} - -static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg crypto_aegis128l_aesni_alg[] = { - { - .setkey = crypto_aegis128l_aesni_setkey, - .setauthsize = crypto_aegis128l_aesni_setauthsize, - .encrypt = crypto_aegis128l_aesni_encrypt, - .decrypt = crypto_aegis128l_aesni_decrypt, - .init = crypto_aegis128l_aesni_init_tfm, - .exit = crypto_aegis128l_aesni_exit_tfm, - - .ivsize = AEGIS128L_NONCE_SIZE, - .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, - .chunksize = AEGIS128L_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - - .cra_name = "__aegis128l", - .cra_driver_name = "__aegis128l-aesni", - - .cra_module = THIS_MODULE, - } - }, { - .setkey = cryptd_aegis128l_aesni_setkey, - .setauthsize = cryptd_aegis128l_aesni_setauthsize, - .encrypt = cryptd_aegis128l_aesni_encrypt, - .decrypt = cryptd_aegis128l_aesni_decrypt, - .init = cryptd_aegis128l_aesni_init_tfm, - .exit = cryptd_aegis128l_aesni_exit_tfm, - - .ivsize = AEGIS128L_NONCE_SIZE, - .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, - .chunksize = AEGIS128L_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_alignmask = 0, - - .cra_priority = 400, - - .cra_name = "aegis128l", - .cra_driver_name = "aegis128l-aesni", - - .cra_module = THIS_MODULE, - } +static struct aead_alg crypto_aegis128l_aesni_alg = { + .setkey = crypto_aegis128l_aesni_setkey, + .setauthsize = crypto_aegis128l_aesni_setauthsize, + .encrypt = crypto_aegis128l_aesni_encrypt, + .decrypt = crypto_aegis128l_aesni_decrypt, + .init = crypto_aegis128l_aesni_init_tfm, + .exit = crypto_aegis128l_aesni_exit_tfm, + + .ivsize = AEGIS128L_NONCE_SIZE, + .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, + .chunksize = AEGIS128L_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + .cra_priority = 400, + + .cra_name = "__aegis128l", + .cra_driver_name = "__aegis128l-aesni", + + .cra_module = THIS_MODULE, } }; +static struct simd_aead_alg *simd_alg; + static int __init crypto_aegis128l_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || @@ -374,14 +278,13 @@ static int __init crypto_aegis128l_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_aegis128l_aesni_alg, - ARRAY_SIZE(crypto_aegis128l_aesni_alg)); + return simd_register_aeads_compat(&crypto_aegis128l_aesni_alg, 1, + &simd_alg); } static void __exit crypto_aegis128l_aesni_module_exit(void) { - crypto_unregister_aeads(crypto_aegis128l_aesni_alg, - ARRAY_SIZE(crypto_aegis128l_aesni_alg)); + simd_unregister_aeads(&crypto_aegis128l_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis128l_aesni_module_init); diff --git a/crypto/Kconfig b/crypto/Kconfig index 5b2c4cd7923f..ff05a87cf9e0 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -318,7 +318,7 @@ config CRYPTO_AEGIS128L_AESNI_SSE2 tristate "AEGIS-128L AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT select CRYPTO_AEAD - select CRYPTO_CRYPTD + select CRYPTO_SIMD help AESNI+SSE2 implementation of the AEGSI-128L dedicated AEAD algorithm. -- cgit v1.2.3 From b6708c2d8fbdeecfaf8380329d4e136f68deef05 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:55 -0700 Subject: crypto: x86/aegis256 - convert to use AEAD SIMD helpers Convert the x86 implementation of AEGIS-256 to use the AEAD SIMD helpers, rather than hand-rolling the same functionality. This simplifies the code and also fixes the bug where the user-provided aead_request is modified. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis256-aesni-glue.c | 157 +++++++--------------------------- crypto/Kconfig | 2 +- 2 files changed, 31 insertions(+), 128 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 6227ca3220a0..716eecb66bd5 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -11,8 +11,8 @@ * any later version. */ -#include #include +#include #include #include #include @@ -242,131 +242,35 @@ static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) { } -static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} - -static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - -static int cryptd_aegis256_aesni_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} - -static int cryptd_aegis256_aesni_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} - -static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg crypto_aegis256_aesni_alg[] = { - { - .setkey = crypto_aegis256_aesni_setkey, - .setauthsize = crypto_aegis256_aesni_setauthsize, - .encrypt = crypto_aegis256_aesni_encrypt, - .decrypt = crypto_aegis256_aesni_decrypt, - .init = crypto_aegis256_aesni_init_tfm, - .exit = crypto_aegis256_aesni_exit_tfm, - - .ivsize = AEGIS256_NONCE_SIZE, - .maxauthsize = AEGIS256_MAX_AUTH_SIZE, - .chunksize = AEGIS256_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - - .cra_name = "__aegis256", - .cra_driver_name = "__aegis256-aesni", - - .cra_module = THIS_MODULE, - } - }, { - .setkey = cryptd_aegis256_aesni_setkey, - .setauthsize = cryptd_aegis256_aesni_setauthsize, - .encrypt = cryptd_aegis256_aesni_encrypt, - .decrypt = cryptd_aegis256_aesni_decrypt, - .init = cryptd_aegis256_aesni_init_tfm, - .exit = cryptd_aegis256_aesni_exit_tfm, - - .ivsize = AEGIS256_NONCE_SIZE, - .maxauthsize = AEGIS256_MAX_AUTH_SIZE, - .chunksize = AEGIS256_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_alignmask = 0, - - .cra_priority = 400, - - .cra_name = "aegis256", - .cra_driver_name = "aegis256-aesni", - - .cra_module = THIS_MODULE, - } +static struct aead_alg crypto_aegis256_aesni_alg = { + .setkey = crypto_aegis256_aesni_setkey, + .setauthsize = crypto_aegis256_aesni_setauthsize, + .encrypt = crypto_aegis256_aesni_encrypt, + .decrypt = crypto_aegis256_aesni_decrypt, + .init = crypto_aegis256_aesni_init_tfm, + .exit = crypto_aegis256_aesni_exit_tfm, + + .ivsize = AEGIS256_NONCE_SIZE, + .maxauthsize = AEGIS256_MAX_AUTH_SIZE, + .chunksize = AEGIS256_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + .cra_priority = 400, + + .cra_name = "__aegis256", + .cra_driver_name = "__aegis256-aesni", + + .cra_module = THIS_MODULE, } }; +static struct simd_aead_alg *simd_alg; + static int __init crypto_aegis256_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || @@ -374,14 +278,13 @@ static int __init crypto_aegis256_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_aegis256_aesni_alg, - ARRAY_SIZE(crypto_aegis256_aesni_alg)); + return simd_register_aeads_compat(&crypto_aegis256_aesni_alg, 1, + &simd_alg); } static void __exit crypto_aegis256_aesni_module_exit(void) { - crypto_unregister_aeads(crypto_aegis256_aesni_alg, - ARRAY_SIZE(crypto_aegis256_aesni_alg)); + simd_unregister_aeads(&crypto_aegis256_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis256_aesni_module_init); diff --git a/crypto/Kconfig b/crypto/Kconfig index ff05a87cf9e0..1b7238e05cf1 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -326,7 +326,7 @@ config CRYPTO_AEGIS256_AESNI_SSE2 tristate "AEGIS-256 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT select CRYPTO_AEAD - select CRYPTO_CRYPTD + select CRYPTO_SIMD help AESNI+SSE2 implementation of the AEGSI-256 dedicated AEAD algorithm. -- cgit v1.2.3 From 477309580dcc5791a76302c53e50d0b8693b13bc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:56 -0700 Subject: crypto: x86/morus640 - convert to use AEAD SIMD helpers Convert the x86 implementation of MORUS-640 to use the AEAD SIMD helpers, rather than hand-rolling the same functionality. This simplifies the code and also fixes the bug where the user-provided aead_request is modified. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/morus640-sse2-glue.c | 12 ++--- arch/x86/crypto/morus640_glue.c | 85 ------------------------------------ crypto/Kconfig | 2 +- include/crypto/morus640_glue.h | 79 ++++++++++----------------------- 4 files changed, 30 insertions(+), 148 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 9afaf8f8565a..32da56b3bdad 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -35,7 +36,9 @@ asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); +MORUS640_DECLARE_ALG(sse2, "morus640-sse2", 400); + +static struct simd_aead_alg *simd_alg; static int __init crypto_morus640_sse2_module_init(void) { @@ -43,14 +46,13 @@ static int __init crypto_morus640_sse2_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_morus640_sse2_algs, - ARRAY_SIZE(crypto_morus640_sse2_algs)); + return simd_register_aeads_compat(&crypto_morus640_sse2_alg, 1, + &simd_alg); } static void __exit crypto_morus640_sse2_module_exit(void) { - crypto_unregister_aeads(crypto_morus640_sse2_algs, - ARRAY_SIZE(crypto_morus640_sse2_algs)); + simd_unregister_aeads(&crypto_morus640_sse2_alg, 1, &simd_alg); } module_init(crypto_morus640_sse2_module_init); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c index cb3a81732016..1dea33d84426 100644 --- a/arch/x86/crypto/morus640_glue.c +++ b/arch/x86/crypto/morus640_glue.c @@ -11,7 +11,6 @@ * any later version. */ -#include #include #include #include @@ -200,90 +199,6 @@ void crypto_morus640_glue_init_ops(struct crypto_aead *aead, } EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); -int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey); - -int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize); - -int cryptd_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt); - -int cryptd_morus640_glue_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt); - -int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - const char *name = crypto_aead_alg(aead)->base.cra_driver_name; - char internal_name[CRYPTO_MAX_ALG_NAME]; - - if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) - >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm); - -void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek "); MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 1b7238e05cf1..498ec4d98ce1 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -340,7 +340,7 @@ config CRYPTO_MORUS640_GLUE tristate depends on X86 select CRYPTO_AEAD - select CRYPTO_CRYPTD + select CRYPTO_SIMD help Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD algorithm. diff --git a/include/crypto/morus640_glue.h b/include/crypto/morus640_glue.h index df8e1103ff94..0ee6266cb26c 100644 --- a/include/crypto/morus640_glue.h +++ b/include/crypto/morus640_glue.h @@ -47,16 +47,7 @@ int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, int crypto_morus640_glue_encrypt(struct aead_request *req); int crypto_morus640_glue_decrypt(struct aead_request *req); -int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen); -int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize); -int cryptd_morus640_glue_encrypt(struct aead_request *req); -int cryptd_morus640_glue_decrypt(struct aead_request *req); -int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead); -void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead); - -#define MORUS640_DECLARE_ALGS(id, driver_name, priority) \ +#define MORUS640_DECLARE_ALG(id, driver_name, priority) \ static const struct morus640_glue_ops crypto_morus640_##id##_ops = {\ .init = crypto_morus640_##id##_init, \ .ad = crypto_morus640_##id##_ad, \ @@ -77,55 +68,29 @@ void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead); { \ } \ \ - static struct aead_alg crypto_morus640_##id##_algs[] = {\ - { \ - .setkey = crypto_morus640_glue_setkey, \ - .setauthsize = crypto_morus640_glue_setauthsize, \ - .encrypt = crypto_morus640_glue_encrypt, \ - .decrypt = crypto_morus640_glue_decrypt, \ - .init = crypto_morus640_##id##_init_tfm, \ - .exit = crypto_morus640_##id##_exit_tfm, \ - \ - .ivsize = MORUS_NONCE_SIZE, \ - .maxauthsize = MORUS_MAX_AUTH_SIZE, \ - .chunksize = MORUS640_BLOCK_SIZE, \ - \ - .base = { \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ - .cra_blocksize = 1, \ - .cra_ctxsize = sizeof(struct morus640_ctx), \ - .cra_alignmask = 0, \ - \ - .cra_name = "__morus640", \ - .cra_driver_name = "__"driver_name, \ - \ - .cra_module = THIS_MODULE, \ - } \ - }, { \ - .setkey = cryptd_morus640_glue_setkey, \ - .setauthsize = cryptd_morus640_glue_setauthsize, \ - .encrypt = cryptd_morus640_glue_encrypt, \ - .decrypt = cryptd_morus640_glue_decrypt, \ - .init = cryptd_morus640_glue_init_tfm, \ - .exit = cryptd_morus640_glue_exit_tfm, \ + static struct aead_alg crypto_morus640_##id##_alg = {\ + .setkey = crypto_morus640_glue_setkey, \ + .setauthsize = crypto_morus640_glue_setauthsize, \ + .encrypt = crypto_morus640_glue_encrypt, \ + .decrypt = crypto_morus640_glue_decrypt, \ + .init = crypto_morus640_##id##_init_tfm, \ + .exit = crypto_morus640_##id##_exit_tfm, \ + \ + .ivsize = MORUS_NONCE_SIZE, \ + .maxauthsize = MORUS_MAX_AUTH_SIZE, \ + .chunksize = MORUS640_BLOCK_SIZE, \ + \ + .base = { \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = sizeof(struct morus640_ctx), \ + .cra_alignmask = 0, \ + .cra_priority = priority, \ \ - .ivsize = MORUS_NONCE_SIZE, \ - .maxauthsize = MORUS_MAX_AUTH_SIZE, \ - .chunksize = MORUS640_BLOCK_SIZE, \ + .cra_name = "__morus640", \ + .cra_driver_name = "__"driver_name, \ \ - .base = { \ - .cra_flags = CRYPTO_ALG_ASYNC, \ - .cra_blocksize = 1, \ - .cra_ctxsize = sizeof(struct crypto_aead *), \ - .cra_alignmask = 0, \ - \ - .cra_priority = priority, \ - \ - .cra_name = "morus640", \ - .cra_driver_name = driver_name, \ - \ - .cra_module = THIS_MODULE, \ - } \ + .cra_module = THIS_MODULE, \ } \ } -- cgit v1.2.3 From e151a8d28c2c99c8a9a9bfbe2bd612e692c33efd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Mar 2019 12:00:57 -0700 Subject: crypto: x86/morus1280 - convert to use AEAD SIMD helpers Convert the x86 implementations of MORUS-1280 to use the AEAD SIMD helpers, rather than hand-rolling the same functionality. This simplifies the code and also fixes the bug where the user-provided aead_request is modified. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/morus1280-avx2-glue.c | 12 ++--- arch/x86/crypto/morus1280-sse2-glue.c | 12 ++--- arch/x86/crypto/morus1280_glue.c | 85 ----------------------------------- crypto/Kconfig | 2 +- include/crypto/morus1280_glue.h | 79 +++++++++----------------------- 5 files changed, 37 insertions(+), 153 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c index 6634907d6ccd..679627a2a824 100644 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -35,7 +36,9 @@ asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); +MORUS1280_DECLARE_ALG(avx2, "morus1280-avx2", 400); + +static struct simd_aead_alg *simd_alg; static int __init crypto_morus1280_avx2_module_init(void) { @@ -44,14 +47,13 @@ static int __init crypto_morus1280_avx2_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_morus1280_avx2_algs, - ARRAY_SIZE(crypto_morus1280_avx2_algs)); + return simd_register_aeads_compat(&crypto_morus1280_avx2_alg, 1, + &simd_alg); } static void __exit crypto_morus1280_avx2_module_exit(void) { - crypto_unregister_aeads(crypto_morus1280_avx2_algs, - ARRAY_SIZE(crypto_morus1280_avx2_algs)); + simd_unregister_aeads(&crypto_morus1280_avx2_alg, 1, &simd_alg); } module_init(crypto_morus1280_avx2_module_init); diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index f40244eaf14d..c35c0638d0bb 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -35,7 +36,9 @@ asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); +MORUS1280_DECLARE_ALG(sse2, "morus1280-sse2", 350); + +static struct simd_aead_alg *simd_alg; static int __init crypto_morus1280_sse2_module_init(void) { @@ -43,14 +46,13 @@ static int __init crypto_morus1280_sse2_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_morus1280_sse2_algs, - ARRAY_SIZE(crypto_morus1280_sse2_algs)); + return simd_register_aeads_compat(&crypto_morus1280_sse2_alg, 1, + &simd_alg); } static void __exit crypto_morus1280_sse2_module_exit(void) { - crypto_unregister_aeads(crypto_morus1280_sse2_algs, - ARRAY_SIZE(crypto_morus1280_sse2_algs)); + simd_unregister_aeads(&crypto_morus1280_sse2_alg, 1, &simd_alg); } module_init(crypto_morus1280_sse2_module_init); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c index 7e600f8bcdad..30fc1bd98ec3 100644 --- a/arch/x86/crypto/morus1280_glue.c +++ b/arch/x86/crypto/morus1280_glue.c @@ -11,7 +11,6 @@ * any later version. */ -#include #include #include #include @@ -205,90 +204,6 @@ void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, } EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); -int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey); - -int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize); - -int cryptd_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt); - -int cryptd_morus1280_glue_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt); - -int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - const char *name = crypto_aead_alg(aead)->base.cra_driver_name; - char internal_name[CRYPTO_MAX_ALG_NAME]; - - if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) - >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm); - -void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek "); MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 498ec4d98ce1..6ad6d11c990b 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -363,7 +363,7 @@ config CRYPTO_MORUS1280_GLUE tristate depends on X86 select CRYPTO_AEAD - select CRYPTO_CRYPTD + select CRYPTO_SIMD help Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD algorithm. diff --git a/include/crypto/morus1280_glue.h b/include/crypto/morus1280_glue.h index ad2aa743dd99..5cefddb1991f 100644 --- a/include/crypto/morus1280_glue.h +++ b/include/crypto/morus1280_glue.h @@ -47,16 +47,7 @@ int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, int crypto_morus1280_glue_encrypt(struct aead_request *req); int crypto_morus1280_glue_decrypt(struct aead_request *req); -int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen); -int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize); -int cryptd_morus1280_glue_encrypt(struct aead_request *req); -int cryptd_morus1280_glue_decrypt(struct aead_request *req); -int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead); -void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead); - -#define MORUS1280_DECLARE_ALGS(id, driver_name, priority) \ +#define MORUS1280_DECLARE_ALG(id, driver_name, priority) \ static const struct morus1280_glue_ops crypto_morus1280_##id##_ops = {\ .init = crypto_morus1280_##id##_init, \ .ad = crypto_morus1280_##id##_ad, \ @@ -77,55 +68,29 @@ void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead); { \ } \ \ - static struct aead_alg crypto_morus1280_##id##_algs[] = {\ - { \ - .setkey = crypto_morus1280_glue_setkey, \ - .setauthsize = crypto_morus1280_glue_setauthsize, \ - .encrypt = crypto_morus1280_glue_encrypt, \ - .decrypt = crypto_morus1280_glue_decrypt, \ - .init = crypto_morus1280_##id##_init_tfm, \ - .exit = crypto_morus1280_##id##_exit_tfm, \ - \ - .ivsize = MORUS_NONCE_SIZE, \ - .maxauthsize = MORUS_MAX_AUTH_SIZE, \ - .chunksize = MORUS1280_BLOCK_SIZE, \ - \ - .base = { \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ - .cra_blocksize = 1, \ - .cra_ctxsize = sizeof(struct morus1280_ctx), \ - .cra_alignmask = 0, \ - \ - .cra_name = "__morus1280", \ - .cra_driver_name = "__"driver_name, \ - \ - .cra_module = THIS_MODULE, \ - } \ - }, { \ - .setkey = cryptd_morus1280_glue_setkey, \ - .setauthsize = cryptd_morus1280_glue_setauthsize, \ - .encrypt = cryptd_morus1280_glue_encrypt, \ - .decrypt = cryptd_morus1280_glue_decrypt, \ - .init = cryptd_morus1280_glue_init_tfm, \ - .exit = cryptd_morus1280_glue_exit_tfm, \ + static struct aead_alg crypto_morus1280_##id##_alg = { \ + .setkey = crypto_morus1280_glue_setkey, \ + .setauthsize = crypto_morus1280_glue_setauthsize, \ + .encrypt = crypto_morus1280_glue_encrypt, \ + .decrypt = crypto_morus1280_glue_decrypt, \ + .init = crypto_morus1280_##id##_init_tfm, \ + .exit = crypto_morus1280_##id##_exit_tfm, \ + \ + .ivsize = MORUS_NONCE_SIZE, \ + .maxauthsize = MORUS_MAX_AUTH_SIZE, \ + .chunksize = MORUS1280_BLOCK_SIZE, \ + \ + .base = { \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = sizeof(struct morus1280_ctx), \ + .cra_alignmask = 0, \ + .cra_priority = priority, \ \ - .ivsize = MORUS_NONCE_SIZE, \ - .maxauthsize = MORUS_MAX_AUTH_SIZE, \ - .chunksize = MORUS1280_BLOCK_SIZE, \ + .cra_name = "__morus1280", \ + .cra_driver_name = "__"driver_name, \ \ - .base = { \ - .cra_flags = CRYPTO_ALG_ASYNC, \ - .cra_blocksize = 1, \ - .cra_ctxsize = sizeof(struct crypto_aead *), \ - .cra_alignmask = 0, \ - \ - .cra_priority = priority, \ - \ - .cra_name = "morus1280", \ - .cra_driver_name = driver_name, \ - \ - .cra_module = THIS_MODULE, \ - } \ + .cra_module = THIS_MODULE, \ } \ } -- cgit v1.2.3 From f2abe0d72b21671ad19db075262411e3d4a828dd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 Mar 2019 22:12:48 -0700 Subject: crypto: x86 - convert to use crypto_simd_usable() Replace all calls to irq_fpu_usable() in the x86 crypto code with crypto_simd_usable(), in order to allow testing the no-SIMD code paths. Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 8 ++++---- arch/x86/crypto/chacha_glue.c | 6 +++--- arch/x86/crypto/crc32-pclmul_glue.c | 5 +++-- arch/x86/crypto/crc32c-intel_glue.c | 7 ++++--- arch/x86/crypto/crct10dif-pclmul_glue.c | 7 ++++--- arch/x86/crypto/ghash-clmulni-intel_glue.c | 9 +++++---- arch/x86/crypto/nhpoly1305-avx2-glue.c | 5 +++-- arch/x86/crypto/nhpoly1305-sse2-glue.c | 5 +++-- arch/x86/crypto/poly1305_glue.c | 4 ++-- arch/x86/crypto/sha1_ssse3_glue.c | 7 ++++--- arch/x86/crypto/sha256_ssse3_glue.c | 7 ++++--- arch/x86/crypto/sha512_ssse3_glue.c | 10 +++++----- 12 files changed, 44 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 1466d59384ac..21c246799aa5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -30,8 +30,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -332,7 +332,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, return -EINVAL; } - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) err = crypto_aes_expand_key(ctx, in_key, key_len); else { kernel_fpu_begin(); @@ -353,7 +353,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) crypto_aes_encrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -366,7 +366,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) crypto_aes_decrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 45c1c4143176..4967ad620775 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -12,10 +12,10 @@ #include #include +#include #include #include #include -#include #include #define CHACHA_STATE_ALIGN 16 @@ -170,7 +170,7 @@ static int chacha_simd(struct skcipher_request *req) struct skcipher_walk walk; int err; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_chacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); @@ -193,7 +193,7 @@ static int xchacha_simd(struct skcipher_request *req) u8 real_iv[16]; int err; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_xchacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index c8d9cdacbf10..cb4ab6645106 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -32,10 +32,11 @@ #include #include #include +#include #include #include -#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -54,7 +55,7 @@ static u32 __attribute__((pure)) unsigned int iremainder; unsigned int prealign; - if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable()) + if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable()) return crc32_le(crc, p, len); if ((long)p & SCALE_F_MASK) { diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 5773e1161072..a58fe217c856 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -29,10 +29,11 @@ #include #include #include +#include #include #include -#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -177,7 +178,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -189,7 +190,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 0e785c0b2354..64f17d2d8b67 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -26,12 +26,13 @@ #include #include #include +#include #include #include #include -#include #include #include +#include asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); @@ -53,7 +54,7 @@ static int chksum_update(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - if (length >= 16 && irq_fpu_usable()) { + if (length >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); kernel_fpu_end(); @@ -73,7 +74,7 @@ static int chksum_final(struct shash_desc *desc, u8 *out) static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= 16 && irq_fpu_usable()) { + if (len >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); kernel_fpu_end(); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 3582ae885ee1..4099a0ae17dd 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,8 +19,9 @@ #include #include #include -#include +#include #include +#include #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -182,7 +183,7 @@ static int ghash_async_update(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -200,7 +201,7 @@ static int ghash_async_final(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -241,7 +242,7 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 20d815ea4b6a..f7567cbd35b6 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -7,9 +7,10 @@ */ #include +#include #include #include -#include +#include asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, u8 hash[NH_HASH_BYTES]); @@ -24,7 +25,7 @@ static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_avx2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - if (srclen < 64 || !irq_fpu_usable()) + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); do { diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index ed68d164ce14..a661ede3b5cf 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -7,9 +7,10 @@ */ #include +#include #include #include -#include +#include asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, u8 hash[NH_HASH_BYTES]); @@ -24,7 +25,7 @@ static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_sse2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - if (srclen < 64 || !irq_fpu_usable()) + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); do { diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 88cc01506c84..6eb65b237b3c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -11,11 +11,11 @@ #include #include +#include #include #include #include #include -#include #include struct poly1305_simd_desc_ctx { @@ -126,7 +126,7 @@ static int poly1305_simd_update(struct shash_desc *desc, unsigned int bytes; /* kernel_fpu_begin/end is costly, use fallback for small updates */ - if (srclen <= 288 || !may_use_simd()) + if (srclen <= 288 || !crypto_simd_usable()) return crypto_poly1305_update(desc, src, srclen); kernel_fpu_begin(); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7391c7de72c7..42f177afc33a 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -29,7 +30,7 @@ #include #include #include -#include +#include typedef void (sha1_transform_fn)(u32 *digest, const char *data, unsigned int rounds); @@ -39,7 +40,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, { struct sha1_state *sctx = shash_desc_ctx(desc); - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) return crypto_sha1_update(desc, data, len); @@ -57,7 +58,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, static int sha1_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) { - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) return crypto_sha1_finup(desc, data, len, out); kernel_fpu_begin(); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 773a873d2b28..73867da3cbee 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -30,6 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -37,8 +38,8 @@ #include #include #include -#include #include +#include asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, u64 rounds); @@ -49,7 +50,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, { struct sha256_state *sctx = shash_desc_ctx(desc); - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) return crypto_sha256_update(desc, data, len); @@ -67,7 +68,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, static int sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) { - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) return crypto_sha256_finup(desc, data, len, out); kernel_fpu_begin(); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index f1b811b60ba6..458356a3f124 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -28,16 +28,16 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include #include +#include #include #include #include -#include - -#include +#include asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, u64 rounds); @@ -49,7 +49,7 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, { struct sha512_state *sctx = shash_desc_ctx(desc); - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); @@ -67,7 +67,7 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) { - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); -- cgit v1.2.3 From 0f0b7e1cc7abf8e1a8b301f2868379d611d05ae2 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 7 Mar 2019 13:09:13 +0100 Subject: x86/tsc: Add option to disable tsc clocksource watchdog Clocksource watchdog has been found responsible for generating latency spikes (in the 10-20 us range) when woken up to check for TSC stability. Add an option to disable it at boot. Signed-off-by: Juri Lelli Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: linux-rt-users@vger.kernel.org Cc: peterz@infradead.org Cc: bristot@redhat.com Cc: williams@redhat.com Link: https://lkml.kernel.org/r/20190307120913.13168-1-juri.lelli@redhat.com --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ arch/x86/kernel/tsc.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b8ee90bb644..c4d830003b21 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4703,6 +4703,10 @@ [x86] unstable: mark the TSC clocksource as unstable, this marks the TSC unconditionally unstable at bootup and avoids any further wobbles once the TSC watchdog notices. + [x86] nowatchdog: disable clocksource watchdog. Used + in situations with strict latency requirements (where + interruptions from clocksource watchdog are not + acceptable). turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3fae23834069..aab0c82e0a0d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -283,6 +283,7 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); static int no_sched_irq_time; +static int no_tsc_watchdog; static int __init tsc_setup(char *str) { @@ -292,6 +293,8 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); + if (!strcmp(str, "nowatchdog")) + no_tsc_watchdog = 1; return 1; } @@ -1349,7 +1352,7 @@ static int __init init_tsc_clocksource(void) if (tsc_unstable) goto unreg; - if (tsc_clocksource_reliable) + if (tsc_clocksource_reliable || no_tsc_watchdog) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) -- cgit v1.2.3 From e0ceeae708cebf22c990c3d703a4ca187dc837f5 Mon Sep 17 00:00:00 2001 From: Pu Wen Date: Sat, 23 Mar 2019 23:42:20 +0800 Subject: x86/CPU/hygon: Fix phys_proc_id calculation logic for multi-die processors The Hygon family 18h multi-die processor platform supports 1, 2 or 4-Dies per socket. The topology looks like this: System View (with 1-Die 2-Socket): |------------| ------ ----- SOCKET0 | D0 | | D1 | SOCKET1 ------ ----- System View (with 2-Die 2-socket): -------------------- | -------------|------ | | | | ------------ ------------ SOCKET0 | D1 -- D0 | | D3 -- D2 | SOCKET1 ------------ ------------ System View (with 4-Die 2-Socket) : -------------------- | -------------|------ | | | | ------------ ------------ | D1 -- D0 | | D7 -- D6 | | | \/ | | | | \/ | | SOCKET0 | | /\ | | | | /\ | | SOCKET1 | D2 -- D3 | | D4 -- D5 | ------------ ------------ | | | | ------|------------| | -------------------- Currently phys_proc_id = initial_apicid >> bits calculates the physical processor ID from the initial_apicid by shifting *bits*. However, this does not work for 1-Die and 2-Die 2-socket systems. According to document [1] section 2.1.11.1, the bits is the value of CPUID_Fn80000008_ECX[12:15]. The possible values are 4, 5 or 6 which mean: 4 - 1 die 5 - 2 dies 6 - 3/4 dies. Hygon programs the initial ApicId the same way as AMD. The ApicId is read from CPUID_Fn00000001_EBX (see section 2.1.11.1 of referrence [1]) and the definition is as below (see section 2.1.10.2.1.3 of [1]): ------------------------------------------------- Bit | 6 | 5 4 | 3 | 2 1 0 | |-----------|---------|--------|----------------| IDs | Socket ID | Node ID | CCX ID | Core/Thread ID | ------------------------------------------------- So for 3/4-Die configurations, the bits variable is 6, which is the same as the ApicID definition field. For 1-Die and 2-Die configurations, bits is 4 or 5, which will cause the right shifted result to not be exactly the value of socket ID. However, the socket ID should be obtained from ApicId[6]. To fix the problem and match the ApicID field definition, set the shift bits to 6 for all Hygon family 18h multi-die CPUs. Because AMD doesn't have 2-Socket systems with 1-Die/2-Die processors (see reference [2]), this doesn't need to be changed on the AMD side but only for Hygon. References: [1] https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf [2] https://www.amd.com/en/products/specifications/processors [bp: heavily massage commit message. ] Signed-off-by: Pu Wen Signed-off-by: Borislav Petkov Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: Yazen Ghannam Cc: x86-ml Link: https://lkml.kernel.org/r/1553355740-19999-1-git-send-email-puwen@hygon.cn --- arch/x86/kernel/cpu/hygon.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index cf25405444ab..415621ddb8a2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -19,6 +19,8 @@ #include "cpu.h" +#define APICID_SOCKET_ID_BIT 6 + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] @@ -87,6 +89,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); + /* Socket ID is ApicId[6] for these processors. */ + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + cacheinfo_hygon_init_llc_id(c, cpu, node_id); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; -- cgit v1.2.3 From 9308fd4074551f222f30322d1ee8c5aff18e9747 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 22 Mar 2019 20:29:00 +0000 Subject: x86/MCE: Group AMD function prototypes in There are two groups of "ifdef CONFIG_X86_MCE_AMD" function prototypes in . Merge these two groups. No functional change. [ bp: align vertically. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Arnd Bergmann Cc: "clemej@gmail.com" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Pu Wen Cc: Qiuxu Zhuo Cc: "rafal@milecki.pl" Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190322202848.20749-3-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 22d05e3835f0..dc2d4b206ab7 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -210,16 +210,6 @@ static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} #endif -#ifdef CONFIG_X86_MCE_AMD -void mce_amd_feature_init(struct cpuinfo_x86 *c); -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); -#else -static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } -static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; -#endif - -static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } - int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); @@ -345,12 +335,19 @@ extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); -#else +void mce_amd_feature_init(struct cpuinfo_x86 *c); +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; -static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; +#else +static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; +static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; +static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; +static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +static inline int +umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif +static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } + #endif /* _ASM_X86_MCE_H */ -- cgit v1.2.3 From 766460852cfaeca4042e5f3aeb9616b3689147bc Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Mon, 25 Mar 2019 15:29:22 -0500 Subject: x86/platform/uv: Fix missing checks of kcalloc() return values Handle potential errors returned from kcalloc(). [ bp: rewrite commit message. ] Signed-off-by: Kangjie Lu Signed-off-by: Borislav Petkov Cc: Andrew Banman Cc: Andy Shevchenko Cc: Colin Ian King Cc: Darren Hart Cc: "Gustavo A. R. Silva" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mike Travis Cc: Nicolai Stange Cc: pakki001@umn.edu Cc: platform-driver-x86@vger.kernel.org Cc: Thomas Gleixner Cc: Varsha Rao Cc: x86-ml Link: https://lkml.kernel.org/r/20190325202924.4624-1-kjlu@umn.edu --- arch/x86/platform/uv/tlb_uv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 2c53b0f19329..1297e185b8c8 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -2133,14 +2133,19 @@ static int __init summarize_uvhub_sockets(int nuvhubs, */ static int __init init_per_cpu(int nuvhubs, int base_part_pnode) { - unsigned char *uvhub_mask; struct uvhub_desc *uvhub_descs; + unsigned char *uvhub_mask = NULL; if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) timeout_us = calculate_destination_timeout(); uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); + if (!uvhub_descs) + goto fail; + uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); + if (!uvhub_mask) + goto fail; if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) goto fail; -- cgit v1.2.3 From f19501aa07f18268ab14f458b51c1c6b7f72a134 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 12 Mar 2019 10:09:38 -0700 Subject: x86/mce: Fix machine_check_poll() tests for error types There has been a lurking "TBD" in the machine check poll routine ever since it was first split out from the machine check handler. The potential issue is that the poll routine may have just begun a read from the STATUS register in a machine check bank when the hardware logs an error in that bank and signals a machine check. That race used to be pretty small back when machine checks were broadcast, but the addition of local machine check means that the poll code could continue running and clear the error from the bank before the local machine check handler on another CPU gets around to reading it. Fix the code to be sure to only process errors that need to be processed in the poll code, leaving other logged errors alone for the machine check handler to find and process. [ bp: Massage a bit and flip the "== 0" check to the usual !(..) test. ] Fixes: b79109c3bbcf ("x86, mce: separate correct machine check poller and fatal exception handler") Fixes: ed7290d0ee8f ("x86, mce: implement new status bits") Reported-by: Ashok Raj Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Ashok Raj Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: linux-edac Cc: Thomas Gleixner Cc: x86-ml Cc: Yazen Ghannam Link: https://lkml.kernel.org/r/20190312170938.GA23035@agluck-desk --- arch/x86/kernel/cpu/mce/core.c | 44 +++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b7fb541a4873..e558ca77cfe8 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -712,19 +712,49 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); + + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; /* - * Uncorrected or signalled events are handled by the exception - * handler when it is enabled, so don't process those here. - * - * TBD do the same check for MCI_STATUS_EN here? + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. */ - if (!(flags & MCP_UC) && - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) - continue; + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; +log_it: error_seen = true; mce_read_aux(&m, i); -- cgit v1.2.3 From 006c077041dc73b9490fffc4c6af5befe0687110 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jul 2018 16:40:09 -0500 Subject: x86/mce: Handle varying MCA bank counts Linux reads MCG_CAP[Count] to find the number of MCA banks visible to a CPU. Currently, this number is the same for all CPUs and a warning is shown if there is a difference. The number of banks is overwritten with the MCG_CAP[Count] value of each following CPU that boots. According to the Intel SDM and AMD APM, the MCG_CAP[Count] value gives the number of banks that are available to a "processor implementation". The AMD BKDGs/PPRs further clarify that this value is per core. This value has historically been the same for every core in the system, but that is not an architectural requirement. Future AMD systems may have different MCG_CAP[Count] values per core, so the assumption that all CPUs will have the same MCG_CAP[Count] value will no longer be valid. Also, the first CPU to boot will allocate the struct mce_banks[] array using the number of banks based on its MCG_CAP[Count] value. The machine check handler and other functions use the global number of banks to iterate and index into the mce_banks[] array. So it's possible to use an out-of-bounds index on an asymmetric system where a following CPU sees a MCG_CAP[Count] value greater than its predecessors. Thus, allocate the mce_banks[] array to the maximum number of banks. This will avoid the potential out-of-bounds index since the value of mca_cfg.banks is capped to MAX_NR_BANKS. Set the value of mca_cfg.banks equal to the max of the previous value and the value for the current CPU. This way mca_cfg.banks will always represent the max number of banks detected on any CPU in the system. This will ensure that all CPUs will access all the banks that are visible to them. A CPU that can access fewer than the max number of banks will find the registers of the extra banks to be read-as-zero. Furthermore, print the resulting number of MCA banks in use. Do this in mcheck_late_init() so that the final value is printed after all CPUs have been initialized. Finally, get bank count from target CPU when doing injection with mce-inject module. [ bp: Remove out-of-bounds example, passify and cleanup commit message. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: linux-edac Cc: Pu Wen Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20180727214009.78289-1-Yazen.Ghannam@amd.com --- arch/x86/kernel/cpu/mce/core.c | 22 +++++++--------------- arch/x86/kernel/cpu/mce/inject.c | 14 +++++++------- 2 files changed, 14 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index e558ca77cfe8..c3498732ba28 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1481,13 +1481,12 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); static int __mcheck_cpu_mce_banks_init(void) { int i; - u8 num_banks = mca_cfg.banks; - mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < num_banks; i++) { + for (i = 0; i < MAX_NR_BANKS; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1501,28 +1500,19 @@ static int __mcheck_cpu_mce_banks_init(void) */ static int __mcheck_cpu_cap_init(void) { - unsigned b; u64 cap; + u8 b; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!mca_cfg.banks) - pr_info("CPU supports %d MCE banks\n", b); - - if (b > MAX_NR_BANKS) { - pr_warn("Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); + if (WARN_ON_ONCE(b > MAX_NR_BANKS)) b = MAX_NR_BANKS; - } - /* Don't support asymmetric configurations today */ - WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); - mca_cfg.banks = b; + mca_cfg.banks = max(mca_cfg.banks, b); if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); - if (err) return err; } @@ -2481,6 +2471,8 @@ EXPORT_SYMBOL_GPL(mcsafe_key); static int __init mcheck_late_init(void) { + pr_info("Using %d MCE banks\n", mca_cfg.banks); + if (mca_cfg.recovery) static_branch_inc(&mcsafe_key); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 8492ef7d9015..3f82afd0f46f 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -46,8 +46,6 @@ static struct mce i_mce; static struct dentry *dfs_inj; -static u8 n_banks; - #define MAX_FLAG_OPT_SIZE 4 #define NBCFG 0x44 @@ -570,9 +568,15 @@ err: static int inj_bank_set(void *data, u64 val) { struct mce *m = (struct mce *)data; + u8 n_banks; + u64 cap; + + /* Get bank count on target CPU so we can handle non-uniform values. */ + rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); + pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu); return -EINVAL; } @@ -665,10 +669,6 @@ static struct dfs_node { static int __init debugfs_init(void) { unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; dfs_inj = debugfs_create_dir("mce-inject", NULL); if (!dfs_inj) -- cgit v1.2.3 From 8db5da0b8618df79eceea99672e205d4a2a6309e Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 27 Jan 2019 19:03:45 -0500 Subject: x86/ima: require signed kernel modules Have the IMA architecture specific policy require signed kernel modules on systems with secure boot mode enabled; and coordinate the different signature verification methods, so only one signature is required. Requiring appended kernel module signatures may be configured, enabled on the boot command line, or with this patch enabled in secure boot mode. This patch defines set_module_sig_enforced(). To coordinate between appended kernel module signatures and IMA signatures, only define an IMA MODULE_CHECK policy rule if CONFIG_MODULE_SIG is not enabled. A custom IMA policy may still define and require an IMA signature. Signed-off-by: Mimi Zohar Reviewed-by: Luis Chamberlain Acked-by: Jessica Yu --- arch/x86/kernel/ima_arch.c | 9 ++++++++- include/linux/module.h | 5 +++++ kernel/module.c | 5 +++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index e47cd9390ab4..3fb9847f1cad 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -64,12 +64,19 @@ static const char * const sb_arch_rules[] = { "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", #endif /* CONFIG_KEXEC_VERIFY_SIG */ "measure func=KEXEC_KERNEL_CHECK", +#if !IS_ENABLED(CONFIG_MODULE_SIG) + "appraise func=MODULE_CHECK appraise_type=imasig", +#endif + "measure func=MODULE_CHECK", NULL }; const char * const *arch_get_ima_policy(void) { - if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) + if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { + if (IS_ENABLED(CONFIG_MODULE_SIG)) + set_module_sig_enforced(); return sb_arch_rules; + } return NULL; } diff --git a/include/linux/module.h b/include/linux/module.h index 5bf5dcd91009..73ee2b10e816 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -676,6 +676,7 @@ static inline bool is_livepatch_module(struct module *mod) #endif /* CONFIG_LIVEPATCH */ bool is_module_sig_enforced(void); +void set_module_sig_enforced(void); #else /* !CONFIG_MODULES... */ @@ -796,6 +797,10 @@ static inline bool is_module_sig_enforced(void) return false; } +static inline void set_module_sig_enforced(void) +{ +} + /* Dereference module function descriptor */ static inline void *dereference_module_function_descriptor(struct module *mod, void *ptr) diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..985caa467aef 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -286,6 +286,11 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); -- cgit v1.2.3 From 0fca08122eaf5c956a2cbe12775245d747f8b1ac Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 28 Mar 2019 20:34:28 +0100 Subject: efi: Unify DMI setup code over the arm/arm64, ia64 and x86 architectures All architectures (arm/arm64, ia64 and x86) do the same here, so unify the code. Note: We do not need to call dump_stack_set_arch_desc() in case of !dmi_available. Both strings, dmi_ids_string and dump_stack_arch_ desc_str are initialized zero and thus nothing would change. Signed-off-by: Robert Richter Signed-off-by: Ard Biesheuvel Reviewed-by: Jean Delvare Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20190328193429.21373-5-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- arch/ia64/kernel/setup.c | 4 +--- arch/x86/kernel/setup.c | 6 ++---- drivers/firmware/dmi_scan.c | 28 +++++++++++++++------------- drivers/firmware/efi/arm-runtime.c | 7 ++----- include/linux/dmi.h | 8 ++------ 5 files changed, 22 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 583a3746d70b..c9cfa760cd57 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -1058,9 +1058,7 @@ check_bugs (void) static int __init run_dmi_scan(void) { - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); return 0; } core_initcall(run_dmi_scan); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd9..3773905cd2c1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1005,13 +1005,11 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_init(); - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine(), for the boot CPU. + * needs to be done after dmi_setup(), for the boot CPU. */ init_hypervisor_platform(); diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 099d83e4e910..fae2d5c43314 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -416,11 +416,8 @@ static void __init save_mem_devices(const struct dmi_header *dm, void *v) nr++; } -void __init dmi_memdev_walk(void) +static void __init dmi_memdev_walk(void) { - if (!dmi_available) - return; - if (dmi_walk_early(count_mem_devices) == 0 && dmi_memdev_nr) { dmi_memdev = dmi_alloc(sizeof(*dmi_memdev) * dmi_memdev_nr); if (dmi_memdev) @@ -614,7 +611,7 @@ static int __init dmi_smbios3_present(const u8 *buf) return 1; } -void __init dmi_scan_machine(void) +static void __init dmi_scan_machine(void) { char __iomem *p, *q; char buf[32]; @@ -769,15 +766,20 @@ static int __init dmi_init(void) subsys_initcall(dmi_init); /** - * dmi_set_dump_stack_arch_desc - set arch description for dump_stack() + * dmi_setup - scan and setup DMI system information * - * Invoke dump_stack_set_arch_desc() with DMI system information so that - * DMI identifiers are printed out on task dumps. Arch boot code should - * call this function after dmi_scan_machine() if it wants to print out DMI - * identifiers on task dumps. + * Scan the DMI system information. This setups DMI identifiers + * (dmi_system_id) for printing it out on task dumps and prepares + * DIMM entry information (dmi_memdev_info) from the SMBIOS table + * for using this when reporting memory errors. */ -void __init dmi_set_dump_stack_arch_desc(void) +void __init dmi_setup(void) { + dmi_scan_machine(); + if (!dmi_available) + return; + + dmi_memdev_walk(); dump_stack_set_arch_desc("%s", dmi_ids_string); } @@ -841,7 +843,7 @@ static bool dmi_is_end_of_table(const struct dmi_system_id *dmi) * returns non zero or we hit the end. Callback function is called for * each successful match. Returns the number of matches. * - * dmi_scan_machine must be called before this function is called. + * dmi_setup must be called before this function is called. */ int dmi_check_system(const struct dmi_system_id *list) { @@ -871,7 +873,7 @@ EXPORT_SYMBOL(dmi_check_system); * Walk the blacklist table until the first match is found. Return the * pointer to the matching entry or NULL if there's no match. * - * dmi_scan_machine must be called before this function is called. + * dmi_setup must be called before this function is called. */ const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list) { diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 4a0dfe4ab829..e2ac5fa5531b 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -162,14 +162,11 @@ void efi_virtmap_unload(void) static int __init arm_dmi_init(void) { /* - * On arm64/ARM, DMI depends on UEFI, and dmi_scan_machine() needs to + * On arm64/ARM, DMI depends on UEFI, and dmi_setup() needs to * be called early because dmi_id_init(), which is an arch_initcall * itself, depends on dmi_scan_machine() having been called already. */ - dmi_scan_machine(); - dmi_memdev_walk(); - if (dmi_available) - dmi_set_dump_stack_arch_desc(); + dmi_setup(); return 0; } core_initcall(arm_dmi_init); diff --git a/include/linux/dmi.h b/include/linux/dmi.h index c46fdb36700b..8de8c4f15163 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -102,9 +102,7 @@ const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list); extern const char * dmi_get_system_info(int field); extern const struct dmi_device * dmi_find_device(int type, const char *name, const struct dmi_device *from); -extern void dmi_scan_machine(void); -extern void dmi_memdev_walk(void); -extern void dmi_set_dump_stack_arch_desc(void); +extern void dmi_setup(void); extern bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp); extern int dmi_get_bios_year(void); extern int dmi_name_in_vendors(const char *str); @@ -122,9 +120,7 @@ static inline int dmi_check_system(const struct dmi_system_id *list) { return 0; static inline const char * dmi_get_system_info(int field) { return NULL; } static inline const struct dmi_device * dmi_find_device(int type, const char *name, const struct dmi_device *from) { return NULL; } -static inline void dmi_scan_machine(void) { return; } -static inline void dmi_memdev_walk(void) { } -static inline void dmi_set_dump_stack_arch_desc(void) { } +static inline void dmi_setup(void) { } static inline bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp) { if (yearp) -- cgit v1.2.3 From a72a19327b92e09dab0eb9fd2bc83466465cbffb Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 29 Mar 2019 00:09:39 +0100 Subject: x86/mm/tlb: Define LOADED_MM_SWITCHING with pointer-sized number sparse complains that LOADED_MM_SWITCHING's definition casts an int to a pointer: arch/x86/mm/tlb.c:409:17: warning: non size-preserving integer to pointer cast Use a pointer-sized integer constant instead. Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Reviewed-by: Mukesh Ojha Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sai Praneeth Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190328230939.15711-1-jannh@google.com --- arch/x86/include/asm/tlbflush.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index f4204bf377fc..90926e8dd1f8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -167,7 +167,7 @@ struct tlb_state { */ struct mm_struct *loaded_mm; -#define LOADED_MM_SWITCHING ((struct mm_struct *)1) +#define LOADED_MM_SWITCHING ((struct mm_struct *)1UL) /* Last user mm for optimizing IBPB */ union { -- cgit v1.2.3 From 39f0584ee695f1d75f4181b209602b9c49e73821 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Mar 2019 19:39:23 +0100 Subject: x86/mce: Remove mce_report_event() Calling this function has been wrong for a while now: * Can't call schedule_work() in #MC context. * mce_notify_irq() either. * None of that noodling is needed anymore - all it needs to do is kick the IRQ work which would self-IPI so that once the #MC handler is done, the work queue will run and process queued MCE records. So remove it. Reported-by: Peter Zijlstra Signed-off-by: Borislav Petkov Cc: Tony Luck Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190325172121.7926-1-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c3498732ba28..58925e7ccb27 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -460,23 +460,6 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -static void mce_report_event(struct pt_regs *regs) -{ - if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_irq(); - /* - * Triggering the work queue here is just an insurance - * policy in case the syscall exit notify handler - * doesn't run soon enough or ends up running on the - * wrong CPU (can happen when audit sleeps) - */ - mce_schedule_work(); - return; - } - - irq_work_queue(&mce_irq_work); -} - /* * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would @@ -1331,7 +1314,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) - mce_report_event(regs); + irq_work_queue(&mce_irq_work); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); sync_core(); -- cgit v1.2.3 From ae37a8cd9b0ad3416d71e54cfaeb3744178189a8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Mar 2019 15:54:51 +0100 Subject: x86/cpufeature: Remove __pure attribute to _static_cpu_has() __pure is used to make gcc do Common Subexpression Elimination (CSE) and thus save subsequent invocations of a function which does a complex computation (without side effects). As a simple example: bool a = _static_cpu_has(x); bool b = _static_cpu_has(x); gets turned into bool a = _static_cpu_has(x); bool b = a; However, gcc doesn't do CSE with asm()s when those get inlined - like it is done with _static_cpu_has() - because, for example, the t_yes/t_no labels are different for each inlined function body and thus cannot be detected as equivalent anymore for the CSE heuristic to hit. However, this all is beside the point because best it should be avoided to have more than one call to _static_cpu_has(X) in the same function due to the fact that each such call is an alternatives patch site and it is simply pointless. Therefore, drop the __pure attribute as it is not doing anything. Reported-by: Nadav Amit Signed-off-by: Borislav Petkov Cc: Peter Zijlstra Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190307151036.GD26566@zn.tnic --- arch/x86/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ce95b8cbd229..2fb791a1b479 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -159,7 +159,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); * These will statically patch the target code for additional * performance. */ -static __always_inline __pure bool _static_cpu_has(u16 bit) +static __always_inline bool _static_cpu_has(u16 bit) { asm_volatile_goto("1: jmp 6f\n" "2:\n" -- cgit v1.2.3 From d71eb0ce109a124b0fa714832823b9452f2762cf Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 2 Apr 2019 09:59:33 -0500 Subject: x86/speculation/mds: Add mds=full,nosmt cmdline option Add the mds=full,nosmt cmdline option. This is like mds=full, but with SMT disabled if the CPU is vulnerable. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Tyler Hicks Acked-by: Jiri Kosina --- Documentation/admin-guide/hw-vuln/mds.rst | 3 +++ Documentation/admin-guide/kernel-parameters.txt | 6 ++++-- arch/x86/kernel/cpu/bugs.c | 10 ++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst index 1de29d28903d..244ab47d1fb3 100644 --- a/Documentation/admin-guide/hw-vuln/mds.rst +++ b/Documentation/admin-guide/hw-vuln/mds.rst @@ -260,6 +260,9 @@ time with the option "mds=". The valid arguments for this option are: It does not automatically disable SMT. + full,nosmt The same as mds=full, with SMT disabled on vulnerable + CPUs. This is the complete mitigation. + off Disables MDS mitigations completely. ============ ============================================================= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7325319c2c23..8f04985d3122 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2372,8 +2372,10 @@ This parameter controls the MDS mitigation. The options are: - full - Enable MDS mitigation on vulnerable CPUs - off - Unconditionally disable MDS mitigation + full - Enable MDS mitigation on vulnerable CPUs + full,nosmt - Enable MDS mitigation and disable + SMT on vulnerable CPUs + off - Unconditionally disable MDS mitigation Not specifying this option is equivalent to mds=full. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 373ae1dcd301..9f252082a83b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -221,6 +221,7 @@ static void x86_amd_ssb_disable(void) /* Default mitigation for L1TF-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { [MDS_MITIGATION_OFF] = "Vulnerable", @@ -238,8 +239,13 @@ static void __init mds_select_mitigation(void) if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + static_branch_enable(&mds_user_clear); + + if (mds_nosmt && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + cpu_smt_disable(false); } + pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -255,6 +261,10 @@ static int __init mds_cmdline(char *str) mds_mitigation = MDS_MITIGATION_OFF; else if (!strcmp(str, "full")) mds_mitigation = MDS_MITIGATION_FULL; + else if (!strcmp(str, "full,nosmt")) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_nosmt = true; + } return 0; } -- cgit v1.2.3 From 7c3658b20194a5b3209a143f63bc9c643c6a3ae2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 2 Apr 2019 10:00:14 -0500 Subject: x86/speculation: Move arch_smt_update() call to after mitigation decisions arch_smt_update() now has a dependency on both Spectre v2 and MDS mitigations. Move its initial call to after all the mitigation decisions have been made. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Tyler Hicks Acked-by: Jiri Kosina --- arch/x86/kernel/cpu/bugs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9f252082a83b..3f934ffef8cf 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -111,6 +111,8 @@ void __init check_bugs(void) mds_select_mitigation(); + arch_smt_update(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -638,9 +640,6 @@ specv2_set_mode: /* Set up IBPB and STIBP depending on the general spectre V2 command */ spectre_v2_user_select_mitigation(cmd); - - /* Enable STIBP if appropriate */ - arch_smt_update(); } static void update_stibp_msr(void * __unused) -- cgit v1.2.3 From 39226ef02bfb43248b7db12a4fdccb39d95318e3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 2 Apr 2019 10:00:51 -0500 Subject: x86/speculation/mds: Add SMT warning message MDS is vulnerable with SMT. Make that clear with a one-time printk whenever SMT first gets enabled. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Tyler Hicks Acked-by: Jiri Kosina --- arch/x86/kernel/cpu/bugs.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3f934ffef8cf..22a14d4b68a2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -673,6 +673,9 @@ static void update_indir_branch_cond(void) static_branch_disable(&switch_to_cond_stibp); } +#undef pr_fmt +#define pr_fmt(fmt) fmt + /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { @@ -693,6 +696,8 @@ static void update_mds_branch_idle(void) static_branch_disable(&mds_idle_clear); } +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" + void arch_smt_update(void) { /* Enhanced IBRS implies STIBP. No update required. */ @@ -717,6 +722,8 @@ void arch_smt_update(void) switch (mds_mitigation) { case MDS_MITIGATION_FULL: case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); update_mds_branch_idle(); break; case MDS_MITIGATION_OFF: @@ -1149,6 +1156,7 @@ static int __init l1tf_cmdline(char *str) early_param("l1tf", l1tf_cmdline); #undef pr_fmt +#define pr_fmt(fmt) fmt #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 1a81542abfdae21716a1a32ea8472de7a271dde6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 09:47:18 +0100 Subject: perf/x86/intel: Simplify intel_tfa_commit_scheduling() validate_group() calls x86_schedule_events(.assign=NULL) and therefore will not call intel_tfa_commit_scheduling(). So there is no point in checking cpuc->is_fake, we'll never get there. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8baa441d8000..dee281d58476 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2015,7 +2015,7 @@ static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int /* * We're going to use PMC3, make sure TFA is set before we touch it. */ - if (cntr == 3 && !cpuc->is_fake) + if (cntr == 3) intel_set_tfa(cpuc, true); } -- cgit v1.2.3 From 21d65555cd878c8d6ff2b2902e6c16b2ca73f33d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 09:57:57 +0100 Subject: perf/x86: Simplify x86_pmu.get_constraints() interface There is a special case for validate_events() where we'll call x86_pmu.get_constraints(.idx=-1). It's purpose, up until recent, seems to be to avoid taking a previous constraint from cpuc->event_constraint[] in intel_get_event_constraints(). (I could not find any other get_event_constraints() implementation using @idx) However, since that cpuc is freshly allocated, that array will in fact be initialized with NULL pointers, achieving the very same effect. Therefore remove this exception. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e2b1447192a8..508bf5355a5a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2031,7 +2031,7 @@ static int validate_event(struct perf_event *event) if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); - c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); + c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dee281d58476..879ab540fd05 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2931,11 +2931,9 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = NULL; - struct event_constraint *c2; + struct event_constraint *c1, *c2; - if (idx >= 0) /* fake does < 0 */ - c1 = cpuc->event_constraint[idx]; + c1 = cpuc->event_constraint[idx]; /* * first time only @@ -3410,7 +3408,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, /* * Without TFA we must not use PMC3. */ - if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) { + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) { c = dyn_constraint(cpuc, c, idx); c->idxmsk64 &= ~(1ULL << 3); c->weight--; -- cgit v1.2.3 From 1f6a1e2d7d7135280125418b25d35aebd6fa0952 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 12:58:52 +0100 Subject: perf/x86: Remove PERF_X86_EVENT_COMMITTED The flag PERF_X86_EVENT_COMMITTED is used to find uncommitted events for which to call put_event_constraint() when scheduling fails. These are the newly added events to the list, and must form, per definition, the tail of cpuc->event_list[]. By computing the list index of the last successfull schedule, then iteration can start there and the flag is redundant. There are only 3 callers of x86_schedule_events(), notably: - x86_pmu_add() - x86_pmu_commit_txn() - validate_group() For x86_pmu_add(), cpuc->n_events isn't updated until after schedule_events() succeeds, therefore cpuc->n_events points to the desired index. For x86_pmu_commit_txn(), cpuc->n_events is updated, but we can trivially compute the desired value with cpuc->n_txn -- the number of events added in this transaction. For validate_group(), we can make the rule for x86_pmu_add() work by simply setting cpuc->n_events to 0 before calling schedule_events(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 28 +++++++++++++--------------- arch/x86/events/perf_event.h | 19 +++++++++---------- 2 files changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 508bf5355a5a..9cab5adfa87a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -925,19 +925,23 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (!unsched && assign) { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; - e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } else { - for (i = 0; i < n; i++) { + /* + * Compute the number of events already present; see + * x86_pmu_add(), validate_group() and x86_pmu_commit_txn(). + * For the former two cpuc->n_events hasn't been updated yet, + * while for the latter cpuc->n_txn contains the number of + * events added in the current transaction. + */ + i = cpuc->n_events; + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) + i -= cpuc->n_txn; + + for (; i < n; i++) { e = cpuc->event_list[i]; - /* - * do not put_constraint() on comitted events, - * because they are good to go - */ - if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) - continue; /* * release events that failed scheduling @@ -1371,11 +1375,6 @@ static void x86_pmu_del(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; - /* - * event is descheduled - */ - event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; - /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate @@ -2079,8 +2078,7 @@ static int validate_group(struct perf_event *event) if (n < 0) goto out; - fake_cpuc->n_events = n; - + fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a75955741c50..b4d41829da4f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -55,22 +55,21 @@ struct event_constraint { int overlap; int flags; }; + /* * struct hw_perf_event.flags flags */ #define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ #define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ #define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0800 /* use large PEBS */ - +#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ struct amd_nb { int nb_id; /* NorthBridge id */ -- cgit v1.2.3 From c090cb70c6152fc97ce91242789d7c322109f1a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 13:01:14 +0100 Subject: perf/x86/intel: Optimize intel_get_excl_constraints() Avoid the POPCNT by noting we can decrement the weight for each cleared bit. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 879ab540fd05..a80015fa8aed 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2838,7 +2838,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; - int is_excl, i; + int is_excl, i, w; /* * validating a group does not require @@ -2894,36 +2894,40 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * SHARED : sibling counter measuring non-exclusive event * UNUSED : sibling counter unused */ + w = c->weight; for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { /* * exclusive event in sibling counter * our corresponding counter cannot be used * regardless of our event */ - if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) + if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) { __clear_bit(i, c->idxmsk); + w--; + continue; + } /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used */ - if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) + if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) { __clear_bit(i, c->idxmsk); + w--; + continue; + } } - /* - * recompute actual bit weight for scheduling algorithm - */ - c->weight = hweight64(c->idxmsk64); - /* * if we return an empty mask, then switch * back to static empty constraint to avoid * the cost of freeing later on */ - if (c->weight == 0) + if (!w) c = &emptyconstraint; + c->weight = w; + return c; } -- cgit v1.2.3 From 2c9651c38d1789d179c8d50724d037152ba85e56 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 13:03:00 +0100 Subject: perf/x86: Clear ->event_constraint[] on put The current code unconditionally clears cpuc->event_constraint[i] before calling get_event_constraints(.idx=i). The only site that cares is intel_get_event_constraints() where the c1 load will always be NULL. However, always calling get_event_constraints() on all events is wastefull, most times it will return the exact same result. Therefore retain the logic in intel_get_event_constraints() and change the generic code to only clear the constraint on put. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 9cab5adfa87a..279530111d8e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -858,7 +858,6 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - cpuc->event_constraint[i] = NULL; c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; @@ -948,6 +947,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, e); + + cpuc->event_constraint[i] = NULL; } } @@ -1411,6 +1412,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; } + cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; perf_event_update_userpage(event); -- cgit v1.2.3 From 109717de57b95d6dd9582c18a74f154d69fea536 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 13:17:51 +0100 Subject: perf/x86: Optimize x86_schedule_events() Now that cpuc->event_constraint[] is retained, we can avoid calling get_event_constraints() over and over again. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 13 +++++++++++-- arch/x86/events/intel/core.c | 3 ++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 279530111d8e..fa88acf0daad 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -858,8 +858,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - cpuc->event_constraint[i] = c; + c = cpuc->event_constraint[i]; + + /* + * Request constraints for new events; or for those events that + * have a dynamic constraint -- for those the constraint can + * change due to external factors (sibling state, allow_tfa). + */ + if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { + c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); + cpuc->event_constraint[i] = c; + } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a80015fa8aed..62394e1b2390 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2945,7 +2945,8 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * - dynamic constraint: handled by intel_get_excl_constraints() */ c2 = __intel_get_event_constraints(cpuc, idx, event); - if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { + if (c1) { + WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC)); bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); c1->weight = c2->weight; c2 = c1; -- cgit v1.2.3 From f80deefa41890f9484802d7b67f11daf28055150 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 13:25:02 +0100 Subject: perf/x86: Add sanity checks to x86_schedule_events() By computing the 'committed' index earlier, we can use it to validate the cached constraint state. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index fa88acf0daad..24dab9ad4fd6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -849,17 +849,34 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct perf_event *e; - int i, wmin, wmax, unsched = 0; + int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); + /* + * Compute the number of events already present; see x86_pmu_add(), + * validate_group() and x86_pmu_commit_txn(). For the former two + * cpuc->n_events hasn't been updated yet, while for the latter + * cpuc->n_txn contains the number of events added in the current + * transaction. + */ + n0 = cpuc->n_events; + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) + n0 -= cpuc->n_txn; + if (x86_pmu.start_scheduling) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; + /* + * Previously scheduled events should have a cached constraint, + * while new events should not have one. + */ + WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); + /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can @@ -937,18 +954,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } else { - /* - * Compute the number of events already present; see - * x86_pmu_add(), validate_group() and x86_pmu_commit_txn(). - * For the former two cpuc->n_events hasn't been updated yet, - * while for the latter cpuc->n_txn contains the number of - * events added in the current transaction. - */ - i = cpuc->n_events; - if (cpuc->txn_flags & PERF_PMU_TXN_ADD) - i -= cpuc->n_txn; - - for (; i < n; i++) { + for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* -- cgit v1.2.3 From 6690e86be83ac75832e461c141055b5d601c0a6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Feb 2019 10:30:52 +0100 Subject: sched/x86: Save [ER]FLAGS on context switch Effectively reverts commit: 2c7577a75837 ("sched/x86_64: Don't save flags on context switch") Specifically because SMAP uses FLAGS.AC which invalidates the claim that the kernel has clean flags. In particular; while preemption from interrupt return is fine (the IRET frame on the exception stack contains FLAGS) it breaks any code that does synchonous scheduling, including preempt_enable(). This has become a significant issue ever since commit: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses") provided for means of having 'normal' C code between STAC / CLAC, exposing the FLAGS.AC state. So far this hasn't led to trouble, however fix it before it comes apart. Reported-by: Julien Thierry Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@kernel.org Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses") Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 2 ++ arch/x86/entry/entry_64.S | 2 ++ arch/x86/include/asm/switch_to.h | 1 + arch/x86/kernel/process_32.c | 7 +++++++ arch/x86/kernel/process_64.c | 8 ++++++++ 5 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d309f30cf7af..5fc76b755510 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -650,6 +650,7 @@ ENTRY(__switch_to_asm) pushl %ebx pushl %edi pushl %esi + pushfl /* switch stack */ movl %esp, TASK_threadsp(%eax) @@ -672,6 +673,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfl popl %esi popl %edi popl %ebx diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..4fe27b67d7e2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -291,6 +291,7 @@ ENTRY(__switch_to_asm) pushq %r13 pushq %r14 pushq %r15 + pushfq /* switch stack */ movq %rsp, TASK_threadsp(%rdi) @@ -313,6 +314,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfq popq %r15 popq %r14 popq %r13 diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7cf1a270d891..157149d4129c 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void); * order of the fields must match the code in __switch_to_asm(). */ struct inactive_task_frame { + unsigned long flags; #ifdef CONFIG_X86_64 unsigned long r15; unsigned long r14; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b2..70933193878c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf..026a43be9bd1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -392,6 +392,14 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; -- cgit v1.2.3 From 67a0514afdbb8b2fc70b771b8c77661a9cb9d3a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Feb 2019 12:56:35 +0100 Subject: x86/ia32: Fix ia32_restore_sigcontext() AC leak Objtool spotted that we call native_load_gs_index() with AC set. Re-arrange the code to avoid that. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 321fe5f5d0e9..4d5fcd47ab75 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -61,9 +61,8 @@ } while (0) #define RELOAD_SEG(seg) { \ - unsigned int pre = GET_SEG(seg); \ + unsigned int pre = (seg) | 3; \ unsigned int cur = get_user_seg(seg); \ - pre |= 3; \ if (pre != cur) \ set_user_seg(seg, pre); \ } @@ -72,6 +71,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; + u16 gs, fs, es, ds; void __user *buf; u32 tmp; @@ -79,16 +79,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; get_user_try { - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ - RELOAD_SEG(gs); - RELOAD_SEG(fs); - RELOAD_SEG(ds); - RELOAD_SEG(es); + gs = GET_SEG(gs); + fs = GET_SEG(fs); + ds = GET_SEG(ds); + es = GET_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(ax); @@ -106,6 +100,17 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + RELOAD_SEG(gs); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); + err |= fpu__restore_sig(buf, 1); force_iret(); -- cgit v1.2.3 From 3693ca81151eacd498675baae56abede577e8b31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 Mar 2019 15:24:33 +0100 Subject: x86/uaccess: Move copy_user_handle_tail() into asm By writing the function in asm we avoid cross object code flow and objtool no longer gets confused about a 'stray' CLAC. Also; the asm version is actually _simpler_. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 24 -------------------- arch/x86/include/asm/uaccess_64.h | 3 --- arch/x86/lib/copy_user_64.S | 48 +++++++++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy_64.c | 20 ---------------- 4 files changed, 48 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 6467757bb39f..3ff577c0b102 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -148,30 +148,6 @@ _ASM_PTR (entry); \ .popsection -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE_UA(100b, 103b) - _ASM_EXTABLE_UA(101b, 103b) - .endm - #else # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index a9d637bc301d..5cd1caa8bc65 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -207,9 +207,6 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) return __copy_user_flushcache(dst, src, size); } -unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len); - unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index db4e5aa0858b..b2f1822084ae 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,6 +16,30 @@ #include #include +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(100b, 103b) + _ASM_EXTABLE_UA(101b, 103b) + .endm + /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro @@ -193,6 +217,30 @@ ENTRY(copy_user_enhanced_fast_string) ENDPROC(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ALIGN; +copy_user_handle_tail: + movl %edx,%ecx +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + ret + + _ASM_EXTABLE_UA(1b, 2b) +ENDPROC(copy_user_handle_tail) + /* * copy_user_nocache - Uncached memory copy with exception handling * This will force destination out of cache for more performance. diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index ee42bb0cbeb3..9952a01cad24 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -54,26 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -/* - * Try to copy last bytes and clear the rest if needed. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - */ -__visible unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++) { - char c; - - if (__get_user_nocheck(c, from++, sizeof(char))) - break; - if (__put_user_nocheck(c, to, sizeof(char))) - break; - } - clac(); - return len; -} - /* * Similar to copy_user_handle_tail, probe for the write fault point, * but reuse __memcpy_mcsafe in case a new read error is encountered. -- cgit v1.2.3 From b69656fa7ea2f75e47d7bd5b9430359fa46488af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Apr 2019 09:39:45 +0200 Subject: x86/uaccess: Fix up the fixup New tooling got confused about this: arch/x86/lib/memcpy_64.o: warning: objtool: .fixup+0x7: return with UACCESS enabled While the code isn't wrong, it is tedious (if at all possible) to figure out what function a particular chunk of .fixup belongs to. This then confuses the objtool uaccess validation. Instead of returning directly from the .fixup, jump back into the right function. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/lib/memcpy_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 3b24dc05251c..9d05572370ed 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -257,6 +257,7 @@ ENTRY(__memcpy_mcsafe) /* Copy successful. Return zero */ .L_done_memcpy_trap: xorl %eax, %eax +.L_done: ret ENDPROC(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) @@ -273,7 +274,7 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe) addl %edx, %ecx .E_trailing_bytes: mov %ecx, %eax - ret + jmp .L_done /* * For write fault handling, given the destination is unaligned, -- cgit v1.2.3 From ff05ab2305aaeb21a3002ae95a17e176c198b71b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Mar 2019 14:33:07 +0100 Subject: x86/nospec, objtool: Introduce ANNOTATE_IGNORE_ALTERNATIVE To facillitate other usage of ignoring alternatives; rename ANNOTATE_NOSPEC_IGNORE to ANNOTATE_IGNORE_ALTERNATIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative-asm.h | 11 +++++++++++ arch/x86/include/asm/alternative.h | 10 ++++++++++ arch/x86/include/asm/nospec-branch.h | 28 +++++++++------------------- tools/objtool/check.c | 8 ++++---- 4 files changed, 34 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 31b627b43a8e..464034db299f 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -19,6 +19,17 @@ .endm #endif +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + /* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4c74073a19cc..094fbc9c0b1c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -45,6 +45,16 @@ #define LOCK_PREFIX "" #endif +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.ignore_alts\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dad12b767ba0..daf25b60c9e3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -10,6 +10,15 @@ #include #include +/* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + ANNOTATE_IGNORE_ALTERNATIVE + /* * Fill the CPU return stack buffer. * @@ -56,19 +65,6 @@ #ifdef __ASSEMBLY__ -/* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -.macro ANNOTATE_NOSPEC_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.nospec - .long .Lannotate_\@ - . - .popsection -.endm - /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline @@ -152,12 +148,6 @@ #else /* __ASSEMBLY__ */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - "999:\n\t" \ - ".pushsection .discard.nospec\n\t" \ - ".long 999b - .\n\t" \ - ".popsection\n\t" - #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5dde107083c6..110ea3d84772 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -457,13 +457,13 @@ static void add_ignores(struct objtool_file *file) * But it at least allows objtool to understand the control flow *around* the * retpoline. */ -static int add_nospec_ignores(struct objtool_file *file) +static int add_ignore_alternatives(struct objtool_file *file) { struct section *sec; struct rela *rela; struct instruction *insn; - sec = find_section_by_name(file->elf, ".rela.discard.nospec"); + sec = find_section_by_name(file->elf, ".rela.discard.ignore_alts"); if (!sec) return 0; @@ -475,7 +475,7 @@ static int add_nospec_ignores(struct objtool_file *file) insn = find_insn(file, rela->sym->sec, rela->addend); if (!insn) { - WARN("bad .discard.nospec entry"); + WARN("bad .discard.ignore_alts entry"); return -1; } @@ -1239,7 +1239,7 @@ static int decode_sections(struct objtool_file *file) add_ignores(file); - ret = add_nospec_ignores(file); + ret = add_ignore_alternatives(file); if (ret) return ret; -- cgit v1.2.3 From 4fc0f0e9471ec573ab2d44e7f462d996d614536e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Mar 2019 19:09:08 +0100 Subject: x86/uaccess, xen: Suppress SMAP warnings drivers/xen/privcmd.o: warning: objtool: privcmd_ioctl()+0x1414: call to hypercall_page() with UACCESS enabled Some Xen hypercalls allow parameter buffers in user land, so they need to set AC=1. Avoid the warning for those cases. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/hypercall.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index de6f0d59a24f..580a909afad4 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -214,6 +214,22 @@ xen_single_call(unsigned int call, return (long)__res; } +static __always_inline void __xen_stac(void) +{ + /* + * Suppress objtool seeing the STAC/CLAC and getting confused about it + * calling random code with AC=1. + */ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_STAC ::: "memory", "flags"); +} + +static __always_inline void __xen_clac(void) +{ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_CLAC ::: "memory", "flags"); +} + static inline long privcmd_call(unsigned int call, unsigned long a1, unsigned long a2, @@ -222,9 +238,9 @@ privcmd_call(unsigned int call, { long res; - stac(); + __xen_stac(); res = xen_single_call(call, a1, a2, a3, a4, a5); - clac(); + __xen_clac(); return res; } @@ -421,9 +437,9 @@ HYPERVISOR_dm_op( domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { int ret; - stac(); + __xen_stac(); ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); - clac(); + __xen_clac(); return ret; } -- cgit v1.2.3 From b7f89bfe52cd523a229d6dd22639225b2e3681dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Apr 2019 09:39:47 +0200 Subject: x86/uaccess: Always inline user_access_begin() If GCC out-of-lines it, the STAC and CLAC are in different fuctions and objtool gets upset. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1954dd5552a2..ae5783b5fab0 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -705,7 +705,7 @@ extern struct movsl_mask { * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ -static __must_check inline bool user_access_begin(const void __user *ptr, size_t len) +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; -- cgit v1.2.3 From 88e4718275c1bddca6f61f300688b4553dc8584b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Apr 2019 09:39:48 +0200 Subject: x86/uaccess, signal: Fix AC=1 bloat Occasionally GCC is less agressive with inlining and the following is observed: arch/x86/kernel/signal.o: warning: objtool: restore_sigcontext()+0x3cc: call to force_valid_ss.isra.5() with UACCESS enabled arch/x86/kernel/signal.o: warning: objtool: do_signal()+0x384: call to frame_uc_flags.isra.0() with UACCESS enabled Cure this by moving this code out of the AC=1 region, since it really isn't needed for the user access. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..c8aa58a2bab9 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -132,16 +132,6 @@ static int restore_sigcontext(struct pt_regs *regs, COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && - user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ @@ -150,6 +140,15 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -461,6 +460,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *fp = NULL; + unsigned long uc_flags; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); @@ -473,9 +473,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -541,6 +543,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, { #ifdef CONFIG_X86_X32_ABI struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -555,9 +558,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); -- cgit v1.2.3 From 5f307be18b32aeff7bbad540c0d3897ecedbeb56 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Sep 2018 13:18:15 +0200 Subject: asm-generic/tlb, arch: Provide generic tlb_flush() based on flush_tlb_range() Provide a generic tlb_flush() implementation that relies on flush_tlb_range(). This is a little awkward because flush_tlb_range() assumes a VMA for range invalidation, but we no longer have one. Audit of all flush_tlb_range() implementations shows only vma->vm_mm and vma->vm_flags are used, and of the latter only VM_EXEC (I-TLB invalidates) and VM_HUGETLB (large TLB invalidate) are used. Therefore, track VM_EXEC and VM_HUGETLB in two more bits, and create a 'fake' VMA. This allows architectures that have a reasonably efficient flush_tlb_range() to not require any additional effort. No change in behavior intended. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nick Piggin Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/arm64/include/asm/tlb.h | 1 + arch/powerpc/include/asm/tlb.h | 1 + arch/riscv/include/asm/tlb.h | 1 + arch/x86/include/asm/tlb.h | 1 + include/asm-generic/tlb.h | 95 ++++++++++++++++++++++++++++++++++++------ 5 files changed, 87 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 106fdc951b6e..37603b5616a5 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -27,6 +27,7 @@ static inline void __tlb_remove_table(void *_table) free_page_and_swap_cache((struct page *)_table); } +#define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb); #include diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index b018e9f9b491..34fba1ce27f7 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,7 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry +#define tlb_flush tlb_flush extern void tlb_flush(struct mmu_gather *tlb); /* Get the generic bits... */ diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 439dc7072e05..1ad8d093c58b 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -18,6 +18,7 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); +#define tlb_flush tlb_flush #include static inline void tlb_flush(struct mmu_gather *tlb) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 404b8b1d44f5..f23e7aaff4cd 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,7 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +#define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f0aa53db5e60..e6a4c407be6c 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -95,7 +95,7 @@ * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * - * And requires the architecture to provide and implement tlb_flush(). + * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: @@ -111,7 +111,10 @@ * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * - * returns the smallest TLB entry size unmapped in this range + * returns the smallest TLB entry size unmapped in this range. + * + * If an architecture does not provide tlb_flush() a default implementation + * based on flush_tlb_range() will be used. * * Additionally there are a few opt-in features: * @@ -245,6 +248,12 @@ struct mmu_gather { unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; + /* + * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma + */ + unsigned int vma_exec : 1; + unsigned int vma_huge : 1; + unsigned int batch_count; struct mmu_gather_batch *active; @@ -286,8 +295,59 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; + /* + * Do not reset mmu_gather::vma_* fields here, we do not + * call into tlb_start_vma() again to set them if there is an + * intermediate flush. + */ +} + +#ifndef tlb_flush + +#if defined(tlb_start_vma) || defined(tlb_end_vma) +#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma() +#endif + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + if (tlb->fullmm || tlb->need_flush_all) { + flush_tlb_mm(tlb->mm); + } else if (tlb->end) { + struct vm_area_struct vma = { + .vm_mm = tlb->mm, + .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | + (tlb->vma_huge ? VM_HUGETLB : 0), + }; + + flush_tlb_range(&vma, tlb->start, tlb->end); + } +} + +static inline void +tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + /* + * flush_tlb_range() implementations that look at VM_HUGETLB (tile, + * mips-4k) flush only large pages. + * + * flush_tlb_range() implementations that flush I-TLB also flush D-TLB + * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing + * range. + * + * We rely on tlb_end_vma() to issue a flush, such that when we reset + * these values the batch is empty. + */ + tlb->vma_huge = !!(vma->vm_flags & VM_HUGETLB); + tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); } +#else + +static inline void +tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { } + +#endif + static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { if (!tlb->end) @@ -357,19 +417,30 @@ static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) * the vmas are adjusted to only cover the region to be torn down. */ #ifndef tlb_start_vma -#define tlb_start_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) +static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (tlb->fullmm) + return; + + tlb_update_vma_flags(tlb, vma); + flush_cache_range(vma, vma->vm_start, vma->vm_end); +} #endif #ifndef tlb_end_vma -#define tlb_end_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - tlb_flush_mmu_tlbonly(tlb); \ -} while (0) +static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (tlb->fullmm) + return; + + /* + * Do a TLB flush and reset the range at VMA boundaries; this avoids + * the ranges growing with the unused space between consecutive VMAs, + * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on + * this. + */ + tlb_flush_mmu_tlbonly(tlb); +} #endif #ifndef __tlb_remove_tlb_entry -- cgit v1.2.3 From 96bc9567cbe112e9320250f01b9c060c882e8619 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Sep 2018 13:24:41 +0200 Subject: asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE Make issuing a TLB invalidate for page-table pages the normal case. The reason is twofold: - too many invalidates is safer than too few, - most architectures use the linux page-tables natively and would thus require this. Make it an opt-out, instead of an opt-in. No change in behavior intended. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 +- arch/arm64/Kconfig | 1 - arch/powerpc/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/x86/Kconfig | 1 - include/asm-generic/tlb.h | 9 +++++---- mm/mmu_gather.c | 2 +- 7 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index cdc7f3d5d278..04b3e8b94cfe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -383,7 +383,7 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE config HAVE_RCU_TABLE_FREE bool -config HAVE_RCU_TABLE_INVALIDATE +config HAVE_RCU_TABLE_NO_INVALIDATE bool config HAVE_MMU_GATHER_PAGE_SIZE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7e34b9eba5de..78d9fafac983 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -149,7 +149,6 @@ config ARM64 select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RCU_TABLE_FREE - select HAVE_RCU_TABLE_INVALIDATE select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a7aa4feabc09..8e1e2abf17eb 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -218,6 +218,7 @@ config PPC select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 40f8f4f73fe8..db79290ed6d5 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -63,6 +63,7 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19..b0f30d86c23f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -183,7 +183,6 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if PARAVIRT - select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 1c861989b704..81799e6a4304 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -135,11 +135,12 @@ * When used, an architecture is expected to provide __tlb_remove_table() * which does the actual freeing of these pages. * - * HAVE_RCU_TABLE_INVALIDATE + * HAVE_RCU_TABLE_NO_INVALIDATE * - * This makes HAVE_RCU_TABLE_FREE call tlb_flush_mmu_tlbonly() before freeing - * the page-table pages. Required if you use HAVE_RCU_TABLE_FREE and your - * architecture uses the Linux page-tables natively. + * This makes HAVE_RCU_TABLE_FREE avoid calling tlb_flush_mmu_tlbonly() before + * freeing the page-table pages. This can be avoided if you use + * HAVE_RCU_TABLE_FREE and your architecture does _NOT_ use the Linux + * page-tables natively. * * MMU_GATHER_NO_RANGE * diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 14dfc97155e4..2a5322d52b0a 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -157,7 +157,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { -#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE +#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE /* * Invalidate page-table caches used by hardware walkers. Then we still * need to RCU-sched wait while freeing the pages because software -- cgit v1.2.3 From e74deb11931ff682b59d5b9d387f7115f689698e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Apr 2019 09:39:48 +0200 Subject: x86/uaccess: Introduce user_access_{save,restore}() Introduce common helpers for when we need to safely suspend a uaccess section; for instance to generate a {KA,UB}SAN report. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smap.h | 20 ++++++++++++++++++++ arch/x86/include/asm/uaccess.h | 3 +++ include/linux/uaccess.h | 2 ++ 3 files changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index db333300bd4b..6cfe43171020 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -58,6 +58,23 @@ static __always_inline void stac(void) alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); } +static __always_inline unsigned long smap_save(void) +{ + unsigned long flags; + + asm volatile (ALTERNATIVE("", "pushf; pop %0; " __stringify(__ASM_CLAC), + X86_FEATURE_SMAP) + : "=rm" (flags) : : "memory", "cc"); + + return flags; +} + +static __always_inline void smap_restore(unsigned long flags) +{ + asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + : : "g" (flags) : "memory", "cc"); +} + /* These macros can be used in asm() statements */ #define ASM_CLAC \ ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) @@ -69,6 +86,9 @@ static __always_inline void stac(void) static inline void clac(void) { } static inline void stac(void) { } +static inline unsigned long smap_save(void) { return 0; } +static inline void smap_restore(unsigned long flags) { } + #define ASM_CLAC #define ASM_STAC diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index ae5783b5fab0..5ca7b91faf67 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -715,6 +715,9 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() +#define user_access_save() smap_save() +#define user_access_restore(x) smap_restore(x) + #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 37b226e8df13..2b70130af585 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -268,6 +268,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define user_access_end() do { } while (0) #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) +static inline unsigned long user_access_save(void) { return 0UL; } +static inline void user_access_restore(unsigned long flags) { } #endif #ifdef CONFIG_HARDENED_USERCOPY -- cgit v1.2.3 From a936af8ea3580855adcc80daa8e01c0196afeb15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Mar 2019 14:44:44 +0100 Subject: x86/smap: Ditch __stringify() Linus noticed all users of __ASM_STAC/__ASM_CLAC are with __stringify(). Just make them a string. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smap.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 6cfe43171020..f94a7d0ddd49 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,13 +13,12 @@ #ifndef _ASM_X86_SMAP_H #define _ASM_X86_SMAP_H -#include #include #include /* "Raw" instruction opcodes */ -#define __ASM_CLAC .byte 0x0f,0x01,0xca -#define __ASM_STAC .byte 0x0f,0x01,0xcb +#define __ASM_CLAC ".byte 0x0f,0x01,0xca" +#define __ASM_STAC ".byte 0x0f,0x01,0xcb" #ifdef __ASSEMBLY__ @@ -28,10 +27,10 @@ #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ - ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -49,20 +48,20 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __ASM_CLAC, X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __ASM_STAC, X86_FEATURE_SMAP); } static __always_inline unsigned long smap_save(void) { unsigned long flags; - asm volatile (ALTERNATIVE("", "pushf; pop %0; " __stringify(__ASM_CLAC), + asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); @@ -77,9 +76,9 @@ static __always_inline void smap_restore(unsigned long flags) /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ -- cgit v1.2.3 From 64604d54d3115fee89598bfb6d8d2252f8a2d114 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Mar 2019 11:35:46 +0100 Subject: sched/x86_64: Don't save flags on context switch Now that we have objtool validating AC=1 state for all x86_64 code, we can once again guarantee clean flags on schedule. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 -- arch/x86/include/asm/switch_to.h | 2 +- arch/x86/kernel/process_64.c | 7 ------- 3 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4fe27b67d7e2..1f0efdb7b629 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -291,7 +291,6 @@ ENTRY(__switch_to_asm) pushq %r13 pushq %r14 pushq %r15 - pushfq /* switch stack */ movq %rsp, TASK_threadsp(%rdi) @@ -314,7 +313,6 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ - popfq popq %r15 popq %r14 popq %r13 diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 157149d4129c..18a4b6890fa8 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -40,13 +40,13 @@ asmlinkage void ret_from_fork(void); * order of the fields must match the code in __switch_to_asm(). */ struct inactive_task_frame { - unsigned long flags; #ifdef CONFIG_X86_64 unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; #else + unsigned long flags; unsigned long si; unsigned long di; #endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 026a43be9bd1..844a28b29967 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -393,13 +393,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; - /* - * For a new task use the RESET flags value since there is no before. - * All the status flags are zero; DF and all the system flags must also - * be 0, specifically IF must be 0 because we context switch to the new - * task with interrupts disabled. - */ - frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; -- cgit v1.2.3 From 89833fab15d6017ba006a45b5af68caa067171a7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 29 Mar 2019 22:46:51 +0100 Subject: x86/fpu: Fix __user annotations In save_xstate_epilog(), use __user when type-casting userspace pointers. In setup_sigcontext() and x32_setup_rt_frame(), cast the userspace pointers to 'unsigned long __user *' before writing into them. These pointers are originally '__u32 __user *' or '__u64 __user *', causing sparse to complain when a userspace pointer is written into them. The casts are okay because the pointers always point to pointer-sized values. Thanks to Luc Van Oostenryck and Al Viro for explaining this to me. Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Mukesh Ojha Cc: Qiaowei Ren Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Will Deacon Cc: x86-ml Link: https://lkml.kernel.org/r/20190329214652.258477-3-jannh@google.com --- arch/x86/kernel/fpu/signal.c | 6 +++--- arch/x86/kernel/signal.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index f6a1d299627c..55b80de13ea5 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -92,13 +92,13 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) return err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ - err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); + err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); /* * For legacy compatible, we always set FP/SSE bits in the bit @@ -113,7 +113,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) */ xfeatures |= XFEATURE_MASK_FPSSE; - err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); + err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); return err; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..b419e1a1a0ce 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -206,7 +206,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ - put_user_ex(fpstate, &sc->fpstate); + put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate); /* non-iBCS2 extensions.. */ put_user_ex(mask, &sc->oldmask); @@ -569,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, restorer = NULL; err |= -EFAULT; } - put_user_ex(restorer, &frame->pretcode); + put_user_ex(restorer, (unsigned long __user *)&frame->pretcode); } put_user_catch(err); err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, -- cgit v1.2.3 From 46ad0840b1584b92b5ff2cc3ed0b011dd6b8e0f1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:06 -0400 Subject: locking/rwsem: Remove arch specific rwsem files As the generic rwsem-xadd code is using the appropriate acquire and release versions of the atomic operations, the arch specific rwsem.h files will not be that much faster than the generic code as long as the atomic functions are properly implemented. So we can remove those arch specific rwsem.h and stop building asm/rwsem.h to reduce maintenance effort. Currently, only x86, alpha and ia64 have implemented architecture specific fast paths. I don't have access to alpha and ia64 systems for testing, but they are legacy systems that are not likely to be updated to the latest kernel anyway. By using a rwsem microbenchmark, the total locking rates on a 4-socket 56-core 112-thread x86-64 system before and after the patch were as follows (mixed means equal # of read and write locks): Before Patch After Patch # of Threads wlock rlock mixed wlock rlock mixed ------------ ----- ----- ----- ----- ----- ----- 1 29,201 30,143 29,458 28,615 30,172 29,201 2 6,807 13,299 1,171 7,725 15,025 1,804 4 6,504 12,755 1,520 7,127 14,286 1,345 8 6,762 13,412 764 6,826 13,652 726 16 6,693 15,408 662 6,599 15,938 626 32 6,145 15,286 496 5,549 15,487 511 64 5,812 15,495 60 5,858 15,572 60 There were some run-to-run variations for the multi-thread tests. For x86-64, using the generic C code fast path seems to be a little bit faster than the assembly version with low lock contention. Looking at the assembly version of the fast paths, there are assembly to/from C code wrappers that save and restore all the callee-clobbered registers (7 registers on x86-64). The assembly generated from the generic C code doesn't need to do that. That may explain the slight performance gain here. The generic asm rwsem.h can also be merged into kernel/locking/rwsem.h with no code change as no other code other than those under kernel/locking needs to access the internal rwsem macros and functions. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-2-longman@redhat.com Signed-off-by: Ingo Molnar --- MAINTAINERS | 1 - arch/alpha/include/asm/rwsem.h | 211 ----------------------------------- arch/arm/include/asm/Kbuild | 1 - arch/arm64/include/asm/Kbuild | 1 - arch/hexagon/include/asm/Kbuild | 1 - arch/ia64/include/asm/rwsem.h | 172 ----------------------------- arch/powerpc/include/asm/Kbuild | 1 - arch/s390/include/asm/Kbuild | 1 - arch/sh/include/asm/Kbuild | 1 - arch/sparc/include/asm/Kbuild | 1 - arch/x86/include/asm/rwsem.h | 237 ---------------------------------------- arch/x86/lib/Makefile | 1 - arch/x86/lib/rwsem.S | 156 -------------------------- arch/x86/um/Makefile | 4 +- arch/xtensa/include/asm/Kbuild | 1 - include/asm-generic/rwsem.h | 140 ------------------------ include/linux/rwsem.h | 4 +- kernel/locking/percpu-rwsem.c | 2 + kernel/locking/rwsem.h | 130 ++++++++++++++++++++++ 19 files changed, 134 insertions(+), 932 deletions(-) delete mode 100644 arch/alpha/include/asm/rwsem.h delete mode 100644 arch/ia64/include/asm/rwsem.h delete mode 100644 arch/x86/include/asm/rwsem.h delete mode 100644 arch/x86/lib/rwsem.S delete mode 100644 include/asm-generic/rwsem.h (limited to 'arch/x86') diff --git a/MAINTAINERS b/MAINTAINERS index 43b36dbed48e..8876f5de7738 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9098,7 +9098,6 @@ F: arch/*/include/asm/spinlock*.h F: include/linux/rwlock*.h F: include/linux/mutex*.h F: include/linux/rwsem*.h -F: arch/*/include/asm/rwsem.h F: include/linux/seqlock.h F: lib/locking*.[ch] F: kernel/locking/ diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h deleted file mode 100644 index cf8fc8f9a2ed..000000000000 --- a/arch/alpha/include/asm/rwsem.h +++ /dev/null @@ -1,211 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ALPHA_RWSEM_H -#define _ALPHA_RWSEM_H - -/* - * Written by Ivan Kokshaysky , 2001. - * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h - */ - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#include - -#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L -#define RWSEM_ACTIVE_BIAS 0x0000000000000001L -#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL -#define RWSEM_WAITING_BIAS (-0x0000000100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -static inline int ___down_read(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter += RWSEM_ACTIVE_READ_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); -#endif - return (oldcount < 0); -} - -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(___down_read(sem))) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(___down_read(sem))) - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long old, new, res; - - res = atomic_long_read(&sem->count); - do { - new = res + RWSEM_ACTIVE_READ_BIAS; - if (new <= 0) - break; - old = res; - res = atomic_long_cmpxchg(&sem->count, old, new); - } while (res != old); - return res >= 0 ? 1 : 0; -} - -static inline long ___down_write(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter += RWSEM_ACTIVE_WRITE_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); -#endif - return oldcount; -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(___down_write(sem))) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (unlikely(___down_write(sem))) { - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long ret = atomic_long_cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - if (ret == RWSEM_UNLOCKED_VALUE) - return 1; - return 0; -} - -static inline void __up_read(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter -= RWSEM_ACTIVE_READ_BIAS; -#else - long temp; - __asm__ __volatile__( - " mb\n" - "1: ldq_l %0,%1\n" - " subq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(oldcount < 0)) - if ((int)oldcount - RWSEM_ACTIVE_READ_BIAS == 0) - rwsem_wake(sem); -} - -static inline void __up_write(struct rw_semaphore *sem) -{ - long count; -#ifndef CONFIG_SMP - sem->count.counter -= RWSEM_ACTIVE_WRITE_BIAS; - count = sem->count.counter; -#else - long temp; - __asm__ __volatile__( - " mb\n" - "1: ldq_l %0,%1\n" - " subq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " subq %0,%3,%0\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (count), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(count)) - if ((int)count == 0) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter -= RWSEM_WAITING_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (-RWSEM_WAITING_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(oldcount < 0)) - rwsem_downgrade_wake(sem); -} - -#endif /* __KERNEL__ */ -#endif /* _ALPHA_RWSEM_H */ diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index a8a4eb7f6dae..8fb51b7bf1d5 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -12,7 +12,6 @@ generic-y += mm-arch-hooks.h generic-y += msi.h generic-y += parport.h generic-y += preempt.h -generic-y += rwsem.h generic-y += seccomp.h generic-y += segment.h generic-y += serial.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 1e17ea5c372b..60a933b07001 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -16,7 +16,6 @@ generic-y += mm-arch-hooks.h generic-y += msi.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += rwsem.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index d046e8ccdf78..3ff5f297acda 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -27,7 +27,6 @@ generic-y += mm-arch-hooks.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h -generic-y += rwsem.h generic-y += sections.h generic-y += segment.h generic-y += serial.h diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h deleted file mode 100644 index 917910607e0e..000000000000 --- a/arch/ia64/include/asm/rwsem.h +++ /dev/null @@ -1,172 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * R/W semaphores for ia64 - * - * Copyright (C) 2003 Ken Chen - * Copyright (C) 2003 Asit Mallick - * Copyright (C) 2005 Christoph Lameter - * - * Based on asm-i386/rwsem.h and other architecture implementation. - * - * The MSW of the count is the negated number of active writers and - * waiting lockers, and the LSW is the total number of active locks. - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffffffff00000001 for - * the case of an uncontended lock. Readers increment by 1 and see a positive - * value when uncontended, negative if there are writers (and maybe) readers - * waiting (in which case it goes to sleep). - */ - -#ifndef _ASM_IA64_RWSEM_H -#define _ASM_IA64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include directly, use instead." -#endif - -#include - -#define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000) -#define RWSEM_ACTIVE_BIAS (1L) -#define RWSEM_ACTIVE_MASK (0xffffffffL) -#define RWSEM_WAITING_BIAS (-0x100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline int -___down_read (struct rw_semaphore *sem) -{ - long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1); - - return (result < 0); -} - -static inline void -__down_read (struct rw_semaphore *sem) -{ - if (___down_read(sem)) - rwsem_down_read_failed(sem); -} - -static inline int -__down_read_killable (struct rw_semaphore *sem) -{ - if (___down_read(sem)) - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * lock for writing - */ -static inline long -___down_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old + RWSEM_ACTIVE_WRITE_BIAS; - } while (atomic_long_cmpxchg_acquire(&sem->count, old, new) != old); - - return old; -} - -static inline void -__down_write (struct rw_semaphore *sem) -{ - if (___down_write(sem)) - rwsem_down_write_failed(sem); -} - -static inline int -__down_write_killable (struct rw_semaphore *sem) -{ - if (___down_write(sem)) { - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -/* - * unlock after reading - */ -static inline void -__up_read (struct rw_semaphore *sem) -{ - long result = ia64_fetchadd8_rel((unsigned long *)&sem->count.counter, -1); - - if (result < 0 && (--result & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void -__up_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old - RWSEM_ACTIVE_WRITE_BIAS; - } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); - - if (new < 0 && (new & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int -__down_read_trylock (struct rw_semaphore *sem) -{ - long tmp; - while ((tmp = atomic_long_read(&sem->count)) >= 0) { - if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp+1)) { - return 1; - } - } - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int -__down_write_trylock (struct rw_semaphore *sem) -{ - long tmp = atomic_long_cmpxchg_acquire(&sem->count, - RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * downgrade write lock to read lock - */ -static inline void -__downgrade_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old - RWSEM_WAITING_BIAS; - } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); - - if (old < 0) - rwsem_downgrade_wake(sem); -} - -#endif /* _ASM_IA64_RWSEM_H */ diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index a0c132bedfae..36bda391e549 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -8,6 +8,5 @@ generic-y += irq_regs.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += preempt.h -generic-y += rwsem.h generic-y += vtime.h generic-y += msi.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 12d77cb11fe5..d5fadefea33c 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += rwsem.h generic-y += trace_clock.h generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 7bf2cb680d32..73fff39a0122 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += mm-arch-hooks.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h -generic-y += rwsem.h generic-y += serial.h generic-y += sizes.h generic-y += trace_clock.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index a22cfd5c0ee8..2ca3200d3616 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -18,7 +18,6 @@ generic-y += mm-arch-hooks.h generic-y += module.h generic-y += msi.h generic-y += preempt.h -generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += word-at-a-time.h diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h deleted file mode 100644 index 4c25cf6caefa..000000000000 --- a/arch/x86/include/asm/rwsem.h +++ /dev/null @@ -1,237 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+ - * - * Written by David Howells (dhowells@redhat.com). - * - * Derived from asm-x86/semaphore.h - * - * - * The MSW of the count is the negated number of active writers and waiting - * lockers, and the LSW is the total number of active locks - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an - * uncontended lock. This can be determined because XADD returns the old value. - * Readers increment by 1 and see a positive value when uncontended, negative - * if there are writers (and maybe) readers waiting (in which case it goes to - * sleep). - * - * The value of WAITING_BIAS supports up to 32766 waiting processes. This can - * be extended to 65534 by manually checking the whole MSW rather than relying - * on the S flag. - * - * The value of ACTIVE_BIAS supports up to 65535 active processes. - * - * This should be totally fair - if anything is waiting, a process that wants a - * lock will go to the back of the queue. When the currently active lock is - * released, if there's a writer at the front of the queue, then that and only - * that will be woken up; if there's a bunch of consecutive readers at the - * front, then they'll all be woken up, but no other readers will be. - */ - -#ifndef _ASM_X86_RWSEM_H -#define _ASM_X86_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ -#include - -/* - * The bias values and the counter type limits the number of - * potential readers/writers to 32767 for 32 bits and 2147483647 - * for 64 bits. - */ - -#ifdef CONFIG_X86_64 -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -#define ____down_read(sem, slow_path) \ -({ \ - struct rw_semaphore* ret; \ - asm volatile("# beginning down_read\n\t" \ - LOCK_PREFIX _ASM_INC "(%[sem])\n\t" \ - /* adds 0x00000001 */ \ - " jns 1f\n" \ - " call " slow_path "\n" \ - "1:\n\t" \ - "# ending down_read\n\t" \ - : "+m" (sem->count), "=a" (ret), \ - ASM_CALL_CONSTRAINT \ - : [sem] "a" (sem) \ - : "memory", "cc"); \ - ret; \ -}) - -static inline void __down_read(struct rw_semaphore *sem) -{ - ____down_read(sem, "call_rwsem_down_read_failed"); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (IS_ERR(____down_read(sem, "call_rwsem_down_read_failed_killable"))) - return -EINTR; - return 0; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline bool __down_read_trylock(struct rw_semaphore *sem) -{ - long result, tmp; - asm volatile("# beginning __down_read_trylock\n\t" - " mov %[count],%[result]\n\t" - "1:\n\t" - " mov %[result],%[tmp]\n\t" - " add %[inc],%[tmp]\n\t" - " jle 2f\n\t" - LOCK_PREFIX " cmpxchg %[tmp],%[count]\n\t" - " jnz 1b\n\t" - "2:\n\t" - "# ending __down_read_trylock\n\t" - : [count] "+m" (sem->count), [result] "=&a" (result), - [tmp] "=&r" (tmp) - : [inc] "i" (RWSEM_ACTIVE_READ_BIAS) - : "memory", "cc"); - return result >= 0; -} - -/* - * lock for writing - */ -#define ____down_write(sem, slow_path) \ -({ \ - long tmp; \ - struct rw_semaphore* ret; \ - \ - asm volatile("# beginning down_write\n\t" \ - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" \ - /* adds 0xffff0001, returns the old value */ \ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ - /* was the active mask 0 before? */\ - " jz 1f\n" \ - " call " slow_path "\n" \ - "1:\n" \ - "# ending down_write" \ - : "+m" (sem->count), [tmp] "=d" (tmp), \ - "=a" (ret), ASM_CALL_CONSTRAINT \ - : [sem] "a" (sem), "[tmp]" (RWSEM_ACTIVE_WRITE_BIAS) \ - : "memory", "cc"); \ - ret; \ -}) - -static inline void __down_write(struct rw_semaphore *sem) -{ - ____down_write(sem, "call_rwsem_down_write_failed"); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) - return -EINTR; - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline bool __down_write_trylock(struct rw_semaphore *sem) -{ - bool result; - long tmp0, tmp1; - asm volatile("# beginning __down_write_trylock\n\t" - " mov %[count],%[tmp0]\n\t" - "1:\n\t" - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jnz 2f\n\t" - " mov %[tmp0],%[tmp1]\n\t" - " add %[inc],%[tmp1]\n\t" - LOCK_PREFIX " cmpxchg %[tmp1],%[count]\n\t" - " jnz 1b\n\t" - "2:\n\t" - CC_SET(e) - "# ending __down_write_trylock\n\t" - : [count] "+m" (sem->count), [tmp0] "=&a" (tmp0), - [tmp1] "=&r" (tmp1), CC_OUT(e) (result) - : [inc] "er" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory"); - return result; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" - /* subtracts 1, returns the old value */ - " jns 1f\n\t" - " call call_rwsem_wake\n" /* expects old value in %edx */ - "1:\n" - "# ending __up_read\n" - : "+m" (sem->count), [tmp] "=d" (tmp) - : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_READ_BIAS) - : "memory", "cc"); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - long tmp; - asm volatile("# beginning __up_write\n\t" - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" - /* subtracts 0xffff0001, returns the old value */ - " jns 1f\n\t" - " call call_rwsem_wake\n" /* expects old value in %edx */ - "1:\n\t" - "# ending __up_write\n" - : "+m" (sem->count), [tmp] "=d" (tmp) - : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX _ASM_ADD "%[inc],(%[sem])\n\t" - /* - * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) - * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) - */ - " jns 1f\n\t" - " call call_rwsem_downgrade_wake\n" - "1:\n\t" - "# ending __downgrade_write\n" - : "+m" (sem->count) - : [sem] "a" (sem), [inc] "er" (-RWSEM_WAITING_BIAS) - : "memory", "cc"); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 140e61843a07..986652064b15 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S deleted file mode 100644 index dc2ab6ea6768..000000000000 --- a/arch/x86/lib/rwsem.S +++ /dev/null @@ -1,156 +0,0 @@ -/* - * x86 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise - */ - -#include -#include -#include - -#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) -#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) - -#ifdef CONFIG_X86_32 - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax which is either a return - * value or just gets clobbered. Same is true for %edx so make sure GCC - * reloads it after the slow path, by making it hold a temporary, for - * example see ____down_write(). - */ - -#define save_common_regs \ - pushl %ecx - -#define restore_common_regs \ - popl %ecx - - /* Avoid uglifying the argument copying x86-64 needs to do. */ - .macro movq src, dst - .endm - -#else - -/* - * x86-64 rwsem wrappers - * - * This interfaces the inline asm code to the slow-path - * C routines. We need to save the call-clobbered regs - * that the asm does not mark as clobbered, and move the - * argument from %rax to %rdi. - * - * NOTE! We don't need to save %rax, because the functions - * will always return the semaphore pointer in %rax (which - * is also the input argument to these helpers) - * - * The following can clobber %rdx because the asm clobbers it: - * call_rwsem_down_write_failed - * call_rwsem_wake - * but %rdi, %rsi, %rcx, %r8-r11 always need saving. - */ - -#define save_common_regs \ - pushq %rdi; \ - pushq %rsi; \ - pushq %rcx; \ - pushq %r8; \ - pushq %r9; \ - pushq %r10; \ - pushq %r11 - -#define restore_common_regs \ - popq %r11; \ - popq %r10; \ - popq %r9; \ - popq %r8; \ - popq %rcx; \ - popq %rsi; \ - popq %rdi - -#endif - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_down_read_failed) - FRAME_BEGIN - save_common_regs - __ASM_SIZE(push,) %__ASM_REG(dx) - movq %rax,%rdi - call rwsem_down_read_failed - __ASM_SIZE(pop,) %__ASM_REG(dx) - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_read_failed) - -ENTRY(call_rwsem_down_read_failed_killable) - FRAME_BEGIN - save_common_regs - __ASM_SIZE(push,) %__ASM_REG(dx) - movq %rax,%rdi - call rwsem_down_read_failed_killable - __ASM_SIZE(pop,) %__ASM_REG(dx) - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_read_failed_killable) - -ENTRY(call_rwsem_down_write_failed) - FRAME_BEGIN - save_common_regs - movq %rax,%rdi - call rwsem_down_write_failed - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_write_failed) - -ENTRY(call_rwsem_down_write_failed_killable) - FRAME_BEGIN - save_common_regs - movq %rax,%rdi - call rwsem_down_write_failed_killable - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_write_failed_killable) - -ENTRY(call_rwsem_wake) - FRAME_BEGIN - /* do nothing if still outstanding active readers */ - __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) - jnz 1f - save_common_regs - movq %rax,%rdi - call rwsem_wake - restore_common_regs -1: FRAME_END - ret -ENDPROC(call_rwsem_wake) - -ENTRY(call_rwsem_downgrade_wake) - FRAME_BEGIN - save_common_regs - __ASM_SIZE(push,) %__ASM_REG(dx) - movq %rax,%rdi - call rwsem_downgrade_wake - __ASM_SIZE(pop,) %__ASM_REG(dx) - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 2d686ae54681..33c51c064c77 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -21,14 +21,12 @@ obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o -subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ - ../lib/rwsem.o +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o endif diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 3843198e03d4..4148090cafb0 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -25,7 +25,6 @@ generic-y += percpu.h generic-y += preempt.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += rwsem.h generic-y += sections.h generic-y += socket.h generic-y += topology.h diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h deleted file mode 100644 index 93e67a055a4d..000000000000 --- a/include/asm-generic/rwsem.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_RWSEM_H -#define _ASM_GENERIC_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include directly, use instead." -#endif - -#ifdef __KERNEL__ - -/* - * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. - * Adapted largely from include/asm-i386/rwsem.h - * by Paul Mackerras . - */ - -/* - * the semaphore definition - */ -#ifdef CONFIG_64BIT -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long tmp; - - while ((tmp = atomic_long_read(&sem->count)) >= 0) { - if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - return 0; -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic_long_dec_return_release(&sem->count); - if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count) < 0)) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - /* - * When downgrading from exclusive to shared ownership, - * anything inside the write-locked region cannot leak - * into the read side. In contrast, anything in the - * read-locked region is ok to be re-ordered into the - * write side. As such, rely on RELEASE semantics. - */ - tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_GENERIC_RWSEM_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 67dbb57508b1..6e56006b2cb6 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,15 +57,13 @@ extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -/* Include the arch specific part */ -#include - /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != 0; } +#define RWSEM_UNLOCKED_VALUE 0L #define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) #endif diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..f17dad99eec8 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,8 @@ #include #include +#include "rwsem.h" + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..067e265fa5c1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -32,6 +32,26 @@ # define DEBUG_RWSEMS_WARN_ON(c) #endif +/* + * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. + * Adapted largely from include/asm-i386/rwsem.h + * by Paul Mackerras . + */ + +/* + * the semaphore definition + */ +#ifdef CONFIG_64BIT +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -132,3 +152,113 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } #endif + +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) + rwsem_down_read_failed(sem); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + } + + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + long tmp; + + while ((tmp = atomic_long_read(&sem->count)) >= 0) { + if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { + return 1; + } + } + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + rwsem_down_write_failed(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + return tmp == RWSEM_UNLOCKED_VALUE; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_dec_return_release(&sem->count); + if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count) < 0)) + rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + if (tmp < 0) + rwsem_downgrade_wake(sem); +} + +#endif /* CONFIG_RWSEM_XCHGADD_ALGORITHM */ -- cgit v1.2.3 From 390a0c62c23cb026cd4664a66f6f45fed3a215f6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:07 -0400 Subject: locking/rwsem: Remove rwsem-spinlock.c & use rwsem-xadd.c for all archs Currently, we have two different implementation of rwsem: 1) CONFIG_RWSEM_GENERIC_SPINLOCK (rwsem-spinlock.c) 2) CONFIG_RWSEM_XCHGADD_ALGORITHM (rwsem-xadd.c) As we are going to use a single generic implementation for rwsem-xadd.c and no architecture-specific code will be needed, there is no point in keeping two different implementations of rwsem. In most cases, the performance of rwsem-spinlock.c will be worse. It also doesn't get all the performance tuning and optimizations that had been implemented in rwsem-xadd.c over the years. For simplication, we are going to remove rwsem-spinlock.c and make all architectures use a single implementation of rwsem - rwsem-xadd.c. All references to RWSEM_GENERIC_SPINLOCK and RWSEM_XCHGADD_ALGORITHM in the code are removed. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-3-longman@redhat.com Signed-off-by: Ingo Molnar --- arch/alpha/Kconfig | 7 - arch/arc/Kconfig | 3 - arch/arm/Kconfig | 4 - arch/arm64/Kconfig | 3 - arch/c6x/Kconfig | 3 - arch/csky/Kconfig | 3 - arch/h8300/Kconfig | 3 - arch/hexagon/Kconfig | 6 - arch/ia64/Kconfig | 4 - arch/m68k/Kconfig | 7 - arch/microblaze/Kconfig | 6 - arch/mips/Kconfig | 7 - arch/nds32/Kconfig | 3 - arch/nios2/Kconfig | 3 - arch/openrisc/Kconfig | 6 - arch/parisc/Kconfig | 6 - arch/powerpc/Kconfig | 7 - arch/riscv/Kconfig | 3 - arch/s390/Kconfig | 6 - arch/sh/Kconfig | 6 - arch/sparc/Kconfig | 8 - arch/unicore32/Kconfig | 6 - arch/x86/Kconfig | 3 - arch/x86/um/Kconfig | 6 - arch/xtensa/Kconfig | 3 - include/linux/rwsem-spinlock.h | 47 ------ include/linux/rwsem.h | 5 - kernel/Kconfig.locks | 2 +- kernel/locking/Makefile | 4 +- kernel/locking/rwsem-spinlock.c | 339 ---------------------------------------- kernel/locking/rwsem.h | 3 - 31 files changed, 2 insertions(+), 520 deletions(-) delete mode 100644 include/linux/rwsem-spinlock.h delete mode 100644 kernel/locking/rwsem-spinlock.c (limited to 'arch/x86') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 584a6e114853..27c871227eee 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -49,13 +49,6 @@ config MMU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default n diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index c781e45d1d99..23e063df5d2c 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -63,9 +63,6 @@ config SCHED_OMIT_FRAME_POINTER config GENERIC_CSUM def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config ARCH_DISCONTIGMEM_ENABLE def_bool n diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 850b4805e2d1..c8b03e1c6c2a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -178,10 +178,6 @@ config TRACE_IRQFLAGS_SUPPORT bool default !CPU_V7M -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7e34b9eba5de..c62b9db2b5e8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -237,9 +237,6 @@ config LOCKDEP_SUPPORT config TRACE_IRQFLAGS_SUPPORT def_bool y -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index e5cd3c5f8399..ed92b5840c0a 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -27,9 +27,6 @@ config MMU config FPU def_bool n -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 725a115759c9..6555d1781132 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -92,9 +92,6 @@ config GENERIC_HWEIGHT config MMU def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c071da34e081..61c01db6c292 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -27,9 +27,6 @@ config H8300 config CPU_BIG_ENDIAN def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index ac441680dcc0..3e54a53208d5 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -65,12 +65,6 @@ config GENERIC_CSUM config GENERIC_IRQ_PROBE def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool n - -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 8d7396bd1790..73a26f04644e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -83,10 +83,6 @@ config STACKTRACE_SUPPORT config GENERIC_LOCKBREAK def_bool n -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config HUGETLB_PAGE_SIZE_VARIABLE bool depends on HUGETLB_PAGE diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index b54206408f91..f5661f48019c 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -32,13 +32,6 @@ config M68K config CPU_BIG_ENDIAN def_bool y -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a51b965b3b82..a7a9a9d59f65 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -58,15 +58,9 @@ config CPU_LITTLE_ENDIAN endchoice -config RWSEM_GENERIC_SPINLOCK - def_bool y - config ZONE_DMA def_bool y -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 def_bool n diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4a5f5b0ee9a9..b9c48b27162d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1037,13 +1037,6 @@ source "arch/mips/paravirt/Kconfig" endmenu -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index addb7f5f5264..55559ca0efe4 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -60,9 +60,6 @@ config GENERIC_LOCKBREAK def_bool y depends on PREEMPT -config RWSEM_GENERIC_SPINLOCK - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4ef15a61b7bc..56685fd45ed0 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -40,9 +40,6 @@ config NO_IOPORT_MAP config FPU def_bool n -config RWSEM_GENERIC_SPINLOCK - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool n diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index a5e361fbb75a..683511b8c9df 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -43,12 +43,6 @@ config CPU_BIG_ENDIAN config MMU def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - def_bool n - config GENERIC_HWEIGHT def_bool y diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index c8e621296092..f1ed8ddfe486 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -75,12 +75,6 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool default n diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82c3061..e5dd6aafaf68 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -103,13 +103,6 @@ config LOCKDEP_SUPPORT bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_LOCKBREAK bool default y diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index eb56c82d8aa1..0582260fb6c2 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -69,9 +69,6 @@ config STACKTRACE_SUPPORT config TRACE_IRQFLAGS_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b6e3d0653002..c9300b437195 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -14,12 +14,6 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config ARCH_HAS_ILOG2_U32 def_bool n diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b1c91ea9a958..0be08d586d40 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -90,12 +90,6 @@ config ARCH_DEFCONFIG default "arch/sh/configs/shx3_defconfig" if SUPERH32 default "arch/sh/configs/cayman_defconfig" if SUPERH64 -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_BUG def_bool y depends on BUG && SUPERH32 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 40f8f4f73fe8..16b620237816 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -191,14 +191,6 @@ config NR_CPUS source "kernel/Kconfig.hz" -config RWSEM_GENERIC_SPINLOCK - bool - default y if SPARC32 - -config RWSEM_XCHGADD_ALGORITHM - bool - default y if SPARC64 - config GENERIC_HWEIGHT bool default y diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 817d82608712..0d5869c9bf03 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -38,12 +38,6 @@ config STACKTRACE_SUPPORT config LOCKDEP_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19..84b184e016cd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -268,9 +268,6 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index a9e80e44178c..a8985e1f7432 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -32,12 +32,6 @@ config ARCH_DEFCONFIG default "arch/um/configs/i386_defconfig" if X86_32 default "arch/um/configs/x86_64_defconfig" if X86_64 -config RWSEM_XCHGADD_ALGORITHM - def_bool 64BIT - -config RWSEM_GENERIC_SPINLOCK - def_bool !RWSEM_XCHGADD_ALGORITHM - config 3_LEVEL_PGTABLES bool "Three-level pagetables" if !64BIT default 64BIT diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 4b9aafe766c5..35c8d91e6106 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -46,9 +46,6 @@ config XTENSA with reasonable minimum requirements. The Xtensa Linux project has a home page at . -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h deleted file mode 100644 index e47568363e5e..000000000000 --- a/include/linux/rwsem-spinlock.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* rwsem-spinlock.h: fallback C implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from ideas by Andrea Arcangeli - * - Derived also from comments by Linus - */ - -#ifndef _LINUX_RWSEM_SPINLOCK_H -#define _LINUX_RWSEM_SPINLOCK_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ -/* - * the rw-semaphore definition - * - if count is 0 then there are no active readers or writers - * - if count is +ve then that is the number of active readers - * - if count is -1 then there is one active writer - * - if wait_list is not empty, then there are processes waiting for the semaphore - */ -struct rw_semaphore { - __s32 count; - raw_spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#define RWSEM_UNLOCKED_VALUE 0x00000000 - -extern void __down_read(struct rw_semaphore *sem); -extern int __must_check __down_read_killable(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern int __must_check __down_write_killable(struct rw_semaphore *sem); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); -extern int rwsem_is_locked(struct rw_semaphore *sem); - -#endif /* __KERNEL__ */ -#endif /* _LINUX_RWSEM_SPINLOCK_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 6e56006b2cb6..0fc41062c649 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -22,10 +22,6 @@ struct rw_semaphore; -#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK -#include /* use a generic implementation */ -#define __RWSEM_INIT_COUNT(name) .count = RWSEM_UNLOCKED_VALUE -#else /* All arch specific implementations share the same struct */ struct rw_semaphore { atomic_long_t count; @@ -65,7 +61,6 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #define RWSEM_UNLOCKED_VALUE 0L #define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) -#endif /* Common initializer macros and functions */ diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index fbba478ae522..e335953fa704 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER config RWSEM_SPIN_ON_OWNER def_bool y - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config LOCK_SPIN_ON_OWNER def_bool y diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..1af83e9ce57d 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,8 +25,6 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli - * - Derived also from comments by Linus - */ -#include -#include -#include -#include - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->count != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). - */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->count += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(current); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: - return 0; - -out_nolock: - /* - * We didn't take the lock, so that there is a writer, which - * is owner or the first waiter of the sem. If it's a waiter, - * it will be woken by current owner. Not need to wake anybody. - */ - list_del(&waiter.list); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ - __down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ - return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->count == 0) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->count = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; - -out_nolock: - list_del(&waiter.list); - if (!list_empty(&sem->wait_list) && sem->count >= 0) - __rwsem_do_wake(sem, 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ - return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count == 0) { - /* got the lock */ - sem->count = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->count == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 067e265fa5c1..45ee00236e03 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -153,7 +153,6 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM /* * lock for reading */ @@ -260,5 +259,3 @@ static inline void __downgrade_write(struct rw_semaphore *sem) if (tmp < 0) rwsem_downgrade_wake(sem); } - -#endif /* CONFIG_RWSEM_XCHGADD_ALGORITHM */ -- cgit v1.2.3 From a6cbfbe6677efb5ca47bb7958c2718236c25126e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 29 Mar 2019 22:46:52 +0100 Subject: x86/uaccess: Fix implicit cast of __user pointer The first two arguments of __user_atomic_cmpxchg_inatomic() are: - @uval is a kernel pointer into which the old value should be stored - @ptr is the user pointer on which the cmpxchg should operate This means that casting @uval to __typeof__(ptr) is wrong. Since @uval is only used once inside the macro, just get rid of __uval and use (uval) directly. Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Reviewed-by: Mukesh Ojha Cc: Andrew Morton Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mike Rapoport Cc: Qiaowei Ren Cc: Thomas Gleixner Cc: Will Deacon Cc: x86-ml Link: https://lkml.kernel.org/r/20190329214652.258477-4-jannh@google.com --- arch/x86/include/asm/uaccess.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1954dd5552a2..a21f2a2f17bf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -585,7 +585,6 @@ extern void __cmpxchg_wrong_size(void) #define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \ ({ \ int __ret = 0; \ - __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ __uaccess_begin_nospec(); \ @@ -661,7 +660,7 @@ extern void __cmpxchg_wrong_size(void) __cmpxchg_wrong_size(); \ } \ __uaccess_end(); \ - *__uval = __old; \ + *(uval) = __old; \ __ret; \ }) -- cgit v1.2.3 From 60574d1e05b094d222162260dd9cac49f4d0996a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 11 Mar 2019 14:55:57 -0600 Subject: acpi: Create subtable parsing infrastructure Parsing entries in an ACPI table had assumed a generic header structure. There is no standard ACPI header, though, so less common layouts with different field sizes required custom parsers to go through their subtable entry list. Create the infrastructure for adding different table types so parsing the entries array may be more reused for all ACPI system tables and the common code doesn't need to be duplicated. Reviewed-by: Rafael J. Wysocki Acked-by: Jonathan Cameron Tested-by: Jonathan Cameron Signed-off-by: Keith Busch Tested-by: Brice Goglin Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/acpi_numa.c | 2 +- arch/arm64/kernel/smp.c | 4 +- arch/ia64/kernel/acpi.c | 14 +++--- arch/x86/kernel/acpi/boot.c | 36 +++++++------- drivers/acpi/numa.c | 16 +++---- drivers/acpi/scan.c | 4 +- drivers/acpi/tables.c | 67 +++++++++++++++++++++++---- drivers/irqchip/irq-gic-v2m.c | 2 +- drivers/irqchip/irq-gic-v3-its-pci-msi.c | 2 +- drivers/irqchip/irq-gic-v3-its-platform-msi.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 6 +-- drivers/irqchip/irq-gic-v3.c | 10 ++-- drivers/irqchip/irq-gic.c | 4 +- drivers/mailbox/pcc.c | 2 +- include/linux/acpi.h | 5 +- 15 files changed, 113 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c index eac1d0cc595c..7ff800045434 100644 --- a/arch/arm64/kernel/acpi_numa.c +++ b/arch/arm64/kernel/acpi_numa.c @@ -45,7 +45,7 @@ static inline int get_cpu_for_acpi_id(u32 uid) return -EINVAL; } -static int __init acpi_parse_gicc_pxm(struct acpi_subtable_header *header, +static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_srat_gicc_affinity *pa; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 824de7038967..bb4b3f07761a 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -586,7 +586,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) } static int __init -acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, +acpi_parse_gic_cpu_interface(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *processor; @@ -595,7 +595,7 @@ acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, if (BAD_MADT_GICC_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_map_gic_cpu_interface(processor); diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 41eb281709da..1435e7a1a8cd 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -177,7 +177,7 @@ struct acpi_table_madt *acpi_madt __initdata; static u8 has_8259; static int __init -acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, +acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic_override *lapic; @@ -195,7 +195,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, } static int __init -acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_lsapic(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_sapic *lsapic; @@ -216,7 +216,7 @@ acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end) } static int __init -acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic_nmi *lacpi_nmi; @@ -230,7 +230,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e } static int __init -acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_iosapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_io_sapic *iosapic; @@ -245,7 +245,7 @@ acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end static unsigned int __initdata acpi_madt_rev; static int __init -acpi_parse_plat_int_src(struct acpi_subtable_header * header, +acpi_parse_plat_int_src(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_interrupt_source *plintsrc; @@ -329,7 +329,7 @@ unsigned int get_cpei_target_cpu(void) } static int __init -acpi_parse_int_src_ovr(struct acpi_subtable_header * header, +acpi_parse_int_src_ovr(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_interrupt_override *p; @@ -350,7 +350,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, } static int __init -acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_nmi_source *nmi_src; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8dcbf6890714..9fc92e4539d8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -197,7 +197,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) } static int __init -acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) +acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; #ifdef CONFIG_X86_X2APIC @@ -210,7 +210,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); #ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; @@ -242,7 +242,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) } static int __init -acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic *processor = NULL; @@ -251,7 +251,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); /* Ignore invalid ID */ if (processor->id == 0xff) @@ -272,7 +272,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) } static int __init -acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) +acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_sapic *processor = NULL; @@ -281,7 +281,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ processor->processor_id, /* ACPI ID */ @@ -291,7 +291,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) } static int __init -acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, +acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; @@ -301,7 +301,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); acpi_lapic_addr = lapic_addr_ovr->address; @@ -309,7 +309,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, } static int __init -acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, +acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL; @@ -319,7 +319,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, if (BAD_MADT_ENTRY(x2apic_nmi, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); if (x2apic_nmi->lint != 1) printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); @@ -328,7 +328,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, } static int __init -acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; @@ -337,7 +337,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e if (BAD_MADT_ENTRY(lapic_nmi, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); if (lapic_nmi->lint != 1) printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); @@ -449,7 +449,7 @@ static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, } static int __init -acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { @@ -462,7 +462,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) if (BAD_MADT_ENTRY(ioapic, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */ if (ioapic->global_irq_base < nr_legacy_irqs()) @@ -508,7 +508,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, } static int __init -acpi_parse_int_src_ovr(struct acpi_subtable_header * header, +acpi_parse_int_src_ovr(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_interrupt_override *intsrc = NULL; @@ -518,7 +518,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(intsrc, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { acpi_sci_ioapic_setup(intsrc->source_irq, @@ -550,7 +550,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, } static int __init -acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) +acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_nmi_source *nmi_src = NULL; @@ -559,7 +559,7 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end if (BAD_MADT_ENTRY(nmi_src, end)) return -EINVAL; - acpi_table_print_madt_entry(header); + acpi_table_print_madt_entry(&header->common); /* TBD: Support nimsrc entries? */ diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 867f6e3f2b4f..30995834ad70 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -339,7 +339,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } static int __init -acpi_parse_x2apic_affinity(struct acpi_subtable_header *header, +acpi_parse_x2apic_affinity(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_srat_x2apic_cpu_affinity *processor_affinity; @@ -348,7 +348,7 @@ acpi_parse_x2apic_affinity(struct acpi_subtable_header *header, if (!processor_affinity) return -EINVAL; - acpi_table_print_srat_entry(header); + acpi_table_print_srat_entry(&header->common); /* let architecture-dependent part to do it */ acpi_numa_x2apic_affinity_init(processor_affinity); @@ -357,7 +357,7 @@ acpi_parse_x2apic_affinity(struct acpi_subtable_header *header, } static int __init -acpi_parse_processor_affinity(struct acpi_subtable_header *header, +acpi_parse_processor_affinity(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_srat_cpu_affinity *processor_affinity; @@ -366,7 +366,7 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header, if (!processor_affinity) return -EINVAL; - acpi_table_print_srat_entry(header); + acpi_table_print_srat_entry(&header->common); /* let architecture-dependent part to do it */ acpi_numa_processor_affinity_init(processor_affinity); @@ -375,7 +375,7 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header, } static int __init -acpi_parse_gicc_affinity(struct acpi_subtable_header *header, +acpi_parse_gicc_affinity(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_srat_gicc_affinity *processor_affinity; @@ -384,7 +384,7 @@ acpi_parse_gicc_affinity(struct acpi_subtable_header *header, if (!processor_affinity) return -EINVAL; - acpi_table_print_srat_entry(header); + acpi_table_print_srat_entry(&header->common); /* let architecture-dependent part to do it */ acpi_numa_gicc_affinity_init(processor_affinity); @@ -395,7 +395,7 @@ acpi_parse_gicc_affinity(struct acpi_subtable_header *header, static int __initdata parsed_numa_memblks; static int __init -acpi_parse_memory_affinity(struct acpi_subtable_header * header, +acpi_parse_memory_affinity(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_srat_mem_affinity *memory_affinity; @@ -404,7 +404,7 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header, if (!memory_affinity) return -EINVAL; - acpi_table_print_srat_entry(header); + acpi_table_print_srat_entry(&header->common); /* let architecture-dependent part to do it */ if (!acpi_numa_memory_affinity_init(memory_affinity)) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 446c959a8f08..f7771a3b4a3e 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2241,10 +2241,10 @@ static struct acpi_probe_entry *ape; static int acpi_probe_count; static DEFINE_MUTEX(acpi_probe_mutex); -static int __init acpi_match_madt(struct acpi_subtable_header *header, +static int __init acpi_match_madt(union acpi_subtable_headers *header, const unsigned long end) { - if (!ape->subtable_valid || ape->subtable_valid(header, ape)) + if (!ape->subtable_valid || ape->subtable_valid(&header->common, ape)) if (!ape->probe_subtbl(header, end)) acpi_probe_count++; diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 8fccbe49612a..7553774a22b7 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -49,6 +49,15 @@ static struct acpi_table_desc initial_tables[ACPI_MAX_TABLES] __initdata; static int acpi_apic_instance __initdata; +enum acpi_subtable_type { + ACPI_SUBTABLE_COMMON, +}; + +struct acpi_subtable_entry { + union acpi_subtable_headers *hdr; + enum acpi_subtable_type type; +}; + /* * Disable table checksum verification for the early stage due to the size * limitation of the current x86 early mapping implementation. @@ -217,6 +226,42 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) } } +static unsigned long __init +acpi_get_entry_type(struct acpi_subtable_entry *entry) +{ + switch (entry->type) { + case ACPI_SUBTABLE_COMMON: + return entry->hdr->common.type; + } + return 0; +} + +static unsigned long __init +acpi_get_entry_length(struct acpi_subtable_entry *entry) +{ + switch (entry->type) { + case ACPI_SUBTABLE_COMMON: + return entry->hdr->common.length; + } + return 0; +} + +static unsigned long __init +acpi_get_subtable_header_length(struct acpi_subtable_entry *entry) +{ + switch (entry->type) { + case ACPI_SUBTABLE_COMMON: + return sizeof(entry->hdr->common); + } + return 0; +} + +static enum acpi_subtable_type __init +acpi_get_subtable_type(char *id) +{ + return ACPI_SUBTABLE_COMMON; +} + /** * acpi_parse_entries_array - for each proc_num find a suitable subtable * @@ -246,8 +291,8 @@ acpi_parse_entries_array(char *id, unsigned long table_size, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries) { - struct acpi_subtable_header *entry; - unsigned long table_end; + struct acpi_subtable_entry entry; + unsigned long table_end, subtable_len, entry_len; int count = 0; int errs = 0; int i; @@ -270,19 +315,20 @@ acpi_parse_entries_array(char *id, unsigned long table_size, /* Parse all entries looking for a match. */ - entry = (struct acpi_subtable_header *) + entry.type = acpi_get_subtable_type(id); + entry.hdr = (union acpi_subtable_headers *) ((unsigned long)table_header + table_size); + subtable_len = acpi_get_subtable_header_length(&entry); - while (((unsigned long)entry) + sizeof(struct acpi_subtable_header) < - table_end) { + while (((unsigned long)entry.hdr) + subtable_len < table_end) { if (max_entries && count >= max_entries) break; for (i = 0; i < proc_num; i++) { - if (entry->type != proc[i].id) + if (acpi_get_entry_type(&entry) != proc[i].id) continue; if (!proc[i].handler || - (!errs && proc[i].handler(entry, table_end))) { + (!errs && proc[i].handler(entry.hdr, table_end))) { errs++; continue; } @@ -297,13 +343,14 @@ acpi_parse_entries_array(char *id, unsigned long table_size, * If entry->length is 0, break from this loop to avoid * infinite loop. */ - if (entry->length == 0) { + entry_len = acpi_get_entry_length(&entry); + if (entry_len == 0) { pr_err("[%4.4s:0x%02x] Invalid zero length\n", id, proc->id); return -EINVAL; } - entry = (struct acpi_subtable_header *) - ((unsigned long)entry + entry->length); + entry.hdr = (union acpi_subtable_headers *) + ((unsigned long)entry.hdr + entry_len); } if (max_entries && count > max_entries) { diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index f5fe0100f9ff..de14e06fd9ec 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -446,7 +446,7 @@ static struct fwnode_handle *gicv2m_get_fwnode(struct device *dev) } static int __init -acpi_parse_madt_msi(struct acpi_subtable_header *header, +acpi_parse_madt_msi(union acpi_subtable_headers *header, const unsigned long end) { int ret; diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c index 8d6d009d1d58..c81d5b81da56 100644 --- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c @@ -159,7 +159,7 @@ static int __init its_pci_of_msi_init(void) #ifdef CONFIG_ACPI static int __init -its_pci_msi_parse_madt(struct acpi_subtable_header *header, +its_pci_msi_parse_madt(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_translator *its_entry; diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c index 7b8e87b493fe..9cdcda5bb3bd 100644 --- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c @@ -117,7 +117,7 @@ static int __init its_pmsi_init_one(struct fwnode_handle *fwnode, #ifdef CONFIG_ACPI static int __init -its_pmsi_parse_madt(struct acpi_subtable_header *header, +its_pmsi_parse_madt(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_translator *its_entry; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 7577755bdcf4..128ac893d7e4 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3830,13 +3830,13 @@ static int __init acpi_get_its_numa_node(u32 its_id) return NUMA_NO_NODE; } -static int __init gic_acpi_match_srat_its(struct acpi_subtable_header *header, +static int __init gic_acpi_match_srat_its(union acpi_subtable_headers *header, const unsigned long end) { return 0; } -static int __init gic_acpi_parse_srat_its(struct acpi_subtable_header *header, +static int __init gic_acpi_parse_srat_its(union acpi_subtable_headers *header, const unsigned long end) { int node; @@ -3903,7 +3903,7 @@ static int __init acpi_get_its_numa_node(u32 its_id) { return NUMA_NO_NODE; } static void __init acpi_its_srat_maps_free(void) { } #endif -static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header, +static int __init gic_acpi_parse_madt_its(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_translator *its_entry; diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 15e55d327505..f44cd89cfc40 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1593,7 +1593,7 @@ gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) } static int __init -gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, +gic_acpi_parse_madt_redist(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_redistributor *redist = @@ -1611,7 +1611,7 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, } static int __init -gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header, +gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = @@ -1653,14 +1653,14 @@ static int __init gic_acpi_collect_gicr_base(void) return -ENODEV; } -static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header, +static int __init gic_acpi_match_gicr(union acpi_subtable_headers *header, const unsigned long end) { /* Subtable presence means that redist exists, that's it */ return 0; } -static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header, +static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = @@ -1726,7 +1726,7 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, return true; } -static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header, +static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index fd3110c171ba..c6dbe5018972 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1495,7 +1495,7 @@ static struct } acpi_data __initdata; static int __init -gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, +gic_acpi_parse_madt_cpu(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *processor; @@ -1527,7 +1527,7 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, } /* The things you have to do to just *count* something... */ -static int __init acpi_dummy_func(struct acpi_subtable_header *header, +static int __init acpi_dummy_func(union acpi_subtable_headers *header, const unsigned long end) { return 0; diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index 256f18b67e8a..08a0a3517138 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -382,7 +382,7 @@ static const struct mbox_chan_ops pcc_chan_ops = { * * This gets called for each entry in the PCC table. */ -static int parse_pcc_subspace(struct acpi_subtable_header *header, +static int parse_pcc_subspace(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_pcct_subspace *ss = (struct acpi_pcct_subspace *) header; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d5dcebd7aad3..9494d42bf507 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -141,10 +141,13 @@ enum acpi_address_range_id { /* Table Handlers */ +union acpi_subtable_headers { + struct acpi_subtable_header common; +}; typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table); -typedef int (*acpi_tbl_entry_handler)(struct acpi_subtable_header *header, +typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header, const unsigned long end); /* Debugger support */ -- cgit v1.2.3 From 7c21383f3429dd70da39c0c7f1efa12377a47ab6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 4 Apr 2019 14:40:27 -0700 Subject: x86/build: Keep local relocations with ld.lld The LLVM linker (ld.lld) defaults to removing local relocations, which causes KASLR boot failures. ld.bfd and ld.gold already handle this correctly. This adds the explicit instruction "--discard-none" during the link phase. There is no change in output for ld.bfd and ld.gold, but ld.lld now produces an image with all the needed relocations. Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nick Desaulniers Cc: Thomas Gleixner Cc: clang-built-linux@googlegroups.com Cc: x86-ml Link: https://lkml.kernel.org/r/20190404214027.GA7324@beast Link: https://github.com/ClangBuiltLinux/linux/issues/404 --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a587805c6687..56e748a7679f 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -47,7 +47,7 @@ export REALMODE_CFLAGS export BITS ifdef CONFIG_X86_NEED_RELOCS - LDFLAGS_vmlinux := --emit-relocs + LDFLAGS_vmlinux := --emit-relocs --discard-none endif # -- cgit v1.2.3 From 4df4309587e18a3c91e68138638dcb9d2a968906 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 3 Apr 2019 13:42:30 -0500 Subject: x86/kexec/crash: Use struct_size() in vzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; instance = vzalloc(sizeof(struct foo) + count * sizeof(struct boo)); Instead of leaving these open-coded and prone to type mistakes, use the new struct_size() helper: instance = vzalloc(struct_size(instance, entry, count)); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20190403184230.GA5295@embeddedor --- arch/x86/kernel/crash.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 17ffc869cab8..a96ca8584803 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -204,8 +204,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) * another range split. So add extra two slots here. */ nr_ranges += 2; - cmem = vzalloc(sizeof(struct crash_mem) + - sizeof(struct crash_mem_range) * nr_ranges); + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; -- cgit v1.2.3 From b5b447b6b4e899e0978b95cb42d272978f8c7b4d Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 11 Mar 2019 22:47:51 +0000 Subject: x86/entry: Remove unneeded need_resched() loop Since the enabling and disabling of IRQs within preempt_schedule_irq() is contained in a need_resched() loop, there is no need for the outer architecture specific loop. Signed-off-by: Valentin Schneider Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20190311224752.8337-14-valentin.schneider@arm.com --- arch/x86/entry/entry_32.S | 3 +-- arch/x86/entry/entry_64.S | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d309f30cf7af..b1856fe9decf 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -766,13 +766,12 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all_kernel testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all_kernel call preempt_schedule_irq - jmp .Lneed_resched + jmp restore_all_kernel END(resume_kernel) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..e7e270603fe7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -645,10 +645,9 @@ retint_kernel: /* Check if we need preemption */ btl $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0, PER_CPU_VAR(__preempt_count) + cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq - jmp 0b 1: #endif /* -- cgit v1.2.3 From 0925dda5962e9b55e4d38a72eba93858f24bac41 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 8 Mar 2019 10:56:15 +0800 Subject: x86/mm/KASLR: Use only one PUD entry for real mode trampoline The current code builds identity mapping for the real mode trampoline by borrowing page tables from the direct mapping section if KASLR is enabled. It copies present entries of the first PUD table in 4-level paging mode, or the first P4D table in 5-level paging mode. However, there's only a very small area under low 1 MB reserved for the real mode trampoline in reserve_real_mode() so it makes no sense to build up a really large mapping for it. Reduce it to one PUD (1GB) entry. This matches the randomization granularity in 4-level paging mode and allows to change the randomization granularity in 5-level paging mode from 512GB to 1GB later. [ tglx: Massaged changelog and comments ] Signed-off-by: Baoquan He Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: hpa@zytor.com Cc: keescook@chromium.org Cc: thgarnie@google.com Link: https://lkml.kernel.org/r/20190308025616.21440-2-bhe@redhat.com --- arch/x86/mm/kaslr.c | 84 +++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 3f452ffed7e9..97813751340d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -147,74 +147,64 @@ void __init kernel_randomize_memory(void) static void __meminit init_trampoline_pud(void) { - unsigned long paddr, paddr_next; + pud_t *pud_page_tramp, *pud, *pud_tramp; + p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; + unsigned long paddr, vaddr; pgd_t *pgd; - pud_t *pud_page, *pud_page_tramp; - int i; pud_page_tramp = alloc_low_page(); + /* + * There are two mappings for the low 1MB area, the direct mapping + * and the 1:1 mapping for the real mode trampoline: + * + * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET + * 1:1 mapping: virt_addr = phys_addr + */ paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - pud_page = (pud_t *) pgd_page_vaddr(*pgd); - - for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { - pud_t *pud, *pud_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + vaddr = (unsigned long)__va(paddr); + pgd = pgd_offset_k(vaddr); - pud_tramp = pud_page_tramp + pud_index(paddr); - pud = pud_page + pud_index(vaddr); - paddr_next = (paddr & PUD_MASK) + PUD_SIZE; - - *pud_tramp = *pud; - } + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); -} - -static void __meminit init_trampoline_p4d(void) -{ - unsigned long paddr, paddr_next; - pgd_t *pgd; - p4d_t *p4d_page, *p4d_page_tramp; - int i; + pud_tramp = pud_page_tramp + pud_index(paddr); + *pud_tramp = *pud; - p4d_page_tramp = alloc_low_page(); - - paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); - - for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { - p4d_t *p4d, *p4d_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + if (pgtable_l5_enabled()) { + p4d_page_tramp = alloc_low_page(); p4d_tramp = p4d_page_tramp + p4d_index(paddr); - p4d = p4d_page + p4d_index(vaddr); - paddr_next = (paddr & P4D_MASK) + P4D_SIZE; - *p4d_tramp = *p4d; - } + set_p4d(p4d_tramp, + __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + } else { + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + } } /* - * Create PGD aligned trampoline table to allow real mode initialization - * of additional CPUs. Consume only 1 low memory page. + * The real mode trampoline, which is required for bootstrapping CPUs + * occupies only a small area under the low 1MB. See reserve_real_mode() + * for details. + * + * If KASLR is disabled the first PGD entry of the direct mapping is copied + * to map the real mode trampoline. + * + * If KASLR is enabled, copy only the PUD which covers the low 1MB + * area. This limits the randomization granularity to 1GB for both 4-level + * and 5-level paging. */ void __meminit init_trampoline(void) { - if (!kaslr_memory_enabled()) { init_trampoline_default(); return; } - if (pgtable_l5_enabled()) - init_trampoline_p4d(); - else - init_trampoline_pud(); + init_trampoline_pud(); } -- cgit v1.2.3 From b569c18434987163a05f05a12cdf6a9975c55ff3 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 8 Mar 2019 10:56:16 +0800 Subject: x86/mm/KASLR: Reduce randomization granularity for 5-level paging to 1GB The current randomization granularity of 5-level is 512 GB. The mapping of the real mode trampoline has been reduced to one PUD entry, so there is no restriction anymore. Reduce the granularity to 1GB for 5-level paging mode which allows better randomization. [ tglx: Massaged changelog ] Signed-off-by: Baoquan He Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: hpa@zytor.com Cc: keescook@chromium.org Cc: thgarnie@google.com Link: https://lkml.kernel.org/r/20190308025616.21440-3-bhe@redhat.com --- arch/x86/mm/kaslr.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 97813751340d..f6ba2791eeb5 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (pgtable_l5_enabled()) - entropy = (rand % (entropy + 1)) & P4D_MASK; - else - entropy = (rand % (entropy + 1)) & PUD_MASK; + entropy = (rand % (entropy + 1)) & PUD_MASK; vaddr += entropy; *kaslr_regions[i].base = vaddr; @@ -137,10 +134,7 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (pgtable_l5_enabled()) - vaddr = round_up(vaddr + 1, P4D_SIZE); - else - vaddr = round_up(vaddr + 1, PUD_SIZE); + vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } } -- cgit v1.2.3 From 5861381d486601430cccf64849bd0a226154bc0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:18:01 +0100 Subject: PM / arch: x86: Rework the MSR_IA32_ENERGY_PERF_BIAS handling The current handling of MSR_IA32_ENERGY_PERF_BIAS in the kernel is problematic, because it may cause changes made by user space to that MSR (with the help of the x86_energy_perf_policy tool, for example) to be lost every time a CPU goes offline and then back online as well as during system-wide power management transitions into sleep states and back into the working state. The first problem is that if the current EPB value for a CPU going online is 0 ('performance'), the kernel will change it to 6 ('normal') regardless of whether or not this is the first bring-up of that CPU. That also happens during system-wide resume from sleep states (including, but not limited to, hibernation). However, the EPB may have been adjusted by user space this way and the kernel should not blindly override that setting. The second problem is that if the platform firmware resets the EPB values for any CPUs during system-wide resume from a sleep state, the kernel will not restore their previous EPB values that may have been set by user space before the preceding system-wide suspend transition. Again, that behavior may at least be confusing from the user space perspective. In order to address these issues, rework the handling of MSR_IA32_ENERGY_PERF_BIAS so that the EPB value is saved on CPU offline and restored on CPU online as well as (for the boot CPU) during the syscore stages of system-wide suspend and resume transitions, respectively. However, retain the policy by which the EPB is set to 6 ('normal') on the first bring-up of each CPU if its initial value is 0, based on the observation that 0 may mean 'not initialized' just as well as 'performance' in that case. While at it, move the MSR_IA32_ENERGY_PERF_BIAS handling code into a separate file and document it in Documentation/admin-guide. Fixes: abe48b108247 (x86, intel, power: Initialize MSR_IA32_ENERGY_PERF_BIAS) Fixes: b51ef52df71c (x86/cpu: Restore MSR_IA32_ENERGY_PERF_BIAS after resume) Reported-by: Thomas Renninger Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov Acked-by: Thomas Gleixner --- Documentation/admin-guide/pm/intel_epb.rst | 6 ++ Documentation/admin-guide/pm/working-state.rst | 1 + arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 17 ---- arch/x86/kernel/cpu/cpu.h | 1 - arch/x86/kernel/cpu/intel.c | 34 ------- arch/x86/kernel/cpu/intel_epb.c | 131 +++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 8 files changed, 140 insertions(+), 53 deletions(-) create mode 100644 Documentation/admin-guide/pm/intel_epb.rst create mode 100644 arch/x86/kernel/cpu/intel_epb.c (limited to 'arch/x86') diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst new file mode 100644 index 000000000000..e9cfa7ec5420 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -0,0 +1,6 @@ +====================================== +Intel Performance and Energy Bias Hint +====================================== + +.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c + :doc: overview diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index b6cef9b5e961..beb004d3632b 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -8,3 +8,4 @@ Working-State Power Management cpuidle cpufreq intel_pstate + intel_epb diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfd24f9f7614..1796d2bdcaaa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..5e37dfa4d9df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1864,23 +1864,6 @@ void cpu_init(void) } #endif -static void bsp_resume(void) -{ - if (this_cpu->c_bsp_resume) - this_cpu->c_bsp_resume(&boot_cpu_data); -} - -static struct syscore_ops cpu_syscore_ops = { - .resume = bsp_resume, -}; - -static int __init init_cpu_syscore(void) -{ - register_syscore_ops(&cpu_syscore_ops); - return 0; -} -core_initcall(init_cpu_syscore); - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 5eb946b9a9f3..c0e2407abdd6 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -14,7 +14,6 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..f17c1a714779 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -596,36 +596,6 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } -static void init_intel_energy_perf(struct cpuinfo_x86 *c) -{ - u64 epb; - - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. - * (x86_energy_perf_policy(8) is available to change it at run-time.) - */ - if (!cpu_has(c, X86_FEATURE_EPB)) - return; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) - return; - - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -} - -static void intel_bsp_resume(struct cpuinfo_x86 *c) -{ - /* - * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, - * so reinitialize it properly like during bootup: - */ - init_intel_energy_perf(c); -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); - init_intel_energy_perf(c); - init_intel_misc_features(c); } @@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, - .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000000..8d53cc88bd22 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki + */ + +#include +#include +#include + +#include +#include + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space through + * the generic MSR interface (with the help of the x86_energy_perf_policy tool), + * but there are two reasons for the kernel to touch it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static int intel_epb_online(unsigned int cpu) +{ + intel_epb_restore(); + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + return intel_epb_save(); +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e78281d07b70..dbfdd0fadbef 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -147,6 +147,7 @@ enum cpuhp_state { CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, + CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, -- cgit v1.2.3 From b9c273babce791cf228fc466577f55056a699f9c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:20:17 +0100 Subject: PM / arch: x86: MSR_IA32_ENERGY_PERF_BIAS sysfs interface The Performance and Energy Bias Hint (EPB) is expected to be set by user space through the generic MSR interface, but that interface is not particularly nice and there are security concerns regarding it, so it is not always available. For this reason, add a sysfs interface for reading and updating the EPB, in the form of a new attribute, energy_perf_bias, located under /sys/devices/system/cpu/cpu#/power/ for online CPUs that support the EPB feature. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov --- Documentation/ABI/testing/sysfs-devices-system-cpu | 18 +++++ Documentation/admin-guide/pm/intel_epb.rst | 27 +++++++ arch/x86/kernel/cpu/intel_epb.c | 93 +++++++++++++++++++++- 3 files changed, 134 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9605dbd4b5b5..7f4af7da3fbc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -518,3 +518,21 @@ Description: Control Symetric Multi Threading (SMT) If control status is "forceoff" or "notsupported" writes are rejected. + +What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias +Date: March 2019 +Contact: linux-pm@vger.kernel.org +Description: Intel Energy and Performance Bias Hint (EPB) + + EPB for the given CPU in a sliding scale 0 - 15, where a value + of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to change the EPB value for the CPU, write either + a number in the 0 - 15 sliding scale above, or one of the + strings: "performance", "balance-performance", "normal", + "balance-power", "power" (that represent values reflected by + their meaning), to this attribute. + + This attribute is present for all online CPUs supporting the + Intel EPB feature. diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst index e9cfa7ec5420..d100849edfc4 100644 --- a/Documentation/admin-guide/pm/intel_epb.rst +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -4,3 +4,30 @@ Intel Performance and Energy Bias Hint .. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c :doc: overview + +Intel Performance and Energy Bias Attribute in ``sysfs`` +======================================================== + +The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU +can be checked or updated through a ``sysfs`` attribute (file) under +:file:`/sys/devices/system/cpu/cpu/power/`, where the CPU number ```` +is allocated at the system initialization time: + +``energy_perf_bias`` + Shows the current EPB value for the CPU in a sliding scale 0 - 15, where + a value of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to update the EPB value for the CPU, this attribute can be + written to, either with a number in the 0 - 15 sliding scale above, or + with one of the strings: "performance", "balance-performance", "normal", + "balance-power", "power" that represent values reflected by their + meaning. + + This attribute is present for all online CPUs supporting the EPB + feature. + +Note that while the EPB interface to the processor is defined at the logical CPU +level, the physical register backing it may be shared by multiple CPUs (for +example, SMT siblings or cores in one package). For this reason, updating the +EPB value for one CPU may cause the EPB values for other CPUs to change. diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 8d53cc88bd22..f4dd73396f28 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -9,8 +9,12 @@ */ #include +#include +#include #include +#include #include +#include #include #include @@ -20,9 +24,9 @@ * * The Performance and Energy Bias Hint (EPB) allows software to specify its * preference with respect to the power-performance tradeoffs present in the - * processor. Generally, the EPB is expected to be set by user space through - * the generic MSR interface (with the help of the x86_energy_perf_policy tool), - * but there are two reasons for the kernel to touch it. + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. * * First, there are systems where the platform firmware resets the EPB during * system-wide transitions from sleep states back into the working state @@ -52,6 +56,7 @@ static DEFINE_PER_CPU(u8, saved_epb); #define EPB_MASK 0x0fULL #define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK static int intel_epb_save(void) { @@ -97,15 +102,95 @@ static struct syscore_ops intel_epb_syscore_ops = { .resume = intel_epb_restore, }; +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + static int intel_epb_online(unsigned int cpu) { + struct device *cpu_dev = get_cpu_device(cpu); + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + return 0; } static int intel_epb_offline(unsigned int cpu) { - return intel_epb_save(); + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; } static __init int intel_epb_init(void) -- cgit v1.2.3 From dec3d0b1071a0f3194e66a83d26ecf4aa8c5910e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 31 Mar 2019 13:04:13 -0700 Subject: crypto: x86/crct10dif-pcl - fix use via crypto_shash_digest() The ->digest() method of crct10dif-pclmul reads the current CRC value from the shash_desc context. But this value is uninitialized, causing crypto_shash_digest() to compute the wrong result. Fix it. Probably this wasn't noticed before because lib/crc-t10dif.c only uses crypto_shash_update(), not crypto_shash_digest(). Likewise, crypto_shash_digest() is not yet tested by the crypto self-tests because those only test the ahash API which only uses shash init/update/final. Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform") Cc: # v3.11+ Cc: Tim Chen Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/crct10dif-pclmul_glue.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 64f17d2d8b67..3c81e15b0873 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -71,15 +71,14 @@ static int chksum_final(struct shash_desc *desc, u8 *out) return 0; } -static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, - u8 *out) +static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) { if (len >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + *(__u16 *)out = crc_t10dif_pcl(crc, data, len); kernel_fpu_end(); } else - *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + *(__u16 *)out = crc_t10dif_generic(crc, data, len); return 0; } @@ -88,15 +87,13 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - return __chksum_finup(&ctx->crc, data, len, out); + return __chksum_finup(ctx->crc, data, len, out); } static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, length, out); + return __chksum_finup(0, data, length, out); } static struct shash_alg alg = { -- cgit v1.2.3 From 678cce4019d746da6c680c48ba9e6d417803e127 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 31 Mar 2019 13:04:11 -0700 Subject: crypto: x86/poly1305 - fix overflow during partial reduction The x86_64 implementation of Poly1305 produces the wrong result on some inputs because poly1305_4block_avx2() incorrectly assumes that when partially reducing the accumulator, the bits carried from limb 'd4' to limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic which processes only one block at a time. However, it's not true for the AVX2 implementation, which processes 4 blocks at a time and therefore can produce intermediate limbs about 4x larger. Fix it by making the relevant calculations use 64-bit arithmetic rather than 32-bit. Note that most of the carries already used 64-bit arithmetic, but the d4 -> h0 carry was different for some reason. To be safe I also made the same change to the corresponding SSE2 code, though that only operates on 1 or 2 blocks at a time. I don't think it's really needed for poly1305_block_sse2(), but it doesn't hurt because it's already x86_64 code. It *might* be needed for poly1305_2block_sse2(), but overflows aren't easy to reproduce there. This bug was originally detected by my patches that improve testmgr to fuzz algorithms against their generic implementation. But also add a test vector which reproduces it directly (in the AVX2 case). Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64") Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64") Cc: # v4.3+ Cc: Martin Willi Cc: Jason A. Donenfeld Signed-off-by: Eric Biggers Reviewed-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++++---- arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++++------- crypto/testmgr.h | 44 +++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 3b6e70d085da..8457cdd47f75 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2) vpaddq t2,t1,t1 vmovq t1x,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index e6add74d78a5..6f0be7a86964 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx @@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2) paddq t2,t1 movq t1,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/crypto/testmgr.h b/crypto/testmgr.h index f267633cf13a..d18a37629f05 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -5634,7 +5634,49 @@ static const struct hash_testvec poly1305_tv_template[] = { .psize = 80, .digest = "\x13\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00", - }, + }, { /* Regression test for overflow in AVX2 implementation */ + .plaintext = "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff", + .psize = 300, + .digest = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8" + "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1", + } }; /* NHPoly1305 test vectors from https://github.com/google/adiantum */ -- cgit v1.2.3 From bfdd5a67c8cb02c147c6b012543e84cb1f5759ba Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 29 Mar 2019 19:35:24 +0100 Subject: x86/asm: Clarify static_cpu_has()'s intended use Clarify when one should use static_cpu_has() and when one should use boot_cpu_has(). Requested-by: Nadav Amit Signed-off-by: Borislav Petkov Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190330112022.28888-2-bp@alien8.de --- arch/x86/include/asm/cpufeature.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 2fb791a1b479..6d6d5cc4302b 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -155,9 +155,12 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #else /* - * Static testing of CPU features. Used the same as boot_cpu_has(). - * These will statically patch the target code for additional - * performance. + * Static testing of CPU features. Used the same as boot_cpu_has(). It + * statically patches the target code for additional performance. Use + * static_cpu_has() only in fast paths, where every cycle counts. Which + * means that the boot_cpu_has() variant is already fast enough for the + * majority of cases and you should stick to using it as it is generally + * only two instructions: a RIP-relative MOV and a TEST. */ static __always_inline bool _static_cpu_has(u16 bit) { -- cgit v1.2.3 From 67e87d43b794a8886b5d075b3e0fdd0c615a595f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 29 Mar 2019 19:52:59 +0100 Subject: x86: Convert some slow-path static_cpu_has() callers to boot_cpu_has() Using static_cpu_has() is pointless on those paths, convert them to the boot_cpu_has() variant. No functional changes. Reported-by: Nadav Amit Signed-off-by: Borislav Petkov Reviewed-by: Rik van Riel Reviewed-by: Juergen Gross # for paravirt Cc: Aubrey Li Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Joerg Roedel Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: Thomas Lendacky Cc: linux-edac@vger.kernel.org Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Tony Luck Cc: virtualization@lists.linux-foundation.org Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190330112022.28888-3-bp@alien8.de --- arch/x86/include/asm/fpu/internal.h | 7 +++---- arch/x86/kernel/apic/apic_numachip.c | 2 +- arch/x86/kernel/cpu/aperfmperf.c | 6 +++--- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/mce/inject.c | 2 +- arch/x86/kernel/cpu/proc.c | 10 +++++----- arch/x86/kernel/ldt.c | 14 +++++++------- arch/x86/kernel/paravirt.c | 2 +- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- 11 files changed, 26 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fb04a3ded7dd..745a19d34f23 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -253,7 +253,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else XSTATE_OP(XSAVE, xstate, lmask, hmask, err); @@ -275,7 +275,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -497,8 +497,7 @@ static inline void fpregs_activate(struct fpu *fpu) * - switch_fpu_finish() restores the new state as * necessary. */ -static inline void -switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 78778b54f904..a5464b8b6c46 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -175,7 +175,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 804c49493938..64d5aec24203 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -83,7 +83,7 @@ unsigned int aperfmperf_get_khz(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; aperfmperf_snapshot_cpu(cpu, ktime_get(), true); @@ -99,7 +99,7 @@ void arch_freq_prepare_all(void) if (!cpu_khz) return; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; for_each_online_cpu(cpu) @@ -115,7 +115,7 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..95a5faf3a6a0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1668,7 +1668,7 @@ static void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (static_cpu_has(X86_FEATURE_RDTSCP)) + if (boot_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux(cpudata); /* Store CPU and node number in limit. */ diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 8492ef7d9015..3da9a8823e47 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -528,7 +528,7 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 2c8522a39ed5..cb2e49810d68 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -35,11 +35,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level); } #else diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6135ae8ce036..b2463fcb20a8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -113,7 +113,7 @@ static void do_sanity_check(struct mm_struct *mm, * tables. */ WARN_ON(!had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* @@ -121,7 +121,7 @@ static void do_sanity_check(struct mm_struct *mm, * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } @@ -156,7 +156,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } @@ -181,7 +181,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } @@ -208,7 +208,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) spinlock_t *ptl; int i, nr_pages; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* @@ -271,7 +271,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) return; /* LDT map/unmap is only required for PTI */ - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); @@ -311,7 +311,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; tlb_gather_mmu(&tlb, mm, start, end); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c0e0101133f3..7bbaa6baf37f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -121,7 +121,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a..16a7113e91c5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -236,7 +236,7 @@ static int get_cpuid_mode(void) static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) { - if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) @@ -666,7 +666,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c0..d62ebbc5ec78 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -108,7 +108,7 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (static_cpu_has(X86_FEATURE_PCID)) + if (boot_cpu_has(X86_FEATURE_PCID)) cr4_clear_bits(X86_CR4_PCIDE); #endif diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index a092b6b40c6b..6a38717d179c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -369,7 +369,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) { + if (boot_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; refresh_sysenter_cs(&tsk->thread); } -- cgit v1.2.3 From 28e3ace70c3d2ea47a62dffe046011d1b74ee839 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 29 Mar 2019 20:00:38 +0100 Subject: x86/mm: Convert some slow-path static_cpu_has() callers to boot_cpu_has() Using static_cpu_has() is pointless on those paths, convert them to the boot_cpu_has() variant. No functional changes. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190330112022.28888-5-bp@alien8.de --- arch/x86/mm/dump_pagetables.c | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- arch/x86/mm/pti.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ee8f8ab46941..7b71ac15b235 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -577,7 +577,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - if (user && static_cpu_has(X86_FEATURE_PTI)) + if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif ptdump_walk_pgd_level_core(m, pgd, false, false); @@ -590,7 +590,7 @@ void ptdump_walk_user_pgd_level_checkwx(void) pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || - !static_cpu_has(X86_FEATURE_PTI)) + !boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("x86/mm: Checking user space page tables\n"); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd01709a091..3dbf440d4114 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ static void pgd_dtor(pgd_t *pgd) * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ -#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ +#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS @@ -292,7 +292,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) #ifdef CONFIG_PAGE_TABLE_ISOLATION - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4fee5c3003ed..8c9a54ebda60 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -626,7 +626,7 @@ void pti_set_kernel_image_nonglobal(void) */ void __init pti_init(void) { - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); -- cgit v1.2.3 From fdcd06a8ab775cbe716ff893372bed580e4c8a1c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Feb 2019 12:49:41 +0000 Subject: arch: Use asm-generic header for asm/mmiowb.h Hook up asm-generic/mmiowb.h to Kbuild for all architectures so that we can subsequently include asm/mmiowb.h from core code. Cc: Masahiro Yamada Cc: Arnd Bergmann Acked-by: Linus Torvalds Signed-off-by: Will Deacon --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm64/include/asm/Kbuild | 1 + arch/c6x/include/asm/Kbuild | 1 + arch/csky/include/asm/Kbuild | 1 + arch/h8300/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/ia64/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/nds32/include/asm/Kbuild | 1 + arch/nios2/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/riscv/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/unicore32/include/asm/Kbuild | 1 + arch/x86/include/asm/Kbuild | 1 + arch/xtensa/include/asm/Kbuild | 1 + 25 files changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 70b783333965..89e87bbc987f 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += irq_work.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += preempt.h generic-y += sections.h generic-y += trace_clock.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index decc306a3b52..393d4f5e1450 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index a8a4eb7f6dae..a3fc0a230a68 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += kdebug.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += preempt.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 1e17ea5c372b..3dae4fd028cf 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -13,6 +13,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += qrwlock.h generic-y += qspinlock.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 249c9f6f26dc..6b168d32fbff 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mmu.h generic-y += mmu_context.h generic-y += pci.h diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 2a0abe8f2a35..95f4e550db8a 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -28,6 +28,7 @@ generic-y += linkage.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += mutex.h generic-y += pci.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index e3dead402e5f..123d8f54be4a 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -29,6 +29,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mmu.h generic-y += mmu_context.h generic-y += module.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index d046e8ccdf78..d53704d561e6 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 11f191689c9e..cabfe0280c33 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += irq_work.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += preempt.h generic-y += trace_clock.h generic-y += vtime.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 2c359d9e80f6..0ddae4a74adb 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -18,6 +18,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 1a8285c3f693..17a8d0a62038 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 87b86cdf126a..bf39c2253ec8 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -12,6 +12,7 @@ generic-y += irq_work.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index 64ceff7ab99b..688b6ed26227 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -31,6 +31,7 @@ generic-y += limits.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 88a667d12aaa..d7ef3512504a 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -27,6 +27,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 22aa97136c01..1919cc5e0f11 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 9bcd0c903dbb..b8c7db777144 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += seccomp.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index a0c132bedfae..74b6605ca55f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += export.h generic-y += irq_regs.h generic-y += local64.h generic-y += mcs_spinlock.h +generic-y += mmiowb.h generic-y += preempt.h generic-y += rwsem.h generic-y += vtime.h diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index cccd12cf27d4..221cd2ec78a4 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -21,6 +21,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mutex.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 12d77cb11fe5..bdc4f06a04c5 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -20,6 +20,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += rwsem.h generic-y += trace_clock.h generic-y += unaligned.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 7bf2cb680d32..162c9054561f 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -14,6 +14,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index a22cfd5c0ee8..468440db6657 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += msi.h generic-y += preempt.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 00bcbe2326d9..b506ad06aefc 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += param.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index d77d953c04c1..b301a0b3c0b2 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a0ab9ab61c75..eebd05942e6c 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += early_ioremap.h generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 3843198e03d4..794e461785e1 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -20,6 +20,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += param.h generic-y += percpu.h generic-y += preempt.h -- cgit v1.2.3 From 08f1f3a72f4cea136686585b81a251baa3539f12 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Feb 2019 13:03:18 +0000 Subject: x86/io: Remove useless definition of mmiowb() x86 maps mmiowb() to barrier(), but this is superfluous because a compiler barrier is already implied by spin_unlock(). Since x86 also includes asm-generic/io.h in its asm/io.h file, remove the definition entirely and pick up the dummy definition from core code. Acked-by: Linus Torvalds Reviewed-by: Thomas Gleixner Signed-off-by: Will Deacon --- arch/x86/include/asm/io.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 686247db3106..a06a9f8294ea 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -90,8 +90,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_writew __writew #define __raw_writel __writel -#define mmiowb() barrier() - #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", u64, "=r", :"memory") -- cgit v1.2.3 From e43e2657fe77a37b13643e2469670ecdb0ba5e10 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Dec 2018 14:32:02 +0100 Subject: x86/dma: Remove the x86_dma_fallback_dev hack Now that we removed support for the NULL device argument in the DMA API, there is no need to cater for that in the x86 code. Signed-off-by: Christoph Hellwig --- arch/x86/include/asm/dma-mapping.h | 10 ---------- arch/x86/kernel/amd_gart_64.c | 6 ------ arch/x86/kernel/pci-dma.c | 20 -------------------- kernel/dma/mapping.c | 7 ------- 4 files changed, 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ce4d176b3d13..6b15a24930e0 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -13,14 +13,7 @@ #include #include -#ifdef CONFIG_ISA -# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) -#else -# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) -#endif - extern int iommu_merge; -extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; extern const struct dma_map_ops *dma_ops; @@ -30,7 +23,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return dma_ops; } -bool arch_dma_alloc_attrs(struct device **dev); -#define arch_dma_alloc_attrs arch_dma_alloc_attrs - #endif diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 2c0aa34af69c..bf7f13ea3c64 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -233,9 +233,6 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page, unsigned long bus; phys_addr_t paddr = page_to_phys(page) + offset; - if (!dev) - dev = &x86_dma_fallback_dev; - if (!need_iommu(dev, paddr, size)) return paddr; @@ -392,9 +389,6 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (nents == 0) return 0; - if (!dev) - dev = &x86_dma_fallback_dev; - out = 0; start = 0; start_sg = sg; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d460998ae828..dcd272dbd0a9 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -51,14 +51,6 @@ int iommu_pass_through __read_mostly; extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; -/* Dummy device used for NULL arguments (normally ISA). */ -struct device x86_dma_fallback_dev = { - .init_name = "fallback device", - .coherent_dma_mask = ISA_DMA_BIT_MASK, - .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, -}; -EXPORT_SYMBOL(x86_dma_fallback_dev); - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; @@ -77,18 +69,6 @@ void __init pci_iommu_alloc(void) } } -bool arch_dma_alloc_attrs(struct device **dev) -{ - if (!*dev) - *dev = &x86_dma_fallback_dev; - - if (!is_device_dma_capable(*dev)) - return false; - return true; - -} -EXPORT_SYMBOL(arch_dma_alloc_attrs); - /* * See for the iommu kernel * parameter documentation. diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c000906348c9..685a53f2a793 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -238,10 +238,6 @@ u64 dma_get_required_mask(struct device *dev) } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev) (true) -#endif - void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { @@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (!arch_dma_alloc_attrs(&dev)) - return NULL; - if (dma_is_direct(ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) -- cgit v1.2.3 From 5599fb69355d7a558f32206dac7539e945a1f604 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 8 Apr 2019 13:42:24 -0700 Subject: ACPICA: Rename nameseg compare macro for clarity ACPICA commit 92ec0935f27e217dff0b176fca02c2ec3d782bb5 ACPI_COMPARE_NAME changed to ACPI_COMPARE_NAMESEG This clarifies (1) this is a compare on 4-byte namesegs, not a generic compare. Improves understanding of the code. Link: https://github.com/acpica/acpica/commit/92ec0935 Signed-off-by: Bob Moore Signed-off-by: Erik Schmauss Signed-off-by: Rafael J. Wysocki --- arch/x86/boot/compressed/acpi.c | 2 +- drivers/acpi/acpica/dbexec.c | 2 +- drivers/acpi/acpica/dsinit.c | 2 +- drivers/acpi/acpica/nsinit.c | 4 +-- drivers/acpi/acpica/nsparse.c | 2 +- drivers/acpi/acpica/nsrepair.c | 2 +- drivers/acpi/acpica/nsrepair2.c | 2 +- drivers/acpi/acpica/nsxfname.c | 4 +-- drivers/acpi/acpica/rsxface.c | 8 +++--- drivers/acpi/acpica/tbdata.c | 3 ++- drivers/acpi/acpica/tbinstal.c | 2 +- drivers/acpi/acpica/tbprint.c | 6 ++--- drivers/acpi/acpica/tbutils.c | 6 ++--- drivers/acpi/acpica/tbxface.c | 4 +-- drivers/acpi/acpica/tbxfload.c | 15 ++++++----- drivers/acpi/acpica/utmisc.c | 8 +++--- drivers/acpi/acpica/utpredef.c | 4 +-- drivers/acpi/acpica/utstring.c | 2 +- drivers/acpi/scan.c | 2 +- drivers/acpi/sysfs.c | 8 +++--- drivers/acpi/tables.c | 4 +-- include/acpi/actypes.h | 4 +-- .../acpi/os_specific/service_layers/oslinuxtbl.c | 30 +++++++++++----------- tools/power/acpi/tools/acpidump/apdump.c | 4 +-- 24 files changed, 66 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 0ef4ad55b29b..ad84239e595e 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -276,7 +276,7 @@ static unsigned long get_acpi_srat_table(void) if (acpi_table) { header = (struct acpi_table_header *)acpi_table; - if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT)) + if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_SRAT)) return acpi_table; } entry += size; diff --git a/drivers/acpi/acpica/dbexec.c b/drivers/acpi/acpica/dbexec.c index bb43305cb215..4027eaab18a4 100644 --- a/drivers/acpi/acpica/dbexec.c +++ b/drivers/acpi/acpica/dbexec.c @@ -453,7 +453,7 @@ acpi_db_execute(char *name, char **args, acpi_object_type *types, u32 flags) /* Dump a _PLD buffer if present */ - if (ACPI_COMPARE_NAME + if (ACPI_COMPARE_NAMESEG ((ACPI_CAST_PTR (struct acpi_namespace_node, acpi_gbl_db_method_info.method)->name.ascii), diff --git a/drivers/acpi/acpica/dsinit.c b/drivers/acpi/acpica/dsinit.c index a4a24ffe5fae..4ebd23700bbc 100644 --- a/drivers/acpi/acpica/dsinit.c +++ b/drivers/acpi/acpica/dsinit.c @@ -200,7 +200,7 @@ acpi_ds_initialize_objects(u32 table_index, /* DSDT is always the first AML table */ - if (ACPI_COMPARE_NAME(table->signature, ACPI_SIG_DSDT)) { + if (ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_DSDT)) { ACPI_DEBUG_PRINT_RAW((ACPI_DB_INIT, "\nInitializing Namespace objects:\n")); } diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c index 19fb8dda870f..53e5d00d3a5e 100644 --- a/drivers/acpi/acpica/nsinit.c +++ b/drivers/acpi/acpica/nsinit.c @@ -478,7 +478,7 @@ acpi_ns_find_ini_methods(acpi_handle obj_handle, /* We are only looking for methods named _INI */ - if (!ACPI_COMPARE_NAME(node->name.ascii, METHOD_NAME__INI)) { + if (!ACPI_COMPARE_NAMESEG(node->name.ascii, METHOD_NAME__INI)) { return (AE_OK); } @@ -641,7 +641,7 @@ acpi_ns_init_one_device(acpi_handle obj_handle, * Note: We know there is an _INI within this subtree, but it may not be * under this particular device, it may be lower in the branch. */ - if (!ACPI_COMPARE_NAME(device_node->name.ascii, "_SB_") || + if (!ACPI_COMPARE_NAMESEG(device_node->name.ascii, "_SB_") || device_node->parent != acpi_gbl_root_node) { ACPI_DEBUG_EXEC(acpi_ut_display_init_pathname (ACPI_TYPE_METHOD, device_node, diff --git a/drivers/acpi/acpica/nsparse.c b/drivers/acpi/acpica/nsparse.c index c0b4f7bedfab..f16cf5e4742c 100644 --- a/drivers/acpi/acpica/nsparse.c +++ b/drivers/acpi/acpica/nsparse.c @@ -203,7 +203,7 @@ acpi_ns_one_complete_parse(u32 pass_number, /* Found OSDT table, enable the namespace override feature */ - if (ACPI_COMPARE_NAME(table->signature, ACPI_SIG_OSDT) && + if (ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_OSDT) && pass_number == ACPI_IMODE_LOAD_PASS1) { walk_state->namespace_override = TRUE; } diff --git a/drivers/acpi/acpica/nsrepair.c b/drivers/acpi/acpica/nsrepair.c index 0aacfa48e20d..be86fea8e4d4 100644 --- a/drivers/acpi/acpica/nsrepair.c +++ b/drivers/acpi/acpica/nsrepair.c @@ -316,7 +316,7 @@ static const struct acpi_simple_repair_info *acpi_ns_match_simple_repair(struct this_name = acpi_object_repair_info; while (this_name->object_converter) { - if (ACPI_COMPARE_NAME(node->name.ascii, this_name->name)) { + if (ACPI_COMPARE_NAMESEG(node->name.ascii, this_name->name)) { /* Check if we can actually repair this name/type combination */ diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c index d5804a6d1d65..4c285e918465 100644 --- a/drivers/acpi/acpica/nsrepair2.c +++ b/drivers/acpi/acpica/nsrepair2.c @@ -188,7 +188,7 @@ static const struct acpi_repair_info *acpi_ns_match_complex_repair(struct this_name = acpi_ns_repairable_names; while (this_name->repair_function) { - if (ACPI_COMPARE_NAME(node->name.ascii, this_name->name)) { + if (ACPI_COMPARE_NAMESEG(node->name.ascii, this_name->name)) { return (this_name); } diff --git a/drivers/acpi/acpica/nsxfname.c b/drivers/acpi/acpica/nsxfname.c index de2d3135d6a9..55b4a5b3331f 100644 --- a/drivers/acpi/acpica/nsxfname.c +++ b/drivers/acpi/acpica/nsxfname.c @@ -495,8 +495,8 @@ acpi_status acpi_install_method(u8 *buffer) /* Table must be a DSDT or SSDT */ - if (!ACPI_COMPARE_NAME(table->signature, ACPI_SIG_DSDT) && - !ACPI_COMPARE_NAME(table->signature, ACPI_SIG_SSDT)) { + if (!ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_DSDT) && + !ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_SSDT)) { return (AE_BAD_HEADER); } diff --git a/drivers/acpi/acpica/rsxface.c b/drivers/acpi/acpica/rsxface.c index 1d6f136e4068..c62be3d91712 100644 --- a/drivers/acpi/acpica/rsxface.c +++ b/drivers/acpi/acpica/rsxface.c @@ -603,10 +603,10 @@ acpi_walk_resources(acpi_handle device_handle, /* Parameter validation */ if (!device_handle || !user_function || !name || - (!ACPI_COMPARE_NAME(name, METHOD_NAME__CRS) && - !ACPI_COMPARE_NAME(name, METHOD_NAME__PRS) && - !ACPI_COMPARE_NAME(name, METHOD_NAME__AEI) && - !ACPI_COMPARE_NAME(name, METHOD_NAME__DMA))) { + (!ACPI_COMPARE_NAMESEG(name, METHOD_NAME__CRS) && + !ACPI_COMPARE_NAMESEG(name, METHOD_NAME__PRS) && + !ACPI_COMPARE_NAMESEG(name, METHOD_NAME__AEI) && + !ACPI_COMPARE_NAMESEG(name, METHOD_NAME__DMA))) { return_ACPI_STATUS(AE_BAD_PARAMETER); } diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c index 0cecd0039acf..933f81316ad2 100644 --- a/drivers/acpi/acpica/tbdata.c +++ b/drivers/acpi/acpica/tbdata.c @@ -480,7 +480,8 @@ acpi_tb_verify_temp_table(struct acpi_table_desc *table_desc, /* If a particular signature is expected (DSDT/FACS), it must match */ - if (signature && !ACPI_COMPARE_NAME(&table_desc->signature, signature)) { + if (signature && + !ACPI_COMPARE_NAMESEG(&table_desc->signature, signature)) { ACPI_BIOS_ERROR((AE_INFO, "Invalid signature 0x%X for ACPI table, expected [%s]", table_desc->signature.integer, signature)); diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c index be6642bf6366..ef1ffd36ab3f 100644 --- a/drivers/acpi/acpica/tbinstal.c +++ b/drivers/acpi/acpica/tbinstal.c @@ -120,7 +120,7 @@ acpi_tb_install_standard_table(acpi_physical_address address, */ if (!reload && acpi_gbl_disable_ssdt_table_install && - ACPI_COMPARE_NAME(&new_table_desc.signature, ACPI_SIG_SSDT)) { + ACPI_COMPARE_NAMESEG(&new_table_desc.signature, ACPI_SIG_SSDT)) { ACPI_INFO(("Ignoring installation of %4.4s at %8.8X%8.8X", new_table_desc.signature.ascii, ACPI_FORMAT_UINT64(address))); diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c index 9b5df95d881b..0bc18fd5cd71 100644 --- a/drivers/acpi/acpica/tbprint.c +++ b/drivers/acpi/acpica/tbprint.c @@ -94,7 +94,7 @@ acpi_tb_print_table_header(acpi_physical_address address, { struct acpi_table_header local_header; - if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_FACS)) { + if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_FACS)) { /* FACS only has signature and length fields */ @@ -158,8 +158,8 @@ acpi_status acpi_tb_verify_checksum(struct acpi_table_header *table, u32 length) * They are the odd tables, have no standard ACPI header and no checksum */ - if (ACPI_COMPARE_NAME(table->signature, ACPI_SIG_S3PT) || - ACPI_COMPARE_NAME(table->signature, ACPI_SIG_FACS)) { + if (ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_S3PT) || + ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_FACS)) { return (AE_OK); } diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index 2469e01310e2..c5f0b8ec70cc 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -332,9 +332,9 @@ acpi_tb_parse_root_table(acpi_physical_address rsdp_address) &table_index); if (ACPI_SUCCESS(status) && - ACPI_COMPARE_NAME(&acpi_gbl_root_table_list. - tables[table_index].signature, - ACPI_SIG_FADT)) { + ACPI_COMPARE_NAMESEG(&acpi_gbl_root_table_list. + tables[table_index].signature, + ACPI_SIG_FADT)) { acpi_gbl_fadt_index = table_index; acpi_tb_parse_fadt(); } diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c index 36592888f0e7..1640685bf4ae 100644 --- a/drivers/acpi/acpica/tbxface.c +++ b/drivers/acpi/acpica/tbxface.c @@ -230,7 +230,7 @@ acpi_get_table_header(char *signature, for (i = 0, j = 0; i < acpi_gbl_root_table_list.current_table_count; i++) { - if (!ACPI_COMPARE_NAME + if (!ACPI_COMPARE_NAMESEG (&(acpi_gbl_root_table_list.tables[i].signature), signature)) { continue; @@ -323,7 +323,7 @@ acpi_get_table(char *signature, i++) { table_desc = &acpi_gbl_root_table_list.tables[i]; - if (!ACPI_COMPARE_NAME(&table_desc->signature, signature)) { + if (!ACPI_COMPARE_NAMESEG(&table_desc->signature, signature)) { continue; } diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c index 1a2592cc3245..4f30f06a6f78 100644 --- a/drivers/acpi/acpica/tbxfload.c +++ b/drivers/acpi/acpica/tbxfload.c @@ -118,7 +118,7 @@ acpi_status acpi_tb_load_namespace(void) table = &acpi_gbl_root_table_list.tables[acpi_gbl_dsdt_index]; if (!acpi_gbl_root_table_list.current_table_count || - !ACPI_COMPARE_NAME(table->signature.ascii, ACPI_SIG_DSDT) || + !ACPI_COMPARE_NAMESEG(table->signature.ascii, ACPI_SIG_DSDT) || ACPI_FAILURE(acpi_tb_validate_table(table))) { status = AE_NO_ACPI_TABLES; goto unlock_and_exit; @@ -170,11 +170,12 @@ acpi_status acpi_tb_load_namespace(void) table = &acpi_gbl_root_table_list.tables[i]; if (!table->address || - (!ACPI_COMPARE_NAME(table->signature.ascii, ACPI_SIG_SSDT) - && !ACPI_COMPARE_NAME(table->signature.ascii, - ACPI_SIG_PSDT) - && !ACPI_COMPARE_NAME(table->signature.ascii, - ACPI_SIG_OSDT)) + (!ACPI_COMPARE_NAMESEG + (table->signature.ascii, ACPI_SIG_SSDT) + && !ACPI_COMPARE_NAMESEG(table->signature.ascii, + ACPI_SIG_PSDT) + && !ACPI_COMPARE_NAMESEG(table->signature.ascii, + ACPI_SIG_OSDT)) || ACPI_FAILURE(acpi_tb_validate_table(table))) { continue; } @@ -364,7 +365,7 @@ acpi_status acpi_unload_parent_table(acpi_handle object) * only these types can contain AML and thus are the only types * that can create namespace objects. */ - if (ACPI_COMPARE_NAME + if (ACPI_COMPARE_NAMESEG (acpi_gbl_root_table_list.tables[i].signature.ascii, ACPI_SIG_DSDT)) { status = AE_TYPE; diff --git a/drivers/acpi/acpica/utmisc.c b/drivers/acpi/acpica/utmisc.c index afaadc73196b..8638efacdbf4 100644 --- a/drivers/acpi/acpica/utmisc.c +++ b/drivers/acpi/acpica/utmisc.c @@ -59,10 +59,10 @@ u8 acpi_ut_is_aml_table(struct acpi_table_header *table) /* These are the only tables that contain executable AML */ - if (ACPI_COMPARE_NAME(table->signature, ACPI_SIG_DSDT) || - ACPI_COMPARE_NAME(table->signature, ACPI_SIG_PSDT) || - ACPI_COMPARE_NAME(table->signature, ACPI_SIG_SSDT) || - ACPI_COMPARE_NAME(table->signature, ACPI_SIG_OSDT) || + if (ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_DSDT) || + ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_PSDT) || + ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_SSDT) || + ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_OSDT) || ACPI_IS_OEM_SIG(table->signature)) { return (TRUE); } diff --git a/drivers/acpi/acpica/utpredef.c b/drivers/acpi/acpica/utpredef.c index a9f08f43c685..1b0f68f5ed8c 100644 --- a/drivers/acpi/acpica/utpredef.c +++ b/drivers/acpi/acpica/utpredef.c @@ -84,7 +84,7 @@ const union acpi_predefined_info *acpi_ut_match_predefined_method(char *name) this_name = acpi_gbl_predefined_methods; while (this_name->info.name[0]) { - if (ACPI_COMPARE_NAME(name, this_name->info.name)) { + if (ACPI_COMPARE_NAMESEG(name, this_name->info.name)) { return (this_name); } @@ -201,7 +201,7 @@ const union acpi_predefined_info *acpi_ut_match_resource_name(char *name) this_name = acpi_gbl_resource_names; while (this_name->info.name[0]) { - if (ACPI_COMPARE_NAME(name, this_name->info.name)) { + if (ACPI_COMPARE_NAMESEG(name, this_name->info.name)) { return (this_name); } diff --git a/drivers/acpi/acpica/utstring.c b/drivers/acpi/acpica/utstring.c index 89dc79798d38..927797355da9 100644 --- a/drivers/acpi/acpica/utstring.c +++ b/drivers/acpi/acpica/utstring.c @@ -141,7 +141,7 @@ void acpi_ut_repair_name(char *name) * Special case for the root node. This can happen if we get an * error during the execution of module-level code. */ - if (ACPI_COMPARE_NAME(name, ACPI_ROOT_PATHNAME)) { + if (ACPI_COMPARE_NAMESEG(name, ACPI_ROOT_PATHNAME)) { return; } diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 446c959a8f08..3fb331fb6e82 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2260,7 +2260,7 @@ int __init __acpi_probe_device_table(struct acpi_probe_entry *ap_head, int nr) mutex_lock(&acpi_probe_mutex); for (ape = ap_head; nr; ape++, nr--) { - if (ACPI_COMPARE_NAME(ACPI_SIG_MADT, ape->id)) { + if (ACPI_COMPARE_NAMESEG(ACPI_SIG_MADT, ape->id)) { acpi_probe_count = 0; acpi_table_parse_madt(ape->type, acpi_match_madt, 0); count += acpi_probe_count; diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index fa76f5e41b5c..262d6ee4735d 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -368,10 +368,10 @@ static int acpi_table_attr_init(struct kobject *tables_obj, char instance_str[ACPI_INST_SIZE]; sysfs_attr_init(&table_attr->attr.attr); - ACPI_MOVE_NAME(table_attr->name, table_header->signature); + ACPI_COPY_NAMESEG(table_attr->name, table_header->signature); list_for_each_entry(attr, &acpi_table_attr_list, node) { - if (ACPI_COMPARE_NAME(table_attr->name, attr->name)) + if (ACPI_COMPARE_NAMESEG(table_attr->name, attr->name)) if (table_attr->instance < attr->instance) table_attr->instance = attr->instance; } @@ -382,7 +382,7 @@ static int acpi_table_attr_init(struct kobject *tables_obj, return -ERANGE; } - ACPI_MOVE_NAME(table_attr->filename, table_header->signature); + ACPI_COPY_NAMESEG(table_attr->filename, table_header->signature); table_attr->filename[ACPI_NAME_SIZE] = '\0'; if (table_attr->instance > 1 || (table_attr->instance == 1 && !acpi_get_table @@ -484,7 +484,7 @@ static int acpi_table_data_init(struct acpi_table_header *th) int i; for (i = 0; i < NUM_ACPI_DATA_OBJS; i++) { - if (ACPI_COMPARE_NAME(th->signature, acpi_data_objs[i].name)) { + if (ACPI_COMPARE_NAMESEG(th->signature, acpi_data_objs[i].name)) { data_attr = kzalloc(sizeof(*data_attr), GFP_KERNEL); if (!data_attr) return -ENOMEM; diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 8fccbe49612a..1ffd7b9eefc5 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -670,8 +670,8 @@ static void __init acpi_table_initrd_scan(void) table_length = table->length; /* Skip RSDT/XSDT which should only be used for override */ - if (ACPI_COMPARE_NAME(table->signature, ACPI_SIG_RSDT) || - ACPI_COMPARE_NAME(table->signature, ACPI_SIG_XSDT)) { + if (ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_RSDT) || + ACPI_COMPARE_NAMESEG(table->signature, ACPI_SIG_XSDT)) { acpi_os_unmap_memory(table, ACPI_HEADER_SIZE); goto next_table; } diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index bfe54189f242..cb51172a47a3 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -515,10 +515,10 @@ typedef u64 acpi_integer; /* Optimizations for 4-character (32-bit) acpi_name manipulation */ #ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED -#define ACPI_COMPARE_NAME(a,b) (*ACPI_CAST_PTR (u32, (a)) == *ACPI_CAST_PTR (u32, (b))) +#define ACPI_COMPARE_NAMESEG(a,b) (*ACPI_CAST_PTR (u32, (a)) == *ACPI_CAST_PTR (u32, (b))) #define ACPI_COPY_NAMESEG(dest,src) (*ACPI_CAST_PTR (u32, (dest)) = *ACPI_CAST_PTR (u32, (src))) #else -#define ACPI_COMPARE_NAME(a,b) (!strncmp (ACPI_CAST_PTR (char, (a)), ACPI_CAST_PTR (char, (b)), ACPI_NAME_SIZE)) +#define ACPI_COMPARE_NAMESEG(a,b) (!strncmp (ACPI_CAST_PTR (char, (a)), ACPI_CAST_PTR (char, (b)), ACPI_NAME_SIZE)) #define ACPI_COPY_NAMESEG(dest,src) (strncpy (ACPI_CAST_PTR (char, (dest)), ACPI_CAST_PTR (char, (src)), ACPI_NAME_SIZE)) #endif diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c index 587b701a7edb..2686d858225a 100644 --- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c +++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c @@ -293,7 +293,7 @@ static acpi_status osl_add_table_to_list(char *signature, u32 instance) } else { next = gbl_table_list_head; while (1) { - if (ACPI_COMPARE_NAME(next->signature, signature)) { + if (ACPI_COMPARE_NAMESEG(next->signature, signature)) { if (next->instance == instance) { found = TRUE; } @@ -782,11 +782,11 @@ osl_get_bios_table(char *signature, /* Handle special tables whose addresses are not in RSDT/XSDT */ - if (ACPI_COMPARE_NAME(signature, ACPI_RSDP_NAME) || - ACPI_COMPARE_NAME(signature, ACPI_SIG_RSDT) || - ACPI_COMPARE_NAME(signature, ACPI_SIG_XSDT) || - ACPI_COMPARE_NAME(signature, ACPI_SIG_DSDT) || - ACPI_COMPARE_NAME(signature, ACPI_SIG_FACS)) { + if (ACPI_COMPARE_NAMESEG(signature, ACPI_RSDP_NAME) || + ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_RSDT) || + ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_XSDT) || + ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_DSDT) || + ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_FACS)) { find_next_instance: @@ -797,7 +797,7 @@ find_next_instance: * careful about the FADT length and validate table addresses. * Note: The 64-bit addresses have priority. */ - if (ACPI_COMPARE_NAME(signature, ACPI_SIG_DSDT)) { + if (ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_DSDT)) { if (current_instance < 2) { if ((gbl_fadt->header.length >= MIN_FADT_FOR_XDSDT) && gbl_fadt->Xdsdt @@ -815,7 +815,7 @@ find_next_instance: dsdt; } } - } else if (ACPI_COMPARE_NAME(signature, ACPI_SIG_FACS)) { + } else if (ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_FACS)) { if (current_instance < 2) { if ((gbl_fadt->header.length >= MIN_FADT_FOR_XFACS) && gbl_fadt->Xfacs @@ -833,7 +833,7 @@ find_next_instance: facs; } } - } else if (ACPI_COMPARE_NAME(signature, ACPI_SIG_XSDT)) { + } else if (ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_XSDT)) { if (!gbl_revision) { return (AE_BAD_SIGNATURE); } @@ -842,7 +842,7 @@ find_next_instance: (acpi_physical_address)gbl_rsdp. xsdt_physical_address; } - } else if (ACPI_COMPARE_NAME(signature, ACPI_SIG_RSDT)) { + } else if (ACPI_COMPARE_NAMESEG(signature, ACPI_SIG_RSDT)) { if (current_instance == 0) { table_address = (acpi_physical_address)gbl_rsdp. @@ -931,7 +931,7 @@ find_next_instance: /* Does this table match the requested signature? */ - if (!ACPI_COMPARE_NAME + if (!ACPI_COMPARE_NAMESEG (mapped_table->signature, signature)) { osl_unmap_table(mapped_table); mapped_table = NULL; @@ -1086,8 +1086,8 @@ osl_map_table(acpi_size address, return (AE_BAD_SIGNATURE); } } else - if (!ACPI_COMPARE_NAME(signature, mapped_table->signature)) - { + if (!ACPI_COMPARE_NAMESEG + (signature, mapped_table->signature)) { acpi_os_unmap_memory(mapped_table, sizeof(struct acpi_table_header)); return (AE_BAD_SIGNATURE); @@ -1236,7 +1236,7 @@ osl_read_table_from_file(char *filename, status = AE_BAD_SIGNATURE; goto exit; } - } else if (!ACPI_COMPARE_NAME(signature, header.signature)) { + } else if (!ACPI_COMPARE_NAMESEG(signature, header.signature)) { fprintf(stderr, "Incorrect signature: Expecting %4.4s, found %4.4s\n", signature, header.signature); @@ -1329,7 +1329,7 @@ osl_get_customized_table(char *pathname, /* Ignore meaningless files */ - if (!ACPI_COMPARE_NAME(filename, signature)) { + if (!ACPI_COMPARE_NAMESEG(filename, signature)) { continue; } diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c index e256c2ac5ddc..9dfcc0329d34 100644 --- a/tools/power/acpi/tools/acpidump/apdump.c +++ b/tools/power/acpi/tools/acpidump/apdump.c @@ -310,9 +310,9 @@ int ap_dump_table_by_name(char *signature) /* To be friendly, handle tables whose signatures do not match the name */ - if (ACPI_COMPARE_NAME(local_signature, "FADT")) { + if (ACPI_COMPARE_NAMESEG(local_signature, "FADT")) { strcpy(local_signature, ACPI_SIG_FADT); - } else if (ACPI_COMPARE_NAME(local_signature, "MADT")) { + } else if (ACPI_COMPARE_NAMESEG(local_signature, "MADT")) { strcpy(local_signature, ACPI_SIG_MADT); } -- cgit v1.2.3 From d75f773c86a2b8b7278e2c33343b46a4024bc002 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 25 Mar 2019 21:32:28 +0200 Subject: treewide: Switch printk users from %pf and %pF to %ps and %pS, respectively %pF and %pf are functionally equivalent to %pS and %ps conversion specifiers. The former are deprecated, therefore switch the current users to use the preferred variant. The changes have been produced by the following command: git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \ while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done And verifying the result. Link: http://lkml.kernel.org/r/20190325193229.23390-1-sakari.ailus@linux.intel.com Cc: Andy Shevchenko Cc: linux-arm-kernel@lists.infradead.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: xen-devel@lists.xenproject.org Cc: linux-acpi@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvdimm@lists.01.org Cc: linux-pci@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-btrfs@vger.kernel.org Cc: linux-f2fs-devel@lists.sourceforge.net Cc: linux-mm@kvack.org Cc: ceph-devel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Sakari Ailus Acked-by: David Sterba (for btrfs) Acked-by: Mike Rapoport (for mm/memblock.c) Acked-by: Bjorn Helgaas (for drivers/pci) Acked-by: Rafael J. Wysocki Signed-off-by: Petr Mladek --- arch/alpha/kernel/pci_iommu.c | 20 ++++++++++---------- arch/arm/mach-imx/pm-imx6.c | 2 +- arch/arm/mm/alignment.c | 2 +- arch/arm/nwfpe/fpmodule.c | 2 +- arch/microblaze/mm/pgtable.c | 2 +- arch/sparc/kernel/ds.c | 2 +- arch/um/kernel/sysrq.c | 2 +- arch/x86/include/asm/trace/exceptions.h | 2 +- arch/x86/kernel/irq_64.c | 2 +- arch/x86/mm/extable.c | 4 ++-- arch/x86/xen/multicalls.c | 2 +- drivers/acpi/device_pm.c | 2 +- drivers/base/power/main.c | 6 +++--- drivers/base/syscore.c | 12 ++++++------ drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/floppy.c | 10 +++++----- drivers/cpufreq/cpufreq.c | 2 +- drivers/mmc/core/quirks.h | 2 +- drivers/nvdimm/bus.c | 2 +- drivers/nvdimm/dimm_devs.c | 2 +- drivers/pci/pci-driver.c | 14 +++++++------- drivers/pci/quirks.c | 4 ++-- drivers/pnp/quirks.c | 2 +- drivers/scsi/esp_scsi.c | 2 +- fs/btrfs/tests/free-space-tree-tests.c | 4 ++-- fs/f2fs/f2fs.h | 2 +- fs/pstore/inode.c | 2 +- include/trace/events/btrfs.h | 2 +- include/trace/events/cpuhp.h | 4 ++-- include/trace/events/preemptirq.h | 2 +- include/trace/events/rcu.h | 4 ++-- include/trace/events/sunrpc.h | 2 +- include/trace/events/vmscan.h | 4 ++-- include/trace/events/workqueue.h | 4 ++-- include/trace/events/xen.h | 2 +- init/main.c | 6 +++--- kernel/async.c | 4 ++-- kernel/events/uprobes.c | 2 +- kernel/fail_function.c | 2 +- kernel/irq/debugfs.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/manage.c | 2 +- kernel/irq/spurious.c | 4 ++-- kernel/rcu/tree.c | 2 +- kernel/stop_machine.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timer.c | 2 +- kernel/workqueue.c | 12 ++++++------ lib/error-inject.c | 2 +- lib/percpu-refcount.c | 4 ++-- mm/memblock.c | 14 +++++++------- mm/memory.c | 2 +- mm/vmscan.c | 2 +- net/ceph/osd_client.c | 2 +- net/core/net-procfs.c | 2 +- net/core/netpoll.c | 4 ++-- 56 files changed, 106 insertions(+), 106 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index aa0f50d0f823..4a2ae862b19e 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -237,7 +237,7 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask) ok = 0; /* If both conditions above are met, we are fine. */ - DBGA("pci_dac_dma_supported %s from %pf\n", + DBGA("pci_dac_dma_supported %s from %ps\n", ok ? "yes" : "no", __builtin_return_address(0)); return ok; @@ -269,7 +269,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, && paddr + size <= __direct_map_size) { ret = paddr + __direct_map_base; - DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %pf\n", + DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %ps\n", cpu_addr, size, ret, __builtin_return_address(0)); return ret; @@ -280,7 +280,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, if (dac_allowed) { ret = paddr + alpha_mv.pci_dac_offset; - DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %pf\n", + DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %ps\n", cpu_addr, size, ret, __builtin_return_address(0)); return ret; @@ -317,7 +317,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, ret = arena->dma_base + dma_ofs * PAGE_SIZE; ret += (unsigned long)cpu_addr & ~PAGE_MASK; - DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %pf\n", + DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %ps\n", cpu_addr, size, npages, ret, __builtin_return_address(0)); return ret; @@ -384,14 +384,14 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr, && dma_addr < __direct_map_base + __direct_map_size) { /* Nothing to do. */ - DBGA2("pci_unmap_single: direct [%llx,%zx] from %pf\n", + DBGA2("pci_unmap_single: direct [%llx,%zx] from %ps\n", dma_addr, size, __builtin_return_address(0)); return; } if (dma_addr > 0xffffffff) { - DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %pf\n", + DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %ps\n", dma_addr, size, __builtin_return_address(0)); return; } @@ -423,7 +423,7 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&arena->lock, flags); - DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %pf\n", + DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %ps\n", dma_addr, size, npages, __builtin_return_address(0)); } @@ -446,7 +446,7 @@ try_again: cpu_addr = (void *)__get_free_pages(gfp | __GFP_ZERO, order); if (! cpu_addr) { printk(KERN_INFO "pci_alloc_consistent: " - "get_free_pages failed from %pf\n", + "get_free_pages failed from %ps\n", __builtin_return_address(0)); /* ??? Really atomic allocation? Otherwise we could play with vmalloc and sg if we can't find contiguous memory. */ @@ -465,7 +465,7 @@ try_again: goto try_again; } - DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %pf\n", + DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %ps\n", size, cpu_addr, *dma_addrp, __builtin_return_address(0)); return cpu_addr; @@ -485,7 +485,7 @@ static void alpha_pci_free_coherent(struct device *dev, size_t size, pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); free_pages((unsigned long)cpu_addr, get_order(size)); - DBGA2("pci_free_consistent: [%llx,%zx] from %pf\n", + DBGA2("pci_free_consistent: [%llx,%zx] from %ps\n", dma_addr, size, __builtin_return_address(0)); } diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 87f45b926c78..e67e0b2d4ce0 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -631,7 +631,7 @@ static void imx6_pm_stby_poweroff(void) static int imx6_pm_stby_poweroff_probe(void) { if (pm_power_off) { - pr_warn("%s: pm_power_off already claimed %p %pf!\n", + pr_warn("%s: pm_power_off already claimed %p %ps!\n", __func__, pm_power_off, pm_power_off); return -EBUSY; } diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index b54f8f8def36..e376883ab35b 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -133,7 +133,7 @@ static const char *usermode_action[] = { static int alignment_proc_show(struct seq_file *m, void *v) { seq_printf(m, "User:\t\t%lu\n", ai_user); - seq_printf(m, "System:\t\t%lu (%pF)\n", ai_sys, ai_sys_last_pc); + seq_printf(m, "System:\t\t%lu (%pS)\n", ai_sys, ai_sys_last_pc); seq_printf(m, "Skipped:\t%lu\n", ai_skipped); seq_printf(m, "Half:\t\t%lu\n", ai_half); seq_printf(m, "Word:\t\t%lu\n", ai_word); diff --git a/arch/arm/nwfpe/fpmodule.c b/arch/arm/nwfpe/fpmodule.c index 1365e8650843..ee34c76e6624 100644 --- a/arch/arm/nwfpe/fpmodule.c +++ b/arch/arm/nwfpe/fpmodule.c @@ -147,7 +147,7 @@ void float_raise(signed char flags) #ifdef CONFIG_DEBUG_USER if (flags & debug) printk(KERN_DEBUG - "NWFPE: %s[%d] takes exception %08x at %pf from %08lx\n", + "NWFPE: %s[%d] takes exception %08x at %ps from %08lx\n", current->comm, current->pid, flags, __builtin_return_address(0), GET_USERREG()->ARM_pc); #endif diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index c2ce1e42b888..8fe54fda31dc 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -75,7 +75,7 @@ static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, p >= memory_start && p < virt_to_phys(high_memory) && !(p >= __virt_to_phys((phys_addr_t)__bss_stop) && p < __virt_to_phys((phys_addr_t)__bss_stop))) { - pr_warn("__ioremap(): phys addr "PTE_FMT" is RAM lr %pf\n", + pr_warn("__ioremap(): phys addr "PTE_FMT" is RAM lr %ps\n", (unsigned long)p, __builtin_return_address(0)); return NULL; } diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c index f87265afb175..cad08ccce625 100644 --- a/arch/sparc/kernel/ds.c +++ b/arch/sparc/kernel/ds.c @@ -876,7 +876,7 @@ void ldom_power_off(void) static void ds_conn_reset(struct ds_info *dp) { - printk(KERN_ERR "ds-%llu: ds_conn_reset() from %pf\n", + printk(KERN_ERR "ds-%llu: ds_conn_reset() from %ps\n", dp->id, __builtin_return_address(0)); } diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 6b995e870d55..05585eef11d9 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -20,7 +20,7 @@ static void _print_addr(void *data, unsigned long address, int reliable) { - pr_info(" [<%08lx>] %s%pF\n", address, reliable ? "" : "? ", + pr_info(" [<%08lx>] %s%pS\n", address, reliable ? "" : "? ", (void *)address); } diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index e0e6d7f21399..6b1e87194809 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(x86_exceptions, __entry->error_code = error_code; ), - TP_printk("address=%pf ip=%pf error_code=0x%lx", + TP_printk("address=%ps ip=%ps error_code=0x%lx", (void *)__entry->address, (void *)__entry->ip, __entry->error_code) ); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..4dff56658427 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -58,7 +58,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pS)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, estack_top, estack_bottom, (void *)regs->ip); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 3c4568f8fb28..b0a2de8d2f9e 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -145,7 +145,7 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup unsigned long error_code, unsigned long fault_addr) { - if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", + if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) show_stack_regs(regs); @@ -162,7 +162,7 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup unsigned long error_code, unsigned long fault_addr) { - if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", + if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) show_stack_regs(regs); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 0766a08bdf45..07054572297f 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -105,7 +105,7 @@ void xen_mc_flush(void) for (i = 0; i < b->mcidx; i++) { if (b->entries[i].result < 0) { #if MC_DEBUG - pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n", + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n", i + 1, b->debug[i].op, b->debug[i].args[0], diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 824ae985ad93..1aa0d014dc34 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -414,7 +414,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used) if (adev->wakeup.flags.notifier_present) { pm_wakeup_ws_event(adev->wakeup.ws, 0, acpi_s2idle_wakeup()); if (adev->wakeup.context.func) { - acpi_handle_debug(handle, "Running %pF for %s\n", + acpi_handle_debug(handle, "Running %pS for %s\n", adev->wakeup.context.func, dev_name(adev->wakeup.context.dev)); adev->wakeup.context.func(&adev->wakeup.context); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5a8149829ab3..c88f56b9ae5b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -205,7 +205,7 @@ static ktime_t initcall_debug_start(struct device *dev, void *cb) if (!pm_print_times_enabled) return 0; - dev_info(dev, "calling %pF @ %i, parent: %s\n", cb, + dev_info(dev, "calling %pS @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); @@ -223,7 +223,7 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime, rettime = ktime_get(); nsecs = (s64) ktime_to_ns(ktime_sub(rettime, calltime)); - dev_info(dev, "%pF returned %d after %Ld usecs\n", cb, error, + dev_info(dev, "%pS returned %d after %Ld usecs\n", cb, error, (unsigned long long)nsecs >> 10); } @@ -2062,7 +2062,7 @@ EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, void *fn, int ret) { if (ret) - printk(KERN_ERR "%s(): %pF returns %d\n", function, fn, ret); + printk(KERN_ERR "%s(): %pS returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index 6e076f359dcc..0d346a307140 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -62,19 +62,19 @@ int syscore_suspend(void) list_for_each_entry_reverse(ops, &syscore_ops_list, node) if (ops->suspend) { if (initcall_debug) - pr_info("PM: Calling %pF\n", ops->suspend); + pr_info("PM: Calling %pS\n", ops->suspend); ret = ops->suspend(); if (ret) goto err_out; WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pF\n", ops->suspend); + "Interrupts enabled after %pS\n", ops->suspend); } trace_suspend_resume(TPS("syscore_suspend"), 0, false); return 0; err_out: - pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend); + pr_err("PM: System core suspend callback %pS failed.\n", ops->suspend); list_for_each_entry_continue(ops, &syscore_ops_list, node) if (ops->resume) @@ -100,10 +100,10 @@ void syscore_resume(void) list_for_each_entry(ops, &syscore_ops_list, node) if (ops->resume) { if (initcall_debug) - pr_info("PM: Calling %pF\n", ops->resume); + pr_info("PM: Calling %pS\n", ops->resume); ops->resume(); WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pF\n", ops->resume); + "Interrupts enabled after %pS\n", ops->resume); } trace_suspend_resume(TPS("syscore_resume"), 0, false); } @@ -122,7 +122,7 @@ void syscore_shutdown(void) list_for_each_entry_reverse(ops, &syscore_ops_list, node) if (ops->shutdown) { if (initcall_debug) - pr_info("PM: Calling %pF\n", ops->shutdown); + pr_info("PM: Calling %pS\n", ops->shutdown); ops->shutdown(); } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c7ad88d91a09..3e5fd97a3b4d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -6116,7 +6116,7 @@ int drbd_ack_receiver(struct drbd_thread *thi) err = cmd->fn(connection, &pi); if (err) { - drbd_err(connection, "%pf failed\n", cmd->fn); + drbd_err(connection, "%ps failed\n", cmd->fn); goto reconnect; } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 95f608d1a098..49f89db0766f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1693,7 +1693,7 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) /* we don't even know which FDC is the culprit */ pr_info("DOR0=%x\n", fdc_state[0].dor); pr_info("floppy interrupt on bizarre fdc %d\n", fdc); - pr_info("handler=%pf\n", handler); + pr_info("handler=%ps\n", handler); is_alive(__func__, "bizarre fdc"); return IRQ_NONE; } @@ -1752,7 +1752,7 @@ static void reset_interrupt(void) debugt(__func__, ""); result(); /* get the status ready for set_fdc */ if (FDCS->reset) { - pr_info("reset set in interrupt, calling %pf\n", cont->error); + pr_info("reset set in interrupt, calling %ps\n", cont->error); cont->error(); /* a reset just after a reset. BAD! */ } cont->redo(); @@ -1793,7 +1793,7 @@ static void show_floppy(void) pr_info("\n"); pr_info("floppy driver state\n"); pr_info("-------------------\n"); - pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n", + pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%ps\n", jiffies, interruptjiffies, jiffies - interruptjiffies, lasthandler); @@ -1812,9 +1812,9 @@ static void show_floppy(void) pr_info("status=%x\n", fd_inb(FD_STATUS)); pr_info("fdc_busy=%lu\n", fdc_busy); if (do_floppy) - pr_info("do_floppy=%pf\n", do_floppy); + pr_info("do_floppy=%ps\n", do_floppy); if (work_pending(&floppy_work)) - pr_info("floppy_work.func=%pf\n", floppy_work.func); + pr_info("floppy_work.func=%ps\n", floppy_work.func); if (delayed_work_pending(&fd_timer)) pr_info("delayed work.function=%p expires=%ld\n", fd_timer.work.func, diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0e626b00053b..d87ebc2d2e68 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -432,7 +432,7 @@ static void cpufreq_list_transition_notifiers(void) mutex_lock(&cpufreq_transition_notifier_list.mutex); for (nb = cpufreq_transition_notifier_list.head; nb; nb = nb->next) - pr_info("%pF\n", nb->notifier_call); + pr_info("%pS\n", nb->notifier_call); mutex_unlock(&cpufreq_transition_notifier_list.mutex); } diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index dd2f73af8f2c..2d2d9ea8be4f 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -159,7 +159,7 @@ static inline void mmc_fixup_device(struct mmc_card *card, (f->ext_csd_rev == EXT_CSD_REV_ANY || f->ext_csd_rev == card->ext_csd.rev) && rev >= f->rev_start && rev <= f->rev_end) { - dev_dbg(&card->dev, "calling %pf\n", f->vendor_fixup); + dev_dbg(&card->dev, "calling %ps\n", f->vendor_fixup); f->vendor_fixup(card, f->data); } } diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 7bbff0af29b2..7ff684159f29 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -581,7 +581,7 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner, struct device_driver *drv = &nd_drv->drv; if (!nd_drv->type) { - pr_debug("driver type bitmask not set (%pf)\n", + pr_debug("driver type bitmask not set (%ps)\n", __builtin_return_address(0)); return -EINVAL; } diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index efe412a6b5b9..06f5087547ea 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -53,7 +53,7 @@ static int validate_dimm(struct nvdimm_drvdata *ndd) rc = nvdimm_check_config_data(ndd->dev); if (rc) - dev_dbg(ndd->dev, "%pf: %s error: %d\n", + dev_dbg(ndd->dev, "%ps: %s error: %d\n", __builtin_return_address(0), __func__, rc); return rc; } diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 79b1610a8beb..11a877461584 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -578,7 +578,7 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state) if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: Device state not saved by %pF\n", + "PCI PM: Device state not saved by %pS\n", drv->suspend); } } @@ -605,7 +605,7 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: Device state not saved by %pF\n", + "PCI PM: Device state not saved by %pS\n", drv->suspend_late); goto Fixup; } @@ -773,7 +773,7 @@ static int pci_pm_suspend(struct device *dev) if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: State of device not saved by %pF\n", + "PCI PM: State of device not saved by %pS\n", pm->suspend); } } @@ -821,7 +821,7 @@ static int pci_pm_suspend_noirq(struct device *dev) if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: State of device not saved by %pF\n", + "PCI PM: State of device not saved by %pS\n", pm->suspend_noirq); goto Fixup; } @@ -1260,11 +1260,11 @@ static int pci_pm_runtime_suspend(struct device *dev) * log level. */ if (error == -EBUSY || error == -EAGAIN) { - dev_dbg(dev, "can't suspend now (%pf returned %d)\n", + dev_dbg(dev, "can't suspend now (%ps returned %d)\n", pm->runtime_suspend, error); return error; } else if (error) { - dev_err(dev, "can't suspend (%pf returned %d)\n", + dev_err(dev, "can't suspend (%ps returned %d)\n", pm->runtime_suspend, error); return error; } @@ -1276,7 +1276,7 @@ static int pci_pm_runtime_suspend(struct device *dev) && !pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: State of device not saved by %pF\n", + "PCI PM: State of device not saved by %pS\n", pm->runtime_suspend); return 0; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e2a879e93d86..56cd4c4a170c 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -36,7 +36,7 @@ static ktime_t fixup_debug_start(struct pci_dev *dev, void (*fn)(struct pci_dev *dev)) { if (initcall_debug) - pci_info(dev, "calling %pF @ %i\n", fn, task_pid_nr(current)); + pci_info(dev, "calling %pS @ %i\n", fn, task_pid_nr(current)); return ktime_get(); } @@ -51,7 +51,7 @@ static void fixup_debug_report(struct pci_dev *dev, ktime_t calltime, delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; if (initcall_debug || duration > 10000) - pci_info(dev, "%pF took %lld usecs\n", fn, duration); + pci_info(dev, "%pS took %lld usecs\n", fn, duration); } static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c index 803666ae3635..de99f371d362 100644 --- a/drivers/pnp/quirks.c +++ b/drivers/pnp/quirks.c @@ -458,7 +458,7 @@ void pnp_fixup_device(struct pnp_dev *dev) for (f = pnp_fixups; *f->id; f++) { if (!compare_pnp_id(dev->id, f->id)) continue; - pnp_dbg(&dev->dev, "%s: calling %pF\n", f->id, + pnp_dbg(&dev->dev, "%s: calling %pS\n", f->id, f->quirk_function); f->quirk_function(dev); } diff --git a/drivers/scsi/esp_scsi.c b/drivers/scsi/esp_scsi.c index 465df475f753..76fd02ccbf49 100644 --- a/drivers/scsi/esp_scsi.c +++ b/drivers/scsi/esp_scsi.c @@ -1031,7 +1031,7 @@ static int esp_check_spur_intr(struct esp *esp) static void esp_schedule_reset(struct esp *esp) { - esp_log_reset("esp_schedule_reset() from %pf\n", + esp_log_reset("esp_schedule_reset() from %ps\n", __builtin_return_address(0)); esp->flags |= ESP_FLAG_RESETTING; esp_event(esp, ESP_EVENT_RESET); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 89346da890cf..f7a969b986eb 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -539,7 +539,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 0, sectorsize, nodesize, alignment); if (ret) { test_err( - "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", + "%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } @@ -547,7 +547,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 1, sectorsize, nodesize, alignment); if (ret) { test_err( - "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", + "%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8f23ee6e8eb9..02d0cd199828 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1337,7 +1337,7 @@ struct f2fs_private_dio { #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ - printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ + printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \ KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index c60ee46f3e39..29e94e0b6d73 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -115,7 +115,7 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); - seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n", pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec), rec->ip, rec->parent_ip, (void *)rec->ip, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index ab1cc33adbac..b9b7465be5eb 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1345,7 +1345,7 @@ DECLARE_EVENT_CLASS(btrfs__work, __entry->normal_work = &work->normal_work; ), - TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%pf ordered_func=%p " + TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%ps ordered_func=%p " "ordered_free=%p", __entry->work, __entry->normal_work, __entry->wq, __entry->func, __entry->ordered_func, __entry->ordered_free) diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h index fe1d6e8cd99d..ad16f77310c6 100644 --- a/include/trace/events/cpuhp.h +++ b/include/trace/events/cpuhp.h @@ -30,7 +30,7 @@ TRACE_EVENT(cpuhp_enter, __entry->fun = fun; ), - TP_printk("cpu: %04u target: %3d step: %3d (%pf)", + TP_printk("cpu: %04u target: %3d step: %3d (%ps)", __entry->cpu, __entry->target, __entry->idx, __entry->fun) ); @@ -58,7 +58,7 @@ TRACE_EVENT(cpuhp_multi_enter, __entry->fun = fun; ), - TP_printk("cpu: %04u target: %3d step: %3d (%pf)", + TP_printk("cpu: %04u target: %3d step: %3d (%ps)", __entry->cpu, __entry->target, __entry->idx, __entry->fun) ); diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 9a0d4ceeb166..95fba0471e5b 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(preemptirq_template, __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext); ), - TP_printk("caller=%pF parent=%pF", + TP_printk("caller=%pS parent=%pS", (void *)((unsigned long)(_stext) + __entry->caller_offs), (void *)((unsigned long)(_stext) + __entry->parent_offs)) ); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index f0c4d10e614b..80339fd14c1c 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -491,7 +491,7 @@ TRACE_EVENT(rcu_callback, __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%pf %ld/%ld", + TP_printk("%s rhp=%p func=%ps %ld/%ld", __entry->rcuname, __entry->rhp, __entry->func, __entry->qlen_lazy, __entry->qlen) ); @@ -587,7 +587,7 @@ TRACE_EVENT(rcu_invoke_callback, __entry->func = rhp->func; ), - TP_printk("%s rhp=%p func=%pf", + TP_printk("%s rhp=%p func=%ps", __entry->rcuname, __entry->rhp, __entry->func) ); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 0d5d0d91f861..7b8b987d332e 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(rpc_task_running, __entry->flags = task->tk_flags; ), - TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d action=%pf", + TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d action=%ps", __entry->task_id, __entry->client_id, __entry->flags, __entry->runstate, diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index a1cb91342231..252327dbfa51 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -226,7 +226,7 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->priority = priority; ), - TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, @@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->total_scan = total_scan; ), - TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 9a761bc6a251..e172549283be 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -60,7 +60,7 @@ TRACE_EVENT(workqueue_queue_work, __entry->cpu = pwq->pool->cpu; ), - TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u", + TP_printk("work struct=%p function=%ps workqueue=%p req_cpu=%u cpu=%u", __entry->work, __entry->function, __entry->workqueue, __entry->req_cpu, __entry->cpu) ); @@ -102,7 +102,7 @@ TRACE_EVENT(workqueue_execute_start, __entry->function = work->func; ), - TP_printk("work struct %p: function %pf", __entry->work, __entry->function) + TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index fdcf88bcf0ea..9a0e8af21310 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -73,7 +73,7 @@ TRACE_EVENT(xen_mc_callback, __entry->fn = fn; __entry->data = data; ), - TP_printk("callback %pf, data %p", + TP_printk("callback %ps, data %p", __entry->fn, __entry->data) ); diff --git a/init/main.c b/init/main.c index c86a1c8f19f4..3192c6cc5382 100644 --- a/init/main.c +++ b/init/main.c @@ -826,7 +826,7 @@ trace_initcall_start_cb(void *data, initcall_t fn) { ktime_t *calltime = (ktime_t *)data; - printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); + printk(KERN_DEBUG "calling %pS @ %i\n", fn, task_pid_nr(current)); *calltime = ktime_get(); } @@ -840,7 +840,7 @@ trace_initcall_finish_cb(void *data, initcall_t fn, int ret) rettime = ktime_get(); delta = ktime_sub(rettime, *calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; - printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", + printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", fn, ret, duration); } @@ -897,7 +897,7 @@ int __init_or_module do_one_initcall(initcall_t fn) strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); local_irq_enable(); } - WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); + WARN(msgbuf[0], "initcall %pS returned with %s\n", fn, msgbuf); add_latent_entropy(); return ret; diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index affa830a198c..48abc0f18eae 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (uc->handler) { rc = uc->handler(uc, regs); WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + "bad rc=0x%x from %ps()\n", rc, uc->handler); } if (uc->ret_handler) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v) { struct fei_attr *attr = list_entry(v, struct fei_attr, list); - seq_printf(m, "%pf\n", attr->kp.addr); + seq_printf(m, "%ps\n", attr->kp.addr); return 0; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); - seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ec34a2a6638..ec43ab2fdfda 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -778,7 +778,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) ret = 0; break; default: - pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) */ raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", + printk(KERN_CONT " threaded [<%p>] %ps", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..8eee921b384d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2870,7 +2870,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); return; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ repeat: } preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..1002cf61700a 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); - pr_debug("Registered %pF as sched_clock source\n", read); + pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..6502c3ed317e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1328,7 +1328,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list lock_map_release(&lockdep_map); if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7abbeed13421..8ea9e4fb8cc6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2277,7 +2277,7 @@ __acquires(&pool->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %pf\n", + " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); @@ -2596,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, - "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), - "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } @@ -4582,7 +4582,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); @@ -4607,7 +4607,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %pf", comma ? "," : "", work->func); + pr_cont("%s %ps", comma ? "," : "", work->func); } } @@ -4639,7 +4639,7 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%pf", comma ? "," : "", + pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); diff --git a/lib/error-inject.c b/lib/error-inject.c index c0d4600f4896..aa63751c916f 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -189,7 +189,7 @@ static int ei_seq_show(struct seq_file *m, void *v) { struct ei_entry *ent = list_entry(v, struct ei_entry, list); - seq_printf(m, "%pf\t%s\n", (void *)ent->start_addr, + seq_printf(m, "%ps\t%s\n", (void *)ent->start_addr, error_type_string(ent->etype)); return 0; } diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 9877682e49c7..da54318d3b55 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -151,7 +151,7 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, - "percpu ref (%pf) <= 0 (%ld) after switching to atomic", + "percpu ref (%ps) <= 0 (%ld) after switching to atomic", ref->release, atomic_long_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out switch confirmation */ @@ -333,7 +333,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, - "%s called more than once on %pf!", __func__, ref->release); + "%s called more than once on %ps!", __func__, ref->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; __percpu_ref_switch_mode(ref, confirm_kill); diff --git a/mm/memblock.c b/mm/memblock.c index 470601115892..9db5e51babd2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -701,7 +701,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_add: [%pa-%pa] %pF\n", + memblock_dbg("memblock_add: [%pa-%pa] %pS\n", &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); @@ -820,7 +820,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg(" memblock_free: [%pa-%pa] %pF\n", + memblock_dbg(" memblock_free: [%pa-%pa] %pS\n", &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); @@ -831,7 +831,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n", + memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n", &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); @@ -1466,7 +1466,7 @@ void * __init memblock_alloc_try_nid_raw( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); @@ -1502,7 +1502,7 @@ void * __init memblock_alloc_try_nid_nopanic( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); @@ -1538,7 +1538,7 @@ void * __init memblock_alloc_try_nid( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, @@ -1567,7 +1567,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) phys_addr_t cursor, end; end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pF\n", + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); diff --git a/mm/memory.c b/mm/memory.c index 47fe250307c7..3541a15067f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -519,7 +519,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", + pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, diff --git a/mm/vmscan.c b/mm/vmscan.c index a5ad0b35ab8e..90648187f622 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -493,7 +493,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan += delta; if (total_scan < 0) { - pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", + pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; next_deferred = nr; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index fa9530dd876e..6f739de28918 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2398,7 +2398,7 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + dout("%s req %p tid %llu cb %ps result %d\n", __func__, req, req->r_tid, req->r_callback, req->r_result); if (req->r_callback) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 63881f72ef71..36347933ec3a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -258,7 +258,7 @@ static int ptype_seq_show(struct seq_file *seq, void *v) else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-8s %pf\n", + seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 361aabffb8c0..bf5446192d6a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -149,7 +149,7 @@ static void poll_one_napi(struct napi_struct *napi) * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); - WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); + WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); @@ -346,7 +346,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", dev->name, dev->netdev_ops->ndo_start_xmit); } -- cgit v1.2.3 From 39ea9baffda91df8bfee9b45610242a3191ea1ec Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:30 +0200 Subject: x86/fpu: Remove fpu->initialized usage in __fpu__restore_sig() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a preparation for the removal of the ->initialized member in the fpu struct. __fpu__restore_sig() is deactivating the FPU via fpu__drop() and then setting manually ->initialized followed by fpu__restore(). The result is that it is possible to manipulate fpu->state and the state of registers won't be saved/restored on a context switch which would overwrite fpu->state: fpu__drop(fpu): ... fpu->initialized = 0; preempt_enable(); <--- context switch Don't access the fpu->state while the content is read from user space and examined/sanitized. Use a temporary kmalloc() buffer for the preparation of the FPU registers and once the state is considered okay, load it. Should something go wrong, return with an error and without altering the original FPU registers. The removal of fpu__initialize() is a nop because fpu->initialized is already set for the user task. [ bp: Massage a bit. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-2-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/signal.h | 2 +- arch/x86/kernel/fpu/regset.c | 5 ++--- arch/x86/kernel/fpu/signal.c | 40 +++++++++++++++------------------------ 3 files changed, 18 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 44bbc39a57b3..7fb516b6893a 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -22,7 +22,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, extern void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk); -extern void convert_to_fxsr(struct task_struct *tsk, +extern void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env); unsigned long diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index bc02f5144b95..5dbc099178a8 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -269,11 +269,10 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } -void convert_to_fxsr(struct task_struct *tsk, +void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env) { - struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -350,7 +349,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); if (!ret) - convert_to_fxsr(target, &env); + convert_to_fxsr(&target->thread.fpu.state.fxsave, &env); /* * update the header bit in the xsave header, indicating the diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 55b80de13ea5..7296a9bb78e7 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -207,11 +207,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) } static inline void -sanitize_restored_xstate(struct task_struct *tsk, +sanitize_restored_xstate(union fpregs_state *state, struct user_i387_ia32_struct *ia32_env, u64 xfeatures, int fx_only) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; if (use_xsave()) { @@ -238,7 +238,7 @@ sanitize_restored_xstate(struct task_struct *tsk, */ xsave->i387.mxcsr &= mxcsr_feature_mask; - convert_to_fxsr(tsk, ia32_env); + convert_to_fxsr(&state->fxsave, ia32_env); } } @@ -284,8 +284,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(buf, size)) return -EACCES; - fpu__initialize(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), @@ -315,40 +313,32 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * header. Validate and sanitize the copied state. */ struct user_i387_ia32_struct env; + union fpregs_state *state; int err = 0; + void *tmp; - /* - * Drop the current fpu which clears fpu->initialized. This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * fpu->initialized is again set. - */ - fpu__drop(fpu); + tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + state = PTR_ALIGN(tmp, 64); if (using_compacted_format()) { - err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + err = copy_user_to_xstate(&state->xsave, buf_fx); } else { - err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + err = __copy_from_user(&state->xsave, buf_fx, state_size); if (!err && state_size > offsetof(struct xregs_state, header)) - err = validate_xstate_header(&fpu->state.xsave.header); + err = validate_xstate_header(&state->xsave.header); } if (err || __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); err = -1; } else { - sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + sanitize_restored_xstate(state, &env, xfeatures, fx_only); + copy_kernel_to_fpregs(state); } - local_bh_disable(); - fpu->initialized = 1; - fpu__restore(fpu); - local_bh_enable(); - + kfree(tmp); return err; } else { /* -- cgit v1.2.3 From 6dd677a044e606fd343e31c2108b13d74aec1ca5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:31 +0200 Subject: x86/fpu: Remove fpu__restore() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users of fpu__restore() so it is time to remove it. The comment regarding fpu__restore() and TS bit is stale since commit b3b0870ef3ffe ("i387: do not preload FPU state at task switch time") and has no meaning since. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: Babu Moger Cc: "Chang S. Bae" Cc: Dmitry Safonov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Jonathan Corbet Cc: kvm ML Cc: linux-doc@vger.kernel.org Cc: Nicolai Stange Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-3-bigeasy@linutronix.de --- Documentation/preempt-locking.txt | 1 - arch/x86/include/asm/fpu/internal.h | 1 - arch/x86/kernel/fpu/core.c | 24 ------------------------ arch/x86/kernel/process_32.c | 4 +--- arch/x86/kernel/process_64.c | 4 +--- 5 files changed, 2 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/preempt-locking.txt b/Documentation/preempt-locking.txt index 509f5a422d57..dce336134e54 100644 --- a/Documentation/preempt-locking.txt +++ b/Documentation/preempt-locking.txt @@ -52,7 +52,6 @@ preemption must be disabled around such regions. Note, some FPU functions are already explicitly preempt safe. For example, kernel_fpu_begin and kernel_fpu_end will disable and enable preemption. -However, fpu__restore() must be called with preemption disabled. RULE #3: Lock acquire and release must be performed by same task diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fb04a3ded7dd..75a1d5f712ee 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -28,7 +28,6 @@ extern void fpu__initialize(struct fpu *fpu); extern void fpu__prepare_read(struct fpu *fpu); extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); -extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2e5003fef51a..1d3ae7988f7f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -303,30 +303,6 @@ void fpu__prepare_write(struct fpu *fpu) } } -/* - * 'fpu__restore()' is called to copy FPU registers from - * the FPU fpstate to the live hw registers and to activate - * access to the hardware registers, so that FPU instructions - * can be used afterwards. - * - * Must be called with kernel preemption disabled (for example - * with local interrupts disabled, as it is in the case of - * do_device_not_available()). - */ -void fpu__restore(struct fpu *fpu) -{ - fpu__initialize(fpu); - - /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ - kernel_fpu_disable(); - trace_x86_fpu_before_restore(fpu); - fpregs_activate(fpu); - copy_kernel_to_fpregs(&fpu->state); - trace_x86_fpu_after_restore(fpu); - kernel_fpu_enable(); -} -EXPORT_SYMBOL_GPL(fpu__restore); - /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b2..7888a41a03cd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -267,9 +267,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before fpu__restore(), so the TS bit is up - * to date. + * the GDT and LDT are properly updated. */ arch_end_context_switch(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf..e1983b3a16c4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -538,9 +538,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before - * loading segments that might reference them, and and it must - * be done before fpu__restore(), so the TS bit is up to - * date. + * loading segments that might reference them. */ arch_end_context_switch(next_p); -- cgit v1.2.3 From 60e528d6ce3f60a058bbb64f8acb2a07f84b172a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:32 +0200 Subject: x86/fpu: Remove preempt_disable() in fpu__clear() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preempt_disable() section was introduced in commit a10b6a16cdad8 ("x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic") and it was said to be temporary. fpu__initialize() initializes the FPU struct to its initial value and then sets ->initialized to 1. The last part is the important one. The content of the state does not matter because it gets set via copy_init_fpstate_to_fpregs(). A preemption here has little meaning because the registers will always be set to the same content after copy_init_fpstate_to_fpregs(). A softirq with a kernel_fpu_begin() could also force to save FPU's registers after fpu__initialize() without changing the outcome here. Remove the preempt_disable() section in fpu__clear(), preemption here does not hurt. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Nicolai Stange Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-4-bigeasy@linutronix.de --- arch/x86/kernel/fpu/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1d3ae7988f7f..1940319268ae 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -366,11 +366,9 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ if (static_cpu_has(X86_FEATURE_FPU)) { - preempt_disable(); fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); - preempt_enable(); } } -- cgit v1.2.3 From 88f5260a3bf9bfb276b5b4aac2e81587e425a1d7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:33 +0200 Subject: x86/fpu: Always init the state in fpu__clear() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fpu__clear() only initializes the state if the CPU has FPU support. This initialisation is also required for FPU-less systems and takes place in math_emulate(). Since fpu__initialize() only performs the initialization if ->initialized is zero it does not matter that it is invoked each time an opcode is emulated. It makes the removal of ->initialized easier if the struct is also initialized in the FPU-less case at the same time. Move fpu__initialize() before the FPU feature check so it is also performed in the FPU-less case too. [ bp: Massage a bit. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: Bill Metzenthen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Nicolai Stange Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-5-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 1 - arch/x86/kernel/fpu/core.c | 5 ++--- arch/x86/math-emu/fpu_entry.c | 3 --- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 75a1d5f712ee..70ecb7c032cb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -24,7 +24,6 @@ /* * High level FPU state handling functions: */ -extern void fpu__initialize(struct fpu *fpu); extern void fpu__prepare_read(struct fpu *fpu); extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1940319268ae..e43296854e37 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -223,7 +223,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__initialize(struct fpu *fpu) +static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); @@ -236,7 +236,6 @@ void fpu__initialize(struct fpu *fpu) fpu->initialized = 1; } } -EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. @@ -365,8 +364,8 @@ void fpu__clear(struct fpu *fpu) /* * Make sure fpstate is cleared and initialized. */ + fpu__initialize(fpu); if (static_cpu_has(X86_FEATURE_FPU)) { - fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); } diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 9e2ba7e667f6..a873da6b46d6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -113,9 +113,6 @@ void math_emulate(struct math_emu_info *info) unsigned long code_base = 0; unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ struct desc_struct code_descriptor; - struct fpu *fpu = ¤t->thread.fpu; - - fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { -- cgit v1.2.3 From fbcc9e0c37ba3186c41b5eb1aee2f7f3b711bc1e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:34 +0200 Subject: x86/fpu: Remove fpu->initialized usage in copy_fpstate_to_sigframe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With lazy-FPU support the (now named variable) ->initialized was set to true if the CPU's FPU registers were holding a valid state of the FPU registers for the active process. If it was set to false then the FPU state was saved in fpu->state and the FPU was deactivated. With lazy-FPU gone, ->initialized is always true for user threads and kernel threads never call this function so ->initialized is always true in copy_fpstate_to_sigframe(). The using_compacted_format() check is also a leftover from the lazy-FPU time. In the ->initialized == false case copy_to_user() would copy the compacted buffer while userland would expect the non-compacted format instead. So in order to save the FPU state in the non-compacted form it issues XSAVE to save the *current* FPU state. If the FPU is not enabled, the attempt raises the FPU trap, the trap restores the FPU contents and re-enables the FPU and XSAVE is invoked again and succeeds. *This* does not longer work since commit bef8b6da9522 ("x86/fpu: Handle #NM without FPU emulation as an error") Remove the check for ->initialized because it is always true and remove the false condition. Update the comment to reflect that the state is always live. [ bp: Massage. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-6-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 7296a9bb78e7..c1a5999affa0 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -144,9 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * Save the state directly to the user frame pointed by the aligned pointer + * 'buf_fx'. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -157,7 +156,6 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct fpu *fpu = ¤t->thread.fpu; - struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -172,29 +170,12 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->initialized || using_compacted_format()) { - /* Save the live register state to the user directly. */ - if (copy_fpregs_to_sigframe(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - copy_fxregs_to_kernel(fpu); - } else { - /* - * It is a *bug* if kernel uses compacted-format for xsave - * area and we copy it out directly to a signal frame. It - * should have been handled above by saving the registers - * directly. - */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) { - WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); - return -1; - } - - fpstate_sanitize_xstate(fpu); - if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) - return -1; - } + /* Save the live registers state to the user frame directly. */ + if (copy_fpregs_to_sigframe(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + copy_fxregs_to_kernel(fpu); /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) -- cgit v1.2.3 From 547571b5abe61bb33c6005d8981e86e3c61fedcc Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 27 Mar 2019 09:15:19 -0600 Subject: x86/asm: Modernize sync_bitops.h Add missing instruction suffixes and use rmwcc.h just like was (more or less) recently done for bitops.h as well, see: 22636f8c9511: x86/asm: Add instruction suffixes to bitops 288e4521f0f6: x86/asm: 'Simplify' GEN_*_RMWcc() macros No change in functionality intended. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5C9B93870200007800222289@prv1-mh.provo.novell.com [ Cleaned up the changelog a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sync_bitops.h | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 2fe745356fb1..6d8d6bc183b7 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -14,6 +14,8 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ +#include + #define ADDR (*(volatile long *)addr) /** @@ -29,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; bts %1,%0" + asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -47,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btr %1,%0" + asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -64,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btc %1,%0" + asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -78,14 +80,9 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) +static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; bts %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -98,12 +95,7 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; btr %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -116,12 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; btc %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) -- cgit v1.2.3 From a5881bea88616e3aacf521dbdbe0e323257aaba1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Apr 2019 10:02:19 +0200 Subject: x86/Kconfig: Remove the unused X86_DMA_REMAP KConfig symbol Signed-off-by: Christoph Hellwig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190410080220.21705-2-hch@lst.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1f9b3cf437c..7f93e013e6db 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -704,7 +704,6 @@ config STA2X11 depends on X86_32_NON_STANDARD && PCI select ARCH_HAS_PHYS_TO_DMA select X86_DEV_DMA_OPS - select X86_DMA_REMAP select SWIOTLB select MFD_STA2X11 select GPIOLIB @@ -2886,10 +2885,6 @@ config X86_DEV_DMA_OPS bool depends on X86_64 || STA2X11 -config X86_DMA_REMAP - bool - depends on STA2X11 - config HAVE_GENERIC_GUP def_bool y -- cgit v1.2.3 From fb346fd9fc081c3d978c3f3d26d39334527a2662 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:17 -0400 Subject: locking/lock_events: Make lock_events available for all archs & other locks The QUEUED_LOCK_STAT option to report queued spinlocks event counts was previously allowed only on x86 architecture. To make the locking event counting code more useful, it is now renamed to a more generic LOCK_EVENT_COUNTS config option. This new option will be available to all the architectures that use qspinlock at the moment. Other locking code can now start to use the generic locking event counting code by including lock_events.h and put the new locking event names into the lock_events_list.h header file. My experience with lock event counting is that it gives valuable insight on how the locking code works and what can be done to make it better. I would like to extend this benefit to other locking code like mutex and rwsem in the near future. The PV qspinlock specific code will stay in qspinlock_stat.h. The locking event counters will now reside in the /lock_event_counts directory. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-9-longman@redhat.com Signed-off-by: Ingo Molnar --- arch/Kconfig | 10 +++ arch/x86/Kconfig | 8 --- kernel/locking/Makefile | 1 + kernel/locking/lock_events.c | 153 ++++++++++++++++++++++++++++++++++++++++ kernel/locking/lock_events.h | 10 ++- kernel/locking/qspinlock_stat.h | 141 ++++-------------------------------- 6 files changed, 183 insertions(+), 140 deletions(-) create mode 100644 kernel/locking/lock_events.c (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 33687dddd86a..28c0f1ad80d7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -901,6 +901,16 @@ config HAVE_ARCH_PREL32_RELOCATIONS config ARCH_USE_MEMREMAP_PROT bool +config LOCK_EVENT_COUNTS + bool "Locking event counts collection" + depends on DEBUG_FS + depends on QUEUED_SPINLOCKS + ---help--- + Enable light-weight counting of various locking related events + in the system with minimal performance impact. This reduces + the chance of application behavior change because of timing + differences. The counts are reported via debugfs. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 84b184e016cd..7d160f58a8f6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -780,14 +780,6 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. -config QUEUED_LOCK_STAT - bool "Paravirt queued spinlock statistics" - depends on PARAVIRT_SPINLOCKS && DEBUG_FS - ---help--- - Enable the collection of statistical data on the slowpath - behavior of paravirtualized queued spinlocks and report - them on debugfs. - source "arch/x86/xen/Kconfig" config KVM_GUEST diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 1af83e9ce57d..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..71c36d1fb834 --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long + */ + +/* + * Collect locking event counts + */ +#include +#include +#include +#include + +#include "lock_events.h" + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR "lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the /lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, id, len; + u64 sum = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + id = (long)file_inode(file)->i_private; + + if (id >= lockevent_num) + return -EBADF; + + for_each_possible_cpu(cpu) + sum += per_cpu(lockevents[id], cpu); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + + for (i = 0 ; i < lockevent_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, + .llseek = default_llseek, +}; + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ + struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); + int i; + + if (!d_counts) + goto out; + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < lockevent_num; i++) + if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent)) + goto fail_undo; + + if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent)) + goto fail_undo; + + return 0; +fail_undo: + debugfs_remove_recursive(d_counts); +out: + pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); + return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 4009e07b474a..feb1acc54611 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -13,6 +13,9 @@ * Authors: Waiman Long */ +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + enum lock_events { #include "lock_events_list.h" @@ -21,7 +24,7 @@ enum lock_events { LOCKEVENT_reset_cnts = lockevent_num, }; -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS /* * Per-cpu counters */ @@ -46,10 +49,11 @@ static inline void __lockevent_add(enum lock_events event, int inc) #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#else /* CONFIG_LOCK_EVENT_COUNTS */ #define lockevent_inc(ev) #define lockevent_add(ev, c) #define lockevent_cond_inc(ev, c) -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 1db5b375fcf4..54152670ff24 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -9,76 +9,29 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Waiman Long + * Authors: Waiman Long */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * /qlockstat/ - * pv_hash_hops - average # of hops per hashing operation - * pv_kick_unlock - # of vCPU kicks issued at unlock time - * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake - * pv_latency_kick - average latency (ns) of vCPU kick operation - * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs - * pv_wait_again - # of wait's after a queue head vCPU kick - * pv_wait_early - # of early vCPU wait's - * pv_wait_head - # of vCPU wait's at the queue head - * pv_wait_node - # of vCPU wait's at a non-head queue node - * lock_pending - # of locking operations via pending code - * lock_slowpath - # of locking operations via MCS lock queue - * lock_use_node2 - # of locking operations that use 2nd per-CPU node - * lock_use_node3 - # of locking operations that use 3rd per-CPU node - * lock_use_node4 - # of locking operations that use 4th per-CPU node - * lock_no_node - # of locking operations without using per-CPU node - * - * Subtracting lock_use_node[234] from lock_slowpath will give you - * lock_use_node1. - * - * Writing to the special ".reset_counts" file will reset all the above - * counter values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ #include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts */ -#include #include #include #include #define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] -#undef LOCK_EVENT -#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, - -static const char * const lockevent_names[lockevent_num + 1] = { - -#include "lock_events_list.h" - - [LOCKEVENT_reset_cnts] = ".reset_counts", -}; - /* - * Per-cpu counters + * PV specific per-cpu counter */ -DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts. * * The following counters are handled specially: * 1. pv_latency_kick @@ -88,8 +41,8 @@ static DEFINE_PER_CPU(u64, pv_kick_time); * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t lockevent_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; int cpu, id, len; @@ -149,78 +102,6 @@ static ssize_t lockevent_read(struct file *file, char __user *user_buf, return simple_read_from_buffer(user_buf, count, ppos, buf, len); } -/* - * Function to handle write request - * - * When id = .reset_cnts, reset all the counter values. - */ -static ssize_t lockevent_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - int cpu; - - /* - * Get the counter ID stored in file->f_inode->i_private - */ - if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) - return count; - - for_each_possible_cpu(cpu) { - int i; - unsigned long *ptr = per_cpu_ptr(lockevents, cpu); - - for (i = 0 ; i < lockevent_num; i++) - WRITE_ONCE(ptr[i], 0); - } - return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_lockevent = { - .read = lockevent_read, - .write = lockevent_write, - .llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ - struct dentry *d_counts = debugfs_create_dir("qlockstat", NULL); - int i; - - if (!d_counts) - goto out; - - /* - * Create the debugfs files - * - * As reading from and writing to the stat files can be slow, only - * root is allowed to do the read/write to limit impact to system - * performance. - */ - for (i = 0; i < lockevent_num; i++) - if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, - (void *)(long)i, &fops_lockevent)) - goto fail_undo; - - if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, - d_counts, (void *)(long)LOCKEVENT_reset_cnts, - &fops_lockevent)) - goto fail_undo; - - return 0; -fail_undo: - debugfs_remove_recursive(d_counts); -out: - pr_warn("Could not create 'qlockstat' debugfs entries\n"); - return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - /* * PV hash hop count */ @@ -260,8 +141,10 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ static inline void lockevent_pv_hop(int hopcnt) { } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ -- cgit v1.2.3 From 39388e80f9b0c3788bfb6efe3054bdce0c3ead45 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:35 +0200 Subject: x86/fpu: Don't save fxregs for ia32 frames in copy_fpstate_to_sigframe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 72a671ced66db ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") the 32bit and 64bit path of the signal delivery code were merged. The 32bit version: int save_i387_xstate_ia32(void __user *buf) … if (cpu_has_xsave) return save_i387_xsave(fp); if (cpu_has_fxsr) return save_i387_fxsave(fp); The 64bit version: int save_i387_xstate(void __user *buf) … if (user_has_fpu()) { if (use_xsave()) err = xsave_user(buf); else err = fxsave_user(buf); if (unlikely(err)) { __clear_user(buf, xstate_size); return err; The merge: int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) … if (user_has_fpu()) { /* Save the live register state to the user directly. */ if (save_user_xstate(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) fpu_fxsave(&tsk->thread.fpu); I don't think that we needed to save the FPU registers to ->thread.fpu because the registers were stored in buf_fx. Today the state will be restored from buf_fx after the signal was handled (I assume that this was also the case with lazy-FPU). Since commit 66463db4fc560 ("x86, fpu: shift drop_init_fpu() from save_xstate_sig() to handle_signal()") it is ensured that the signal handler starts with clear/fresh set of FPU registers which means that the previous store is futile. Remove the copy_fxregs_to_kernel() call because task's FPU state is cleared later in handle_signal() via fpu__clear(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-7-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c1a5999affa0..34989d2a8893 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -155,7 +155,6 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct fpu *fpu = ¤t->thread.fpu; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -173,9 +172,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) /* Save the live registers state to the user frame directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - copy_fxregs_to_kernel(fpu); /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) -- cgit v1.2.3 From 2722146eb78451b30e4717a267a3a2b44e4ad317 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:36 +0200 Subject: x86/fpu: Remove fpu->initialized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct fpu.initialized member is always set to one for user tasks and zero for kernel tasks. This avoids saving/restoring the FPU registers for kernel threads. The ->initialized = 0 case for user tasks has been removed in previous changes, for instance, by doing an explicit unconditional init at fork() time for FPU-less systems which was otherwise delayed until the emulated opcode. The context switch code (switch_fpu_prepare() + switch_fpu_finish()) can't unconditionally save/restore registers for kernel threads. Not only would it slow down the switch but also load a zeroed xcomp_bv for XSAVES. For kernel_fpu_begin() (+end) the situation is similar: EFI with runtime services uses this before alternatives_patched is true. Which means that this function is used too early and it wasn't the case before. For those two cases, use current->mm to distinguish between user and kernel thread. For kernel_fpu_begin() skip save/restore of the FPU registers. During the context switch into a kernel thread don't do anything. There is no reason to save the FPU state of a kernel thread. The reordering in __switch_to() is important because the current() pointer needs to be valid before switch_fpu_finish() is invoked so ->mm is seen of the new task instead the old one. N.B.: fpu__save() doesn't need to check ->mm because it is called by user tasks only. [ bp: Massage. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: Babu Moger Cc: "Chang S. Bae" Cc: Dmitry Safonov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: kvm ML Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nicolai Stange Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: Will Deacon Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-8-bigeasy@linutronix.de --- arch/x86/ia32/ia32_signal.c | 17 ++++----- arch/x86/include/asm/fpu/internal.h | 18 +++++----- arch/x86/include/asm/fpu/types.h | 9 ----- arch/x86/include/asm/trace/fpu.h | 5 +-- arch/x86/kernel/fpu/core.c | 70 +++++++++++-------------------------- arch/x86/kernel/fpu/init.c | 2 -- arch/x86/kernel/fpu/regset.c | 19 +++------- arch/x86/kernel/fpu/xstate.c | 2 -- arch/x86/kernel/process_32.c | 4 +-- arch/x86/kernel/process_64.c | 4 +-- arch/x86/kernel/signal.c | 17 ++++----- arch/x86/mm/pkeys.c | 7 +--- 12 files changed, 53 insertions(+), 121 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 321fe5f5d0e9..6eeb3249f22f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -216,8 +216,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { - struct fpu *fpu = ¤t->thread.fpu; - unsigned long sp; + unsigned long sp, fx_aligned, math_size; /* Default to using normal stack */ sp = regs->sp; @@ -231,15 +230,11 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->initialized) { - unsigned long fx_aligned, math_size; - - sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); - *fpstate = (struct _fpstate_32 __user *) sp; - if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size) < 0) - return (void __user *) -1L; - } + sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); + *fpstate = (struct _fpstate_32 __user *) sp; + if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, + math_size) < 0) + return (void __user *) -1L; sp -= frame_size; /* Align the stack pointer according to the i386 ABI, diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 70ecb7c032cb..04042eacc852 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -494,11 +494,14 @@ static inline void fpregs_activate(struct fpu *fpu) * * - switch_fpu_finish() restores the new state as * necessary. + * + * The FPU context is only stored/restored for a user task and + * ->mm is used to distinguish between kernel and user threads. */ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { + if (static_cpu_has(X86_FEATURE_FPU) && current->mm) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else @@ -506,8 +509,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ trace_x86_fpu_regs_deactivated(old_fpu); - } else - old_fpu->last_cpu = -1; + } } /* @@ -520,12 +522,12 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->initialized; + if (static_cpu_has(X86_FEATURE_FPU)) { + if (!fpregs_state_valid(new_fpu, cpu)) { + if (current->mm) + copy_kernel_to_fpregs(&new_fpu->state); + } - if (preload) { - if (!fpregs_state_valid(new_fpu, cpu)) - copy_kernel_to_fpregs(&new_fpu->state); fpregs_activate(new_fpu); } } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 2e32e178e064..f098f6cab94b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -293,15 +293,6 @@ struct fpu { */ unsigned int last_cpu; - /* - * @initialized: - * - * This flag indicates whether this context is initialized: if the task - * is not running then we can restore from this context, if the task - * is running then we should save into this context. - */ - unsigned char initialized; - /* * @avx512_timestamp: * diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 069c04be1507..bd65f6ba950f 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -13,22 +13,19 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, initialized) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->initialized = fpu->initialized; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->initialized, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e43296854e37..97e27de2b7c0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,7 +101,7 @@ static void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->initialized) { + if (current->mm) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ static void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->initialized) + if (current->mm) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -147,11 +147,10 @@ void fpu__save(struct fpu *fpu) preempt_disable(); trace_x86_fpu_before_save(fpu); - if (fpu->initialized) { - if (!copy_fpregs_to_fpstate(fpu)) { - copy_kernel_to_fpregs(&fpu->state); - } - } + + if (!copy_fpregs_to_fpstate(fpu)) + copy_kernel_to_fpregs(&fpu->state); + trace_x86_fpu_after_save(fpu); preempt_enable(); } @@ -190,7 +189,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { dst_fpu->last_cpu = -1; - if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) + if (!static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -227,14 +226,10 @@ static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->initialized) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); + fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); - trace_x86_fpu_activate_state(fpu); - /* Safe to do for the current task: */ - fpu->initialized = 1; - } + trace_x86_fpu_activate_state(fpu); } /* @@ -247,32 +242,20 @@ static void fpu__initialize(struct fpu *fpu) * * - or it's called for stopped tasks (ptrace), in which case the * registers were already saved by the context-switch code when - * the task scheduled out - we only have to initialize the registers - * if they've never been initialized. + * the task scheduled out. * * If the task has used the FPU before then save it. */ void fpu__prepare_read(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) { + if (fpu == ¤t->thread.fpu) fpu__save(fpu); - } else { - if (!fpu->initialized) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for current and for stopped child tasks: */ - fpu->initialized = 1; - } - } } /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then invalidate any cached FPU registers. - * If the task has not used the FPU before then initialize its fpstate. + * Invalidate any cached FPU registers. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will @@ -289,17 +272,8 @@ void fpu__prepare_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->initialized) { - /* Invalidate any cached state: */ - __fpu_invalidate_fpregs_state(fpu); - } else { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for stopped child tasks: */ - fpu->initialized = 1; - } + /* Invalidate any cached state: */ + __fpu_invalidate_fpregs_state(fpu); } /* @@ -316,17 +290,13 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - if (fpu->initialized) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); - } + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); } - fpu->initialized = 0; - trace_x86_fpu_dropped(fpu); preempt_enable(); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6abd83572b01..20d8fa7124c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -239,8 +239,6 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - - WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 5dbc099178a8..d652b939ccfb 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -15,16 +15,12 @@ */ int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { - struct fpu *target_fpu = &target->thread.fpu; - - return target_fpu->initialized ? regset->n : 0; + return regset->n; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { - struct fpu *target_fpu = &target->thread.fpu; - - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) + if (boot_cpu_has(X86_FEATURE_FXSR)) return regset->n; else return 0; @@ -370,16 +366,9 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - int fpvalid; - - fpvalid = fpu->initialized; - if (fpvalid) - fpvalid = !fpregs_get(tsk, NULL, - 0, sizeof(struct user_i387_ia32_struct), - ufpu, NULL); - return fpvalid; + return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), + ufpu, NULL); } EXPORT_SYMBOL(dump_fpu); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d7432c2b1051..8bfcc5b9e04b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -892,8 +892,6 @@ const void *get_xsave_field_ptr(int xsave_state) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->initialized) - return NULL; /* * fpu__save() takes the CPU's xstate registers * and saves them off to the 'fpu memory buffer. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7888a41a03cd..77d9eb43ccac 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -288,10 +288,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, cpu); - this_cpu_write(current_task, next_p); + switch_fpu_finish(next_fpu, cpu); + /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e1983b3a16c4..ffea7c557963 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -566,14 +566,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) x86_fsgsbase_load(prev, next); - switch_fpu_finish(next_fpu, cpu); - /* * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + switch_fpu_finish(next_fpu, cpu); + /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b419e1a1a0ce..87b327c6cb10 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -246,7 +246,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long sp = regs->sp; unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); - struct fpu *fpu = ¤t->thread.fpu; + int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -265,11 +265,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->initialized) { - sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), - &buf_fx, &math_size); - *fpstate = (void __user *)sp; - } + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), + &buf_fx, &math_size); + *fpstate = (void __user *)sp; sp = align_sigframe(sp - frame_size); @@ -281,8 +279,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->initialized && - copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) + ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); + if (ret < 0) return (void __user *)-1L; return (void __user *)sp; @@ -763,8 +761,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->initialized) - fpu__clear(fpu); + fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); } diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 047a77f6a10c..05bb9a44eb1c 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -39,17 +39,12 @@ int __execute_only_pkey(struct mm_struct *mm) * dance to set PKRU if we do not need to. Check it * first and assume that if the execute-only pkey is * write-disabled that we do not have to set it - * ourselves. We need preempt off so that nobody - * can make fpregs inactive. + * ourselves. */ - preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { - preempt_enable(); return execute_only_pkey; } - preempt_enable(); /* * Set up PKRU so that it denies access for everything -- cgit v1.2.3 From 0169f53e0d97bb675075506810494bd86b8c934e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:37 +0200 Subject: x86/fpu: Remove user_fpu_begin() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit user_fpu_begin() sets fpu_fpregs_owner_ctx to task's fpu struct. This is always the case since there is no lazy FPU anymore. fpu_fpregs_owner_ctx is used during context switch to decide if it needs to load the saved registers or if the currently loaded registers are valid. It could be skipped during a taskA -> kernel thread -> taskA switch because the switch to the kernel thread would not alter the CPU's sFPU tate. Since this field is always updated during context switch and never invalidated, setting it manually (in user context) makes no difference. A kernel thread with kernel_fpu_begin() block could set fpu_fpregs_owner_ctx to NULL but a kernel thread does not use user_fpu_begin(). This is a leftover from the lazy-FPU time. Remove user_fpu_begin(), it does not change fpu_fpregs_owner_ctx's content. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Nicolai Stange Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-9-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 17 ----------------- arch/x86/kernel/fpu/core.c | 4 +--- arch/x86/kernel/fpu/signal.c | 1 - 3 files changed, 1 insertion(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 04042eacc852..54f70cae2f15 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -532,23 +532,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) } } -/* - * Needs to be preemption-safe. - * - * NOTE! user_fpu_begin() must be used only immediately before restoring - * the save state. It does not do any saving/restoring on its own. In - * lazy FPU mode, it is just an optimization to avoid a #NM exception, - * the task can lose the FPU right after preempt_enable(). - */ -static inline void user_fpu_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - preempt_disable(); - fpregs_activate(fpu); - preempt_enable(); -} - /* * MXCSR and XCR definitions: */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97e27de2b7c0..739ca3ae2bdc 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -335,10 +335,8 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ fpu__initialize(fpu); - if (static_cpu_has(X86_FEATURE_FPU)) { - user_fpu_begin(); + if (static_cpu_has(X86_FEATURE_FPU)) copy_init_fpstate_to_fpregs(); - } } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 34989d2a8893..155f4552413e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -322,7 +322,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * For 64-bit frames and 32-bit fsave frames, restore the user * state to the registers directly (with exceptions handled). */ - user_fpu_begin(); if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { fpu__clear(fpu); return -1; -- cgit v1.2.3 From 4ee91519e1dccc175665fe24bb20a47c6053575c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Apr 2019 18:41:38 +0200 Subject: x86/fpu: Add an __fpregs_load_activate() internal helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper function that ensures the floating point registers for the current task are active. Use with preemption disabled. While at it, add fpregs_lock/unlock() helpers too, to be used in later patches. [ bp: Add a comment about its intended usage. ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-10-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/api.h | 11 +++++++++++ arch/x86/include/asm/fpu/internal.h | 22 ++++++++++++++-------- 2 files changed, 25 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b56d504af654..73e684160f35 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,6 +10,7 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H +#include /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It @@ -22,6 +23,16 @@ extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); +static inline void fpregs_lock(void) +{ + preempt_disable(); +} + +static inline void fpregs_unlock(void) +{ + preempt_enable(); +} + /* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 54f70cae2f15..3e0c2c496f2d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -484,6 +484,18 @@ static inline void fpregs_activate(struct fpu *fpu) trace_x86_fpu_regs_activated(fpu); } +/* + * Internal helper, do not use directly. Use switch_fpu_return() instead. + */ +static inline void __fpregs_load_activate(struct fpu *fpu, int cpu) +{ + if (!fpregs_state_valid(fpu, cpu)) { + if (current->mm) + copy_kernel_to_fpregs(&fpu->state); + fpregs_activate(fpu); + } +} + /* * FPU state switching for scheduling. * @@ -522,14 +534,8 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU)) { - if (!fpregs_state_valid(new_fpu, cpu)) { - if (current->mm) - copy_kernel_to_fpregs(&new_fpu->state); - } - - fpregs_activate(new_fpu); - } + if (static_cpu_has(X86_FEATURE_FPU)) + __fpregs_load_activate(new_fpu, cpu); } /* -- cgit v1.2.3 From 07baeb04f37c952981d63359ff840118ce8f5434 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:39 +0200 Subject: x86/fpu: Make __raw_xsave_addr() use a feature number instead of mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most users of __raw_xsave_addr() use a feature number, shift it to a mask and then __raw_xsave_addr() shifts it back to the feature number. Make __raw_xsave_addr() use the feature number as an argument. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-11-bigeasy@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8bfcc5b9e04b..4f7f3c5d0d0c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -805,20 +805,18 @@ void fpu__resume_cpu(void) } /* - * Given an xstate feature mask, calculate where in the xsave + * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ -static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) +static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { - int feature_nr = fls64(xstate_feature_mask) - 1; - - if (!xfeature_enabled(feature_nr)) { + if (!xfeature_enabled(xfeature_nr)) { WARN_ON_FPU(1); return NULL; } - return (void *)xsave + xstate_comp_offsets[feature_nr]; + return (void *)xsave + xstate_comp_offsets[xfeature_nr]; } /* * Given the xsave area and a state inside, this function returns the @@ -840,6 +838,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask */ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) { + int xfeature_nr; /* * Do we even *have* xsave state? */ @@ -867,7 +866,8 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!(xsave->header.xfeatures & xstate_feature)) return NULL; - return __raw_xsave_addr(xsave, xstate_feature); + xfeature_nr = fls64(xstate_feature) - 1; + return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); @@ -1014,7 +1014,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, 1 << i); + void *src = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1100,7 +1100,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, 1 << i); + void *src = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1157,7 +1157,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) u64 mask = ((u64)1 << i); if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, 1 << i); + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1211,7 +1211,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) u64 mask = ((u64)1 << i); if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, 1 << i); + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; -- cgit v1.2.3 From abd16d68d65229e5acafdadc32704239131bf2ea Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:40 +0200 Subject: x86/fpu: Use a feature number instead of mask in two more helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After changing the argument of __raw_xsave_addr() from a mask to number Dave suggested to check if it makes sense to do the same for get_xsave_addr(). As it turns out it does. Only get_xsave_addr() needs the mask to check if the requested feature is part of what is supported/saved and then uses the number again. The shift operation is cheaper compared to fls64() (find last bit set). Also, the feature number uses less opcode space compared to the mask. :) Make the get_xsave_addr() argument a xfeature number instead of a mask and fix up its callers. Furthermore, use xfeature_nr and xfeature_mask consistently. This results in the following changes to the kvm code: feature -> xfeature_mask index -> xfeature_nr Suggested-by: Dave Hansen Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: "Radim Krčmář" Cc: Rasmus Villemoes Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: Siarhei Liakh Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-12-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/xstate.c | 22 ++++++++++------------ arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/x86.c | 28 ++++++++++++++-------------- arch/x86/mm/mpx.c | 6 +++--- 5 files changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 48581988d78c..fbe41f808e5d 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -46,8 +46,8 @@ extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); -void *get_xsave_addr(struct xregs_state *xsave, int xstate); -const void *get_xsave_field_ptr(int xstate_field); +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); +const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4f7f3c5d0d0c..9c459fd1d38e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -830,15 +830,14 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) * * Inputs: * xstate: the thread's storage area for all FPU data - * xstate_feature: state which is defined in xsave.h (e.g. - * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) + * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ -void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { - int xfeature_nr; /* * Do we even *have* xsave state? */ @@ -850,11 +849,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) * have not enabled. Remember that pcntxt_mask is * what we write to the XCR0 register. */ - WARN_ONCE(!(xfeatures_mask & xstate_feature), + WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to - * have requested that 'xstate_feature' be saved. + * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * @@ -863,10 +862,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) * or because the "init optimization" caused it * to not be saved. */ - if (!(xsave->header.xfeatures & xstate_feature)) + if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; - xfeature_nr = fls64(xstate_feature) - 1; return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); @@ -882,13 +880,13 @@ EXPORT_SYMBOL_GPL(get_xsave_addr); * Note that this only works on the current task. * * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) + * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area or NULL if the state * is not present or is in its 'init state'. */ -const void *get_xsave_field_ptr(int xsave_state) +const void *get_xsave_field_ptr(int xfeature_nr) { struct fpu *fpu = ¤t->thread.fpu; @@ -898,7 +896,7 @@ const void *get_xsave_field_ptr(int xsave_state) */ fpu__save(fpu); - return get_xsave_addr(&fpu->state.xsave, xsave_state); + return get_xsave_addr(&fpu->state.xsave, xfeature_nr); } #ifdef CONFIG_ARCH_HAS_PKEYS diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d26f9e9c3d83..8b6d03e55d2f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -456,7 +456,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * which is all zeros which indicates MPX was not * responsible for the exception. */ - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) goto exit_trap; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 099b851dabaf..8022e7769b3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3674,15 +3674,15 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *src = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *src = get_xsave_addr(xsave, xfeature_nr); if (src) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(dest + offset, &vcpu->arch.pkru, sizeof(vcpu->arch.pkru)); else @@ -3690,7 +3690,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) } - valid -= feature; + valid -= xfeature_mask; } } @@ -3717,22 +3717,22 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *dest = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *dest = get_xsave_addr(xsave, xfeature_nr); if (dest) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(&vcpu->arch.pkru, src + offset, sizeof(vcpu->arch.pkru)); else memcpy(dest, src + offset, size); } - valid -= feature; + valid -= xfeature_mask; } } @@ -8850,11 +8850,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_put_guest_fpu(vcpu); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_MASK_BNDREGS); + XFEATURE_BNDREGS); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_MASK_BNDCSR); + XFEATURE_BNDCSR); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); if (init_event) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index c805db6236b4..59726aaf4671 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -142,7 +142,7 @@ int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs) goto err_out; } /* get bndregs field from current task's xsave area */ - bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS); + bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS); if (!bndregs) { err = -EINVAL; goto err_out; @@ -190,7 +190,7 @@ static __user void *mpx_get_bounds_dir(void) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -376,7 +376,7 @@ static int do_mpx_bt_fault(void) const struct mpx_bndcsr *bndcsr; struct mm_struct *mm = current->mm; - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) return -EINVAL; /* -- cgit v1.2.3 From 7e94a7b659eefedda82cde97229a26f319fb1182 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 13:11:28 +0200 Subject: x86/microcode/intel: Refactor Intel microcode blob loading Change generic_load_microcode() to use the iov_iter API instead of a clumsy open-coded version which has to pay attention to __user data or kernel data, depending on the loading method. This allows to avoid explicit casting between user and kernel pointers. Because the iov_iter API makes it hard to read the same location twice, as a side effect, also fix a double-read of the microcode header (which could e.g. lead to out-of-bounds reads in microcode_sanity_check()). Not that it matters much, only root is allowed to load microcode anyway... [ bp: Massage a bit, sort function-local variables. ] Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: x86-ml Link: https://lkml.kernel.org/r/20190404111128.131157-1-jannh@google.com --- arch/x86/kernel/cpu/microcode/intel.c | 71 ++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 16936a24795c..a44bdbe7c55e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -861,32 +862,33 @@ out: return ret; } -static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; - unsigned int csig, cpf; enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; - while (leftover) { + while (iov_iter_count(iter)) { struct microcode_header_intel mc_header; - unsigned int mc_size; + unsigned int mc_size, data_size; + u8 *data; - if (leftover < sizeof(mc_header)) { - pr_err("error! Truncated header in microcode data file\n"); + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); break; } - if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) - break; - mc_size = get_totalsize(&mc_header); - if (!mc_size || mc_size > leftover) { - pr_err("error! Bad data in microcode data file\n"); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); break; } @@ -899,7 +901,9 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, curr_mc_size = mc_size; } - if (get_ucode_data(mc, ucode_ptr, mc_size) || + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || microcode_sanity_check(mc, 1) < 0) { break; } @@ -914,14 +918,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc = NULL; /* trigger new vmalloc */ ret = UCODE_NEW; } - - ucode_ptr += mc_size; - leftover -= mc_size; } vfree(mc); - if (leftover) { + if (iov_iter_count(iter)) { vfree(new_mc); return UCODE_ERROR; } @@ -945,12 +946,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, return ret; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -977,10 +972,12 @@ static bool is_blacklisted(unsigned int cpu) static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { - char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; + struct iov_iter iter; enum ucode_state ret; + struct kvec kvec; + char name[30]; if (is_blacklisted(cpu)) return UCODE_NFOUND; @@ -993,26 +990,30 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; } - ret = generic_load_microcode(cpu, (void *)firmware->data, - firmware->size, &get_ucode_fw); + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + struct iov_iter iter; + struct iovec iov; + if (is_blacklisted(cpu)) return UCODE_NFOUND; - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); + iov.iov_base = (void __user *)buf; + iov.iov_len = size; + iov_iter_init(&iter, WRITE, &iov, 1, size); + + return generic_load_microcode(cpu, &iter); } static struct microcode_ops microcode_intel_ops = { -- cgit v1.2.3 From 3d45ad9260c35c597706847e196aae8d966a574f Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 3 Apr 2019 22:12:17 -0400 Subject: x86/ima: add missing include As reported by 0-DAY kernel test infrastructure: arch/x86//kernel/ima_arch.c: In function 'arch_get_ima_policy': >> arch/x86//kernel/ima_arch.c:78:4: error: implicit declaration of function 'set_module_sig_enforced' [-Werror=implicit-function-declaration] Signed-off-by: Mimi Zohar --- arch/x86/kernel/ima_arch.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 3fb9847f1cad..85de790583f9 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -3,6 +3,7 @@ * Copyright (C) 2018 IBM Corporation */ #include +#include #include extern struct boot_params boot_params; -- cgit v1.2.3 From 24613a04ad1c0588c10f4b5403ca60a73d164051 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Apr 2019 22:14:07 +0200 Subject: x86/microcode: Fix the ancient deprecated microcode loading method Commit 2613f36ed965 ("x86/microcode: Attempt late loading only when new microcode is present") added the new define UCODE_NEW to denote that an update should happen only when newer microcode (than installed on the system) has been found. But it missed adjusting that for the old /dev/cpu/microcode loading interface. Fix it. Fixes: 2613f36ed965 ("x86/microcode: Attempt late loading only when new microcode is present") Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Jann Horn Link: https://lkml.kernel.org/r/20190405133010.24249-3-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5260185cbf7b..8a4a7823451a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -418,8 +418,9 @@ static int do_microcode_update(const void __user *buf, size_t size) if (ustate == UCODE_ERROR) { error = -1; break; - } else if (ustate == UCODE_OK) + } else if (ustate == UCODE_NEW) { apply_microcode_on_target(cpu); + } } return error; -- cgit v1.2.3 From c02f48e070bde326f55bd94544ca82291f7396e3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 5 Apr 2019 06:28:11 +0200 Subject: x86/microcode: Deprecate MICROCODE_OLD_INTERFACE This is the ancient loading interface which requires special tools to be used. The bigger problem with it is that it is as inadequate for proper loading of microcode as the late microcode loading interface is because it happens just as late. iucode_tool's manpage already warns people that it is deprecated. Deprecate it and turn it off by default along with a big fat warning in the Kconfig help. It will go away sometime in the future. Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190405133010.24249-4-bp@alien8.de --- arch/x86/Kconfig | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19..5a0a752f3ddd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1330,8 +1330,16 @@ config MICROCODE_AMD processors will be enabled. config MICROCODE_OLD_INTERFACE - def_bool y + bool "Ancient loading interface (DEPRECATED)" + default n depends on MICROCODE + ---help--- + DO NOT USE THIS! This is the ancient /dev/cpu/microcode interface + which was used by userspace tools like iucode_tool and microcode.ctl. + It is inadequate because it runs too late to be able to properly + load microcode on a machine and it needs special tools. Instead, you + should've switched to the early loading method with the initrd or + builtin microcode by now: Documentation/x86/microcode.txt config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" -- cgit v1.2.3 From c806e88734b9e9aea260bf2261c129aa23968fca Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:41 +0200 Subject: x86/pkeys: Provide *pkru() helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dave Hansen asked for __read_pkru() and __write_pkru() to be symmetrical. As part of the series __write_pkru() will read back the value and only write it if it is different. In order to make both functions symmetrical, move the function containing only the opcode asm into a function called like the instruction itself. __write_pkru() will just invoke wrpkru() but in a follow-up patch will also read back the value. [ bp: Convert asm opcode wrapper names to rd/wrpkru(). ] Suggested-by: Dave Hansen Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andi Kleen Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: kvm ML Cc: Michal Hocko Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-13-bigeasy@linutronix.de --- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/include/asm/special_insns.h | 12 +++++++++--- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2779ace16d23..e8875ca75623 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -127,7 +127,7 @@ static inline int pte_dirty(pte_t pte) static inline u32 read_pkru(void) { if (boot_cpu_has(X86_FEATURE_OSPKE)) - return __read_pkru(); + return rdpkru(); return 0; } diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 43c029cdc3fe..34897e2b52c9 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -92,7 +92,7 @@ static inline void native_write_cr8(unsigned long val) #endif #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -static inline u32 __read_pkru(void) +static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; @@ -107,7 +107,7 @@ static inline u32 __read_pkru(void) return pkru; } -static inline void __write_pkru(u32 pkru) +static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; @@ -118,8 +118,14 @@ static inline void __write_pkru(u32 pkru) asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (pkru), "c"(ecx), "d"(edx)); } + +static inline void __write_pkru(u32 pkru) +{ + wrpkru(pkru); +} + #else -static inline u32 __read_pkru(void) +static inline u32 rdpkru(void) { return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab432a930ae8..dd1e1eea4f0b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6501,7 +6501,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = __read_pkru(); + vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); } -- cgit v1.2.3 From 577ff465f5a6a5a0866d75a033844810baca20a0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:42 +0200 Subject: x86/fpu: Only write PKRU if it is different from current MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Dave Hansen, WRPKRU is more expensive than RDPKRU. It has a higher cycle cost and it's also practically a (light) speculation barrier. As an optimisation read the current PKRU value and only write the new one if it is different. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: Juergen Gross Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-14-bigeasy@linutronix.de --- arch/x86/include/asm/special_insns.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 34897e2b52c9..0a3c4cab39db 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -121,6 +121,13 @@ static inline void wrpkru(u32 pkru) static inline void __write_pkru(u32 pkru) { + /* + * WRPKRU is relatively expensive compared to RDPKRU. + * Avoid WRPKRU when it would not change the value. + */ + if (pkru == rdpkru()) + return; + wrpkru(pkru); } -- cgit v1.2.3 From 0556cbdc2fbcb3068e5b924a8b3d5386ae0dd27d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:43 +0200 Subject: x86/pkeys: Don't check if PKRU is zero before writing it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit write_pkru() checks if the current value is the same as the expected value. So instead of just checking if the current and new value is zero (and skip the write in such a case) we can benefit from that. Remove the zero check of PKRU, __write_pkru() provides such a check now. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-15-bigeasy@linutronix.de --- arch/x86/mm/pkeys.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 05bb9a44eb1c..50f65fc1b9a3 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -142,13 +142,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | void copy_init_pkru_to_fpregs(void) { u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); - /* - * Any write to PKRU takes it out of the XSAVE 'init - * state' which increases context switch cost. Avoid - * writing 0 when PKRU was already 0. - */ - if (!init_pkru_value_snapshot && !read_pkru()) - return; /* * Override the PKRU state that came from 'init_fpstate' * with the baseline from the process. -- cgit v1.2.3 From 0cecca9d03c964abbd2b7927d0670eb70db4ebf2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Apr 2019 18:41:44 +0200 Subject: x86/fpu: Eager switch PKRU state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While most of a task's FPU state is only needed in user space, the protection keys need to be in place immediately after a context switch. The reason is that any access to userspace memory while running in kernel mode also needs to abide by the memory permissions specified in the protection keys. The "eager switch" is a preparation for loading the FPU state on return to userland. Instead of decoupling PKRU state from xstate, update PKRU within xstate on write operations by the kernel. For user tasks the PKRU should be always read from the xsave area and it should not change anything because the PKRU value was loaded as part of FPU restore. For kernel threads the default "init_pkru_value" will be written. Before this commit, the kernel thread would end up with a random value which it inherited from the previous user task. [ bigeasy: save pkru to xstate, no cache, don't use __raw_xsave_addr() ] [ bp: update commit message, sort headers properly in asm/fpu/xstate.h ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andi Kleen Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: kvm ML Cc: Michal Hocko Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-16-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 24 ++++++++++++++++++++++-- arch/x86/include/asm/fpu/xstate.h | 4 +++- arch/x86/include/asm/pgtable.h | 6 ++++++ arch/x86/mm/pkeys.c | 1 - 4 files changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3e0c2c496f2d..6eb4a0b1ad0e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -534,8 +535,27 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU)) - __fpregs_load_activate(new_fpu, cpu); + u32 pkru_val = init_pkru_value; + struct pkru_state *pk; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + __fpregs_load_activate(new_fpu, cpu); + + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + + /* + * PKRU state is switched eagerly because it needs to be valid before we + * return to userland e.g. for a copy_to_user() operation. + */ + if (current->mm) { + pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); + if (pk) + pkru_val = pk->pkru; + } + __write_pkru(pkru_val); } /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fbe41f808e5d..7e42b285c856 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -2,9 +2,11 @@ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H +#include #include + #include -#include +#include /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e8875ca75623..9beb371b1adf 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1355,6 +1355,12 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #define PKRU_WD_BIT 0x2 #define PKRU_BITS_PER_PKEY 2 +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +extern u32 init_pkru_value; +#else +#define init_pkru_value 0 +#endif + static inline bool __pkru_allows_read(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 50f65fc1b9a3..2ecbf4155f98 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -126,7 +126,6 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ -static u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | -- cgit v1.2.3 From 383c252545edcc708128e2028a2318b05c45ede4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:45 +0200 Subject: x86/entry: Add TIF_NEED_FPU_LOAD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add TIF_NEED_FPU_LOAD. This flag is used for loading the FPU registers before returning to userland. It must not be set on systems without a FPU. If this flag is cleared, the CPU's FPU registers hold the latest, up-to-date content of the current task's (current()) FPU registers. The in-memory copy (union fpregs_state) is not valid. If this flag is set, then all of CPU's FPU registers may hold a random value (except for PKRU) and it is required to load the content of the FPU registers on return to userland. Introduce it now as a preparatory change before adding the main feature. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Konrad Rzeszutek Wilk Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: Tim Chen Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-17-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 8 ++++++++ arch/x86/include/asm/thread_info.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 6eb4a0b1ad0e..da75d7b3e37d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -508,6 +508,14 @@ static inline void __fpregs_load_activate(struct fpu *fpu, int cpu) * - switch_fpu_finish() restores the new state as * necessary. * + * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers + * are saved in the current thread's FPU register state. + * + * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not + * hold current()'s FPU registers. It is required to load the + * registers before returning to userland or using the content + * otherwise. + * * The FPU context is only stored/restored for a user task and * ->mm is used to distinguish between kernel and user threads. */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e0eccbcb8447..f9453536f9bb 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -88,6 +88,7 @@ struct thread_info { #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ +#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ @@ -117,6 +118,7 @@ struct thread_info { #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) +#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) -- cgit v1.2.3 From 69277c98f5eef0d9839699b7825923c3985f665f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Apr 2019 18:41:46 +0200 Subject: x86/fpu: Always store the registers in copy_fpstate_to_sigframe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit copy_fpstate_to_sigframe() stores the registers directly to user space. This is okay because the FPU registers are valid and saving them directly avoids saving them into kernel memory and making a copy. However, this cannot be done anymore if the FPU registers are going to be restored on the return to userland. It is possible that the FPU registers will be invalidated in the middle of the save operation and this should be done with disabled preemption / BH. Save the FPU registers to the task's FPU struct and copy them to the user memory later on. Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-18-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 155f4552413e..8f23f5237218 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -144,8 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Save the state directly to the user frame pointed by the aligned pointer - * 'buf_fx'. + * Save the state to task's fpu->state and then copy it to the user frame + * pointed to by the aligned pointer 'buf_fx'. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -155,6 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -169,9 +171,16 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - /* Save the live registers state to the user frame directly. */ - if (copy_fpregs_to_sigframe(buf_fx)) - return -1; + copy_fpregs_to_fpstate(fpu); + + if (using_compacted_format()) { + if (copy_xstate_to_user(buf_fx, xsave, 0, size)) + return -1; + } else { + fpstate_sanitize_xstate(fpu); + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) + return -1; + } /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) -- cgit v1.2.3 From a352a3b7b7920212ee4c45a41500c66826318e92 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Apr 2019 18:41:47 +0200 Subject: x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FPU registers need only to be saved if TIF_NEED_FPU_LOAD is not set. Otherwise this has been already done and can be skipped. [ bp: Massage a bit. ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-19-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8f23f5237218..9b9dfdc96285 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,17 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - copy_fpregs_to_fpstate(fpu); + /* + * If we do not need to load the FPU registers at return to userspace + * then the CPU has the current state and we need to save it. Otherwise, + * it has already been done and we can skip it. + */ + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + copy_fpregs_to_fpstate(fpu); + set_thread_flag(TIF_NEED_FPU_LOAD); + } + fpregs_unlock(); if (using_compacted_format()) { if (copy_xstate_to_user(buf_fx, xsave, 0, size)) -- cgit v1.2.3 From 0d714dba162620fd8b9f5b3104a487e041353c4d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:48 +0200 Subject: x86/fpu: Update xstate's PKRU value on write_pkru() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the context switch the xstate is loaded which also includes the PKRU value. If xstate is restored on return to userland it is required that the PKRU value in xstate is the same as the one in the CPU. Save the PKRU in xstate during modification. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andi Kleen Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: kvm ML Cc: Michal Hocko Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-20-bigeasy@linutronix.de --- arch/x86/include/asm/pgtable.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9beb371b1adf..5cfbbb6d458d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -23,6 +23,8 @@ #ifndef __ASSEMBLY__ #include +#include +#include extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -133,8 +135,23 @@ static inline u32 read_pkru(void) static inline void write_pkru(u32 pkru) { - if (boot_cpu_has(X86_FEATURE_OSPKE)) - __write_pkru(pkru); + struct pkru_state *pk; + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; + + pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); + + /* + * The PKRU value in xstate needs to be in sync with the value that is + * written to the CPU. The FPU restore on return to userland would + * otherwise load the previous value again. + */ + fpregs_lock(); + if (pk) + pk->pkru = pkru; + __write_pkru(pkru); + fpregs_unlock(); } static inline int pte_young(pte_t pte) -- cgit v1.2.3 From e0d3602f933367881bddfff310a744e6e61c284c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:49 +0200 Subject: x86/fpu: Inline copy_user_to_fpregs_zeroing() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start refactoring __fpu__restore_sig() by inlining copy_user_to_fpregs_zeroing(). The original function remains and will be used to restore from userland memory if possible. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-21-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 9b9dfdc96285..c2ff43fbbd07 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -337,11 +337,29 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) kfree(tmp); return err; } else { + int ret; + /* * For 64-bit frames and 32-bit fsave frames, restore the user * state to the registers directly (with exceptions handled). */ - if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { + if (use_xsave()) { + if ((unsigned long)buf_fx % 64 || fx_only) { + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + ret = copy_user_to_fxregs(buf_fx); + } else { + u64 init_bv = xfeatures_mask & ~xfeatures; + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + ret = copy_user_to_xregs(buf_fx, xfeatures); + } + } else if (use_fxsr()) { + ret = copy_user_to_fxregs(buf_fx); + } else + ret = copy_user_to_fregs(buf_fx); + + if (ret) { fpu__clear(fpu); return -1; } -- cgit v1.2.3 From 926b21f37b072ae4c117052de45a975c6d468fec Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:50 +0200 Subject: x86/fpu: Restore from kernel memory on the 64-bit path too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 64-bit case (both 64-bit and 32-bit frames) loads the new state from user memory. However, doing this is not desired if the FPU state is going to be restored on return to userland: it would be required to disable preemption in order to avoid a context switch which would set TIF_NEED_FPU_LOAD. If this happens before the restore operation then the loaded registers would become volatile. Furthermore, disabling preemption while accessing user memory requires to disable the pagefault handler. An error during FXRSTOR would then mean that either a page fault occurred (and it would have to be retried with enabled page fault handler) or a #GP occurred because the xstate is bogus (after all, the signal handler can modify it). In order to avoid that mess, copy the FPU state from userland, validate it and then load it. The copy_kernel_…() helpers are basically just like the old helpers except that they operate on kernel memory and the fault handler just sets the error value and the caller handles it. copy_user_to_fpregs_zeroing() and its helpers remain and will be used later for a fastpath optimisation. [ bp: Clarify commit message. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-22-bigeasy@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 43 +++++++++++++++++++++++++ arch/x86/kernel/fpu/signal.c | 62 +++++++++++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index da75d7b3e37d..2cf04fbcba5d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -121,6 +121,21 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ @@ -149,6 +164,14 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } +static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) @@ -162,6 +185,11 @@ static inline void copy_kernel_to_fregs(struct fregs_state *fx) kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } +static inline int copy_kernel_to_fregs_err(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + static inline int copy_user_to_fregs(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -361,6 +389,21 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) return err; } +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + /* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact and we can diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c2ff43fbbd07..9ea1eaa4c9b1 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -234,7 +234,8 @@ sanitize_restored_xstate(union fpregs_state *state, */ xsave->i387.mxcsr &= mxcsr_feature_mask; - convert_to_fxsr(&state->fxsave, ia32_env); + if (ia32_env) + convert_to_fxsr(&state->fxsave, ia32_env); } } @@ -337,28 +338,63 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) kfree(tmp); return err; } else { + union fpregs_state *state; + void *tmp; int ret; + tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + state = PTR_ALIGN(tmp, 64); + /* * For 64-bit frames and 32-bit fsave frames, restore the user * state to the registers directly (with exceptions handled). */ - if (use_xsave()) { - if ((unsigned long)buf_fx % 64 || fx_only) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_user_to_fxregs(buf_fx); + if ((unsigned long)buf_fx % 64) + fx_only = 1; + + if (use_xsave() && !fx_only) { + u64 init_bv = xfeatures_mask & ~xfeatures; + + if (using_compacted_format()) { + ret = copy_user_to_xstate(&state->xsave, buf_fx); } else { - u64 init_bv = xfeatures_mask & ~xfeatures; - if (unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_user_to_xregs(buf_fx, xfeatures); + ret = __copy_from_user(&state->xsave, buf_fx, state_size); + + if (!ret && state_size > offsetof(struct xregs_state, header)) + ret = validate_xstate_header(&state->xsave.header); } + if (ret) + goto err_out; + + sanitize_restored_xstate(state, NULL, xfeatures, fx_only); + + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures); + } else if (use_fxsr()) { - ret = copy_user_to_fxregs(buf_fx); - } else - ret = copy_user_to_fregs(buf_fx); + ret = __copy_from_user(&state->fxsave, buf_fx, state_size); + if (ret) + goto err_out; + if (use_xsave()) { + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + } + state->fxsave.mxcsr &= mxcsr_feature_mask; + + ret = copy_kernel_to_fxregs_err(&state->fxsave); + } else { + ret = __copy_from_user(&state->fsave, buf_fx, state_size); + if (ret) + goto err_out; + ret = copy_kernel_to_fregs_err(&state->fsave); + } + +err_out: + kfree(tmp); if (ret) { fpu__clear(fpu); return -1; -- cgit v1.2.3 From c2ff9e9a3d9d6c019394a22989a228d02970a8b1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:51 +0200 Subject: x86/fpu: Merge the two code paths in __fpu__restore_sig() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ia32_fxstate case (32bit with fxsr) and the other (64bit frames or 32bit frames without fxsr) restore both from kernel memory and sanitize the content. The !ia32_fxstate version restores missing xstates from "init state" while the ia32_fxstate doesn't and skips it. Merge the two code paths and keep the !ia32_fxstate one. Copy only the user_i387_ia32_struct data structure in the ia32_fxstate. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-23-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 139 +++++++++++++++++-------------------------- 1 file changed, 54 insertions(+), 85 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 9ea1eaa4c9b1..b13e86b29426 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -263,12 +263,17 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) { + struct user_i387_ia32_struct *envp = NULL; + int state_size = fpu_kernel_xstate_size; int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = fpu_kernel_xstate_size; + struct user_i387_ia32_struct env; + union fpregs_state *state; u64 xfeatures = 0; int fx_only = 0; + int ret = 0; + void *tmp; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -303,105 +308,69 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } } + tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + state = PTR_ALIGN(tmp, 64); + + if ((unsigned long)buf_fx % 64) + fx_only = 1; + + /* + * For 32-bit frames with fxstate, copy the fxstate so it can be + * reconstructed later. + */ if (ia32_fxstate) { - /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Validate and sanitize the copied state. - */ - struct user_i387_ia32_struct env; - union fpregs_state *state; - int err = 0; - void *tmp; + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto err_out; + envp = &env; + } - tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - state = PTR_ALIGN(tmp, 64); + if (use_xsave() && !fx_only) { + u64 init_bv = xfeatures_mask & ~xfeatures; if (using_compacted_format()) { - err = copy_user_to_xstate(&state->xsave, buf_fx); + ret = copy_user_to_xstate(&state->xsave, buf_fx); } else { - err = __copy_from_user(&state->xsave, buf_fx, state_size); + ret = __copy_from_user(&state->xsave, buf_fx, state_size); - if (!err && state_size > offsetof(struct xregs_state, header)) - err = validate_xstate_header(&state->xsave.header); + if (!ret && state_size > offsetof(struct xregs_state, header)) + ret = validate_xstate_header(&state->xsave.header); } + if (ret) + goto err_out; - if (err || __copy_from_user(&env, buf, sizeof(env))) { - err = -1; - } else { - sanitize_restored_xstate(state, &env, xfeatures, fx_only); - copy_kernel_to_fpregs(state); - } - - kfree(tmp); - return err; - } else { - union fpregs_state *state; - void *tmp; - int ret; - - tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - state = PTR_ALIGN(tmp, 64); - - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - if ((unsigned long)buf_fx % 64) - fx_only = 1; - - if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask & ~xfeatures; - - if (using_compacted_format()) { - ret = copy_user_to_xstate(&state->xsave, buf_fx); - } else { - ret = __copy_from_user(&state->xsave, buf_fx, state_size); - - if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&state->xsave.header); - } - if (ret) - goto err_out; - - sanitize_restored_xstate(state, NULL, xfeatures, fx_only); - - if (unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures); + sanitize_restored_xstate(state, envp, xfeatures, fx_only); - } else if (use_fxsr()) { - ret = __copy_from_user(&state->fxsave, buf_fx, state_size); - if (ret) - goto err_out; + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures); - if (use_xsave()) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - } - state->fxsave.mxcsr &= mxcsr_feature_mask; + } else if (use_fxsr()) { + ret = __copy_from_user(&state->fxsave, buf_fx, state_size); + if (ret) + goto err_out; - ret = copy_kernel_to_fxregs_err(&state->fxsave); - } else { - ret = __copy_from_user(&state->fsave, buf_fx, state_size); - if (ret) - goto err_out; - ret = copy_kernel_to_fregs_err(&state->fsave); + sanitize_restored_xstate(state, envp, xfeatures, fx_only); + if (use_xsave()) { + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } -err_out: - kfree(tmp); - if (ret) { - fpu__clear(fpu); - return -1; - } + ret = copy_kernel_to_fxregs_err(&state->fxsave); + } else { + ret = __copy_from_user(&state->fsave, buf_fx, state_size); + if (ret) + goto err_out; + ret = copy_kernel_to_fregs_err(&state->fsave); } - return 0; +err_out: + kfree(tmp); + if (ret) + fpu__clear(fpu); + return ret; } static inline int xstate_sigframe_size(void) -- cgit v1.2.3 From 5f409e20b794565e2d60ad333e79334630a6c798 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Apr 2019 18:41:52 +0200 Subject: x86/fpu: Defer FPU state load until return to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defer loading of FPU state until return to userspace. This gives the kernel the potential to skip loading FPU state for tasks that stay in kernel mode, or for tasks that end up with repeated invocations of kernel_fpu_begin() & kernel_fpu_end(). The fpregs_lock/unlock() section ensures that the registers remain unchanged. Otherwise a context switch or a bottom half could save the registers to its FPU context and the processor's FPU registers would became random if modified at the same time. KVM swaps the host/guest registers on entry/exit path. This flow has been kept as is. First it ensures that the registers are loaded and then saves the current (host) state before it loads the guest's registers. The swap is done at the very end with disabled interrupts so it should not change anymore before theg guest is entered. The read/save version seems to be cheaper compared to memcpy() in a micro benchmark. Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy(). For kernel threads, this flag gets never cleared which avoids saving / restoring the FPU state for kernel threads and during in-kernel usage of the FPU registers. [ bp: Correct and update commit message and fix checkpatch warnings. s/register/registers/ where it is used in plural. minor comment corrections. remove unused trace_x86_fpu_activate_state() TP. ] Signed-off-by: Rik van Riel Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Aubrey Li Cc: Babu Moger Cc: "Chang S. Bae" Cc: Dmitry Safonov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: kvm ML Cc: Nicolai Stange Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Tim Chen Cc: Waiman Long Cc: x86-ml Cc: Yi Wang Link: https://lkml.kernel.org/r/20190403164156.19645-24-bigeasy@linutronix.de --- arch/x86/entry/common.c | 10 +++- arch/x86/include/asm/fpu/api.h | 22 +++++++- arch/x86/include/asm/fpu/internal.h | 27 +++++---- arch/x86/include/asm/trace/fpu.h | 10 ++-- arch/x86/kernel/fpu/core.c | 106 +++++++++++++++++++++++++++--------- arch/x86/kernel/fpu/signal.c | 49 ++++++++++------- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 5 +- arch/x86/kernel/process_64.c | 5 +- arch/x86/kvm/x86.c | 20 +++++-- 10 files changed, 184 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7bc105f47d21..51beb8d29123 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -25,12 +25,13 @@ #include #include #include +#include #include #include #include -#include #include +#include #define CREATE_TRACE_POINTS #include @@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); + /* Reload ti->flags; we may have rescheduled above. */ + cached_flags = READ_ONCE(ti->flags); + + fpregs_assert_state_consistent(); + if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 73e684160f35..b774c52e5411 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,7 +10,7 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H -#include +#include /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It @@ -22,17 +22,37 @@ extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); +extern void fpregs_mark_activate(void); +/* + * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. + * A context switch will (and softirq might) save CPU's FPU registers to + * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * a random state. + */ static inline void fpregs_lock(void) { preempt_disable(); + local_bh_disable(); } static inline void fpregs_unlock(void) { + local_bh_enable(); preempt_enable(); } +#ifdef CONFIG_X86_DEBUG_FPU +extern void fpregs_assert_state_consistent(void); +#else +static inline void fpregs_assert_state_consistent(void) { } +#endif + +/* + * Load the task FPU state before returning to userspace. + */ +extern void switch_fpu_return(void); + /* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2cf04fbcba5d..0c8a5093647a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -30,7 +30,7 @@ extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); -extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); +extern int fpu__copy(struct task_struct *dst, struct task_struct *src); extern void fpu__clear(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); @@ -531,13 +531,20 @@ static inline void fpregs_activate(struct fpu *fpu) /* * Internal helper, do not use directly. Use switch_fpu_return() instead. */ -static inline void __fpregs_load_activate(struct fpu *fpu, int cpu) +static inline void __fpregs_load_activate(void) { + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->mm == NULL)) + return; + if (!fpregs_state_valid(fpu, cpu)) { - if (current->mm) - copy_kernel_to_fpregs(&fpu->state); + copy_kernel_to_fpregs(&fpu->state); fpregs_activate(fpu); + fpu->last_cpu = cpu; } + clear_thread_flag(TIF_NEED_FPU_LOAD); } /* @@ -548,8 +555,8 @@ static inline void __fpregs_load_activate(struct fpu *fpu, int cpu) * - switch_fpu_prepare() saves the old state. * This is done within the context of the old process. * - * - switch_fpu_finish() restores the new state as - * necessary. + * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state + * will get loaded on return to userspace, or when the kernel needs it. * * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers * are saved in the current thread's FPU register state. @@ -581,10 +588,10 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ /* - * Set up the userspace FPU context for the new task, if the task - * has used the FPU. + * Load PKRU from the FPU context if available. Delay loading of the + * complete FPU state until the return to userland. */ -static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) +static inline void switch_fpu_finish(struct fpu *new_fpu) { u32 pkru_val = init_pkru_value; struct pkru_state *pk; @@ -592,7 +599,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) if (!static_cpu_has(X86_FEATURE_FPU)) return; - __fpregs_load_activate(new_fpu, cpu); + set_thread_flag(TIF_NEED_FPU_LOAD); if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index bd65f6ba950f..879b77792f94 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -13,19 +13,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) + __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; + __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, + __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) @@ -61,11 +64,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 739ca3ae2bdc..ce243f76bdb7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -102,23 +102,20 @@ static void __kernel_fpu_begin(void) kernel_fpu_disable(); if (current->mm) { - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ - copy_fpregs_to_fpstate(fpu); - } else { - __cpu_invalidate_fpregs_state(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ + copy_fpregs_to_fpstate(fpu); + } } + __cpu_invalidate_fpregs_state(); } static void __kernel_fpu_end(void) { - struct fpu *fpu = ¤t->thread.fpu; - - if (current->mm) - copy_kernel_to_fpregs(&fpu->state); - kernel_fpu_enable(); } @@ -145,14 +142,17 @@ void fpu__save(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - preempt_disable(); + fpregs_lock(); trace_x86_fpu_before_save(fpu); - if (!copy_fpregs_to_fpstate(fpu)) - copy_kernel_to_fpregs(&fpu->state); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + if (!copy_fpregs_to_fpstate(fpu)) { + copy_kernel_to_fpregs(&fpu->state); + } + } trace_x86_fpu_after_save(fpu); - preempt_enable(); + fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -185,8 +185,11 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct task_struct *dst, struct task_struct *src) { + struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = &src->thread.fpu; + dst_fpu->last_cpu = -1; if (!static_cpu_has(X86_FEATURE_FPU)) @@ -201,16 +204,23 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* - * Save current FPU registers directly into the child - * FPU context, without any memory-to-memory copying. + * If the FPU registers are not current just memcpy() the state. + * Otherwise save current FPU registers directly into the child's FPU + * context, without any memory-to-memory copying. * * ( The function 'fails' in the FNSAVE case, which destroys - * register contents so we have to copy them back. ) + * register contents so we have to load them back. ) */ - if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - copy_kernel_to_fpregs(&src_fpu->state); - } + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); + + else if (!copy_fpregs_to_fpstate(dst_fpu)) + copy_kernel_to_fpregs(&dst_fpu->state); + + fpregs_unlock(); + + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -226,10 +236,9 @@ static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); + set_thread_flag(TIF_NEED_FPU_LOAD); fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); } /* @@ -308,6 +317,8 @@ void fpu__drop(struct fpu *fpu) */ static inline void copy_init_fpstate_to_fpregs(void) { + fpregs_lock(); + if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, -1); else if (static_cpu_has(X86_FEATURE_FXSR)) @@ -317,6 +328,9 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); + + fpregs_mark_activate(); + fpregs_unlock(); } /* @@ -339,6 +353,46 @@ void fpu__clear(struct fpu *fpu) copy_init_fpstate_to_fpregs(); } +/* + * Load FPU context before returning to userspace. + */ +void switch_fpu_return(void) +{ + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + __fpregs_load_activate(); +} +EXPORT_SYMBOL_GPL(switch_fpu_return); + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * If current FPU state according to its tracking (loaded FPU context on this + * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is + * loaded on return to userland. + */ +void fpregs_assert_state_consistent(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + return; + + WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); +} +EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +#endif + +void fpregs_mark_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_activate(fpu); + fpu->last_cpu = smp_processor_id(); + clear_thread_flag(TIF_NEED_FPU_LOAD); +} +EXPORT_SYMBOL_GPL(fpregs_mark_activate); + /* * x87 math exception handling: */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index b13e86b29426..6df1f15e0cd5 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -269,11 +269,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; - union fpregs_state *state; u64 xfeatures = 0; int fx_only = 0; int ret = 0; - void *tmp; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -308,14 +306,18 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } } - tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - state = PTR_ALIGN(tmp, 64); + /* + * The current state of the FPU registers does not matter. By setting + * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate + * is not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + set_thread_flag(TIF_NEED_FPU_LOAD); + __fpu_invalidate_fpregs_state(fpu); if ((unsigned long)buf_fx % 64) fx_only = 1; - /* * For 32-bit frames with fxstate, copy the fxstate so it can be * reconstructed later. @@ -331,43 +333,52 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) u64 init_bv = xfeatures_mask & ~xfeatures; if (using_compacted_format()) { - ret = copy_user_to_xstate(&state->xsave, buf_fx); + ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); } else { - ret = __copy_from_user(&state->xsave, buf_fx, state_size); + ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&state->xsave.header); + ret = validate_xstate_header(&fpu->state.xsave.header); } if (ret) goto err_out; - sanitize_restored_xstate(state, envp, xfeatures, fx_only); + sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + fpregs_lock(); if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures); + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); } else if (use_fxsr()) { - ret = __copy_from_user(&state->fxsave, buf_fx, state_size); - if (ret) + ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); + if (ret) { + ret = -EFAULT; goto err_out; + } + + sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); - sanitize_restored_xstate(state, envp, xfeatures, fx_only); + fpregs_lock(); if (use_xsave()) { u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } - ret = copy_kernel_to_fxregs_err(&state->fxsave); + ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); } else { - ret = __copy_from_user(&state->fsave, buf_fx, state_size); + ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); if (ret) goto err_out; - ret = copy_kernel_to_fregs_err(&state->fsave); + + fpregs_lock(); + ret = copy_kernel_to_fregs_err(&fpu->state.fsave); } + if (!ret) + fpregs_mark_activate(); + fpregs_unlock(); err_out: - kfree(tmp); if (ret) fpu__clear(fpu); return ret; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a..b9b8e6eb74f2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -101,7 +101,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.vm86 = NULL; #endif - return fpu__copy(&dst->thread.fpu, &src->thread.fpu); + return fpu__copy(dst, src); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 77d9eb43ccac..1bc47f3a4885 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - switch_fpu_prepare(prev_fpu, cpu); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -290,7 +291,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); - switch_fpu_finish(next_fpu, cpu); + switch_fpu_finish(next_fpu); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ffea7c557963..37b2ecef041e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -520,7 +520,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); - switch_fpu_prepare(prev_fpu, cpu); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -572,7 +573,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_fpu, cpu); + switch_fpu_finish(next_fpu); /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8022e7769b3a..e340c3c0cba3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7877,6 +7877,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) wait_lapic_expire(vcpu); guest_enter_irqoff(); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -8137,22 +8141,30 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); + fpregs_lock(); + copy_fpregs_to_fpstate(¤t->thread.fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); + fpregs_lock(); + copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); copy_kernel_to_fpregs(¤t->thread.fpu.state); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } -- cgit v1.2.3 From 1d731e731c4cd7cbd3b1aa295f0932e7610da82f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:53 +0200 Subject: x86/fpu: Add a fastpath to __fpu__restore_sig() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commits refactor the restoration of the FPU registers so that they can be loaded from in-kernel memory. This overhead can be avoided if the load can be performed without a pagefault. Attempt to restore FPU registers by invoking copy_user_to_fpregs_zeroing(). If it fails try the slowpath which can handle pagefaults. [ bp: Add a comment over the fastpath to be able to find one's way around the function. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-25-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 6df1f15e0cd5..a1bd7be70206 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -242,10 +242,10 @@ sanitize_restored_xstate(union fpregs_state *state, /* * Restore the extended state if present. Otherwise, restore the FP/SSE state. */ -static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { + if (fx_only) { u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); @@ -327,8 +327,27 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (ret) goto err_out; envp = &env; + } else { + /* + * Attempt to restore the FPU registers directly from user + * memory. For that to succeed, the user access cannot cause + * page faults. If it does, fall back to the slow path below, + * going through the kernel buffer with the enabled pagefault + * handler. + */ + fpregs_lock(); + pagefault_disable(); + ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); + pagefault_enable(); + if (!ret) { + fpregs_mark_activate(); + fpregs_unlock(); + return 0; + } + fpregs_unlock(); } + if (use_xsave() && !fx_only) { u64 init_bv = xfeatures_mask & ~xfeatures; -- cgit v1.2.3 From da2f32fb8dc7cbd9433cb2e990693734b30a2465 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:54 +0200 Subject: x86/fpu: Add a fastpath to copy_fpstate_to_sigframe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Try to save the FPU registers directly to the userland stack frame if the CPU holds the FPU registers for the current task. This has to be done with the pagefault disabled because we can't fault (while the FPU registers are locked) and therefore the operation might fail. If it fails try the slowpath which can handle faults. [ bp: Massage a bit. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-26-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a1bd7be70206..3c3167576216 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -144,8 +144,10 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Save the state to task's fpu->state and then copy it to the user frame - * pointed to by the aligned pointer 'buf_fx'. + * Try to save it directly to the user frame with disabled page fault handler. + * If this fails then do the slow path where the FPU state is first saved to + * task's fpu->state and then copy it to the user frame pointed to by the + * aligned pointer 'buf_fx'. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -159,6 +161,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); + int ret = -EFAULT; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -173,23 +176,30 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) /* * If we do not need to load the FPU registers at return to userspace - * then the CPU has the current state and we need to save it. Otherwise, - * it has already been done and we can skip it. + * then the CPU has the current state. Try to save it directly to + * userland's stack frame if it does not cause a pagefault. If it does, + * try the slowpath. */ fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - copy_fpregs_to_fpstate(fpu); + pagefault_disable(); + ret = copy_fpregs_to_sigframe(buf_fx); + pagefault_enable(); + if (ret) + copy_fpregs_to_fpstate(fpu); set_thread_flag(TIF_NEED_FPU_LOAD); } fpregs_unlock(); - if (using_compacted_format()) { - if (copy_xstate_to_user(buf_fx, xsave, 0, size)) - return -1; - } else { - fpstate_sanitize_xstate(fpu); - if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) - return -1; + if (ret) { + if (using_compacted_format()) { + if (copy_xstate_to_user(buf_fx, xsave, 0, size)) + return -1; + } else { + fpstate_sanitize_xstate(fpu); + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) + return -1; + } } /* Save the fsave header for the 32-bit frames. */ -- cgit v1.2.3 From 06b251dff78704c7d122bd109384d970a7dbe94d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 12 Apr 2019 20:16:15 +0200 Subject: x86/fpu: Restore regs in copy_fpstate_to_sigframe() in order to use the fastpath MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a task is scheduled out and receives a signal then it won't be able to take the fastpath because the registers aren't available. The slowpath is more expensive compared to XRSTOR + XSAVE which usually succeeds. Here are some clock_gettime() numbers from a bigger box with AVX512 during bootup: - __fpregs_load_activate() takes 140ns - 350ns. If it was the most recent FPU context on the CPU then the optimisation in __fpregs_load_activate() will skip the load (which was disabled during the test). - copy_fpregs_to_sigframe() takes 200ns - 450ns if it succeeds. On a pagefault it is 1.8us - 3us usually in the 2.6us area. - The slowpath takes 1.5us - 6us. Usually in the 2.6us area. My testcases (including lat_sig) take the fastpath without __fpregs_load_activate(). I expect this to be the majority. Since the slowpath is in the >1us area it makes sense to load the registers and attempt to save them directly. The direct save may fail but should only happen on the first invocation or after fork() while the page is read-only. [ bp: Massage a bit. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-27-bigeasy@linutronix.de --- arch/x86/kernel/fpu/signal.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3c3167576216..7026f1c4e5e3 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -175,20 +175,21 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) (struct _fpstate_32 __user *) buf) ? -1 : 1; /* - * If we do not need to load the FPU registers at return to userspace - * then the CPU has the current state. Try to save it directly to - * userland's stack frame if it does not cause a pagefault. If it does, - * try the slowpath. + * Load the FPU registers if they are not valid for the current task. + * With a valid FPU state we can attempt to save the state directly to + * userland's stack frame which will likely succeed. If it does not, do + * the slowpath. */ fpregs_lock(); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - pagefault_disable(); - ret = copy_fpregs_to_sigframe(buf_fx); - pagefault_enable(); - if (ret) - copy_fpregs_to_fpstate(fpu); - set_thread_flag(TIF_NEED_FPU_LOAD); - } + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __fpregs_load_activate(); + + pagefault_disable(); + ret = copy_fpregs_to_sigframe(buf_fx); + pagefault_enable(); + if (ret && !test_thread_flag(TIF_NEED_FPU_LOAD)) + copy_fpregs_to_fpstate(fpu); + set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); if (ret) { -- cgit v1.2.3 From a5eff7259790d5314eff10563d6e59d358cce482 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Apr 2019 18:41:56 +0200 Subject: x86/pkeys: Add PKRU value to init_fpstate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The task's initial PKRU value is set partly for fpu__clear()/ copy_init_pkru_to_fpregs(). It is not part of init_fpstate.xsave and instead it is set explicitly. If the user removes the PKRU state from XSAVE in the signal handler then __fpu__restore_sig() will restore the missing bits from `init_fpstate' and initialize the PKRU value to 0. Add the `init_pkru_value' to `init_fpstate' so it is set to the init value in such a case. In theory copy_init_pkru_to_fpregs() could be removed because restoring the PKRU at return-to-userland should be enough. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andi Kleen Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Jason A. Donenfeld" Cc: Konrad Rzeszutek Wilk Cc: kvm ML Cc: Paolo Bonzini Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: x86-ml Link: https://lkml.kernel.org/r/20190403164156.19645-28-bigeasy@linutronix.de --- arch/x86/kernel/cpu/common.c | 5 +++++ arch/x86/mm/pkeys.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..352fa19e6311 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -372,6 +372,8 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { + struct pkru_state *pk; + /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -382,6 +384,9 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) return; cr4_set_bits(X86_CR4_PKE); + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + if (pk) + pk->pkru = init_pkru_value; /* * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2ecbf4155f98..1dcfc91c8f0c 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,6 +18,7 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ +#include /* init_fpstate */ int __execute_only_pkey(struct mm_struct *mm) { @@ -161,6 +162,7 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, static ssize_t init_pkru_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { + struct pkru_state *pk; char buf[32]; ssize_t len; u32 new_init_pkru; @@ -183,6 +185,10 @@ static ssize_t init_pkru_write_file(struct file *file, return -EINVAL; WRITE_ONCE(init_pkru_value, new_init_pkru); + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + if (!pk) + return -EINVAL; + pk->pkru = new_init_pkru; return count; } -- cgit v1.2.3 From 40fba00ffa431c8597ca785ea1cfa4d9f6503390 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 10 Apr 2019 03:53:49 +0800 Subject: x86/resctrl: Do not repeat rdtgroup mode initialization When cache allocation is supported and the user creates a new resctrl resource group, the allocations of the new resource group are initialized to all regions that it can possibly use. At this time these regions are all that are shareable by other resource groups as well as regions that are not currently used. The new resource group's mode is also initialized to reflect this initialization and set to "shareable". The new resource group's mode is currently repeatedly initialized within the loop that configures the hardware with the resource group's default allocations. Move the initialization of the resource group's mode outside the hardware configuration loop. The resource group's mode is now initialized only once as the final step to reflect that its configured allocations are "shareable". Fixes: 95f0b77efa57 ("x86/intel_rdt: Initialize new resource group with sane defaults") Signed-off-by: Xiaochen Shen Signed-off-by: Borislav Petkov Reviewed-by: Fenghua Yu Acked-by: Reinette Chatre Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: pei.p.jia@intel.com Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/1554839629-5448-1-git-send-email-xiaochen.shen@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 54b9eef3eea9..85212a32b54d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2610,9 +2610,10 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } - rdtgrp->mode = RDT_MODE_SHAREABLE; } + rdtgrp->mode = RDT_MODE_SHAREABLE; + return 0; } -- cgit v1.2.3 From c5c27a0a583844c69a433039e4fd6396ba23551b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2019 12:27:56 +0200 Subject: x86/stacktrace: Remove the pointless ULONG_MAX marker Terminating the last trace entry with ULONG_MAX is a completely pointless exercise and none of the consumers can rely on it because it's inconsistently implemented across architectures. In fact quite some of the callers remove the entry and adjust stack_trace.nr_entries afterwards. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Link: https://lkml.kernel.org/r/20190410103643.750954603@linutronix.de --- arch/x86/kernel/stacktrace.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 5c2d71a1dc06..b2f706f1e0b7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -46,9 +46,6 @@ static void noinline __save_stack_trace(struct stack_trace *trace, if (!addr || save_stack_address(trace, addr, nosched)) break; } - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } /* @@ -97,7 +94,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (regs) { /* Success path for user tasks */ if (user_mode(regs)) - goto success; + return 0; /* * Kernel mode registers on the stack indicate an @@ -132,10 +129,6 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; -success: - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; - return 0; } @@ -221,9 +214,6 @@ void save_stack_trace_user(struct stack_trace *trace) /* * Trace user stack if we are not a kernel thread */ - if (current->mm) { + if (current->mm) __save_stack_trace_user(trace); - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } -- cgit v1.2.3 From 2f5fb19341883bb6e37da351bc3700489d8506a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 19:51:06 +0200 Subject: x86/speculation: Prevent deadlock on ssb_state::lock Mikhail reported a lockdep splat related to the AMD specific ssb_state lock: CPU0 CPU1 lock(&st->lock); local_irq_disable(); lock(&(&sighand->siglock)->rlock); lock(&st->lock); lock(&(&sighand->siglock)->rlock); *** DEADLOCK *** The connection between sighand->siglock and st->lock comes through seccomp, which takes st->lock while holding sighand->siglock. Make sure interrupts are disabled when __speculation_ctrl_update() is invoked via prctl() -> speculation_ctrl_update(). Add a lockdep assert to catch future offenders. Fixes: 1f50ddb4f418 ("x86/speculation: Handle HT correctly on AMD") Reported-by: Mikhail Gavrilov Signed-off-by: Thomas Gleixner Tested-by: Mikhail Gavrilov Cc: Thomas Lendacky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1904141948200.4917@nanos.tec.linutronix.de --- arch/x86/kernel/process.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a..957eae13b370 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -426,6 +426,8 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; + lockdep_assert_irqs_disabled(); + /* * If TIF_SSBD is different, select the proper mitigation * method. Note that if SSBD mitigation is disabled or permanentely @@ -477,10 +479,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ -- cgit v1.2.3 From cfd32acf7875d9dd83f82e1940098e88abeea439 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Apr 2019 19:55:41 -0700 Subject: KVM: x86/mmu: Fix an inverted list_empty() check when zapping sptes A recently introduced helper for handling zap vs. remote flush incorrectly bails early, effectively leaking defunct shadow pages. Manifests as a slab BUG when exiting KVM due to the shadow pages being alive when their associated cache is destroyed. ========================================================================== BUG kvm_mmu_page_header: Objects remaining in kvm_mmu_page_header on ... -------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Slab 0x00000000fc436387 objects=26 used=23 fp=0x00000000d023caee ... CPU: 6 PID: 4315 Comm: rmmod Tainted: G B 5.1.0-rc2+ #19 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x46/0x5b slab_err+0xad/0xd0 ? on_each_cpu_mask+0x3c/0x50 ? ksm_migrate_page+0x60/0x60 ? on_each_cpu_cond_mask+0x7c/0xa0 ? __kmalloc+0x1ca/0x1e0 __kmem_cache_shutdown+0x13a/0x310 shutdown_cache+0xf/0x130 kmem_cache_destroy+0x1d5/0x200 kvm_mmu_module_exit+0xa/0x30 [kvm] kvm_arch_exit+0x45/0x60 [kvm] kvm_exit+0x6f/0x80 [kvm] vmx_exit+0x1a/0x50 [kvm_intel] __x64_sys_delete_module+0x153/0x1f0 ? exit_to_usermode_loop+0x88/0xc0 do_syscall_64+0x4f/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a21136345cb6f ("KVM: x86/mmu: Split remote_flush+zap case out of kvm_mmu_flush_or_zap()") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eee455a8a612..85f753728953 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2238,7 +2238,7 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, struct list_head *invalid_list, bool remote_flush) { - if (!remote_flush && !list_empty(invalid_list)) + if (!remote_flush && list_empty(invalid_list)) return false; if (!list_empty(invalid_list)) -- cgit v1.2.3 From 6a03469a1edc94da52b65478f1e00837add869a3 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 15 Apr 2019 09:49:56 -0700 Subject: x86/build/lto: Fix truncated .bss with -fdata-sections With CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y, we compile the kernel with -fdata-sections, which also splits the .bss section. The new section, with a new .bss.* name, which pattern gets missed by the main x86 linker script which only expects the '.bss' name. This results in the discarding of the second part and a too small, truncated .bss section and an unhappy, non-working kernel. Use the common BSS_MAIN macro in the linker script to properly capture and merge all the generated BSS sections. Signed-off-by: Sami Tolvanen Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Cc: Borislav Petkov Cc: Kees Cook Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190415164956.124067-1-samitolvanen@google.com [ Extended the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bad8c51fee6e..a5127b2c195f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -362,7 +362,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) - *(.bss) + *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; -- cgit v1.2.3 From 510bb96fe5b3480b4b22d815786377e54cb701e7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Apr 2019 10:46:07 +0200 Subject: x86/mm: Prevent bogus warnings with "noexec=off" Xose Vazquez Perez reported boot warnings when NX is disabled on the kernel command line. __early_set_fixmap() triggers this warning: attempted to set unsupported pgprot: 8000000000000163 bits: 8000000000000000 supported: 7fffffffffffffff WARNING: CPU: 0 PID: 0 at arch/x86/include/asm/pgtable.h:537 __early_set_fixmap+0xa2/0xff because it uses __default_kernel_pte_mask to mask out unsupported bits. Use __supported_pte_mask instead. Disabling NX on the command line also triggers the NX warning in the page table mapping check: WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:262 note_page+0x2ae/0x650 .... Make the warning depend on NX set in __supported_pte_mask. Reported-by: Xose Vazquez Perez Tested-by: Xose Vazquez Perez Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1904151037530.1729@nanos.tec.linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 3 ++- arch/x86/mm/ioremap.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ee8f8ab46941..c0309ea9abee 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -259,7 +259,8 @@ static void note_wx(struct pg_state *st) #endif /* Account the WX pages */ st->wx_pages += npages; - WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(__supported_pte_mask & _PAGE_NX, + "x86/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0029604af8a4..dd73d5d74393 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -825,7 +825,7 @@ void __init __early_set_fixmap(enum fixed_addresses idx, pte = early_ioremap_pte(addr); /* Sanitize 'prot' against any unsupported bits: */ - pgprot_val(flags) &= __default_kernel_pte_mask; + pgprot_val(flags) &= __supported_pte_mask; if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); -- cgit v1.2.3 From 0082517fa4bce073e7cf542633439f26538a14cc Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Fri, 12 Apr 2019 16:01:53 +0800 Subject: x86/reboot, efi: Use EFI reboot for Acer TravelMate X514-51T Upon reboot, the Acer TravelMate X514-51T laptop appears to complete the shutdown process, but then it hangs in BIOS POST with a black screen. The problem is intermittent - at some points it has appeared related to Secure Boot settings or different kernel builds, but ultimately we have not been able to identify the exact conditions that trigger the issue to come and go. Besides, the EFI mode cannot be disabled in the BIOS of this model. However, after extensive testing, we observe that using the EFI reboot method reliably avoids the issue in all cases. So add a boot time quirk to use EFI reboot on such systems. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=203119 Signed-off-by: Jian-Hong Pan Signed-off-by: Daniel Drake Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux@endlessm.com Link: http://lkml.kernel.org/r/20190412080152.3718-1-jian-hong@endlessm.com [ Fix !CONFIG_EFI build failure, clarify the code and the changelog a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 21 +++++++++++++++++++++ include/linux/efi.h | 7 ++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c0..8fd3cedd9acc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 54357a258b35..6ebc2098cfe1 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1611,7 +1611,12 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, struct screen_info *si, efi_guid_t *proto, unsigned long size); -bool efi_runtime_disabled(void); +#ifdef CONFIG_EFI +extern bool efi_runtime_disabled(void); +#else +static inline bool efi_runtime_disabled(void) { return true; } +#endif + extern void efi_call_virt_check_flags(unsigned long flags, const char *call); extern unsigned long efi_call_virt_save_flags(void); -- cgit v1.2.3 From 780e0106d468a2962b16b52fdf42898f2639e0a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Apr 2019 10:03:35 +0200 Subject: x86/mm/tlb: Revert "x86/mm: Align TLB invalidation info" Revert the following commit: 515ab7c41306: ("x86/mm: Align TLB invalidation info") I found out (the hard way) that under some .config options (notably L1_CACHE_SHIFT=7) and compiler combinations this on-stack alignment leads to a 320 byte stack usage, which then triggers a KASAN stack warning elsewhere. Using 320 bytes of stack space for a 40 byte structure is ludicrous and clearly not right. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Acked-by: Nadav Amit Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 515ab7c41306 ("x86/mm: Align TLB invalidation info") Link: http://lkml.kernel.org/r/20190416080335.GM7905@worktop.programming.kicks-ass.net [ Minor changelog edits. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bc4bc7b2f075..487b8474c01c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -728,7 +728,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { int cpu; - struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { + struct flush_tlb_info info = { .mm = mm, .stride_shift = stride_shift, .freed_tables = freed_tables, -- cgit v1.2.3 From 690908104e39d37947f89d76388c876ce4ec5fda Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 15 Apr 2019 15:16:17 +0200 Subject: KVM: nVMX: allow tests to use bad virtual-APIC page address As mentioned in the comment, there are some special cases where we can simply clear the TPR shadow bit from the CPU-based execution controls in the vmcs02. Handle them so that we can remove some XFAILs from vmx.flat. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 ++++++++++++++++--------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7ec9bb1dd723..a22af5a85540 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2873,20 +2873,27 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - * Failing the vm entry is _not_ what the processor - * does but it's basically the only possibility we - * have. We could still enter the guest if CR8 load - * exits are enabled, CR8 store exits are enabled, and - * virtualize APIC access is disabled; in this case - * the processor would never use the TPR shadow and we - * could simply clear the bit from the execution - * control. But such a configuration is useless, so - * let's keep the code simple. */ if (!is_error_page(page)) { vmx->nested.virtual_apic_page = page; hpa = page_to_phys(vmx->nested.virtual_apic_page); vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && + nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * The processor will never use the TPR shadow, simply + * clear the bit from the execution control. Such a + * configuration is useless, but it happens in tests. + * For any other configuration, failing the vm entry is + * _not_ what the processor does but it's basically the + * only possibility we have. + */ + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_TPR_SHADOW); + } else { + printk("bad virtual-APIC page address\n"); + dump_vmcs(); } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab432a930ae8..7a8f75fc6b7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5603,7 +5603,7 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +void dump_vmcs(void) { u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a1e00d0a2482..f879529906b4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -517,4 +517,6 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +void dump_vmcs(void); + #endif /* __KVM_X86_VMX_H */ -- cgit v1.2.3 From 9d5dcc93a6ddfc78124f006ccd3637ce070ef2fc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:44:58 -0700 Subject: perf/x86: Fix incorrect PEBS_REGS PEBS_REGS used as mask for the supported registers for large PEBS. However, the mask cannot filter the sample_regs_user/sample_regs_intr correctly. (1ULL << PERF_REG_X86_*) should be used to replace PERF_REG_X86_*, which is only the index. Rename PEBS_REGS to PEBS_GP_REGS, because the mask is only for general purpose registers. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Fixes: 2fe1bc1f501d ("perf/x86: Enable free running PEBS for REGS_USER/INTR") Link: https://lkml.kernel.org/r/20190402194509.2832-2-kan.liang@linux.intel.com [ Renamed it to PEBS_GP_REGS - as 'GPRS' is used elsewhere ;-) ] Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/perf_event.h | 38 +++++++++++++++++++------------------- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f61dcbef20ff..f9451566cd9b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3131,7 +3131,7 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) flags &= ~PERF_SAMPLE_TIME; if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; - if (event->attr.sample_regs_user & ~PEBS_REGS) + if (event->attr.sample_regs_user & ~PEBS_GP_REGS) flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a75955741c50..1e98a42b560a 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -96,25 +96,25 @@ struct amd_nb { PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ PERF_SAMPLE_PERIOD) -#define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ - PERF_REG_X86_CX | \ - PERF_REG_X86_DX | \ - PERF_REG_X86_DI | \ - PERF_REG_X86_SI | \ - PERF_REG_X86_SP | \ - PERF_REG_X86_BP | \ - PERF_REG_X86_IP | \ - PERF_REG_X86_FLAGS | \ - PERF_REG_X86_R8 | \ - PERF_REG_X86_R9 | \ - PERF_REG_X86_R10 | \ - PERF_REG_X86_R11 | \ - PERF_REG_X86_R12 | \ - PERF_REG_X86_R13 | \ - PERF_REG_X86_R14 | \ - PERF_REG_X86_R15) +#define PEBS_GP_REGS \ + ((1ULL << PERF_REG_X86_AX) | \ + (1ULL << PERF_REG_X86_BX) | \ + (1ULL << PERF_REG_X86_CX) | \ + (1ULL << PERF_REG_X86_DX) | \ + (1ULL << PERF_REG_X86_DI) | \ + (1ULL << PERF_REG_X86_SI) | \ + (1ULL << PERF_REG_X86_SP) | \ + (1ULL << PERF_REG_X86_BP) | \ + (1ULL << PERF_REG_X86_IP) | \ + (1ULL << PERF_REG_X86_FLAGS) | \ + (1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) /* * Per register state. -- cgit v1.2.3 From f447e4eb3ad1e60d173ca997fcb2ef2a66f12574 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Apr 2019 10:32:52 -0700 Subject: perf/x86/intel: Force resched when TFA sysctl is modified This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU event is using PMC3 when the echo command returns. Vice-Versa, when TFA is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing). $ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches 1.000123979 125,768,725,208 branches 1.000562520 125,631,000,456 branches 1.000942898 125,487,114,291 branches 1.001333316 125,323,363,620 branches 2.004721306 125,514,968,546 branches 2.005114560 125,511,110,861 branches 2.005482722 125,510,132,724 branches 2.005851245 125,508,967,086 branches 3.006323475 125,166,570,648 branches 3.006709247 125,165,650,056 branches 3.007086605 125,164,639,142 branches 3.007459298 125,164,402,912 branches 4.007922698 125,045,577,140 branches 4.008310775 125,046,804,324 branches 4.008670814 125,048,265,111 branches 4.009039251 125,048,677,611 branches 5.009503373 125,122,240,217 branches 5.009897067 125,122,450,517 branches Then on another connection, sysadmin does: $ echo 1 >/sys/devices/cpu/allow_tsx_force_abort Then perf stat adjusts the events immediately: 5.010286029 125,121,393,483 branches 5.010646308 125,120,556,786 branches 6.011113588 124,963,351,832 branches 6.011510331 124,964,267,566 branches 6.011889913 124,964,829,130 branches 6.012262996 124,965,841,156 branches 7.012708299 124,419,832,234 branches [79.69%] 7.012847908 124,416,363,853 branches [79.73%] 7.013225462 124,400,723,712 branches [79.73%] 7.013598191 124,376,154,434 branches [79.70%] 8.014089834 124,250,862,693 branches [74.98%] 8.014481363 124,267,539,139 branches [74.94%] 8.014856006 124,259,519,786 branches [74.98%] 8.014980848 124,225,457,969 branches [75.04%] 9.015464576 124,204,235,423 branches [75.03%] 9.015858587 124,204,988,490 branches [75.04%] 9.016243680 124,220,092,486 branches [74.99%] 9.016620104 124,231,260,146 branches [74.94%] And vice-versa if the syadmin does: $ echo 0 >/sys/devices/cpu/allow_tsx_force_abort Events are again spread over the 4 counters: 10.017096277 124,276,230,565 branches [74.96%] 10.017237209 124,228,062,171 branches [75.03%] 10.017478637 124,178,780,626 branches [75.03%] 10.017853402 124,198,316,177 branches [75.03%] 11.018334423 124,602,418,933 branches [85.40%] 11.018722584 124,602,921,320 branches [85.42%] 11.019095621 124,603,956,093 branches [85.42%] 11.019467742 124,595,273,783 branches [85.42%] 12.019945736 125,110,114,864 branches 12.020330764 125,109,334,472 branches 12.020688740 125,109,818,865 branches 12.021054020 125,108,594,014 branches 13.021516774 125,109,164,018 branches 13.021903640 125,108,794,510 branches 13.022270770 125,107,756,978 branches 13.022630819 125,109,380,471 branches 14.023114989 125,133,140,817 branches 14.023501880 125,133,785,858 branches 14.023868339 125,133,852,700 branches Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Cc: nelson.dsouza@intel.com Cc: tonyj@suse.com Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 4 ++++ arch/x86/events/intel/core.c | 50 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 1 + 3 files changed, 53 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 87b50f4be201..fdd106267fd2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -661,6 +661,10 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } +struct pmu *x86_get_pmu(void) +{ + return &pmu; +} /* * Event scheduler state: * diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1bb59c4c59f2..8265b5026a19 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4156,6 +4156,50 @@ done: return count; } +static void update_tfa_sched(void *ignored) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * check if PMC3 is used + * and if so force schedule out for all event types all contexts + */ + if (test_bit(3, cpuc->active_mask)) + perf_pmu_resched(x86_get_pmu()); +} + +static ssize_t show_sysctl_tfa(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", allow_tsx_force_abort); +} + +static ssize_t set_sysctl_tfa(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool val; + ssize_t ret; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + /* no change */ + if (val == allow_tsx_force_abort) + return count; + + allow_tsx_force_abort = val; + + get_online_cpus(); + on_each_cpu(update_tfa_sched, NULL, 1); + put_online_cpus(); + + return count; +} + + static DEVICE_ATTR_RW(freeze_on_smi); static ssize_t branches_show(struct device *cdev, @@ -4188,7 +4232,9 @@ static struct attribute *intel_pmu_caps_attrs[] = { NULL }; -static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); +static DEVICE_ATTR(allow_tsx_force_abort, 0644, + show_sysctl_tfa, + set_sysctl_tfa); static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, @@ -4697,7 +4743,7 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; x86_pmu.commit_scheduling = intel_tfa_commit_scheduling; - intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr; + intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr; } pr_cont("Skylake events, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e544d83ea4b4..9e474a5f3b86 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -713,6 +713,7 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .event_str_ht = ht, \ } +struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; static inline bool x86_pmu_has_lbr_callstack(void) -- cgit v1.2.3 From 878068ea270ea82767ff1d26c91583263c81fba0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:44:59 -0700 Subject: perf/x86: Support outputting XMM registers Starting from Icelake, XMM registers can be collected in PEBS record. But current code only output the pt_regs. Add a new struct x86_perf_regs for both pt_regs and xmm_regs. The xmm_regs will be used later to keep a pointer to PEBS record which has XMM information. XMM registers are 128 bit. To simplify the code, they are handled like two different registers, which means setting two bits in the register bitmap. This also allows only sampling the lower 64bit bits in XMM. The index of XMM registers starts from 32. There are 16 XMM registers. So all reserved space for regs are used. Remove REG_RESERVED. Add PERF_REG_X86_XMM_MAX, which stands for the max number of all x86 regs including both GPRs and XMM. Add REG_NOSUPPORT for 32bit to exclude unsupported registers. Previous platforms can not collect XMM information in PEBS record. Adding pebs_no_xmm_regs to indicate the unsupported platforms. The common code still validates the supported registers. However, it cannot check model specific registers, e.g. XMM. Add extra check in x86_pmu_hw_config() to reject invalid config of regs_user and regs_intr. The regs_user never supports XMM collection. The regs_intr only supports XMM collection when sampling PEBS event on icelake and later platforms. Originally-by: Andi Kleen Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-3-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 15 +++++++++++++++ arch/x86/events/intel/ds.c | 4 +++- arch/x86/events/perf_event.h | 21 ++++++++++++++++++++- arch/x86/include/asm/perf_event.h | 5 +++++ arch/x86/include/uapi/asm/perf_regs.h | 23 ++++++++++++++++++++++- arch/x86/kernel/perf_regs.c | 27 ++++++++++++++++++++------- 6 files changed, 85 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index fdd106267fd2..de1a924a4914 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -560,6 +560,21 @@ int x86_pmu_hw_config(struct perf_event *event) return -EINVAL; } + /* sample_regs_user never support XMM registers */ + if (unlikely(event->attr.sample_regs_user & PEBS_XMM_REGS)) + return -EINVAL; + /* + * Besides the general purpose registers, XMM registers may + * be collected in PEBS on some platforms, e.g. Icelake + */ + if (unlikely(event->attr.sample_regs_intr & PEBS_XMM_REGS)) { + if (x86_pmu.pebs_no_xmm_regs) + return -EINVAL; + + if (!event->attr.precise_ip) + return -EINVAL; + } + return x86_setup_perfctr(event); } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 10c99ce1fead..f57e6cb7fd99 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,8 +1628,10 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; - if (x86_pmu.version <= 4) + if (x86_pmu.version <= 4) { x86_pmu.pebs_no_isolation = 1; + x86_pmu.pebs_no_xmm_regs = 1; + } if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; int format = x86_pmu.intel_cap.pebs_format; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9e474a5f3b86..7abfadb4f202 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -115,6 +115,24 @@ struct amd_nb { (1ULL << PERF_REG_X86_R14) | \ (1ULL << PERF_REG_X86_R15)) +#define PEBS_XMM_REGS \ + ((1ULL << PERF_REG_X86_XMM0) | \ + (1ULL << PERF_REG_X86_XMM1) | \ + (1ULL << PERF_REG_X86_XMM2) | \ + (1ULL << PERF_REG_X86_XMM3) | \ + (1ULL << PERF_REG_X86_XMM4) | \ + (1ULL << PERF_REG_X86_XMM5) | \ + (1ULL << PERF_REG_X86_XMM6) | \ + (1ULL << PERF_REG_X86_XMM7) | \ + (1ULL << PERF_REG_X86_XMM8) | \ + (1ULL << PERF_REG_X86_XMM9) | \ + (1ULL << PERF_REG_X86_XMM10) | \ + (1ULL << PERF_REG_X86_XMM11) | \ + (1ULL << PERF_REG_X86_XMM12) | \ + (1ULL << PERF_REG_X86_XMM13) | \ + (1ULL << PERF_REG_X86_XMM14) | \ + (1ULL << PERF_REG_X86_XMM15)) + /* * Per register state. */ @@ -612,7 +630,8 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1; + pebs_no_isolation :1, + pebs_no_xmm_regs :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8bdf74902293..d9f5bbe44b3c 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -248,6 +248,11 @@ extern void perf_events_lapic_init(void); #define PERF_EFLAGS_VM (1UL << 5) struct pt_regs; +struct x86_perf_regs { + struct pt_regs regs; + u64 *xmm_regs; +}; + extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index f3329cabce5c..ac67bbea10ca 100644 --- a/arch/x86/include/uapi/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h @@ -27,8 +27,29 @@ enum perf_event_x86_regs { PERF_REG_X86_R13, PERF_REG_X86_R14, PERF_REG_X86_R15, - + /* These are the limits for the GPRs. */ PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, + + /* These all need two bits set because they are 128bit */ + PERF_REG_X86_XMM0 = 32, + PERF_REG_X86_XMM1 = 34, + PERF_REG_X86_XMM2 = 36, + PERF_REG_X86_XMM3 = 38, + PERF_REG_X86_XMM4 = 40, + PERF_REG_X86_XMM5 = 42, + PERF_REG_X86_XMM6 = 44, + PERF_REG_X86_XMM7 = 46, + PERF_REG_X86_XMM8 = 48, + PERF_REG_X86_XMM9 = 50, + PERF_REG_X86_XMM10 = 52, + PERF_REG_X86_XMM11 = 54, + PERF_REG_X86_XMM12 = 56, + PERF_REG_X86_XMM13 = 58, + PERF_REG_X86_XMM14 = 60, + PERF_REG_X86_XMM15 = 62, + + /* These include both GPRs and XMMX registers */ + PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2, }; #endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index c06c4c16c6b6..07c30ee17425 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { u64 perf_reg_value(struct pt_regs *regs, int idx) { + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } -#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) - #ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; @@ -96,10 +112,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) - return -EINVAL; - - if (mask & REG_NOSUPPORT) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; -- cgit v1.2.3 From 48f38aa4cc5a48bc0fe85c5c4b1ab171fbb539b6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 2 Apr 2019 12:45:00 -0700 Subject: perf/x86/intel: Extract memory code PEBS parser for reuse Extract some code related to memory profiling from the PEBS record parser into separate functions. It can be reused by the upcoming adaptive PEBS parser. No functional changes. Rename intel_hsw_weight to intel_get_tsx_weight, and intel_hsw_transaction to intel_get_tsx_transaction. Because the input is not the hsw pebs format anymore. Signed-off-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-4-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 63 +++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index f57e6cb7fd99..2f80b7e282ff 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1125,34 +1125,50 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) +static inline u64 intel_get_tsx_weight(u64 tsx_tuning) { - if (pebs->tsx_tuning) { - union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + if (tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = tsx_tuning }; return tsx.cycles_last_block; } return 0; } -static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) +static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax) { - u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; /* For RTM XABORTs also log the abort code from AX */ - if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) - txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + if ((txn & PERF_TXN_TRANSACTION) && (ax & 1)) + txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; return txn; } +#define PERF_X86_EVENT_PEBS_HSW_PREC \ + (PERF_X86_EVENT_PEBS_ST_HSW | \ + PERF_X86_EVENT_PEBS_LD_HSW | \ + PERF_X86_EVENT_PEBS_NA_HSW) + +static u64 get_data_src(struct perf_event *event, u64 aux) +{ + u64 val = PERF_MEM_NA; + int fl = event->hw.flags; + bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); + + if (fl & PERF_X86_EVENT_PEBS_LDLAT) + val = load_latency_data(aux); + else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) + val = precise_datala_hsw(event, aux); + else if (fst) + val = precise_store_data(aux); + return val; +} + static void setup_pebs_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, struct pt_regs *regs) { -#define PERF_X86_EVENT_PEBS_HSW_PREC \ - (PERF_X86_EVENT_PEBS_ST_HSW | \ - PERF_X86_EVENT_PEBS_LD_HSW | \ - PERF_X86_EVENT_PEBS_NA_HSW) /* * We cast to the biggest pebs_record but are careful not to * unconditionally access the 'extra' entries. @@ -1160,17 +1176,13 @@ static void setup_pebs_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_record_skl *pebs = __pebs; u64 sample_type; - int fll, fst, dsrc; - int fl = event->hw.flags; + int fll; if (pebs == NULL) return; sample_type = event->attr.sample_type; - dsrc = sample_type & PERF_SAMPLE_DATA_SRC; - - fll = fl & PERF_X86_EVENT_PEBS_LDLAT; - fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; perf_sample_data_init(data, 0, event->hw.last_period); @@ -1185,16 +1197,8 @@ static void setup_pebs_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (dsrc) { - u64 val = PERF_MEM_NA; - if (fll) - val = load_latency_data(pebs->dse); - else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) - val = precise_datala_hsw(event, pebs->dse); - else if (fst) - val = precise_store_data(pebs->dse); - data->data_src.val = val; - } + if (sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = get_data_src(event, pebs->dse); /* * We must however always use iregs for the unwinder to stay sane; the @@ -1281,10 +1285,11 @@ static void setup_pebs_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_hsw_weight(pebs); + data->weight = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) - data->txn = intel_hsw_transaction(pebs); + data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, + pebs->ax); } /* -- cgit v1.2.3 From 477f00f9617009a9a3a9271885231573b728ca4f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:01 -0700 Subject: perf/x86/intel/ds: Extract code of event update in short period The drain_pebs() could be called twice in a short period for auto-reload event in pmu::read(). The intel_pmu_save_and_restart_reload() should be called to update the event->count. This case should also be handled on Icelake. Extract the code for later reuse. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-5-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2f80b7e282ff..9ac73860645d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1491,6 +1491,25 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) __intel_pmu_pebs_event(event, iregs, at, top, 0, n); } +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +{ + struct perf_event *event; + int bit; + + /* + * The drain_pebs() could be called twice in a short period + * for auto-reload event in pmu::read(). There are no + * overflows have happened in between. + * It needs to call intel_pmu_save_and_restart_reload() to + * update the event->count for this case. + */ + for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + event = cpuc->events[bit]; + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); + } +} + static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1518,19 +1537,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) } if (unlikely(base >= top)) { - /* - * The drain_pebs() could be called twice in a short period - * for auto-reload event in pmu::read(). There are no - * overflows have happened in between. - * It needs to call intel_pmu_save_and_restart_reload() to - * update the event->count for this case. - */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, - size) { - event = cpuc->events[bit]; - if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) - intel_pmu_save_and_restart_reload(event, 0); - } + intel_pmu_pebs_event_update_no_drain(cpuc, size); return; } -- cgit v1.2.3 From c22497f5838c237e3094a4dfb99d1c5de6353239 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:02 -0700 Subject: perf/x86/intel: Support adaptive PEBS v4 Adaptive PEBS is a new way to report PEBS sampling information. Instead of a fixed size record for all PEBS events it allows to configure the PEBS record to only include the information needed. Events can then opt in to use such an extended record, or stay with a basic record which only contains the IP. The major new feature is to support LBRs in PEBS record. Besides normal LBR, this allows (much faster) large PEBS, while still supporting callstacks through callstack LBR. So essentially a lot of profiling can now be done without frequent interrupts, dropping the overhead significantly. The main requirement still is to use a period, and not use frequency mode, because frequency mode requires reevaluating the frequency on each overflow. The floating point state (XMM) is also supported, which allows efficient profiling of FP function arguments. Introduce specific drain function to handle variable length records. Use a new callback to parse the new record format, and also handle the STATUS field now being at a different offset. Add code to set up the configuration register. Since there is only a single register, all events either get the full super set of all events, or only the basic record. Originally-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-6-kan.liang@linux.intel.com [ Renamed GPRS => GP. ] Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 7 + arch/x86/events/intel/ds.c | 380 +++++++++++++++++++++++++++++++++++--- arch/x86/events/intel/lbr.c | 22 +++ arch/x86/events/perf_event.h | 11 +- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/perf_event.h | 43 +++++ 6 files changed, 438 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8265b5026a19..bdc366d709aa 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2145,6 +2145,11 @@ static void intel_pmu_enable_fixed(struct perf_event *event) bits <<= (idx * 4); mask = 0xfULL << (idx * 4); + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + } + rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; ctrl_val |= bits; @@ -3510,6 +3515,8 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { + cpuc->pebs_record_size = x86_pmu.pebs_record_size; + if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9ac73860645d..6436452d6342 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -906,17 +906,87 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - - reserved * x86_pmu.pebs_record_size; + reserved * cpuc->pebs_record_size; } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + threshold = ds->pebs_buffer_base + cpuc->pebs_record_size; } ds->pebs_interrupt_threshold = threshold; } +static void adaptive_pebs_record_size_update(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = cpuc->pebs_data_cfg; + int sz = sizeof(struct pebs_basic); + + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + sz += sizeof(struct pebs_meminfo); + if (pebs_data_cfg & PEBS_DATACFG_GP) + sz += sizeof(struct pebs_gprs); + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + sz += sizeof(struct pebs_xmm); + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + + cpuc->pebs_record_size = sz; +} + +#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ + PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_TRANSACTION) + +static u64 pebs_update_adaptive_cfg(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + u64 sample_type = attr->sample_type; + u64 pebs_data_cfg = 0; + bool gprs, tsx_weight; + + if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) && + attr->precise_ip > 1) + return pebs_data_cfg; + + if (sample_type & PERF_PEBS_MEMINFO_TYPE) + pebs_data_cfg |= PEBS_DATACFG_MEMINFO; + + /* + * We need GPRs when: + * + user requested them + * + precise_ip < 2 for the non event IP + * + For RTM TSX weight we need GPRs for the abort code. + */ + gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_GP_REGS); + + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + ((attr->config & INTEL_ARCH_EVENT_MASK) == + x86_pmu.rtm_abort_event); + + if (gprs || (attr->precise_ip < 2) || tsx_weight) + pebs_data_cfg |= PEBS_DATACFG_GP; + + if ((sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_XMM_REGS)) + pebs_data_cfg |= PEBS_DATACFG_XMMS; + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + /* + * For now always log all LBRs. Could configure this + * later. + */ + pebs_data_cfg |= PEBS_DATACFG_LBRS | + ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT); + } + + return pebs_data_cfg; +} + static void -pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, + struct perf_event *event, bool add) { + struct pmu *pmu = event->ctx->pmu; /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but @@ -933,6 +1003,29 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) update = true; } + /* + * The PEBS record doesn't shrink on pmu::del(). Doing so would require + * iterating all remaining PEBS events to reconstruct the config. + */ + if (x86_pmu.intel_cap.pebs_baseline && add) { + u64 pebs_data_cfg; + + /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */ + if (cpuc->n_pebs == 1) { + cpuc->pebs_data_cfg = 0; + cpuc->pebs_record_size = sizeof(struct pebs_basic); + } + + pebs_data_cfg = pebs_update_adaptive_cfg(event); + + /* Update pebs_record_size if new event requires more data. */ + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) { + cpuc->pebs_data_cfg |= pebs_data_cfg; + adaptive_pebs_record_size_update(); + update = true; + } + } + if (update) pebs_update_threshold(cpuc); } @@ -947,7 +1040,7 @@ void intel_pmu_pebs_add(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; - pebs_update_state(needed_cb, cpuc, event->ctx->pmu); + pebs_update_state(needed_cb, cpuc, event, true); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -965,6 +1058,14 @@ void intel_pmu_pebs_enable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; + if (x86_pmu.intel_cap.pebs_baseline) { + hwc->config |= ICL_EVENTSEL_ADAPTIVE; + if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) { + wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg); + cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg; + } + } + /* * Use auto-reload if possible to save a MSR write in the PMI. * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. @@ -991,7 +1092,7 @@ void intel_pmu_pebs_del(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; - pebs_update_state(needed_cb, cpuc, event->ctx->pmu); + pebs_update_state(needed_cb, cpuc, event, false); } void intel_pmu_pebs_disable(struct perf_event *event) @@ -1144,6 +1245,13 @@ static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax) return txn; } +static inline u64 get_pebs_status(void *n) +{ + if (x86_pmu.intel_cap.pebs_format < 4) + return ((struct pebs_record_nhm *)n)->status; + return ((struct pebs_basic *)n)->applicable_counters; +} + #define PERF_X86_EVENT_PEBS_HSW_PREC \ (PERF_X86_EVENT_PEBS_ST_HSW | \ PERF_X86_EVENT_PEBS_LD_HSW | \ @@ -1164,7 +1272,7 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } -static void setup_pebs_sample_data(struct perf_event *event, +static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, struct pt_regs *regs) @@ -1306,6 +1414,140 @@ static void setup_pebs_sample_data(struct perf_event *event, data->br_stack = &cpuc->lbr_stack; } +static void adaptive_pebs_save_regs(struct pt_regs *regs, + struct pebs_gprs *gprs) +{ + regs->ax = gprs->ax; + regs->bx = gprs->bx; + regs->cx = gprs->cx; + regs->dx = gprs->dx; + regs->si = gprs->si; + regs->di = gprs->di; + regs->bp = gprs->bp; + regs->sp = gprs->sp; +#ifndef CONFIG_X86_32 + regs->r8 = gprs->r8; + regs->r9 = gprs->r9; + regs->r10 = gprs->r10; + regs->r11 = gprs->r11; + regs->r12 = gprs->r12; + regs->r13 = gprs->r13; + regs->r14 = gprs->r14; + regs->r15 = gprs->r15; +#endif +} + +/* + * With adaptive PEBS the layout depends on what fields are configured. + */ + +static void setup_pebs_adaptive_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct pebs_basic *basic = __pebs; + void *next_record = basic + 1; + u64 sample_type; + u64 format_size; + struct pebs_meminfo *meminfo = NULL; + struct pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + + if (basic == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + sample_type = event->attr.sample_type; + format_size = basic->format_size; + perf_sample_data_init(data, 0, event->hw.last_period); + data->period = event->hw.last_period; + + if (event->attr.use_clockid == 0) + data->time = native_sched_clock_from_tsc(basic->tsc); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + + *regs = *iregs; + /* The ip in basic is EventingIP */ + set_linear_ip(regs, basic->ip); + regs->flags = PERF_EFLAGS_EXACT; + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (format_size & PEBS_DATACFG_MEMINFO) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (format_size & PEBS_DATACFG_GP) { + gprs = next_record; + next_record = gprs + 1; + + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & PERF_SAMPLE_REGS_INTR) + adaptive_pebs_save_regs(regs, gprs); + } + + if (format_size & PEBS_DATACFG_MEMINFO) { + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight = meminfo->latency ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = get_data_src(event, meminfo->aux); + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + data->addr = meminfo->address; + + if (sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, + gprs ? gprs->ax : 0); + } + + if (format_size & PEBS_DATACFG_XMMS) { + struct pebs_xmm *xmm = next_record; + + next_record = xmm + 1; + perf_regs->xmm_regs = xmm->xmm; + } + + if (format_size & PEBS_DATACFG_LBRS) { + struct pebs_lbr *lbr = next_record; + int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + & 0xff) + 1; + next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + data->br_stack = &cpuc->lbr_stack; + } + } + + WARN_ONCE(next_record != __pebs + (format_size >> 48), + "PEBS record size %llu, expected %llu, config %llx\n", + format_size >> 48, + (u64)(next_record - __pebs), + basic->format_size); +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -1323,19 +1565,19 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) if (base == NULL) return NULL; - for (at = base; at < top; at += x86_pmu.pebs_record_size) { - struct pebs_record_nhm *p = at; + for (at = base; at < top; at += cpuc->pebs_record_size) { + unsigned long status = get_pebs_status(at); - if (test_bit(bit, (unsigned long *)&p->status)) { + if (test_bit(bit, (unsigned long *)&status)) { /* PEBS v3 has accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) return at; - if (p->status == (1 << bit)) + if (status == (1 << bit)) return at; /* clear non-PEBS bit and re-check */ - pebs_status = p->status & cpuc->pebs_enabled; + pebs_status = status & cpuc->pebs_enabled; pebs_status &= PEBS_COUNTER_MASK; if (pebs_status == (1 << bit)) return at; @@ -1415,11 +1657,18 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *base, void *top, - int bit, int count) + int bit, int count, + void (*setup_sample)(struct perf_event *, + struct pt_regs *, + void *, + struct perf_sample_data *, + struct pt_regs *)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; - struct pt_regs regs; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { @@ -1434,20 +1683,20 @@ static void __intel_pmu_pebs_event(struct perf_event *event, return; while (count > 1) { - setup_pebs_sample_data(event, iregs, at, &data, ®s); - perf_event_output(event, &data, ®s); - at += x86_pmu.pebs_record_size; + setup_sample(event, iregs, at, &data, regs); + perf_event_output(event, &data, regs); + at += cpuc->pebs_record_size; at = get_next_pebs_record_by_bit(at, top, bit); count--; } - setup_pebs_sample_data(event, iregs, at, &data, ®s); + setup_sample(event, iregs, at, &data, regs); /* * All but the last records are processed. * The last one is left to be able to call the overflow handler. */ - if (perf_event_overflow(event, &data, ®s)) { + if (perf_event_overflow(event, &data, regs)) { x86_pmu_stop(event, 0); return; } @@ -1488,7 +1737,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; } - __intel_pmu_pebs_event(event, iregs, at, top, 0, n); + __intel_pmu_pebs_event(event, iregs, at, top, 0, n, + setup_pebs_fixed_sample_data); } static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) @@ -1550,8 +1800,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&pebs_status, - size) + for_each_set_bit(bit, (unsigned long *)&pebs_status, size) counts[bit]++; continue; @@ -1590,8 +1839,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * If collision happened, the record will be dropped. */ if (p->status != (1ULL << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - x86_pmu.max_pebs_events) + for_each_set_bit(i, (unsigned long *)&pebs_status, size) error[i]++; continue; } @@ -1599,7 +1847,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) counts[bit]++; } - for (bit = 0; bit < size; bit++) { + for_each_set_bit(bit, (unsigned long *)&mask, size) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; @@ -1620,11 +1868,66 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, - top, bit, counts[bit]); + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } +static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event; + void *base, *at, *top; + int bit, size; + u64 mask; + + if (!x86_pmu.pebs_active) + return; + + base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_basic *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + mask = ((1ULL << x86_pmu.max_pebs_events) - 1) | + (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); + size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + + if (unlikely(base >= top)) { + intel_pmu_pebs_event_update_no_drain(cpuc, size); + return; + } + + for (at = base; at < top; at += cpuc->pebs_record_size) { + u64 pebs_status; + + pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; + pebs_status &= mask; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, size) + counts[bit]++; + } + + for_each_set_bit(bit, (unsigned long *)&mask, size) { + if (counts[bit] == 0) + continue; + + event = cpuc->events[bit]; + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + __intel_pmu_pebs_event(event, iregs, base, + top, bit, counts[bit], + setup_pebs_adaptive_sample_data); + } +} + /* * BTS, PEBS probe and setup */ @@ -1646,8 +1949,12 @@ void __init intel_ds_init(void) } if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; + char *pebs_qual = ""; int format = x86_pmu.intel_cap.pebs_format; + if (format < 4) + x86_pmu.intel_cap.pebs_baseline = 0; + switch (format) { case 0: pr_cont("PEBS fmt0%c, ", pebs_type); @@ -1683,6 +1990,29 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 4: + x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; + x86_pmu.pebs_record_size = sizeof(struct pebs_basic); + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= + PERF_SAMPLE_BRANCH_STACK | + PERF_SAMPLE_TIME; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + pebs_qual = "-baseline"; + } else { + /* Only basic record supported */ + x86_pmu.pebs_no_xmm_regs = 1; + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_ADDR | + PERF_SAMPLE_TIME | + PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_TRANSACTION | + PERF_SAMPLE_REGS_USER | + PERF_SAMPLE_REGS_INTR); + } + pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + break; + default: pr_cont("no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 580c1b91c454..07b7175fc378 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1080,6 +1080,28 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } +void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + cpuc->lbr_stack.nr = x86_pmu.lbr_nr; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + u64 info = lbr->lbr[i].info; + struct perf_branch_entry *e = &cpuc->lbr_entries[i]; + + e->from = lbr->lbr[i].from; + e->to = lbr->lbr[i].to; + e->mispred = !!(info & LBR_INFO_MISPRED); + e->predicted = !(info & LBR_INFO_MISPRED); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = info & LBR_INFO_CYCLES; + e->reserved = 0; + } + intel_pmu_lbr_filter(cpuc); +} + /* * Map interface branch filters onto LBR filters */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7abfadb4f202..2059c143946f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -224,6 +224,11 @@ struct cpu_hw_events { int n_pebs; int n_large_pebs; + /* Current super set of events hardware configuration */ + u64 pebs_data_cfg; + u64 active_pebs_data_cfg; + int pebs_record_size; + /* * Intel LBR bits */ @@ -490,6 +495,7 @@ union perf_capabilities { * values > 32bit. */ u64 full_width_write:1; + u64 pebs_baseline:1; }; u64 capabilities; }; @@ -634,11 +640,12 @@ struct x86_pmu { pebs_no_xmm_regs :1; int pebs_record_size; int pebs_buffer_size; + int max_pebs_events; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); - int max_pebs_events; unsigned long large_pebs_flags; + u64 rtm_abort_event; /* * Intel LBR @@ -978,6 +985,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); +void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); + void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ca5bc0eacb95..1378518cf63f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -116,6 +116,7 @@ #define LBR_INFO_CYCLES 0xffff #define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d9f5bbe44b3c..997a6587d7cf 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -32,6 +32,8 @@ #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) +#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) +#define ICL_FIXED_0_ADAPTIVE (1ULL << 32) #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) @@ -87,6 +89,12 @@ #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 #define ARCH_PERFMON_EVENTS_COUNT 7 +#define PEBS_DATACFG_MEMINFO BIT_ULL(0) +#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_XMMS BIT_ULL(2) +#define PEBS_DATACFG_LBRS BIT_ULL(3) +#define PEBS_DATACFG_LBR_SHIFT 24 + /* * Intel "Architectural Performance Monitoring" CPUID * detection/enumeration details: @@ -176,6 +184,41 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) +/* + * Adaptive PEBS v4 + */ + +struct pebs_basic { + u64 format_size; + u64 ip; + u64 applicable_counters; + u64 tsc; +}; + +struct pebs_meminfo { + u64 address; + u64 aux; + u64 latency; + u64 tsx_tuning; +}; + +struct pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15; +}; + +struct pebs_xmm { + u64 xmm[16*2]; /* two entries for each register */ +}; + +struct pebs_lbr_entry { + u64 from, to, info; +}; + +struct pebs_lbr { + struct pebs_lbr_entry lbr[0]; /* Variable length */ +}; + /* * IBS cpuid feature detection */ -- cgit v1.2.3 From d3617b98b04583df222f34992e65712862a77bf1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 2 Apr 2019 12:45:03 -0700 Subject: perf/x86/lbr: Avoid reading the LBRs when adaptive PEBS handles them With adaptive PEBS the CPU can directly supply the LBR information, so we don't need to read it again. But the LBRs still need to be enabled. Add a special count to the cpuc that distinguishes these two cases, and avoid reading the LBRs unnecessarily when PEBS is active. Signed-off-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-7-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/lbr.c | 13 ++++++++++++- arch/x86/events/perf_event.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 07b7175fc378..6f814a27416b 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -488,6 +488,8 @@ void intel_pmu_lbr_add(struct perf_event *event) * be 'new'. Conversely, a new event can get installed through the * context switch path for the first time. */ + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) + cpuc->lbr_pebs_users++; perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); @@ -507,8 +509,11 @@ void intel_pmu_lbr_del(struct perf_event *event) task_ctx->lbr_callstack_users--; } + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) + cpuc->lbr_pebs_users--; cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); perf_sched_cb_dec(event->ctx->pmu); } @@ -658,7 +663,13 @@ void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (!cpuc->lbr_users) + /* + * Don't read when all LBRs users are using adaptive PEBS. + * + * This could be smarter and actually check the event, + * but this simple approach seems to work for now. + */ + if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) return; if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2059c143946f..dced91582147 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -233,6 +233,7 @@ struct cpu_hw_events { * Intel LBR bits */ int lbr_users; + int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; -- cgit v1.2.3 From 63b79f6ebc464afb730bc45762c820795e276da1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Apr 2019 12:45:04 -0700 Subject: perf/x86: Support constraint ranges Icelake extended the general counters to 8, even when SMT is enabled. However only a (large) subset of the events can be used on all 8 counters. The events that can or cannot be used on all counters are organized in ranges. A lot of scheduler constraints are required to handle all this. To avoid blowing up the tables add event code ranges to the constraint tables, and a new inline function to match them. Originally-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) # developer hat on Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) # maintainer hat on Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-8-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 2 +- arch/x86/events/perf_event.h | 43 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 39 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bdc366d709aa..d4b52896f173 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2693,7 +2693,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6436452d6342..4429bfa92fbc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -858,7 +858,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) if (x86_pmu.pebs_constraints) { for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index dced91582147..0ff0c5ae8c29 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -49,13 +49,19 @@ struct event_constraint { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; u64 idxmsk64; }; - u64 code; - u64 cmask; - int weight; - int overlap; - int flags; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; + unsigned int size; }; +static inline bool constraint_match(struct event_constraint *c, u64 ecode) +{ + return ((ecode & c->cmask) - c->code) <= (u64)c->size; +} + /* * struct hw_perf_event.flags flags */ @@ -280,18 +286,29 @@ struct cpu_hw_events { void *kfree_on_online[X86_PERF_KFREE_MAX]; }; -#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ +#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \ { .idxmsk64 = (n) }, \ .code = (c), \ + .size = (e) - (c), \ .cmask = (m), \ .weight = (w), \ .overlap = (o), \ .flags = f, \ } +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \ + __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f) + #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) +/* + * The constraint_match() function only works for 'simple' event codes + * and not for extended (AMD64_EVENTSEL_EVENT) events codes. + */ +#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \ + __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0) + #define INTEL_EXCLEVT_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ 0, PERF_X86_EVENT_EXCL) @@ -326,6 +343,12 @@ struct cpu_hw_events { #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) +/* + * Constraint on a range of Event codes + */ +#define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT) + /* * Constraint on the Event code + UMask + fixed-mask * @@ -373,6 +396,9 @@ struct cpu_hw_events { #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) +#define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + /* Check only flags, but allow all event/umask */ #define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS) @@ -389,6 +415,11 @@ struct cpu_hw_events { ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \ + __EVENT_CONSTRAINT_RANGE(code, end, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) + #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ -- cgit v1.2.3 From 6017608936c1825ff5d7325270484042f597edff Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:05 -0700 Subject: perf/x86/intel: Add Icelake support Add Icelake core PMU perf code, including constraint tables and the main enable code. Icelake expanded the generic counters to always 8 even with HT on, but a range of events cannot be scheduled on the extra 4 counters. Add new constraint ranges to describe this to the scheduler. The number of constraints that need to be checked is larger now than with earlier CPUs. At some point we may need a new data structure to look them up more efficiently than with linear search. So far it still seems to be acceptable however. Icelake added a new fixed counter SLOTS. Full support for it is added later in the patch series. The cache events table is identical to Skylake. Compare to PEBS instruction event on generic counter, fixed counter 0 has less skid. Force instruction:ppp always in fixed counter 0. Originally-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-9-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 111 ++++++++++++++++++++++++++++++++++++++ arch/x86/events/intel/ds.c | 25 ++++++++- arch/x86/events/perf_event.h | 2 + arch/x86/include/asm/intel_ds.h | 2 +- arch/x86/include/asm/perf_event.h | 2 +- 5 files changed, 138 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d4b52896f173..156c6ae53d3c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -239,6 +239,35 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct event_constraint intel_icl_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), + INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ + INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ + INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_icl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff9fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -3374,6 +3403,9 @@ static struct event_constraint counter0_constraint = static struct event_constraint counter2_constraint = EVENT_CONSTRAINT(0, 0x4, 0); +static struct event_constraint fixed0_constraint = + FIXED_EVENT_CONSTRAINT(0x00c0, 0); + static struct event_constraint * hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -3392,6 +3424,21 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + /* + * Fixed counter 0 has less skid. + * Force instruction:ppp in Fixed counter 0 + */ + if ((event->attr.precise_ip == 3) && + constraint_match(&fixed0_constraint, event->hw.config)) + return &fixed0_constraint; + + return hsw_get_event_constraints(cpuc, idx, event); +} + static struct event_constraint * glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -4124,6 +4171,42 @@ static struct attribute *hsw_tsx_events_attrs[] = { NULL }; +EVENT_ATTR_STR(tx-capacity-read, tx_capacity_read, "event=0x54,umask=0x80"); +EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-capacity-read, el_capacity_read, "event=0x54,umask=0x80"); +EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2"); + +static struct attribute *icl_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL, +}; + +static struct attribute *icl_tsx_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_capacity_read), + EVENT_PTR(tx_capacity_write), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_abort), + EVENT_PTR(el_commit), + EVENT_PTR(el_capacity_read), + EVENT_PTR(el_capacity_write), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + NULL, +}; + +static __init struct attribute **get_icl_events_attrs(void) +{ + return boot_cpu_has(X86_FEATURE_RTM) ? + merge_attr(icl_events_attrs, icl_tsx_events_attrs) : + icl_events_attrs; +} + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4757,6 +4840,34 @@ __init int intel_pmu_init(void) name = "skylake"; break; + case INTEL_FAM6_ICELAKE_MOBILE: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_icl_event_constraints; + x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints; + x86_pmu.extra_regs = intel_icl_extra_regs; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = icl_get_event_constraints; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); + x86_pmu.cpu_events = get_icl_events_attrs(); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(false); + pr_cont("Icelake events, "); + name = "icelake"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4429bfa92fbc..7a9f5dac5abe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -849,6 +849,26 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_icl_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x400000000ULL), /* SLOTS */ + + INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), /* MEM_INST_RETIRED.STORE */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */ + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -1053,7 +1073,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) cpuc->pebs_enabled |= 1ULL << hwc->idx; - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; @@ -1105,7 +1125,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && + (x86_pmu.version < 5)) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 0ff0c5ae8c29..07fc84bb85c1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -999,6 +999,8 @@ extern struct event_constraint intel_bdw_pebs_event_constraints[]; extern struct event_constraint intel_skl_pebs_event_constraints[]; +extern struct event_constraint intel_icl_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index ae26df1c2789..8380c3ddd4b2 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -8,7 +8,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 -#define MAX_FIXED_PEBS_EVENTS 3 +#define MAX_FIXED_PEBS_EVENTS 4 /* * A debug store configuration. diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 997a6587d7cf..04768a3a5454 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -7,7 +7,7 @@ */ #define INTEL_PMC_MAX_GENERIC 32 -#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_FIXED 4 #define INTEL_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 -- cgit v1.2.3 From f08c47d1f86c6dc666c7e659d94bf6d4492aa9d7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:06 -0700 Subject: perf/x86/intel/cstate: Add Icelake support Icelake uses the same C-state residency events as Sandy Bridge. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-10-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 94a4b7fc75d0..dd5658ec31d5 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -578,6 +578,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -- cgit v1.2.3 From b3377c3acb9e54cf86efcfe25f2e792bca599ed4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:07 -0700 Subject: perf/x86/intel/rapl: Add Icelake support Icelake support the same RAPL counters as Skylake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-11-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 94dc564146ca..37ebf6fc5415 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, skl_rapl_init), {}, }; -- cgit v1.2.3 From cf50d79a8cfe5adae37fec026220b009559bbeed Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:08 -0700 Subject: perf/x86/msr: Add Icelake support Icelake is the same as the existing Skylake parts. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-12-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a878e6286e4a..f3f4c2263501 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -89,6 +89,7 @@ static bool test_intel(int idx) case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_MOBILE: case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_ICELAKE_MOBILE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; -- cgit v1.2.3 From 6e394376ee89233508fa21d006546357f8efee31 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:45:09 -0700 Subject: perf/x86/intel/uncore: Add Intel Icelake uncore support Add Intel Icelake uncore support: - The init code is based on Skylake - Add new PCI id for IMC - New MSR address for CBOX - Get CBOX# from CNL_UNC_CBO_CONFIG MSR directly - Create a new PMU for fixed clocktick counter Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-13-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore.c | 6 +++ arch/x86/events/intel/uncore.h | 1 + arch/x86/events/intel/uncore_snb.c | 91 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 9fe64c01a2e5..fc40a1473058 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1367,6 +1367,11 @@ static const struct intel_uncore_init_fun skx_uncore_init __initconst = { .pci_init = skx_uncore_pci_init, }; +static const struct intel_uncore_init_fun icl_uncore_init __initconst = { + .cpu_init = icl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1393,6 +1398,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 853a49a8ccf6..79eb2e21e4f0 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -512,6 +512,7 @@ int skl_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); +void icl_uncore_cpu_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 13493f43b247..f8431819b3e1 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -34,6 +34,8 @@ #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 +#define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 +#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -93,6 +95,12 @@ #define SKL_UNC_PERF_GLOBAL_CTL 0xe01 #define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1) +/* ICL Cbo register */ +#define ICL_UNC_CBO_CONFIG 0x396 +#define ICL_UNC_NUM_CBO_MASK 0xf +#define ICL_UNC_CBO_0_PER_CTR0 0x702 +#define ICL_UNC_CBO_MSR_OFFSET 0x8 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -280,6 +288,70 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_type icl_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = ICL_UNC_CBO_MSR_OFFSET, + .ops = &skl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct uncore_event_desc icl_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff"), + { /* end: all zeroes */ }, +}; + +static struct attribute *icl_uncore_clock_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group icl_uncore_clock_format_group = { + .name = "format", + .attrs = icl_uncore_clock_formats_attr, +}; + +static struct intel_uncore_type icl_uncore_clockbox = { + .name = "clock", + .num_counters = 1, + .num_boxes = 1, + .fixed_ctr_bits = 48, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &skl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type *icl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +static int icl_get_cbox_num(void) +{ + u64 num_boxes; + + rdmsrl(ICL_UNC_CBO_CONFIG, num_boxes); + + return num_boxes & ICL_UNC_NUM_CBO_MASK; +} + +void icl_uncore_cpu_init(void) +{ + uncore_msr_uncores = icl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + snb_uncore_arb.ops = &skl_uncore_msr_ops; +} + enum { SNB_PCI_UNCORE_IMC, }; @@ -668,6 +740,18 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id icl_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -693,6 +777,11 @@ static struct pci_driver skl_uncore_pci_driver = { .id_table = skl_uncore_pci_ids, }; +static struct pci_driver icl_uncore_pci_driver = { + .name = "icl_uncore", + .id_table = icl_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -732,6 +821,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ + IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } }; -- cgit v1.2.3 From 6daeb8737f8a93c6d3a3ae57e23dd3dbe8b239da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 10 Apr 2019 11:57:09 -0700 Subject: perf/x86/intel: Add Tremont core PMU support Add perf core PMU support for Intel Tremont CPU. The init code is based on Goldmont plus. The generic purpose counter 0 and fixed counter 0 have less skid. Force :ppp events on generic purpose counter 0. Force instruction:ppp on generic purpose counter 0 and fixed counter 0. Updates LLC cache event table and OFFCORE_RESPONSE mask. Adaptive PEBS, which is already enabled on ICL, is also supported on Tremont. No extra code required. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: https://lkml.kernel.org/r/1554922629-126287-3-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 91 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 156c6ae53d3c..4b4dac089635 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1856,6 +1856,45 @@ static __initconst const u64 glp_hw_cache_extra_regs }, }; +#define TNT_LOCAL_DRAM BIT_ULL(26) +#define TNT_DEMAND_READ GLM_DEMAND_DATA_RD +#define TNT_DEMAND_WRITE GLM_DEMAND_RFO +#define TNT_LLC_ACCESS GLM_ANY_RESPONSE +#define TNT_SNP_ANY (SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \ + SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM) +#define TNT_LLC_MISS (TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM) + +static __initconst const u64 tnt_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TNT_DEMAND_READ| + TNT_LLC_ACCESS, + [C(RESULT_MISS)] = TNT_DEMAND_READ| + TNT_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TNT_DEMAND_WRITE| + TNT_LLC_ACCESS, + [C(RESULT_MISS)] = TNT_DEMAND_WRITE| + TNT_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, +}; + +static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + EVENT_EXTRA_END +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -3406,6 +3445,9 @@ static struct event_constraint counter2_constraint = static struct event_constraint fixed0_constraint = FIXED_EVENT_CONSTRAINT(0x00c0, 0); +static struct event_constraint fixed0_counter0_constraint = + INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL); + static struct event_constraint * hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -3454,6 +3496,29 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + /* + * :ppp means to do reduced skid PEBS, + * which is available on PMC0 and fixed counter 0. + */ + if (event->attr.precise_ip == 3) { + /* Force instruction:ppp on PMC0 and Fixed counter 0 */ + if (constraint_match(&fixed0_constraint, event->hw.config)) + return &fixed0_counter0_constraint; + + return &counter0_constraint; + } + + c = intel_get_event_constraints(cpuc, idx, event); + + return c; +} + static bool allow_tsx_force_abort = true; static struct event_constraint * @@ -4585,6 +4650,32 @@ __init int intel_pmu_init(void) name = "goldmont_plus"; break; + case INTEL_FAM6_ATOM_TREMONT_X: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.extra_regs = intel_tnt_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.get_event_constraints = tnt_get_event_constraints; + extra_attr = slm_format_attr; + pr_cont("Tremont events, "); + name = "Tremont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: -- cgit v1.2.3 From ba696429d290690db967e5f49463df4b2c1314a4 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 3 Apr 2019 19:03:09 +0200 Subject: x86/hyper-v: Implement EOI assist Hyper-V TLFS suggests an optimization to avoid imminent VMExit on EOI: "The OS performs an EOI by atomically writing zero to the EOI Assist field of the virtual VP assist page and checking whether the "No EOI required" field was previously zero. If it was, the OS must write to the HV_X64_APIC_EOI MSR thereby triggering an intercept into the hypervisor." Implement the optimization in Linux. Tested-by: Long Li Signed-off-by: Vitaly Kuznetsov Cc: Borislav Petkov Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Michael Kelley (EOSG) Cc: Peter Zijlstra Cc: Sasha Levin Cc: Simon Xiao Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: linux-hyperv@vger.kernel.org Link: http://lkml.kernel.org/r/20190403170309.4107-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/hv_apic.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 8eb6fbee8e13..5c056b8aebef 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -86,6 +86,11 @@ static void hv_apic_write(u32 reg, u32 val) static void hv_apic_eoi_write(u32 reg, u32 val) { + struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()]; + + if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) + return; + wrmsr(HV_X64_MSR_EOI, val, 0); } -- cgit v1.2.3 From a943245adc9ae31942af752e879fbbc182166573 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 16 Apr 2019 11:57:51 +0100 Subject: x86/Kconfig: Fix spelling mistake "effectivness" -> "effectiveness" The Kconfig text contains a spelling mistake, fix it. Signed-off-by: Colin Ian King Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20190416105751.18899-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1f9b3cf437c..01dbb05bc498 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1499,7 +1499,7 @@ config X86_CPA_STATISTICS depends on DEBUG_FS ---help--- Expose statistics about the Change Page Attribute mechanims, which - helps to determine the effectivness of preserving large and huge + helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. config ARCH_HAS_MEM_ENCRYPT -- cgit v1.2.3 From 2b27924bb1d48e3775f432b70bdad5e6dd4e7798 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 15 Apr 2019 15:57:19 +0200 Subject: KVM: nVMX: always use early vmcs check when EPT is disabled The remaining failures of vmx.flat when EPT is disabled are caused by incorrectly reflecting VMfails to the L1 hypervisor. What happens is that nested_vmx_restore_host_state corrupts the guest CR3, reloading it with the host's shadow CR3 instead, because it blindly loads GUEST_CR3 from the vmcs01. For simplicity let's just always use hardware VMCS checks when EPT is disabled. This way, nested_vmx_restore_host_state is not reached at all (or at least shouldn't be reached). Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx/nested.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398..d213ec5c3766 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -146,6 +146,7 @@ #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 +#define VMX_ABORT_VMCS_CORRUPTED 3 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a22af5a85540..6401eb7ef19c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3796,8 +3796,18 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 + * points to shadow pages! Fortunately we only get here after a WARN_ON + * if EPT is disabled, so a VMabort is perfectly fine. + */ + if (enable_ept) { + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } else { + nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); + } /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -5745,6 +5755,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + /* + * Without EPT it is not possible to restore L1's CR3 and PDPTR on + * VMfail, because they are not available in vmcs01. Just always + * use hardware checks. + */ + if (!enable_ept) + nested_early_check = 1; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { -- cgit v1.2.3 From bc8a3d8925a8fa09fa550e0da115d95851ce33c6 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 8 Apr 2019 11:07:30 -0700 Subject: kvm: mmu: Fix overflow on kvm mmu page limit calculation KVM bases its memory usage limits on the total number of guest pages across all memslots. However, those limits, and the calculations to produce them, use 32 bit unsigned integers. This can result in overflow if a VM has more guest pages that can be represented by a u32. As a result of this overflow, KVM can use a low limit on the number of MMU pages it will allocate. This makes KVM unable to map all of guest memory at once, prompting spurious faults. Tested: Ran all kvm-unit-tests on an Intel Haswell machine. This patch introduced no new failures. Signed-off-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 ++++++------ arch/x86/kvm/mmu.c | 13 ++++++------- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 159b5988292f..9b7b731a0032 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) } #define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 @@ -844,9 +844,9 @@ enum kvm_irqchip_mode { }; struct kvm_arch { - unsigned int n_used_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_max_mmu_pages; + unsigned long n_used_mmu_pages; + unsigned long n_requested_mmu_pages; + unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -1256,8 +1256,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); bool pdptrs_changed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 85f753728953..e10962dfc203 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt) * aggregate version in order to make the slab shrinker * faster */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); @@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { LIST_HEAD(invalid_list); @@ -6031,10 +6031,10 @@ out: /* * Calculate mmu pages needed for kvm. */ -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) { - unsigned int nr_mmu_pages; - unsigned int nr_pages = 0; + unsigned long nr_mmu_pages; + unsigned long nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; @@ -6047,8 +6047,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bbdc60f2fae8..54c2a377795b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); -static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) return kvm->arch.n_max_mmu_pages - diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 099b851dabaf..455f156f56ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4270,7 +4270,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, - u32 kvm_nr_mmu_pages) + unsigned long kvm_nr_mmu_pages) { if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; @@ -4284,7 +4284,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { return kvm->arch.n_max_mmu_pages; } -- cgit v1.2.3 From 4a58038b9e420276157785afa0a0bbb4b9bc2265 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Wed, 20 Mar 2019 08:12:28 +0000 Subject: Revert "svm: Fix AVIC incomplete IPI emulation" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bb218fbcfaaa3b115d4cd7a43c0ca164f3a96e57. As Oren Twaig pointed out the old discussion: https://patchwork.kernel.org/patch/8292231/ that the change coud potentially cause an extra IPI to be sent to the destination vcpu because the AVIC hardware already set the IRR bit before the incomplete IPI #VMEXIT with id=1 (target vcpu is not running). Since writting to ICR and ICR2 will also set the IRR. If something triggers the destination vcpu to get scheduled before the emulation finishes, then this could result in an additional IPI. Also, the issue mentioned in the commit bb218fbcfaaa was misdiagnosed. Cc: Radim Krčmář Cc: Paolo Bonzini Reported-by: Oren Twaig Signed-off-by: Suravee Suthikulpanit Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e0a791c3d4fc..d7b14c902052 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4517,14 +4517,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; struct kvm_lapic *apic = svm->vcpu.arch.apic; /* - * Update ICR high and low, then emulate sending IPI, - * which is handled when writing APIC_ICR. + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } break; } case AVIC_IPI_FAILURE_INVALID_TARGET: -- cgit v1.2.3 From e44e3eacccfd2294a1ce279f68452b1635d7fa82 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 26 Mar 2019 03:57:37 +0000 Subject: svm/avic: Fix invalidate logical APIC id entry Only clear the valid bit when invalidate logical APIC id entry. The current logic clear the valid bit, but also set the rest of the bits (including reserved bits) to 1. Fixes: 98d90582be2e ('svm: Fix AVIC DFR and LDR handling') Signed-off-by: Suravee Suthikulpanit Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d7b14c902052..933f19d840fe 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -262,6 +262,7 @@ struct amd_svm_iommu_ir { }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) @@ -4607,7 +4608,7 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) - WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); + clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 99c221796a810055974b54c02e8f53297e48d146 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 3 Apr 2019 16:06:42 +0200 Subject: KVM: x86: svm: make sure NMI is injected after nmi_singlestep I noticed that apic test from kvm-unit-tests always hangs on my EPYC 7401P, the hanging test nmi-after-sti is trying to deliver 30000 NMIs and tracing shows that we're sometimes able to deliver a few but never all. When we're trying to inject an NMI we may fail to do so immediately for various reasons, however, we still need to inject it so enable_nmi_window() arms nmi_singlestep mode. #DB occurs as expected, but we're not checking for pending NMIs before entering the guest and unless there's a different event to process, the NMI will never get delivered. Make KVM_REQ_EVENT request on the vCPU from db_interception() to make sure pending NMIs are checked and possibly injected. Signed-off-by: Vitaly Kuznetsov Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 933f19d840fe..c6815aef2cac 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2693,6 +2693,7 @@ static int npf_interception(struct vcpu_svm *svm) static int db_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_vcpu *vcpu = &svm->vcpu; if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && @@ -2703,6 +2704,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { disable_nmi_singlestep(svm); + /* Make sure we check for pending NMIs upon entry */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } if (svm->vcpu.guest_debug & -- cgit v1.2.3 From 1811d979c71621aafc7b879477202d286f7e863b Mon Sep 17 00:00:00 2001 From: WANG Chao Date: Fri, 12 Apr 2019 15:55:39 +0800 Subject: x86/kvm: move kvm_load/put_guest_xcr0 into atomic context guest xcr0 could leak into host when MCE happens in guest mode. Because do_machine_check() could schedule out at a few places. For example: kvm_load_guest_xcr0 ... kvm_x86_ops->run(vcpu) { vmx_vcpu_run vmx_complete_atomic_exit kvm_machine_check do_machine_check do_memory_failure memory_failure lock_page In this case, host_xcr0 is 0x2ff, guest vcpu xcr0 is 0xff. After schedule out, host cpu has guest xcr0 loaded (0xff). In __switch_to { switch_fpu_finish copy_kernel_to_fpregs XRSTORS If any bit i in XSTATE_BV[i] == 1 and xcr0[i] == 0, XRSTORS will generate #GP (In this case, bit 9). Then ex_handler_fprestore kicks in and tries to reinitialize fpu by restoring init fpu state. Same story as last #GP, except we get DOUBLE FAULT this time. Cc: stable@vger.kernel.org Signed-off-by: WANG Chao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 10 ++++------ arch/x86/kvm/x86.h | 2 ++ 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c6815aef2cac..675cecb3fa9c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5636,6 +5636,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; clgi(); + kvm_load_guest_xcr0(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -5781,6 +5782,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); + kvm_put_guest_xcr0(vcpu); stgi(); /* Any pending NMI will happen here */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7a8f75fc6b7e..88060a621db2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6410,6 +6410,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + kvm_load_guest_xcr0(vcpu); + if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && vcpu->arch.pkru != vmx->host_pkru) @@ -6506,6 +6508,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } + kvm_put_guest_xcr0(vcpu); + vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 455f156f56ed..f05891b8df7c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -800,7 +800,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) { if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && !vcpu->guest_xcr0_loaded) { @@ -810,8 +810,9 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) vcpu->guest_xcr0_loaded = 1; } } +EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) { if (vcpu->guest_xcr0_loaded) { if (vcpu->arch.xcr0 != host_xcr0) @@ -819,6 +820,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) vcpu->guest_xcr0_loaded = 0; } } +EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -7865,8 +7867,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - kvm_load_guest_xcr0(vcpu); - if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_x86_ops->request_immediate_exit(vcpu); @@ -7919,8 +7919,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_put_guest_xcr0(vcpu); - kvm_before_interrupt(vcpu); kvm_x86_ops->handle_external_intr(vcpu); kvm_after_interrupt(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 28406aa1136d..aedc5d0d4989 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -347,4 +347,6 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } +void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); +void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); #endif -- cgit v1.2.3 From 672ff6cff80ca43bf3258410d2b887036969df5f Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Mon, 25 Mar 2019 21:10:17 +0200 Subject: KVM: x86: Raise #GP when guest vCPU do not support PMU Before this change, reading a VMware pseduo PMC will succeed even when PMU is not supported by guest. This can easily be seen by running kvm-unit-test vmware_backdoors with "-cpu host,-pmu" option. Reviewed-by: Mihai Carabas Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 58ead7db71a3..e39741997893 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -281,9 +281,13 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { bool fast_mode = idx & (1u << 31); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; u64 ctr_val; + if (!pmu->version) + return 1; + if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); -- cgit v1.2.3 From e51bfdb68725dc052d16241ace40ea3140f938aa Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Mon, 25 Mar 2019 21:09:17 +0200 Subject: KVM: nVMX: Expose RDPMC-exiting only when guest supports PMU Issue was discovered when running kvm-unit-tests on KVM running as L1 on top of Hyper-V. When vmx_instruction_intercept unit-test attempts to run RDPMC to test RDPMC-exiting, it is intercepted by L1 KVM which it's EXIT_REASON_RDPMC handler raise #GP because vCPU exposed by Hyper-V doesn't support PMU. Instead of unit-test expectation to be reflected with EXIT_REASON_RDPMC. The reason vmx_instruction_intercept unit-test attempts to run RDPMC even though Hyper-V doesn't support PMU is because L1 expose to L2 support for RDPMC-exiting. Which is reasonable to assume that is supported only in case CPU supports PMU to being with. Above issue can easily be simulated by modifying vmx_instruction_intercept config in x86/unittests.cfg to run QEMU with "-cpu host,+vmx,-pmu" and run unit-test. To handle issue, change KVM to expose RDPMC-exiting only when guest supports PMU. Reported-by: Saar Amar Reviewed-by: Mihai Carabas Reviewed-by: Jim Mattson Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 88060a621db2..5866e9e9f1e0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6856,6 +6856,30 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } +static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return false; + + eax.full = entry->eax; + return (eax.split.version_id > 0); +} + +static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool pmu_enabled = guest_cpuid_has_pmu(vcpu); + + if (pmu_enabled) + vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING; + else + vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING; +} + static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6944,6 +6968,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); + nested_vmx_procbased_ctls_update(vcpu); } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && -- cgit v1.2.3 From ed19321fb6571214f410b30322e4ad6e6b7c3915 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:03:09 -0700 Subject: KVM: x86: Load SMRAM in a single shot when leaving SMM RSM emulation is currently broken on VMX when the interrupted guest has CR4.VMXE=1. Rather than dance around the issue of HF_SMM_MASK being set when loading SMSTATE into architectural state, ideally RSM emulation itself would be reworked to clear HF_SMM_MASK prior to loading non-SMM architectural state. Ostensibly, the only motivation for having HF_SMM_MASK set throughout the loading of state from the SMRAM save state area is so that the memory accesses from GET_SMSTATE() are tagged with role.smm. Load all of the SMRAM save state area from guest memory at the beginning of RSM emulation, and load state from the buffer instead of reading guest memory one-by-one. This paves the way for clearing HF_SMM_MASK prior to loading state, and also aligns RSM with the enter_smm() behavior, which fills a buffer and writes SMRAM save state in a single go. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 3 +- arch/x86/include/asm/kvm_host.h | 5 +- arch/x86/kvm/emulate.c | 149 ++++++++++++++++++------------------- arch/x86/kvm/svm.c | 20 ++--- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 5 +- 6 files changed, 92 insertions(+), 92 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 93c4bf598fb0..ec489c432850 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -226,7 +226,8 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); - int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, + const char *smstate); }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9b7b731a0032..a9d03af34030 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1182,7 +1182,7 @@ struct kvm_x86_ops { int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); int (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); @@ -1592,4 +1592,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val +#define GET_SMSTATE(type, buf, offset) \ + (*(type *)((buf) + (offset) - 0x7e00)) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c338984c850d..ae0d289b50fe 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2339,16 +2339,6 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) return edx & bit(X86_FEATURE_LM); } -#define GET_SMSTATE(type, smbase, offset) \ - ({ \ - type __val; \ - int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ - sizeof(__val)); \ - if (r != X86EMUL_CONTINUE) \ - return X86EMUL_UNHANDLEABLE; \ - __val; \ - }) - static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) { desc->g = (flags >> 23) & 1; @@ -2361,27 +2351,29 @@ static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) desc->type = (flags >> 8) & 15; } -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) { struct desc_struct desc; int offset; u16 selector; - selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); return X86EMUL_CONTINUE; } -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) { struct desc_struct desc; int offset; @@ -2390,11 +2382,11 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) offset = 0x7e00 + n * 16; - selector = GET_SMSTATE(u16, smbase, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - base3 = GET_SMSTATE(u32, smbase, offset + 12); + selector = GET_SMSTATE(u16, smstate, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + base3 = GET_SMSTATE(u32, smstate, offset + 12); ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); return X86EMUL_CONTINUE; @@ -2445,7 +2437,8 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, + const char *smstate) { struct desc_struct desc; struct desc_ptr dt; @@ -2453,53 +2446,54 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) u32 val, cr0, cr3, cr4; int i; - cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); + cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - val = GET_SMSTATE(u32, smbase, 0x7fcc); + val = GET_SMSTATE(u32, smstate, 0x7fcc); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7fc8); + val = GET_SMSTATE(u32, smstate, 0x7fc8); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - selector = GET_SMSTATE(u32, smbase, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + selector = GET_SMSTATE(u32, smstate, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - selector = GET_SMSTATE(u32, smbase, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + selector = GET_SMSTATE(u32, smstate, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - dt.address = GET_SMSTATE(u32, smbase, 0x7f74); - dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + dt.address = GET_SMSTATE(u32, smstate, 0x7f74); + dt.size = GET_SMSTATE(u32, smstate, 0x7f70); ctxt->ops->set_gdt(ctxt, &dt); - dt.address = GET_SMSTATE(u32, smbase, 0x7f58); - dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + dt.address = GET_SMSTATE(u32, smstate, 0x7f58); + dt.size = GET_SMSTATE(u32, smstate, 0x7f54); ctxt->ops->set_idt(ctxt, &dt); for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smbase, i); + int r = rsm_load_seg_32(ctxt, smstate, i); if (r != X86EMUL_CONTINUE) return r; } - cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } -static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, + const char *smstate) { struct desc_struct desc; struct desc_ptr dt; @@ -2509,43 +2503,43 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) int i, r; for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smbase, 0x7f68); + val = GET_SMSTATE(u32, smstate, 0x7f68); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7f60); + val = GET_SMSTATE(u32, smstate, 0x7f60); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - cr3 = GET_SMSTATE(u64, smbase, 0x7f50); - cr4 = GET_SMSTATE(u64, smbase, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); - val = GET_SMSTATE(u64, smbase, 0x7ed0); + cr0 = GET_SMSTATE(u64, smstate, 0x7f58); + cr3 = GET_SMSTATE(u64, smstate, 0x7f50); + cr4 = GET_SMSTATE(u64, smstate, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); + val = GET_SMSTATE(u64, smstate, 0x7ed0); ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); - selector = GET_SMSTATE(u32, smbase, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); - base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + selector = GET_SMSTATE(u32, smstate, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); + base3 = GET_SMSTATE(u32, smstate, 0x7e9c); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e84); - dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + dt.size = GET_SMSTATE(u32, smstate, 0x7e84); + dt.address = GET_SMSTATE(u64, smstate, 0x7e88); ctxt->ops->set_idt(ctxt, &dt); - selector = GET_SMSTATE(u32, smbase, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); - base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + selector = GET_SMSTATE(u32, smstate, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); + base3 = GET_SMSTATE(u32, smstate, 0x7e7c); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e64); - dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + dt.size = GET_SMSTATE(u32, smstate, 0x7e64); + dt.address = GET_SMSTATE(u64, smstate, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); @@ -2553,7 +2547,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) return r; for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smbase, i); + r = rsm_load_seg_64(ctxt, smstate, i); if (r != X86EMUL_CONTINUE) return r; } @@ -2564,12 +2558,19 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) static int em_rsm(struct x86_emulate_ctxt *ctxt) { unsigned long cr0, cr4, efer; + char buf[512]; u64 smbase; int ret; if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); + smbase = ctxt->ops->get_smbase(ctxt); + + ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); + if (ret != X86EMUL_CONTINUE) + return X86EMUL_UNHANDLEABLE; + /* * Get back to real mode, to prepare a safe state in which to load * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU @@ -2605,20 +2606,18 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) efer = 0; ctxt->ops->set_msr(ctxt, MSR_EFER, efer); - smbase = ctxt->ops->get_smbase(ctxt); - /* * Give pre_leave_smm() a chance to make ISA-specific changes to the * vCPU state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (ctxt->ops->pre_leave_smm(ctxt, smbase)) + if (ctxt->ops->pre_leave_smm(ctxt, buf)) return X86EMUL_UNHANDLEABLE; if (emulator_has_longmode(ctxt)) - ret = rsm_load_state_64(ctxt, smbase + 0x8000); + ret = rsm_load_state_64(ctxt, buf); else - ret = rsm_load_state_32(ctxt, smbase + 0x8000); + ret = rsm_load_state_32(ctxt, buf); if (ret != X86EMUL_CONTINUE) { /* FIXME: should triple fault */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 675cecb3fa9c..6b1cd73e4053 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6232,27 +6232,23 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *nested_vmcb; struct page *page; - struct { - u64 guest; - u64 vmcb; - } svm_state_save; + u64 guest; + u64 vmcb; int ret; - ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, - sizeof(svm_state_save)); - if (ret) - return ret; + guest = GET_SMSTATE(u64, smstate, 0x7ed8); + vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); - if (svm_state_save.guest) { + if (guest) { vcpu->arch.hflags &= ~HF_SMM_MASK; - nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); + nested_vmcb = nested_svm_map(svm, vmcb, &page); if (nested_vmcb) - enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); + enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); else ret = 1; vcpu->arch.hflags |= HF_SMM_MASK; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5866e9e9f1e0..14ea25eadde8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7398,7 +7398,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f05891b8df7c..6ee1f9e5d3fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5963,9 +5963,10 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, + const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); } static const struct x86_emulate_ops emulate_ops = { -- cgit v1.2.3 From c5833c7a43a66bfe2f36439cb2f1281a588668af Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:03:10 -0700 Subject: KVM: x86: Open code kvm_set_hflags Prepare for clearing HF_SMM_MASK prior to loading state from the SMRAM save state map, i.e. kvm_smm_changed() needs to be called after state has been loaded and so cannot be done automatically when setting hflags from RSM. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 3 +++ arch/x86/kvm/x86.c | 33 +++++++++++++++------------------ 3 files changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ec489c432850..feab24cac610 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -228,6 +228,7 @@ struct x86_emulate_ops { void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); + void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ae0d289b50fe..a6b282853253 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2629,6 +2629,9 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); + + ctxt->ops->post_leave_smm(ctxt); + return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ee1f9e5d3fb..472bbbbe153a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3530,7 +3530,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags); +static void kvm_smm_changed(struct kvm_vcpu *vcpu); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -3590,12 +3590,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - u32 hflags = vcpu->arch.hflags; - if (events->smi.smm) - hflags |= HF_SMM_MASK; - else - hflags &= ~HF_SMM_MASK; - kvm_set_hflags(vcpu, hflags); + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + kvm_smm_changed(vcpu); + } vcpu->arch.smi_pending = events->smi.pending; @@ -5960,7 +5961,7 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) { - kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); + emul_to_vcpu(ctxt)->arch.hflags = emul_flags; } static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, @@ -5969,6 +5970,11 @@ static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); } +static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + kvm_smm_changed(emul_to_vcpu(ctxt)); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -6009,6 +6015,7 @@ static const struct x86_emulate_ops emulate_ops = { .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, + .post_leave_smm = emulator_post_leave_smm, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6250,16 +6257,6 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) -{ - unsigned changed = vcpu->arch.hflags ^ emul_flags; - - vcpu->arch.hflags = emul_flags; - - if (changed & HF_SMM_MASK) - kvm_smm_changed(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { -- cgit v1.2.3 From 9ec19493fb86d6d5fbf9286b94ff21e56ef66376 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:03:11 -0700 Subject: KVM: x86: clear SMM flags before loading state while leaving SMM RSM emulation is currently broken on VMX when the interrupted guest has CR4.VMXE=1. Stop dancing around the issue of HF_SMM_MASK being set when loading SMSTATE into architectural state, e.g. by toggling it for problematic flows, and simply clear HF_SMM_MASK prior to loading architectural state (from SMRAM save state area). Reported-by: Jon Doron Cc: Jim Mattson Cc: Liran Alon Cc: Vitaly Kuznetsov Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM") Signed-off-by: Sean Christopherson Tested-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 12 ++++++------ arch/x86/kvm/svm.c | 12 ++++-------- arch/x86/kvm/vmx/vmx.c | 2 -- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a6b282853253..f526acee2eed 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2571,6 +2571,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (ret != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); + /* * Get back to real mode, to prepare a safe state in which to load * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU @@ -2624,12 +2630,6 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); - - ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & - ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); - ctxt->ops->post_leave_smm(ctxt); return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6b1cd73e4053..406b558abfef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6239,21 +6239,17 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) struct page *page; u64 guest; u64 vmcb; - int ret; guest = GET_SMSTATE(u64, smstate, 0x7ed8); vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); if (guest) { - vcpu->arch.hflags &= ~HF_SMM_MASK; nested_vmcb = nested_svm_map(svm, vmcb, &page); - if (nested_vmcb) - enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); - else - ret = 1; - vcpu->arch.hflags |= HF_SMM_MASK; + if (!nested_vmcb) + return 1; + enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); } - return ret; + return 0; } static int enable_smi_window(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 14ea25eadde8..b4e7d645275a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7409,9 +7409,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) } if (vmx->nested.smm.guest_mode) { - vcpu->arch.hflags &= ~HF_SMM_MASK; ret = nested_vmx_enter_non_root_mode(vcpu, false); - vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; -- cgit v1.2.3 From 8f4dc2e77cdfaf7e644ef29693fa229db29ee1de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:10:47 -0700 Subject: KVM: x86: Don't clear EFER during SMM transitions for 32-bit vCPU Neither AMD nor Intel CPUs have an EFER field in the legacy SMRAM save state area, i.e. don't save/restore EFER across SMM transitions. KVM somewhat models this, e.g. doesn't clear EFER on entry to SMM if the guest doesn't support long mode. But during RSM, KVM unconditionally clears EFER so that it can get back to pure 32-bit mode in order to start loading CRs with their actual non-SMM values. Clear EFER only when it will be written when loading the non-SMM state so as to preserve bits that can theoretically be set on 32-bit vCPUs, e.g. KVM always emulates EFER_SCE. And because CR4.PAE is cleared only to play nice with EFER, wrap that code in the long mode check as well. Note, this may result in a compiler warning about cr4 being consumed uninitialized. Re-read CR4 even though it's technically unnecessary, as doing so allows for more readable code and RSM emulation is not a performance critical path. Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f526acee2eed..f3284827c432 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2582,15 +2582,13 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU * supports long mode. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); if (emulator_has_longmode(ctxt)) { struct desc_struct cs_desc; /* Zero CR4.PCIDE before CR0.PG. */ - if (cr4 & X86_CR4_PCIDE) { + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PCIDE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - cr4 &= ~X86_CR4_PCIDE; - } /* A 32-bit code segment is required to clear EFER.LMA. */ memset(&cs_desc, 0, sizeof(cs_desc)); @@ -2604,13 +2602,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + if (emulator_has_longmode(ctxt)) { + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + /* And finally go back to 32-bit mode. */ + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + } /* * Give pre_leave_smm() a chance to make ISA-specific changes to the -- cgit v1.2.3 From b68f3cc7d978943fcf85148165b00594c38db776 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:10:48 -0700 Subject: KVM: x86: Always use 32-bit SMRAM save state for 32-bit kernels Invoking the 64-bit variation on a 32-bit kenrel will crash the guest, trigger a WARN, and/or lead to a buffer overrun in the host, e.g. rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64. KVM allows userspace to report long mode support via CPUID, even though the guest is all but guaranteed to crash if it actually tries to enable long mode. But, a pure 32-bit guest that is ignorant of long mode will happily plod along. SMM complicates things as 64-bit CPUs use a different SMRAM save state area. KVM handles this correctly for 64-bit kernels, e.g. uses the legacy save state map if userspace has hid long mode from the guest, but doesn't fare well when userspace reports long mode support on a 32-bit host kernel (32-bit KVM doesn't support 64-bit guests). Since the alternative is to crash the guest, e.g. by not loading state or explicitly requesting shutdown, unconditionally use the legacy SMRAM save state map for 32-bit KVM. If a guest has managed to get far enough to handle SMIs when running under a weird/buggy userspace hypervisor, then don't deliberately crash the guest since there are no downsides (from KVM's perspective) to allow it to continue running. Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 10 ++++++++++ arch/x86/kvm/x86.c | 10 ++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f3284827c432..d0d5dd44b4f4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2331,12 +2331,16 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) { +#ifdef CONFIG_X86_64 u32 eax, ebx, ecx, edx; eax = 0x80000001; ecx = 0; ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); return edx & bit(X86_FEATURE_LM); +#else + return false; +#endif } static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) @@ -2372,6 +2376,7 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, return X86EMUL_CONTINUE; } +#ifdef CONFIG_X86_64 static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, int n) { @@ -2391,6 +2396,7 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); return X86EMUL_CONTINUE; } +#endif static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, u64 cr0, u64 cr3, u64 cr4) @@ -2492,6 +2498,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } +#ifdef CONFIG_X86_64 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, const char *smstate) { @@ -2554,6 +2561,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } +#endif static int em_rsm(struct x86_emulate_ctxt *ctxt) { @@ -2621,9 +2629,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->pre_leave_smm(ctxt, buf)) return X86EMUL_UNHANDLEABLE; +#ifdef CONFIG_X86_64 if (emulator_has_longmode(ctxt)) ret = rsm_load_state_64(ctxt, buf); else +#endif ret = rsm_load_state_32(ctxt, buf); if (ret != X86EMUL_CONTINUE) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 472bbbbe153a..f10fef561573 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7441,9 +7441,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); } +#ifdef CONFIG_X86_64 static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) { -#ifdef CONFIG_X86_64 struct desc_ptr dt; struct kvm_segment seg; unsigned long val; @@ -7493,10 +7493,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) for (i = 0; i < 6; i++) enter_smm_save_seg_64(vcpu, buf, i); -#else - WARN_ON_ONCE(1); -#endif } +#endif static void enter_smm(struct kvm_vcpu *vcpu) { @@ -7507,9 +7505,11 @@ static void enter_smm(struct kvm_vcpu *vcpu) trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); memset(buf, 0, 512); +#ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else +#endif enter_smm_save_state_32(vcpu, buf); /* @@ -7567,8 +7567,10 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); +#ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) kvm_x86_ops->set_efer(vcpu, 0); +#endif kvm_update_cpuid(vcpu); kvm_mmu_reset_context(vcpu); -- cgit v1.2.3 From be43c440eb5d0ccfdb0d67d5a4c9d579ff988b75 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Sat, 6 Apr 2019 15:06:58 +0530 Subject: KVM: x86: fix warning Using plain integer as NULL pointer Changed passing argument as "0 to NULL" which resolves below sparse warning arch/x86/kvm/x86.c:3096:61: warning: Using plain integer as NULL pointer Signed-off-by: Hariprasad Kelam Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f10fef561573..a0d1fc80ac5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3095,7 +3095,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; default: break; -- cgit v1.2.3 From 1d487e9bf8ba66a7174c56a0029c54b1eca8f99c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Apr 2019 11:16:47 +0200 Subject: KVM: fix spectrev1 gadgets These were found with smatch, and then generalized when applicable. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- include/linux/kvm_host.h | 10 ++++++---- virt/kvm/irqchip.c | 5 +++-- virt/kvm/kvm_main.c | 6 ++++-- 4 files changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 991fdf7fc17f..9bf70cf84564 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -138,6 +138,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); + offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { @@ -901,7 +902,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { - *dst = &map->phys_map[irq->dest_id]; + u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); + *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9d55c63db09b..640a03642766 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -513,10 +514,10 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { - /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case - * the caller has read kvm->online_vcpus before (as is the case - * for kvm_for_each_vcpu, for example). - */ + int num_vcpus = atomic_read(&kvm->online_vcpus); + i = array_index_nospec(i, num_vcpus); + + /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return kvm->vcpus[i]; } @@ -600,6 +601,7 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { + as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 3547b0d8c91e..79e59e4fa3dc 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -144,18 +144,19 @@ static int setup_routing_entry(struct kvm *kvm, { struct kvm_kernel_irq_routing_entry *ei; int r; + u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ - hlist_for_each_entry(ei, &rt->map[ue->gsi], link) + hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; - e->gsi = ue->gsi; + e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 55fe8e20d8fd..dc8edc97ba85 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2977,12 +2977,14 @@ static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_device_ops *ops = NULL; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int type; int ret; if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; - ops = kvm_device_ops_table[cd->type]; + type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); + ops = kvm_device_ops_table[type]; if (ops == NULL) return -ENODEV; @@ -2997,7 +2999,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, dev->kvm = kvm; mutex_lock(&kvm->lock); - ret = ops->create(dev, cd->type); + ret = ops->create(dev, type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); -- cgit v1.2.3 From 7a223e06b1a411cef6c4cd7a9b9a33c8d225b10e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 27 Mar 2019 15:12:20 +0100 Subject: KVM: x86: avoid misreporting level-triggered irqs as edge-triggered in tracing In __apic_accept_irq() interface trig_mode is int and actually on some code paths it is set above u8: kvm_apic_set_irq() extracts it from 'struct kvm_lapic_irq' where trig_mode is u16. This is done on purpose as e.g. kvm_set_msi_irq() sets it to (1 << 15) & e->msi.data kvm_apic_local_deliver sets it to reg & (1 << 15). Fix the immediate issue by making 'tm' into u16. We may also want to adjust __apic_accept_irq() interface and use proper sizes for vector, level, trig_mode but this is not urgent. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6432d08c7de7..4d47a2631d1f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -438,13 +438,13 @@ TRACE_EVENT(kvm_apic_ipi, ); TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), + TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) - __field( __u8, tm ) + __field( __u16, tm ) __field( __u8, vec ) ), -- cgit v1.2.3 From 9d609649bb29a882e7e6bf502adb45dcac178fe5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 15 Apr 2019 15:14:32 +0200 Subject: KVM: vmx: print more APICv fields in dump_vmcs The SVI, RVI, virtual-APIC page address and APIC-access page address fields were left out of dump_vmcs. Add them. KERN_CONT technically isn't SMP safe, but it's okay to use it here since the whole of dump_vmcs() is a single huge multi-line piece of output that isn't SMP-safe. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b4e7d645275a..92bd511076f6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5723,8 +5723,16 @@ void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); + } + pr_err(KERN_CONT "TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); + pr_err(KERN_CONT "virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) -- cgit v1.2.3 From f16cb57be82b20c6a00a6bb1750cb77c08e56c1a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 10 Apr 2019 11:38:30 +0200 Subject: KVM: x86: clear VM_EXIT_SAVE_IA32_PAT This is not needed, PAT writes always take an MSR vmexit. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 92bd511076f6..a8a1e533d7fb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2288,7 +2288,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_CLEAR_BNDCFGS | -- cgit v1.2.3 From 674ea351cdeb01d2740edce31db7f2d79ce6095d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 10 Apr 2019 11:41:40 +0200 Subject: KVM: x86: optimize check for valid PAT value This check will soon be done on every nested vmentry and vmexit, "parallelize" it using bitwise operations. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 10 +--------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.h | 10 ++++++++++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index e9ea2d45ae66..9f72cc427158 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr) return false; } -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - static bool valid_mtrr_type(unsigned t) { return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ @@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return false; if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; + return kvm_pat_valid(data); } else if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a8a1e533d7fb..c40fb667002c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1891,7 +1891,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index aedc5d0d4989..eda954441213 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -347,6 +347,16 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } + +static inline bool kvm_pat_valid(u64 data) +{ + if (data & 0xF8F8F8F8F8F8F8F8ull) + return false; + /* 0, 1, 4, 5, 6, 7 are valid values. */ + return (data | ((data & 0x0202020202020202ull) << 1)) == data; +} + void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); + #endif -- cgit v1.2.3 From f6b0db1fdafad5fdbe10a4a3559da42d286435e6 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Mon, 8 Apr 2019 17:35:11 -0400 Subject: kvm: nVMX: Check "load IA32_PAT" VM-exit control on vmentry According to section "Checks on Host Control Registers and MSRs" in Intel SDM vol 3C, the following check is performed on vmentry: If the "load IA32_PAT" VM-exit control is 1, the value of the field for the IA32_PAT MSR must be one that could be written by WRMSR without fault at CPL 0. Specifically, each of the 8 bytes in the field must have one of the values 0 (UC), 1 (WC), 4 (WT), 5 (WP), 6 (WB), or 7 (UC-). Signed-off-by: Krish Sadhukhan Reviewed-by: Karl Heubaum Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6401eb7ef19c..37cd5ebac4fb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2606,6 +2606,10 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->host_ia32_pat)) + return -EINVAL; + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, -- cgit v1.2.3 From de2bc2bfdf419ad9078736d88cae72beae972a59 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Mon, 8 Apr 2019 17:35:12 -0400 Subject: kvm: nVMX: Check "load IA32_PAT" VM-entry control on vmentry According to section "Checking and Loading Guest State" in Intel SDM vol 3C, the following check is performed on vmentry: If the "load IA32_PAT" VM-entry control is 1, the value of the field for the IA32_PAT MSR must be one that could be written by WRMSR without fault at CPL 0. Specifically, each of the 8 bytes in the field must have one of the values 0 (UC), 1 (WC), 4 (WT), 5 (WP), 6 (WB), or 7 (UC-). Signed-off-by: Krish Sadhukhan Reviewed-by: Karl Heubaum Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 37cd5ebac4fb..8dc6a43cfdf3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2696,6 +2696,10 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && + !kvm_pat_valid(vmcs12->guest_ia32_pat)) + return 1; + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; -- cgit v1.2.3 From 9c3e922ba316a5d3d8cbe41e0db97888fca5c359 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Apr 2019 12:18:05 -0700 Subject: KVM: nVMX: Move guest non-reg state checks to VM-Exit path Per Intel's SDM, volume 3, section Checking and Loading Guest State: Because the checking and the loading occur concurrently, a failure may be discovered only after some state has been loaded. For this reason, the logical processor responds to such failures by loading state from the host-state area, as it would for a VM exit. In other words, a failed non-register state consistency check results in a VM-Exit, not VM-Fail. Moving the non-reg state checks also paves the way for renaming nested_vmx_check_vmentry_postreqs() to align with the SDM, i.e. nested_vmx_check_vmentry_guest_state(). Fixes: 26539bd0e446a ("KVM: nVMX: check vmcs12 for valid activity state") Signed-off-by: Sean Christopherson Reviewed-by: Krish Sadhukhan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8dc6a43cfdf3..a02d2e3ded33 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2628,18 +2628,6 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -/* - * Checks related to Guest Non-register State - */ -static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) -{ - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return -EINVAL; - - return 0; -} - static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2651,9 +2639,6 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, if (nested_check_host_control_regs(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - if (nested_check_guest_non_reg_state(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - return 0; } @@ -2684,6 +2669,18 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, return r; } +/* + * Checks related to Guest Non-register State + */ +static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) +{ + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) @@ -2729,6 +2726,9 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) return 1; + if (nested_check_guest_non_reg_state(vmcs12)) + return 1; + return 0; } -- cgit v1.2.3 From 5478ba349f3f71f8d306cddaed33fa0527cc7b16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Apr 2019 12:18:06 -0700 Subject: KVM: nVMX: Rename and split top-level consistency checks to match SDM Rename the top-level consistency check functions to (loosely) align with the SDM. Historically, KVM has used the terms "prereq" and "postreq" to differentiate between consistency checks that lead to VM-Fail and those that lead to VM-Exit. The terms are vague and potentially misleading, e.g. "postreq" might be interpreted as occurring after VM-Entry. Note, while the SDM lumps controls and host state into a single section, "Checks on VMX Controls and Host-State Area", split them into separate top-level functions as the two categories of checks result in different VM instruction errors. This split will allow for additional cleanup. Note #2, "vmentry" is intentionally dropped from the new function names to avoid confusion with nested_check_vm_entry_controls(), and to keep the length of the functions names somewhat manageable. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Krish Sadhukhan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a02d2e3ded33..97cc17effce2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2589,6 +2589,17 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + return 0; +} + /* * Checks related to Host Control Registers and MSRs */ @@ -2628,14 +2639,9 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { - if (nested_check_vm_execution_controls(vcpu, vmcs12) || - nested_check_vm_exit_controls(vcpu, vmcs12) || - nested_check_vm_entry_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_check_host_control_regs(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; @@ -2681,9 +2687,9 @@ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) return 0; } -static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 *exit_qual) +static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 *exit_qual) { bool ia32e; @@ -3008,7 +3014,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return -1; } - if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) goto vmentry_fail_vmexit; } @@ -3153,7 +3159,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); + ret = nested_vmx_check_controls(vcpu, vmcs12); + if (ret) + return nested_vmx_failValid(vcpu, ret); + + ret = nested_vmx_check_host_state(vcpu, vmcs12); if (ret) return nested_vmx_failValid(vcpu, ret); @@ -5488,8 +5498,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; } - if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || - nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_controls(vcpu, vmcs12) || + nested_vmx_check_host_state(vcpu, vmcs12) || + nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) return -EINVAL; vmx->nested.dirty_vmcs12 = true; -- cgit v1.2.3 From 98d9e858fa966bd7132cc21d65e4c89a97f4fe2d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 12 Apr 2019 10:19:57 +0200 Subject: KVM: nVMX: Return -EINVAL when signaling failure in pre-VM-Entry helpers Convert all top-level nested VM-Enter consistency check functions to return 0/-EINVAL instead of failure codes, since now they can only ever return one failure code. This also does not give the false impression that failure information is always consumed and/or relevant, e.g. vmx_set_nested_state() only cares whether or not the checks were successful. nested_check_host_control_regs() can also now be inlined into its caller, nested_vmx_check_host_state, since the two have effectively become the same function. Based on a patch by Sean Christopherson. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 97cc17effce2..56c22e5c96c3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2595,16 +2595,13 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, if (nested_check_vm_execution_controls(vcpu, vmcs12) || nested_check_vm_exit_controls(vcpu, vmcs12) || nested_check_vm_entry_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; return 0; } -/* - * Checks related to Host Control Registers and MSRs - */ -static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { bool ia32e; @@ -2639,15 +2636,6 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_check_host_control_regs(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - - return 0; -} - static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3159,13 +3147,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - ret = nested_vmx_check_controls(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); + if (nested_vmx_check_controls(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - ret = nested_vmx_check_host_state(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); + if (nested_vmx_check_host_state(vcpu, vmcs12)) + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with -- cgit v1.2.3 From c80add0f487e28c48e855245189dc28bd3d5c250 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Apr 2019 12:18:09 -0700 Subject: KVM: nVMX: Return -EINVAL when signaling failure in VM-Entry helpers Most, but not all, helpers that are related to emulating consistency checks for nested VM-Entry return -EINVAL when a check fails. Convert the holdouts to have consistency throughout and to make it clear that the functions are signaling pass/fail as opposed to "resume guest" vs. "exit to userspace". Opportunistically fix bad indentation in nested_vmx_check_guest_state(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 56c22e5c96c3..b46136b099b8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -930,7 +930,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { if (!nested_cr3_valid(vcpu, cr3)) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* @@ -941,7 +941,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne !nested_ept) { if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { *entry_failure_code = ENTRY_FAIL_PDPTE; - return 1; + return -EINVAL; } } } @@ -2373,13 +2373,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ if (vmx->emulation_required) { *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; + return -EINVAL; } /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) - return 1; + return -EINVAL; if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; @@ -2685,15 +2685,15 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) - return 1; + return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && !kvm_pat_valid(vmcs12->guest_ia32_pat)) - return 1; + return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; - return 1; + return -EINVAL; } /* @@ -2712,16 +2712,16 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || ((vmcs12->guest_cr0 & X86_CR0_PG) && ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) - return 1; + return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) - return 1; + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return -EINVAL; if (nested_check_guest_non_reg_state(vmcs12)) - return 1; + return -EINVAL; return 0; } -- cgit v1.2.3 From 11988499e62b310f3bf6f6d0a807a06d3f9ccc96 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:19:15 -0700 Subject: KVM: x86: Skip EFER vs. guest CPUID checks for host-initiated writes KVM allows userspace to violate consistency checks related to the guest's CPUID model to some degree. Generally speaking, userspace has carte blanche when it comes to guest state so long as jamming invalid state won't negatively affect the host. Currently this is seems to be a non-issue as most of the interesting EFER checks are missing, e.g. NX and LME, but those will be added shortly. Proactively exempt userspace from the CPUID checks so as not to break userspace. Note, the efer_reserved_bits check still applies to userspace writes as that mask reflects the host's capabilities, e.g. KVM shouldn't allow a guest to run with NX=1 if it has been disabled in the host. Fixes: d80174745ba39 ("KVM: SVM: Only allow setting of EFER_SVME when CPUID SVM is set") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0d1fc80ac5a..5d7dcd06d08a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1258,31 +1258,42 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return 0; } -bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) - return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) - return false; + return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - return false; + return false; return true; + +} +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & efer_reserved_bits) + return false; + + return __kvm_valid_efer(vcpu, efer); } EXPORT_SYMBOL_GPL(kvm_valid_efer); -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; + u64 efer = msr_info->data; - if (!kvm_valid_efer(vcpu, efer)) - return 1; + if (efer & efer_reserved_bits) + return false; - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + if (!msr_info->host_initiated) { + if (!__kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + } efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; @@ -2452,7 +2463,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.arch_capabilities = data; break; case MSR_EFER: - return set_efer(vcpu, data); + return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ -- cgit v1.2.3 From 0a62956312e9dcd0ce5c59be4f0a8d8292a62402 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:19:16 -0700 Subject: KVM: x86: Inject #GP if guest attempts to set unsupported EFER bits EFER.LME and EFER.NX are considered reserved if their respective feature bits are not advertised to the guest. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5d7dcd06d08a..38440316a806 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1266,6 +1266,13 @@ static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return false; + if (efer & (EFER_LME | EFER_LMA) && + !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return false; + + if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + return false; + return true; } -- cgit v1.2.3 From c110ae578ca0a10064dfbda3d786d6a733b9fe69 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Mar 2019 17:24:03 +0100 Subject: kvm: move KVM_CAP_NR_MEMSLOTS to common code All architectures except MIPS were defining it in the same way, and memory slots are handled entirely by common code so there is no point in keeping the definition per-architecture. Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 5 ++--- arch/powerpc/kvm/powerpc.c | 3 --- arch/s390/kvm/kvm-s390.c | 3 --- arch/x86/kvm/x86.c | 3 --- virt/kvm/arm/arm.c | 3 --- virt/kvm/kvm_main.c | 2 ++ 6 files changed, 4 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 67068c47c591..b62ad0d94234 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1117,9 +1117,8 @@ struct kvm_userspace_memory_region { This ioctl allows the user to create, modify or delete a guest physical memory slot. Bits 0-15 of "slot" specify the slot id and this value should be less than the maximum number of user memory slots supported per -VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS, -if this capability is supported by the architecture. Slots may not -overlap in guest physical address space. +VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. +Slots may not overlap in guest physical address space. If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" specifies the address space which is being modified. They must be diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8885377ec3e0..92910b7c5bcc 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -644,9 +644,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = num_online_cpus(); break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4638303ba6a8..28f35d2b06cb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -513,9 +513,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 38440316a806..6c27d224f744 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3093,9 +3093,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 99c37384ba7b..be4ec5f3ba5f 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -224,9 +224,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; - break; case KVM_CAP_MSI_DEVID: if (!kvm) r = -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dc8edc97ba85..684b67252cd5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3063,6 +3063,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_MAX_VCPU_ID: return KVM_MAX_VCPU_ID; + case KVM_CAP_NR_MEMSLOTS: + return KVM_USER_MEM_SLOTS; default: break; } -- cgit v1.2.3 From 7dbcf2b0b770eeb803a416ee8dcbef78e6389d40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:38 +0200 Subject: x86/irq/64: Limit IST stack overflow check to #DB stack Commit 37fe6a42b343 ("x86: Check stack overflow in detail") added a broad check for the full exception stack area, i.e. it considers the full exception stack area as valid. That's wrong in two aspects: 1) It does not check the individual areas one by one 2) #DF, NMI and #MCE are not enabling interrupts which means that a regular device interrupt cannot happen in their context. In fact if a device interrupt hits one of those IST stacks that's a bug because some code path enabled interrupts while handling the exception. Limit the check to the #DB stack and consider all other IST stacks as 'overflow' or invalid. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Mitsuo Hayasaka Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.682135110@linutronix.de --- arch/x86/kernel/irq_64.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..b50ac9c7397b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,9 +26,18 @@ int sysctl_panic_on_stackoverflow; /* * Probabilistic stack overflow check: * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. + * Regular device interrupts can enter on the following stacks: + * + * - User stack + * + * - Kernel task stack + * + * - Interrupt stack if a device driver reenables interrupts + * which should only happen in really old drivers. + * + * - Debug IST stack + * + * All other contexts are invalid. */ static inline void stack_overflow_check(struct pt_regs *regs) { @@ -53,8 +62,8 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + estack_bottom = (u64)oist->ist[DEBUG_STACK]; + estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN; if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; -- cgit v1.2.3 From fa33215422fd415a07ec2a00e9f1acdaf0fa8e94 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 17:59:39 +0200 Subject: x86/dumpstack: Fix off-by-one errors in stack identification The get_stack_info() function is off-by-one when checking whether an address is on a IRQ stack or a IST stack. This prevents an overflowed IRQ or IST stack from being dumped properly. [ tglx: Do the same for 32-bit ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.785651055@linutronix.de --- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..d305440ebe9c 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -41,7 +41,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -66,7 +66,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..90f0fa88cbb3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -65,7 +65,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack <= begin || stack >= end) + if (stack < begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -88,7 +88,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; -- cgit v1.2.3 From 4f44b8f0b33b7111216f0fad353315f796b81617 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 17:59:40 +0200 Subject: x86/irq/64: Remove a hardcoded irq_stack_union access stack_overflow_check() is using both irq_stack_ptr and irq_stack_union to find the IRQ stack. That's going to break when vmapped irq stacks are introduced. Change it to just use irq_stack_ptr. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.872549191@linutronix.de --- arch/x86/kernel/irq_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b50ac9c7397b..f6dcc8fea5c0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -55,9 +55,8 @@ static inline void stack_overflow_check(struct pt_regs *regs) regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_top = irq_stack_bottom - IRQ_STACK_SIZE + STACK_TOP_MARGIN; if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return; -- cgit v1.2.3 From df835e7083bee33e98635aca26b39b63ebc6cca7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:41 +0200 Subject: x86/irq/64: Sanitize the top/bottom confusion On x86, stacks go top to bottom, but the stack overflow check uses it the other way round, which is just confusing. Clean it up and sanitize the warning string a bit. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.961241397@linutronix.de --- arch/x86/kernel/irq_64.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f6dcc8fea5c0..cf200466d5c8 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -42,7 +42,7 @@ int sysctl_panic_on_stackoverflow; static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 +#define STACK_MARGIN 128 struct orig_ist *oist; u64 irq_stack_top, irq_stack_bottom; u64 estack_top, estack_bottom; @@ -51,25 +51,25 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - irq_stack_top = irq_stack_bottom - IRQ_STACK_SIZE + STACK_TOP_MARGIN; - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + irq_stack_top = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_bottom = irq_stack_top - IRQ_STACK_SIZE + STACK_MARGIN; + if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) return; oist = this_cpu_ptr(&orig_ist); - estack_bottom = (u64)oist->ist[DEBUG_STACK]; - estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) + estack_top = (u64)oist->ist[DEBUG_STACK]; + estack_bottom = estack_top - DEBUG_STKSZ + STACK_MARGIN; + if (regs->sp >= estack_bottom && regs->sp <= estack_top) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx, irq stack:%Lx-%Lx, exception stack: %Lx-%Lx, ip:%pF)\n", current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + irq_stack_bottom, irq_stack_top, + estack_bottom, estack_top, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); -- cgit v1.2.3 From 99d334511b337884cadbdfae28da912a4edb1001 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:42 +0200 Subject: x86/idt: Remove unused macro SISTG Commit d8ba61ba58c8 ("x86/entry/64: Don't use IST entry for #BP stack") removed the last user but left the macro around. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Dou Liyang Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.050689789@linutronix.de --- arch/x86/kernel/idt.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71..2877606e97de 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -45,10 +45,6 @@ struct idt_data { #define ISTG(_vector, _addr, _ist) \ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) - /* Task gate */ #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) -- cgit v1.2.3 From 6f36bd8d2e8c221eaaf4ce5b0ebbb11c00b0ac98 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:43 +0200 Subject: x86/64: Remove stale CURRENT_MASK Nothing uses that and before people get the wrong ideas, get rid of it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Andy Lutomirski Cc: Baoquan He Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.139284839@linutronix.de --- arch/x86/include/asm/page_64_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8f657286d599..bcd8c0518604 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,7 +14,6 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -- cgit v1.2.3 From 30842211506e376b76394a9cb4e6d0c9d258b8d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:44 +0200 Subject: x86/exceptions: Remove unused stack defines on 32bit Nothing requires those for 32bit builds. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Michal Hocko Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.227822695@linutronix.de --- arch/x86/include/asm/page_32_types.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0d5c739eebd7..57b5dc15ca7e 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -22,11 +22,7 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 0 -#define DEBUG_STACK 0 -#define MCE_STACK 0 -#define N_EXCEPTION_STACKS 1 +#define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE /* -- cgit v1.2.3 From 8f34c5b5afce91d171bb0802631197484cb69b8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:45 +0200 Subject: x86/exceptions: Make IST index zero based The defines for the exception stack (IST) array in the TSS are using the SDM convention IST1 - IST7. That causes all sorts of code to subtract 1 for array indices related to IST. That's confusing at best and does not provide any value. Make the indices zero based and fixup the usage sites. The only code which needs to adjust the 0 based index is the interrupt descriptor setup which needs to add 1 now. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: Dou Liyang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: linux-doc@vger.kernel.org Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Qian Cai Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.331772825@linutronix.de --- Documentation/x86/kernel-stacks | 8 ++++---- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/page_64_types.h | 13 ++++++++----- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 14 +++++++------- arch/x86/kernel/idt.c | 15 +++++++++------ arch/x86/kernel/irq_64.c | 2 +- arch/x86/mm/fault.c | 2 +- 8 files changed, 34 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks index 9a0aa4d3a866..1b04596caea9 100644 --- a/Documentation/x86/kernel-stacks +++ b/Documentation/x86/kernel-stacks @@ -59,7 +59,7 @@ If that assumption is ever broken then the stacks will become corrupt. The currently assigned IST stacks are :- -* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_DF. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 8 - Double Fault Exception (#DF). @@ -68,7 +68,7 @@ The currently assigned IST stacks are :- Using a separate stack allows the kernel to recover from it well enough in many cases to still output an oops. -* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_NMI. EXCEPTION_STKSZ (PAGE_SIZE). Used for non-maskable interrupts (NMI). @@ -76,7 +76,7 @@ The currently assigned IST stacks are :- middle of switching stacks. Using IST for NMI events avoids making assumptions about the previous state of the kernel stack. -* DEBUG_STACK. DEBUG_STKSZ +* ESTACK_DB. DEBUG_STKSZ Used for hardware debug interrupts (interrupt 1) and for software debug interrupts (INT3). @@ -86,7 +86,7 @@ The currently assigned IST stacks are :- avoids making assumptions about the previous state of the kernel stack. -* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). +* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 18 - Machine Check Exception (#MC). diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..fd0a50452cb3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -841,7 +841,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) /** * idtentry - Generate an IDT entry stub @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=ESTACK_DB idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index bcd8c0518604..6ab2c54c1bf9 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -24,11 +24,14 @@ #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 2 -#define DEBUG_STACK 3 -#define MCE_STACK 4 -#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ +/* + * The index for the tss.ist[] array. The hardware limit is 7 entries. + */ +#define ESTACK_DF 0 +#define ESTACK_NMI 1 +#define ESTACK_DB 2 +#define ESTACK_MCE 3 +#define N_EXCEPTION_STACKS 4 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..0e4cb718fc4a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -516,7 +516,7 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); */ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ + [ESTACK_DB] = DEBUG_STKSZ }; #endif @@ -1760,7 +1760,7 @@ void cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - if (v == DEBUG_STACK-1) + if (v == ESTACK_DB) per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 90f0fa88cbb3..455b47ef9250 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -18,16 +18,16 @@ #include -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", +static const char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { +static const unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ + [ESTACK_DB] = DEBUG_STKSZ }; const char *stack_type_name(enum stack_type type) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2877606e97de..2188f734ec61 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,9 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -180,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, ESTACK_DB), + ISTG(X86_TRAP_NMI, nmi, ESTACK_NMI), + ISTG(X86_TRAP_DF, double_fault, ESTACK_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, ESTACK_MCE), #endif }; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index cf200466d5c8..182e8b245e06 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -61,7 +61,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[DEBUG_STACK]; + estack_top = (u64)oist->ist[ESTACK_DB]; estack_bottom = estack_top - DEBUG_STKSZ + STACK_MARGIN; if (regs->sp >= estack_bottom && regs->sp <= estack_top) return; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 667f1da36208..0524e1d74f24 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -793,7 +793,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + unsigned long stack = this_cpu_read(orig_ist.ist[ESTACK_DF]) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but -- cgit v1.2.3 From 881a463cf21dbf83aab2cf6c9a359f34f88c2491 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:46 +0200 Subject: x86/cpu_entry_area: Cleanup setup functions No point in retrieving the entry area pointer over and over. Do it once and use unsigned int for 'cpu' everywhere. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.419653165@linutronix.de --- arch/x86/mm/cpu_entry_area.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 19c6abf9ea31..c2a54f75d335 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } -static void __init percpu_setup_debug_store(int cpu) +static void __init percpu_setup_debug_store(unsigned int cpu) { #ifdef CONFIG_CPU_SUP_INTEL - int npages; + unsigned int npages; void *cea; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) @@ -79,8 +79,9 @@ static void __init percpu_setup_debug_store(int cpu) } /* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(unsigned int cpu) { + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; @@ -101,10 +102,9 @@ static void __init setup_cpu_entry_area(int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif - cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), - gdt_prot); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + cea_map_percpu_pages(&cea->entry_stack_page, per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); @@ -128,19 +128,18 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, - &per_cpu(cpu_tss_rw, cpu), + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + per_cpu(cpu_entry_area, cpu) = cea; #endif #ifdef CONFIG_X86_64 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(exception_stacks) != sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + cea_map_percpu_pages(&cea->exception_stacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); #endif -- cgit v1.2.3 From 019b17b3ffe48100e52f609ca1c6ed6e5a40cba1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:47 +0200 Subject: x86/exceptions: Add structs for exception stacks At the moment everything assumes a full linear mapping of the various exception stacks. Adding guard pages to the cpu entry area mapping of the exception stacks will break that assumption. As a preparatory step convert both the real storage and the effective mapping in the cpu entry area from character arrays to structures. To ensure that both arrays have the same ordering and the same size of the individual stacks fill the members with a macro. The guard size is the only difference between the two resulting structures. For now both have guard size 0 until the preparation of all usage sites is done. Provide a couple of helper macros which are used in the following conversions. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.506807893@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 52 +++++++++++++++++++++++++++++++---- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/mm/cpu_entry_area.c | 8 ++---- 3 files changed, 51 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 29c706415443..af8c312673de 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -7,6 +7,51 @@ #include #include +#ifdef CONFIG_X86_64 + +/* Macro to enforce the same ordering and stack sizes */ +#define ESTACKS_MEMBERS(guardsize) \ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[DEBUG_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char IST_top_guard[guardsize]; \ + +/* The exception stacks' physical storage. No guard pages required */ +struct exception_stacks { + ESTACKS_MEMBERS(0) +}; + +/* + * The effective cpu entry area mapping with guard pages. Guard size is + * zero until the code which makes assumptions about linear mappings is + * cleaned up. + */ +struct cea_exception_stacks { + ESTACKS_MEMBERS(0) +}; + +#define CEA_ESTACK_SIZE(st) \ + sizeof(((struct cea_exception_stacks *)0)->st## _stack) + +#define CEA_ESTACK_BOT(ceastp, st) \ + ((unsigned long)&(ceastp)->st## _stack) + +#define CEA_ESTACK_TOP(ceastp, st) \ + (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st)) + +#define CEA_ESTACK_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack) + +#define CEA_ESTACK_PAGES \ + (sizeof(struct cea_exception_stacks) / PAGE_SIZE) + +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -32,12 +77,9 @@ struct cpu_entry_area { #ifdef CONFIG_X86_64 /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. + * Exception stacks used for IST entries with guard pages. */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + struct cea_exception_stacks estacks; #endif #ifdef CONFIG_CPU_SUP_INTEL /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0e4cb718fc4a..24b801ea7522 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1754,7 +1754,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; + char *estacks = (char *)&get_cpu_entry_area(cpu)->estacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index c2a54f75d335..6a09b84c13fe 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -13,8 +13,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -138,9 +137,8 @@ static void __init setup_cpu_entry_area(unsigned int cpu) #ifdef CONFIG_X86_64 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&cea->exception_stacks, - &per_cpu(exception_stacks, cpu), + sizeof(((struct cpu_entry_area *)0)->estacks)); + cea_map_percpu_pages(&cea->estacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); #endif percpu_setup_debug_store(cpu); -- cgit v1.2.3 From a4af767ae59cc579569bbfe49513a0037d5989ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:48 +0200 Subject: x86/cpu_entry_area: Prepare for IST guard pages To allow guard pages between the IST stacks each stack needs to be mapped individually. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.592691557@linutronix.de --- arch/x86/mm/cpu_entry_area.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 6a09b84c13fe..2b1407662a6d 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -77,6 +77,34 @@ static void __init percpu_setup_debug_store(unsigned int cpu) #endif } +#ifdef CONFIG_X86_64 + +#define cea_map_stack(name) do { \ + npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ + cea_map_percpu_pages(cea->estacks.name## _stack, \ + estacks->name## _stack, npages, PAGE_KERNEL); \ + } while (0) + +static void __init percpu_setup_exception_stacks(unsigned int cpu) +{ + struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + unsigned int npages; + + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + /* + * The exceptions stack mappings in the per cpu area are protected + * by guard pages so each stack must be mapped separately. + */ + cea_map_stack(DF); + cea_map_stack(NMI); + cea_map_stack(DB); + cea_map_stack(MCE); +} +#else +static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +#endif + /* Setup the fixmap mappings only once per-processor */ static void __init setup_cpu_entry_area(unsigned int cpu) { @@ -134,13 +162,8 @@ static void __init setup_cpu_entry_area(unsigned int cpu) per_cpu(cpu_entry_area, cpu) = cea; #endif -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->estacks)); - cea_map_percpu_pages(&cea->estacks, &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); -#endif + percpu_setup_exception_stacks(cpu); + percpu_setup_debug_store(cpu); } -- cgit v1.2.3 From 7623f37e411156e6e09b95cf5c76e509c5fda640 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:49 +0200 Subject: x86/cpu_entry_area: Provide exception stack accessor Store a pointer to the per cpu entry area exception stack mappings to allow fast retrieval. Required for converting various places from using the shadow IST array to directly doing address calculations on the actual mapping address. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.680960459@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 4 ++++ arch/x86/mm/cpu_entry_area.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index af8c312673de..9b406f067ecf 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -99,6 +99,7 @@ struct cpu_entry_area { #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); @@ -118,4 +119,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } +#define __this_cpu_ist_top_va(name) \ + CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 2b1407662a6d..a00d0d059c8a 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -14,6 +14,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); +DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -92,6 +93,9 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) unsigned int npages; BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + + per_cpu(cea_exception_stacks, cpu) = &cea->estacks; + /* * The exceptions stack mappings in the per cpu area are protected * by guard pages so each stack must be mapped separately. -- cgit v1.2.3 From d876b67343a648f3613506c7dbfed088fa0c875b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:50 +0200 Subject: x86/traps: Use cpu_entry_area instead of orig_ist The orig_ist[] array is a shadow copy of the IST array in the TSS. The reason why it exists is that older kernels used two TSS variants with different pointers into the debug stack. orig_ist[] contains the real starting points. There is no point anymore to do so because the same information can be retrieved using the base address of the cpu entry area mapping and the offsets of the various exception stacks. No functional change. Preparation for removing orig_ist. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.784487230@linutronix.de --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0524e1d74f24..06c089513d39 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,6 +28,7 @@ #include /* vma_pkey() */ #include /* efi_recover_from_page_fault()*/ #include /* store_idt(), ... */ +#include /* exception stack */ #define CREATE_TRACE_POINTS #include @@ -793,7 +794,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[ESTACK_DF]) - sizeof(void *); + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but -- cgit v1.2.3 From bf5882abab773afd1277415e2f826b21de28f30d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:51 +0200 Subject: x86/irq/64: Use cpu entry area instead of orig_ist The orig_ist[] array is a shadow copy of the IST array in the TSS. The reason why it exists is that older kernels used two TSS variants with different pointers into the debug stack. orig_ist[] contains the real starting points. There is no point anymore to do so because the same information can be retrieved using the base address of the cpu entry area mapping and the offsets of the various exception stacks. No functional change. Preparation for removing orig_ist. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.885741626@linutronix.de --- arch/x86/kernel/irq_64.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 182e8b245e06..7eb6f8d11bfd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,6 +18,8 @@ #include #include #include + +#include #include #include @@ -43,10 +45,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW #define STACK_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; + u64 irq_stack_top, irq_stack_bottom, estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); + struct cea_exception_stacks *estacks; if (user_mode(regs)) return; @@ -60,9 +61,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) return; - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[ESTACK_DB]; - estack_bottom = estack_top - DEBUG_STKSZ + STACK_MARGIN; + estacks = __this_cpu_read(cea_exception_stacks); + estack_top = CEA_ESTACK_TOP(estacks, DB); + estack_bottom = CEA_ESTACK_BOT(estacks, DB) + STACK_MARGIN; if (regs->sp >= estack_bottom && regs->sp <= estack_top) return; -- cgit v1.2.3 From afcd21dad88b68d646e91ed36948117d58b4c197 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:52 +0200 Subject: x86/dumpstack/64: Use cpu_entry_area instead of orig_ist The orig_ist[] array is a shadow copy of the IST array in the TSS. The reason why it exists is that older kernels used two TSS variants with different pointers into the debug stack. orig_ist[] contains the real starting points. There is no point anymore to do so because the same information can be retrieved using the base address of the cpu entry area mapping and the offsets of the various exception stacks. No functional change. Preparation for removing orig_ist. Cc: Josh Poimboeuf Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.974900463@linutronix.de --- arch/x86/kernel/dumpstack_64.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 455b47ef9250..f6fbd0438f9e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,6 +16,7 @@ #include #include +#include #include static const char *exception_stack_names[N_EXCEPTION_STACKS] = { @@ -25,11 +26,6 @@ static const char *exception_stack_names[N_EXCEPTION_STACKS] = { [ ESTACK_MCE ] = "#MC", }; -static const unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [ESTACK_DB] = DEBUG_STKSZ -}; - const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); @@ -52,25 +48,44 @@ const char *stack_type_name(enum stack_type type) return NULL; } +struct estack_layout { + unsigned int begin; + unsigned int end; +}; + +#define ESTACK_ENTRY(x) { \ + .begin = offsetof(struct cea_exception_stacks, x## _stack), \ + .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \ + } + +static const struct estack_layout layout[N_EXCEPTION_STACKS] = { + [ ESTACK_DF ] = ESTACK_ENTRY(DF), + [ ESTACK_NMI ] = ESTACK_ENTRY(NMI), + [ ESTACK_DB ] = ESTACK_ENTRY(DB), + [ ESTACK_MCE ] = ESTACK_ENTRY(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long estacks, begin, end, stk = (unsigned long)stack; struct pt_regs *regs; - unsigned k; + unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + estacks = (unsigned long)__this_cpu_read(cea_exception_stacks); + for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); + begin = estacks + layout[k].begin; + end = estacks + layout[k].end; regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stk < begin || stk >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; info->next_sp = (unsigned long *)regs->sp; return true; -- cgit v1.2.3 From f6ef73224a0f0400c3979c8bc68b383f9d2eb9d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:53 +0200 Subject: x86/cpu: Prepare TSS.IST setup for guard pages Convert the TSS.IST setup code to use the cpu entry area information directly instead of assuming a linear mapping of the IST stacks. The store to orig_ist[] is no longer required as there are no users anymore. This is the last preparatory step towards IST guard pages. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.061686012@linutronix.de --- arch/x86/kernel/cpu/common.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 24b801ea7522..4b01b71415f5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,19 +507,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [ESTACK_DB] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1690,17 +1677,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1699,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1736,12 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = (char *)&get_cpu_entry_area(cpu)->estacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == ESTACK_DB) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[ESTACK_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[ESTACK_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[ESTACK_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[ESTACK_MCE] = __this_cpu_ist_top_va(MCE); + per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[ESTACK_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; -- cgit v1.2.3 From 4d68c3d0ecd5fcba8876e8a58ac41ffb360de43e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:54 +0200 Subject: x86/cpu: Remove orig_ist array All users gone. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Pingfan Liu Cc: Pu Wen Cc: Sean Christopherson Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.151435667@linutronix.de --- arch/x86/include/asm/processor.h | 9 --------- arch/x86/kernel/cpu/common.c | 6 ------ 2 files changed, 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2bb3a648fc12..8fcfcd1a8375 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,16 +374,7 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); - union irq_stack_union { char irq_stack[IRQ_STACK_SIZE]; /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b01b71415f5..8243f198fb7f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1549,12 +1549,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); -- cgit v1.2.3 From 3207426925d2b4da390be8068df1d1c2b36e5918 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:55 +0200 Subject: x86/exceptions: Disconnect IST index and stack order The entry order of the TSS.IST array and the order of the stack storage/mapping are not required to be the same. With the upcoming split of the debug stack this is going to fall apart as the number of TSS.IST array entries stays the same while the actual stacks are increasing. Make them separate so that code like dumpstack can just utilize the mapping order. The IST index is solely required for the actual TSS.IST array initialization. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: Dou Liyang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Josh Poimboeuf Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.241588113@linutronix.de --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/cpu_entry_area.h | 11 +++++++++++ arch/x86/include/asm/page_64_types.h | 9 ++++----- arch/x86/include/asm/stacktrace.h | 2 ++ arch/x86/kernel/cpu/common.c | 10 +++++----- arch/x86/kernel/idt.c | 8 ++++---- 6 files changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fd0a50452cb3..5c0348504a4b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=ESTACK_DB +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 9b406f067ecf..310eeb62d418 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -35,6 +35,17 @@ struct cea_exception_stacks { ESTACKS_MEMBERS(0) }; +/* + * The exception stack ordering in [cea_]exception_stacks + */ +enum exception_stack_ordering { + ESTACK_DF, + ESTACK_NMI, + ESTACK_DB, + ESTACK_MCE, + N_EXCEPTION_STACKS +}; + #define CEA_ESTACK_SIZE(st) \ sizeof(((struct cea_exception_stacks *)0)->st## _stack) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6ab2c54c1bf9..056de887b220 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -27,11 +27,10 @@ /* * The index for the tss.ist[] array. The hardware limit is 7 entries. */ -#define ESTACK_DF 0 -#define ESTACK_NMI 1 -#define ESTACK_DB 2 -#define ESTACK_MCE 3 -#define N_EXCEPTION_STACKS 4 +#define IST_INDEX_DF 0 +#define IST_INDEX_NMI 1 +#define IST_INDEX_DB 2 +#define IST_INDEX_MCE 3 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f335aad404a4..d6d758a187b6 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -9,6 +9,8 @@ #include #include + +#include #include enum stack_type { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8243f198fb7f..143aceaf9a9a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1731,11 +1731,11 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!t->x86_tss.ist[0]) { - t->x86_tss.ist[ESTACK_DF] = __this_cpu_ist_top_va(DF); - t->x86_tss.ist[ESTACK_NMI] = __this_cpu_ist_top_va(NMI); - t->x86_tss.ist[ESTACK_DB] = __this_cpu_ist_top_va(DB); - t->x86_tss.ist[ESTACK_MCE] = __this_cpu_ist_top_va(MCE); - per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[ESTACK_DB]; + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[IST_INDEX_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2188f734ec61..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -183,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, ESTACK_DB), - ISTG(X86_TRAP_NMI, nmi, ESTACK_NMI), - ISTG(X86_TRAP_DF, double_fault, ESTACK_DF), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, ESTACK_MCE), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; -- cgit v1.2.3 From 1bdb67e5aa2d5d43c48cb7d93393fcba276c9e71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:56 +0200 Subject: x86/exceptions: Enable IST guard pages All usage sites which expected that the exception stacks in the CPU entry area are mapped linearly are fixed up. Enable guard pages between the IST stacks. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Sean Christopherson Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.349862042@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 310eeb62d418..9c96406e6d2b 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -26,13 +26,9 @@ struct exception_stacks { ESTACKS_MEMBERS(0) }; -/* - * The effective cpu entry area mapping with guard pages. Guard size is - * zero until the code which makes assumptions about linear mappings is - * cleaned up. - */ +/* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(0) + ESTACKS_MEMBERS(PAGE_SIZE) }; /* -- cgit v1.2.3 From 2a594d4ccf3f10f80b77d71bd3dad10813ac0137 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:57 +0200 Subject: x86/exceptions: Split debug IST stack The debug IST stack is actually two separate debug stacks to handle #DB recursion. This is required because the CPU starts always at top of stack on exception entry, which means on #DB recursion the second #DB would overwrite the stack of the first. The low level entry code therefore adjusts the top of stack on entry so a secondary #DB starts from a different stack page. But the stack pages are adjacent without a guard page between them. Split the debug stack into 3 stacks which are separated by guard pages. The 3rd stack is never mapped into the cpu_entry_area and is only there to catch triple #DB nesting: --- top of DB_stack <- Initial stack --- end of DB_stack guard page --- top of DB1_stack <- Top of stack after entering first #DB --- end of DB1_stack guard page --- top of DB2_stack <- Top of stack after entering second #DB --- end of DB2_stack guard page If DB2 would not act as the final guard hole, a second #DB would point the top of #DB stack to the stack below #DB1 which would be valid and not catch the not so desired triple nesting. The backing store does not allocate any memory for DB2 and its guard page as it is not going to be mapped into the cpu_entry_area. - Adjust the low level entry code so it adjusts top of #DB with the offset between the stacks instead of exception stack size. - Make the dumpstack code aware of the new stacks. - Adjust the in_debug_stack() implementation and move it into the NMI code where it belongs. As this is NMI hotpath code, it just checks the full area between top of DB_stack and bottom of DB1_stack without checking for the guard page. That's correct because the NMI cannot hit a stackpointer pointing to the guard page between DB and DB1 stack. Even if it would, then the NMI operation still is unaffected, but the resume of the debug exception on the topmost DB stack will crash by touching the guard page. [ bp: Make exception_stack_names static const char * const ] Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: linux-doc@vger.kernel.org Cc: Masahiro Yamada Cc: Peter Zijlstra Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.439944544@linutronix.de --- Documentation/x86/kernel-stacks | 7 ++++++- arch/x86/entry/entry_64.S | 8 ++++---- arch/x86/include/asm/cpu_entry_area.h | 14 ++++++++++---- arch/x86/include/asm/debugreg.h | 2 -- arch/x86/include/asm/page_64_types.h | 3 --- arch/x86/kernel/asm-offsets_64.c | 2 ++ arch/x86/kernel/cpu/common.c | 11 ----------- arch/x86/kernel/dumpstack_64.c | 12 ++++++++---- arch/x86/kernel/nmi.c | 20 +++++++++++++++++++- arch/x86/mm/cpu_entry_area.c | 4 +++- 10 files changed, 52 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks index 1b04596caea9..d1bfb0b95ee0 100644 --- a/Documentation/x86/kernel-stacks +++ b/Documentation/x86/kernel-stacks @@ -76,7 +76,7 @@ The currently assigned IST stacks are :- middle of switching stacks. Using IST for NMI events avoids making assumptions about the previous state of the kernel stack. -* ESTACK_DB. DEBUG_STKSZ +* ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE). Used for hardware debug interrupts (interrupt 1) and for software debug interrupts (INT3). @@ -86,6 +86,11 @@ The currently assigned IST stacks are :- avoids making assumptions about the previous state of the kernel stack. + To handle nested #DB correctly there exist two instances of DB stacks. On + #DB entry the IST stackpointer for #DB is switched to the second instance + so a nested #DB starts from a clean stack. The nested #DB switches + the IST stackpointer to a guard hole to catch triple nesting. + * ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 18 - Machine Check Exception (#MC). diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5c0348504a4b..ee649f1f279e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -925,13 +925,13 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 9c96406e6d2b..cff3f3f3bfe0 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -10,25 +10,29 @@ #ifdef CONFIG_X86_64 /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize) \ +#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ char DF_stack_guard[guardsize]; \ char DF_stack[EXCEPTION_STKSZ]; \ char NMI_stack_guard[guardsize]; \ char NMI_stack[EXCEPTION_STKSZ]; \ + char DB2_stack_guard[guardsize]; \ + char DB2_stack[db2_holesize]; \ + char DB1_stack_guard[guardsize]; \ + char DB1_stack[EXCEPTION_STKSZ]; \ char DB_stack_guard[guardsize]; \ - char DB_stack[DEBUG_STKSZ]; \ + char DB_stack[EXCEPTION_STKSZ]; \ char MCE_stack_guard[guardsize]; \ char MCE_stack[EXCEPTION_STKSZ]; \ char IST_top_guard[guardsize]; \ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0) + ESTACKS_MEMBERS(0, 0) }; /* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(PAGE_SIZE) + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) }; /* @@ -37,6 +41,8 @@ struct cea_exception_stacks { enum exception_stack_ordering { ESTACK_DF, ESTACK_NMI, + ESTACK_DB2, + ESTACK_DB1, ESTACK_DB, ESTACK_MCE, N_EXCEPTION_STACKS diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9e5ca30738e5..1a8609a15856 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void) { __this_cpu_dec(debug_stack_usage); } -int is_debug_stack(unsigned long addr); void debug_stack_set_zero(void); void debug_stack_reset(void); #else /* !X86_64 */ -static inline int is_debug_stack(unsigned long addr) { return 0; } static inline void debug_stack_set_zero(void) { } static inline void debug_stack_reset(void) { } static inline void debug_stack_usage_inc(void) { } diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 056de887b220..793c14c372cb 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -18,9 +18,6 @@ #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) -#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) - #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b5..f5281567e28e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,6 +68,8 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 143aceaf9a9a..88cab45707a9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1549,17 +1549,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1735,7 +1725,6 @@ void cpu_init(void) t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); - per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[IST_INDEX_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f6fbd0438f9e..fca97bd3d8ae 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,16 +19,18 @@ #include #include -static const char *exception_stack_names[N_EXCEPTION_STACKS] = { +static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -58,9 +60,11 @@ struct estack_layout { .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \ } -static const struct estack_layout layout[N_EXCEPTION_STACKS] = { +static const struct estack_layout layout[] = { [ ESTACK_DF ] = ESTACK_ENTRY(DF), [ ESTACK_NMI ] = ESTACK_ENTRY(NMI), + [ ESTACK_DB2 ] = { .begin = 0, .end = 0}, + [ ESTACK_DB1 ] = ESTACK_ENTRY(DB1), [ ESTACK_DB ] = ESTACK_ENTRY(DB), [ ESTACK_MCE ] = ESTACK_ENTRY(MCE), }; @@ -71,7 +75,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); estacks = (unsigned long)__this_cpu_read(cea_exception_stacks); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..3755d0310026 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include #include #include +#include #include #if defined(CONFIG_EDAC) #include #endif -#include +#include #include #include #include @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index a00d0d059c8a..752ad11d6868 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -98,10 +98,12 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) /* * The exceptions stack mappings in the per cpu area are protected - * by guard pages so each stack must be mapped separately. + * by guard pages so each stack must be mapped separately. DB2 is + * not mapped; it just exists to catch triple nesting of #DB. */ cea_map_stack(DF); cea_map_stack(NMI); + cea_map_stack(DB1); cea_map_stack(DB); cea_map_stack(MCE); } -- cgit v1.2.3 From c450c8f532b63475b30e29bc600c25ab0a4ab282 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:58 +0200 Subject: x86/dumpstack/64: Speedup in_exception_stack() The current implementation of in_exception_stack() iterates over the exception stacks array. Most of the time this is an useless exercise, but even for the actual use cases (perf and ftrace) it takes at least 2 iterations to get to the NMI stack. As the exception stacks and the guard pages are page aligned the loop can be avoided completely. Add a initial check whether the stack pointer is inside the full exception stack area and leave early if not. Create a lookup table which describes the stack area. The table index is the page offset from the beginning of the exception stacks. So for any given stack pointer the page offset is computed and a lookup in the description table is performed. If it is inside a guard page, return. If not, use the descriptor to fill in the info structure. The table is filled at compile time and for the !KASAN case the interesting page descriptors exactly fit into a single cache line. Just the last guard page descriptor is in the next cacheline, but that should not be accessed in the regular case. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Josh Poimboeuf Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.543320386@linutronix.de --- arch/x86/kernel/dumpstack_64.c | 82 ++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index fca97bd3d8ae..f356d3ea0c70 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -50,52 +50,72 @@ const char *stack_type_name(enum stack_type type) return NULL; } -struct estack_layout { - unsigned int begin; - unsigned int end; +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; }; -#define ESTACK_ENTRY(x) { \ - .begin = offsetof(struct cea_exception_stacks, x## _stack), \ - .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \ - } +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } -static const struct estack_layout layout[] = { - [ ESTACK_DF ] = ESTACK_ENTRY(DF), - [ ESTACK_NMI ] = ESTACK_ENTRY(NMI), - [ ESTACK_DB2 ] = { .begin = 0, .end = 0}, - [ ESTACK_DB1 ] = ESTACK_ENTRY(DB1), - [ ESTACK_DB ] = ESTACK_ENTRY(DB), - [ ESTACK_MCE ] = ESTACK_ENTRY(MCE), +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB1), + EPAGERANGE(DB), + EPAGERANGE(MCE), }; static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long estacks, begin, end, stk = (unsigned long)stack; + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); - estacks = (unsigned long)__this_cpu_read(cea_exception_stacks); - - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - begin = estacks + layout[k].begin; - end = estacks + layout[k].end; - regs = (struct pt_regs *)end - 1; + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; - if (stk < begin || stk >= end) - continue; + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; - info->type = STACK_TYPE_EXCEPTION + k; - info->begin = (unsigned long *)begin; - info->end = (unsigned long *)end; - info->next_sp = (unsigned long *)regs->sp; + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; - return true; - } - - return false; + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; } static bool in_irq_stack(unsigned long *stack, struct stack_info *info) -- cgit v1.2.3 From aa641c287b2f7676f6f0064a8351daf08eca6b0a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:59 +0200 Subject: x86/irq/32: Define IRQ_STACK_SIZE On 32-bit IRQ_STACK_SIZE is the same as THREAD_SIZE. To allow sharing struct irq_stack with 32-bit, define IRQ_STACK_SIZE for 32-bit and use it for struct irq_stack. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Michal Hocko Cc: Nick Desaulniers Cc: Sean Christopherson Cc: Suravee Suthikulpanit Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.632513987@linutronix.de --- arch/x86/include/asm/page_32_types.h | 2 ++ arch/x86/include/asm/processor.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 57b5dc15ca7e..565ad755c785 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -22,6 +22,8 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) +#define IRQ_STACK_SIZE THREAD_SIZE + #define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8fcfcd1a8375..8b4bf732e1b5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -422,8 +422,8 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); * per-CPU IRQ handling stacks */ struct irq_stack { - u32 stack[THREAD_SIZE/sizeof(u32)]; -} __aligned(THREAD_SIZE); + u32 stack[IRQ_STACK_SIZE / sizeof(u32)]; +} __aligned(IRQ_STACK_SIZE); DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); DECLARE_PER_CPU(struct irq_stack *, softirq_stack); -- cgit v1.2.3 From 231c4846b106d526fa212b02b37447d3f2fcc99d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:00 +0200 Subject: x86/irq/32: Make irq stack a character array There is no reason to have an u32 array in struct irq_stack. The only purpose of the array is to size the struct properly. Preparatory change for sharing struct irq_stack with 64-bit. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Nick Desaulniers Cc: Pingfan Liu Cc: Pu Wen Cc: Sean Christopherson Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.736241969@linutronix.de --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8b4bf732e1b5..ff3f469ab2d4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -422,7 +422,7 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); * per-CPU IRQ handling stacks */ struct irq_stack { - u32 stack[IRQ_STACK_SIZE / sizeof(u32)]; + char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); -- cgit v1.2.3 From a754fe2b76d1d6bce7069657bba975034f3ad961 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:01 +0200 Subject: x86/irq/32: Rename hard/softirq_stack to hard/softirq_stack_ptr The percpu storage holds a pointer to the stack not the stack itself. Rename it before sharing struct irq_stack with 64-bit. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Nick Desaulniers Cc: Nicolai Stange Cc: Sean Christopherson Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.824805922@linutronix.de --- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/irq_32.c | 19 ++++++++++--------- 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ff3f469ab2d4..0b7d866a1ad5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -425,8 +425,8 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack); +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ extern unsigned int fpu_kernel_xstate_size; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d305440ebe9c..64a59d726639 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,7 +34,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -59,7 +59,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 95600a99ae93..f37489c806fa 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); static void call_on_stack(void *func, void *stack) { @@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -113,21 +113,22 @@ void irq_ctx_init(int cpu) { struct irq_stack *irqstk; - if (per_cpu(hardirq_stack, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return; irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), THREADINFO_GFP, THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack, cpu) = irqstk; + per_cpu(hardirq_stack_ptr, cpu) = irqstk; irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), THREADINFO_GFP, THREAD_SIZE_ORDER)); - per_cpu(softirq_stack, cpu) = irqstk; + per_cpu(softirq_stack_ptr, cpu) = irqstk; - printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); + pr_debug("CPU %u irqstacks, hard=%p soft=%p\n", + cpu, per_cpu(hardirq_stack_ptr, cpu), + per_cpu(softirq_stack_ptr, cpu)); } void do_softirq_own_stack(void) @@ -135,7 +136,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); -- cgit v1.2.3 From 758a2e312228410f2f5092ade558109e93dc3ee8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:02 +0200 Subject: x86/irq/64: Rename irq_stack_ptr to hardirq_stack_ptr Preparatory patch to share code with 32bit. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Desaulniers Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Pingfan Liu Cc: Sean Christopherson Cc: Stephen Rothwell Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.912584074@linutronix.de --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/dumpstack_64.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ee649f1f279e..726abbe6c6d8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -431,7 +431,7 @@ END(irq_entries_start) */ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(irq_stack_ptr), %rsp + movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0b7d866a1ad5..5e3dd4e2136d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -396,7 +396,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); } -DECLARE_PER_CPU(char *, irq_stack_ptr); +DECLARE_PER_CPU(char *, hardirq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 88cab45707a9..13ec72bb8f36 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1510,7 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = +DEFINE_PER_CPU(char *, hardirq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f356d3ea0c70..753b8cfe8b8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -120,7 +120,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 7eb6f8d11bfd..f0c7356c8969 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -56,7 +56,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_top = (u64)__this_cpu_read(hardirq_stack_ptr); irq_stack_bottom = irq_stack_top - IRQ_STACK_SIZE + STACK_MARGIN; if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) return; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4bf46575568a..657343ecc2da 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -245,7 +245,7 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif #ifdef CONFIG_X86_64 - per_cpu(irq_stack_ptr, cpu) = + per_cpu(hardirq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE; #endif -- cgit v1.2.3 From 451f743a64e1cf979f5fe21a1b2a015feb559f72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:03 +0200 Subject: x86/irq/32: Invoke irq_ctx_init() from init_IRQ() irq_ctx_init() is invoked from native_init_IRQ() or from xen_init_IRQ() code. There is no reason to have this split. The interrupt stacks must be allocated no matter what. Invoke it from init_IRQ() before invoking the native or XEN init implementation. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Abraham Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Mike Rapoport Cc: Nicolai Stange Cc: Sean Christopherson Cc: Stefano Stabellini Cc: Stephen Rothwell Cc: x86-ml Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20190414160146.001162606@linutronix.de --- arch/x86/kernel/irqinit.c | 4 ++-- drivers/xen/events/events_base.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1..26b5cb5386b9 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,6 +91,8 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + irq_ctx_init(smp_processor_id()); + x86_init.irqs.intr_init(); } @@ -104,6 +106,4 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); - - irq_ctx_init(smp_processor_id()); } diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 117e76b2f939..084e45882c73 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1687,7 +1687,6 @@ void __init xen_init_IRQ(void) #ifdef CONFIG_X86 if (xen_pv_domain()) { - irq_ctx_init(smp_processor_id()); if (xen_initial_domain()) pci_xen_initial_domain(); } -- cgit v1.2.3 From 66c7ceb47f628c8bd4f84a6d01c2725ded6a342d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:04 +0200 Subject: x86/irq/32: Handle irq stack allocation failure proper irq_ctx_init() crashes hard on page allocation failures. While that's ok during early boot, it's just wrong in the CPU hotplug bringup code. Check the page allocation failure and return -ENOMEM and handle it at the call sites. On early boot the only way out is to BUG(), but on CPU hotplug there is no reason to crash, so just abort the operation. Rename the function to something more sensible while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alison Schofield Cc: Andrew Morton Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Nicolai Stange Cc: Pu Wen Cc: Sean Christopherson Cc: Shaokun Zhang Cc: Stefano Stabellini Cc: Suravee Suthikulpanit Cc: x86-ml Cc: xen-devel@lists.xenproject.org Cc: Yazen Ghannam Cc: Yi Wang Cc: Zhenzhong Duan Link: https://lkml.kernel.org/r/20190414160146.089060584@linutronix.de --- arch/x86/include/asm/irq.h | 4 ++-- arch/x86/include/asm/smp.h | 2 +- arch/x86/kernel/irq_32.c | 32 ++++++++++++++++---------------- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/smpboot.c | 15 ++++++++++++--- arch/x86/xen/smp_pv.c | 4 +++- 6 files changed, 35 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index fbb16e6b6c18..d751e8440a6b 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -17,9 +17,9 @@ static inline int irq_canonicalize(int irq) } #ifdef CONFIG_X86_32 -extern void irq_ctx_init(int cpu); +extern int irq_init_percpu_irqstack(unsigned int cpu); #else -# define irq_ctx_init(cpu) do { } while (0) +static inline int irq_init_percpu_irqstack(unsigned int cpu) { return 0; } #endif #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 2e95b6c1bca3..da545df207b2 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -131,7 +131,7 @@ void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); -void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); +int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index f37489c806fa..fc34816c6f04 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -107,28 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) } /* - * allocate per-cpu stacks for hardirq and for softirq processing + * Allocate per-cpu stacks for hardirq and softirq processing */ -void irq_ctx_init(int cpu) +int irq_init_percpu_irqstack(unsigned int cpu) { - struct irq_stack *irqstk; + int node = cpu_to_node(cpu); + struct page *ph, *ps; if (per_cpu(hardirq_stack_ptr, cpu)) - return; - - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack_ptr, cpu) = irqstk; + return 0; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(softirq_stack_ptr, cpu) = irqstk; + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } - pr_debug("CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack_ptr, cpu), - per_cpu(softirq_stack_ptr, cpu)); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; } void do_softirq_own_stack(void) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 26b5cb5386b9..16919a9671fa 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,7 +91,7 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); - irq_ctx_init(smp_processor_id()); + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); x86_init.irqs.intr_init(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce1a67b70168..c92b21f9e9dc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -935,20 +935,27 @@ out: return boot_error; } -void common_cpu_up(unsigned int cpu, struct task_struct *idle) +int common_cpu_up(unsigned int cpu, struct task_struct *idle) { + int ret; + /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif + return 0; } /* @@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; - common_cpu_up(cpu, tidle); + err = common_cpu_up(cpu, tidle); + if (err) + return err; err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 145506f9fdbe..590fcf863006 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -361,7 +361,9 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; - common_cpu_up(cpu, idle); + rc = common_cpu_up(cpu, idle); + if (rc) + return rc; xen_setup_runstate_info(cpu); -- cgit v1.2.3 From 0ac26104208450d35c4e68754ce0c67b3a4d7802 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:05 +0200 Subject: x86/irq/64: Init hardirq_stack_ptr during CPU hotplug Preparatory change for disentangling the irq stack union as a prerequisite for irq stacks with guard pages. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Nicolai Stange Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Cc: Yi Wang Link: https://lkml.kernel.org/r/20190414160146.177558566@linutronix.de --- arch/x86/include/asm/irq.h | 4 ---- arch/x86/kernel/cpu/common.c | 4 +--- arch/x86/kernel/irq_64.c | 15 +++++++++++++++ 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index d751e8440a6b..8f95686ec27e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_32 extern int irq_init_percpu_irqstack(unsigned int cpu); -#else -static inline int irq_init_percpu_irqstack(unsigned int cpu) { return 0; } -#endif #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 13ec72bb8f36..1222080838da 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1510,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, hardirq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(char *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f0c7356c8969..c0bea0d7d76a 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -87,3 +87,18 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) generic_handle_irq_desc(desc); return true; } + +static int map_irq_stack(unsigned int cpu) +{ + void *va = per_cpu_ptr(irq_stack_union.irq_stack, cpu); + + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} + +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); +} -- cgit v1.2.3 From e6401c13093173aad709a5c6de00cf8d692ee786 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 18:00:06 +0200 Subject: x86/irq/64: Split the IRQ stack into its own pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the IRQ stack is hardcoded as the first page of the percpu area, and the stack canary lives on the IRQ stack. The former gets in the way of adding an IRQ stack guard page, and the latter is a potential weakness in the stack canary mechanism. Split the IRQ stack into its own private percpu pages. [ tglx: Make 64 and 32 bit share struct irq_stack ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Brijesh Singh Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: Feng Tang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jan Beulich Cc: Jiri Kosina Cc: Joerg Roedel Cc: Jordan Borgner Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Maran Wilson Cc: Masahiro Yamada Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Desaulniers Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Pu Wen Cc: "Rafael Ávila de Espíndola" Cc: Sean Christopherson Cc: Stefano Stabellini Cc: Vlastimil Babka Cc: x86-ml Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20190414160146.267376656@linutronix.de --- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/processor.h | 32 ++++++++++++++------------------ arch/x86/include/asm/stackprotector.h | 6 +++--- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 8 ++++---- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/irq_64.c | 5 ++++- arch/x86/kernel/setup_percpu.c | 5 ----- arch/x86/kernel/vmlinux.lds.S | 7 ++++--- arch/x86/tools/relocs.c | 2 +- arch/x86/xen/xen-head.S | 10 +++++----- 11 files changed, 39 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 726abbe6c6d8..cfe4d6ea258d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif #ifdef CONFIG_RETPOLINE @@ -430,7 +430,7 @@ END(irq_entries_start) * it before we actually move ourselves to the IRQ stack. */ - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5e3dd4e2136d..7e99ef67bff0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); #define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) +/* Per CPU interrupt stacks */ +struct irq_stack { + char stack[IRQ_STACK_SIZE]; +} __aligned(IRQ_STACK_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -375,28 +382,24 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #endif #ifdef CONFIG_X86_64 -union irq_stack_union { - char irq_stack[IRQ_STACK_SIZE]; +struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ - struct { - char gs_base[40]; - unsigned long stack_canary; - }; + char gs_base[40]; + unsigned long stack_canary; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -DECLARE_INIT_PER_CPU(irq_stack_union); +DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; +DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(char *, hardirq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); @@ -418,14 +421,7 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* - * per-CPU IRQ handling stacks - */ -struct irq_stack { - char stack[IRQ_STACK_SIZE]; -} __aligned(IRQ_STACK_SIZE); - -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +/* Per CPU softirq stack pointer */ DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245..91e29b6a86a5 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -13,7 +13,7 @@ * On x86_64, %gs is shared by percpu area and stack canary. All * percpu symbols are zero based and %gs points to the base of percpu * area. The first occupant of the percpu area is always - * irq_stack_union which contains stack_canary at offset 40. Userland + * fixed_percpu_data which contains stack_canary at offset 40. Userland * %gs is always saved and restored on kernel entry and exit using * swapgs, so stack protector doesn't add any complexity there. * @@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void) u64 tsc; #ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); + BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); #endif /* * We both use the random pool and the current TSC as a source @@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - this_cpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(fixed_percpu_data.stack_canary, canary); #else this_cpu_write(stack_canary.canary, canary); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f5281567e28e..d3d075226c0a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -73,7 +73,7 @@ int main(void) BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1222080838da..801c6f040faa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1498,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1510,7 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb82..bcd206c8ac90 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c0bea0d7d76a..c0f89d136b80 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -23,6 +23,9 @@ #include #include +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); + int sysctl_panic_on_stackoverflow; /* @@ -90,7 +93,7 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) static int map_irq_stack(unsigned int cpu) { - void *va = per_cpu_ptr(irq_stack_union.irq_stack, cpu); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; return 0; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 657343ecc2da..86663874ef04 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif -#ifdef CONFIG_X86_64 - per_cpu(hardirq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE; -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bad8c51fee6e..a5af9a7c4be4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -403,7 +403,8 @@ SECTIONS */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); /* * Build-time check on the image size: @@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b629f6992d9f..efa483205e43 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -738,7 +738,7 @@ static void percpu_init(void) * __per_cpu_load * * The "gold" linker incorrectly associates: - * init_per_cpu__irq_stack_union + * init_per_cpu__fixed_percpu_data * init_per_cpu__gdt_page */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 5077ead5e59c..c1d8b90aa4e2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -40,13 +40,13 @@ ENTRY(startup_xen) #ifdef CONFIG_X86_64 /* Set up %gs. * - * The base of %gs always points to the bottom of the irqstack - * union. If the stack protector canary is enabled, it is - * located at %gs:40. Note that, on SMP, the boot cpu uses - * init data section till per cpu areas are set up. + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax cdq wrmsr #endif -- cgit v1.2.3 From 18b7a6bef62de1d598fbff23b52114b7775ecf00 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 18:00:07 +0200 Subject: x86/irq/64: Remap the IRQ stack with guard pages The IRQ stack lives in percpu space, so an IRQ handler that overflows it will overwrite other data structures. Use vmap() to remap the IRQ stack so that it will have the usual guard pages that vmap()/vmalloc() allocations have. With this, the kernel will panic immediately on an IRQ stack overflow. [ tglx: Move the map code to a proper place and invoke it only when a CPU is about to be brought online. No point in installing the map at early boot for all possible CPUs. Fail the CPU bringup if the vmap() fails as done for all other preparatory stages in CPU hotplug. ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160146.363733568@linutronix.de --- arch/x86/kernel/irq_64.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c0f89d136b80..f107eb2021f6 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -91,6 +91,35 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) return true; } +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; + + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); + + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } + + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; + + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ static int map_irq_stack(unsigned int cpu) { void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); @@ -98,6 +127,7 @@ static int map_irq_stack(unsigned int cpu) per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; return 0; } +#endif int irq_init_percpu_irqstack(unsigned int cpu) { -- cgit v1.2.3 From 117ed45485413b1977bfc638c32bf5b01d53c62b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:08 +0200 Subject: x86/irq/64: Remove stack overflow debug code All stack types on x86 64-bit have guard pages now. So there is no point in executing probabilistic overflow checks as the guard pages are a accurate and reliable overflow prevention. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160146.466354762@linutronix.de --- arch/x86/Kconfig | 2 +- arch/x86/kernel/irq_64.c | 56 ------------------------------------------------ 2 files changed, 1 insertion(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19..fd06614b09a7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,6 +14,7 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select HAVE_DEBUG_STACKOVERFLOW select MODULES_USE_ELF_REL select OLD_SIGACTION @@ -138,7 +139,6 @@ config X86 select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK - select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f107eb2021f6..6bf6517a05bb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,64 +26,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; DECLARE_INIT_PER_CPU(irq_stack_backing_store); -int sysctl_panic_on_stackoverflow; - -/* - * Probabilistic stack overflow check: - * - * Regular device interrupts can enter on the following stacks: - * - * - User stack - * - * - Kernel task stack - * - * - Interrupt stack if a device driver reenables interrupts - * which should only happen in really old drivers. - * - * - Debug IST stack - * - * All other contexts are invalid. - */ -static inline void stack_overflow_check(struct pt_regs *regs) -{ -#ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_MARGIN 128 - u64 irq_stack_top, irq_stack_bottom, estack_top, estack_bottom; - u64 curbase = (u64)task_stack_page(current); - struct cea_exception_stacks *estacks; - - if (user_mode(regs)) - return; - - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_MARGIN && - regs->sp <= curbase + THREAD_SIZE) - return; - - irq_stack_top = (u64)__this_cpu_read(hardirq_stack_ptr); - irq_stack_bottom = irq_stack_top - IRQ_STACK_SIZE + STACK_MARGIN; - if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) - return; - - estacks = __this_cpu_read(cea_exception_stacks); - estack_top = CEA_ESTACK_TOP(estacks, DB); - estack_bottom = CEA_ESTACK_BOT(estacks, DB) + STACK_MARGIN; - if (regs->sp >= estack_bottom && regs->sp <= estack_top) - return; - - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx, irq stack:%Lx-%Lx, exception stack: %Lx-%Lx, ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_bottom, irq_stack_top, - estack_bottom, estack_top, (void *)regs->ip); - - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); -#endif -} - bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - stack_overflow_check(regs); - if (IS_ERR_OR_NULL(desc)) return false; -- cgit v1.2.3 From cae5ec342645746d617dd420d206e1588d47768a Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 12 Apr 2019 17:50:57 -0400 Subject: x86/speculation/mds: Fix comment s/L1TF/MDS/ Signed-off-by: Boris Ostrovsky Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Reviewed-by: Tyler Hicks Reviewed-by: Josh Poimboeuf --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 22a14d4b68a2..0642505dda69 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -221,7 +221,7 @@ static void x86_amd_ssb_disable(void) #undef pr_fmt #define pr_fmt(fmt) "MDS: " fmt -/* Default mitigation for L1TF-affected CPUs */ +/* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; static bool mds_nosmt __ro_after_init = false; -- cgit v1.2.3 From e2c3c94788b08891dcf3dbe608f9880523ecd71b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 12 Apr 2019 17:50:58 -0400 Subject: x86/speculation/mds: Print SMT vulnerable on MSBDS with mitigations off This code is only for CPUs which are affected by MSBDS, but are *not* affected by the other two MDS issues. For such CPUs, enabling the mds_idle_clear mitigation is enough to mitigate SMT. However if user boots with 'mds=off' and still has SMT enabled, we should not report that SMT is mitigated: $cat /sys//devices/system/cpu/vulnerabilities/mds Vulnerable; SMT mitigated But rather: Vulnerable; SMT vulnerable Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Reviewed-by: Tyler Hicks Reviewed-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20190412215118.294906495@localhost.localdomain --- arch/x86/kernel/cpu/bugs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0642505dda69..6b8a55c7cebc 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1204,7 +1204,8 @@ static ssize_t mds_show_state(char *buf) if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], - sched_smt_active() ? "mitigated" : "disabled"); + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); } return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], -- cgit v1.2.3 From d68be4c4d31295ff6ae34a8ddfaa4c1a8ff42812 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 12 Apr 2019 15:39:29 -0500 Subject: x86/speculation: Support 'mitigations=' cmdline option Configure x86 runtime CPU speculation bug mitigations in accordance with the 'mitigations=' cmdline option. This affects Meltdown, Spectre v2, Speculative Store Bypass, and L1TF. The default behavior is unchanged. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina (on x86) Reviewed-by: Jiri Kosina Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Waiman Long Cc: Andrea Arcangeli Cc: Jon Masters Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-s390@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-arch@vger.kernel.org Cc: Greg Kroah-Hartman Cc: Tyler Hicks Cc: Linus Torvalds Cc: Randy Dunlap Cc: Steven Price Cc: Phil Auld Link: https://lkml.kernel.org/r/6616d0ae169308516cfdf5216bedd169f8a8291b.1555085500.git.jpoimboe@redhat.com --- Documentation/admin-guide/kernel-parameters.txt | 16 +++++++++++----- arch/x86/kernel/cpu/bugs.c | 11 +++++++++-- arch/x86/mm/pti.c | 4 +++- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 720ffa9c4e04..779ddeb2929c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2514,15 +2514,20 @@ http://repo.or.cz/w/linux-2.6/mini2440.git mitigations= - Control optional mitigations for CPU vulnerabilities. - This is a set of curated, arch-independent options, each - of which is an aggregation of existing arch-specific - options. + [X86] Control optional mitigations for CPU + vulnerabilities. This is a set of curated, + arch-independent options, each of which is an + aggregation of existing arch-specific options. off Disable all optional CPU mitigations. This improves system performance, but it may also expose users to several CPU vulnerabilities. + Equivalent to: nopti [X86] + nospectre_v2 [X86] + spectre_v2_user=off [X86] + spec_store_bypass_disable=off [X86] + l1tf=off [X86] auto (default) Mitigate all CPU vulnerabilities, but leave SMT @@ -2530,12 +2535,13 @@ users who don't want to be surprised by SMT getting disabled across kernel upgrades, or who have other ways of avoiding SMT-based attacks. - This is the default behavior. + Equivalent to: (default behavior) auto,nosmt Mitigate all CPU vulnerabilities, disabling SMT if needed. This is for users who always want to be fully mitigated, even if it means losing SMT. + Equivalent to: l1tf=flush,nosmt [X86] mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 01874d54f4fd..435c078c2948 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -440,7 +440,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -672,7 +673,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -996,6 +998,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4fee5c3003ed..5890f09bfc19 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -115,7 +116,8 @@ void __init pti_check_boottime_disable(void) } } - if (cmdline_find_option_bool(boot_command_line, "nopti")) { + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; -- cgit v1.2.3 From ea094d53580f40c2124cef3d072b73b2425e7bfd Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Wed, 17 Apr 2019 09:18:50 -0500 Subject: x86/PCI: Fix PCI IRQ routing table memory leak In pcibios_irq_init(), the PCI IRQ routing table 'pirq_table' is first found through pirq_find_routing_table(). If the table is not found and CONFIG_PCI_BIOS is defined, the table is then allocated in pcibios_get_irq_routing_table() using kmalloc(). Later, if the I/O APIC is used, this table is actually not used. In that case, the allocated table is not freed, which is a memory leak. Free the allocated table if it is not used. Signed-off-by: Wenwen Wang [bhelgaas: added Ingo's reviewed-by, since the only change since v1 was to use the irq_routing_table local variable name he suggested] Signed-off-by: Bjorn Helgaas Reviewed-by: Ingo Molnar Acked-by: Thomas Gleixner --- arch/x86/pci/irq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 52e55108404e..d3a73f9335e1 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1119,6 +1119,8 @@ static const struct dmi_system_id pciirq_dmi_table[] __initconst = { void __init pcibios_irq_init(void) { + struct irq_routing_table *rtable = NULL; + DBG(KERN_DEBUG "PCI: IRQ init\n"); if (raw_pci_ops == NULL) @@ -1129,8 +1131,10 @@ void __init pcibios_irq_init(void) pirq_table = pirq_find_routing_table(); #ifdef CONFIG_PCI_BIOS - if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) { pirq_table = pcibios_get_irq_routing_table(); + rtable = pirq_table; + } #endif if (pirq_table) { pirq_peer_trick(); @@ -1145,8 +1149,10 @@ void __init pcibios_irq_init(void) * If we're using the I/O APIC, avoid using the PCI IRQ * routing table */ - if (io_apic_assign_pci_irqs) + if (io_apic_assign_pci_irqs) { + kfree(rtable); pirq_table = NULL; + } } x86_init.pci.fixup_irqs(); -- cgit v1.2.3 From 7390619ab9ea9fd0ba9f4c3e4749ee20262cba7d Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Apr 2019 19:08:48 +0800 Subject: x86/resctrl: Move per RDT domain initialization to a separate function Carve out per rdt_domain initialization code from rdtgroup_init_alloc() into a separate function. No functional change, make the code more readable and save us at least two indentation levels. Signed-off-by: Xiaochen Shen Signed-off-by: Borislav Petkov Cc: Fenghua Yu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: pei.p.jia@intel.com Cc: Reinette Chatre Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/1555499329-1170-2-git-send-email-xiaochen.shen@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 131 ++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 85212a32b54d..36ace51ee705 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2516,28 +2516,86 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) bitmap_clear(val, zero_bit, cbm_len - zero_bit); } +/* + * Initialize cache resources per RDT domain + * + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. + */ +static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, + u32 closid) +{ + struct rdt_resource *r_cdp = NULL; + struct rdt_domain *d_cdp = NULL; + u32 used_b = 0, unused_b = 0; + unsigned long tmp_cbm; + enum rdtgrp_mode mode; + u32 peer_ctl, *ctrl; + int i; + + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; + /* + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. + */ + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl | peer_ctl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + + return 0; +} + /** * rdtgroup_init_alloc - Initialize the new RDT group's allocations * * A new RDT group is being created on an allocation capable (CAT) * supporting system. Set this group up to start off with all usable - * allocations. That is, all shareable and unused bits. + * allocations. * - * All-zero CBM is invalid. If there are no more shareable bits available - * on any domain then the entire allocation will fail. + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { - struct rdt_resource *r_cdp = NULL; - struct rdt_domain *d_cdp = NULL; - u32 used_b = 0, unused_b = 0; - u32 closid = rdtgrp->closid; struct rdt_resource *r; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; struct rdt_domain *d; - u32 peer_ctl, *ctrl; - int i, ret; + int ret; for_each_alloc_enabled_rdt_resource(r) { /* @@ -2547,54 +2605,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) if (r->rid == RDT_RESOURCE_MBA) continue; list_for_each_entry(d, &r->domains, list) { - rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); - d->have_new_ctrl = false; - d->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; - /* - * If CDP is active include peer - * domain's usage to ensure there - * is no overlap with an exclusive - * group. - */ - if (d_cdp) - peer_ctl = d_cdp->ctrl_val[i]; - else - peer_ctl = 0; - used_b |= *ctrl | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - d->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cbm_ensure_valid(&d->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure - * that bitmap_weight() does not access out-of-bound - * memory. - */ - tmp_cbm = d->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < - r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", - r->name, d->id); - return -ENOSPC; - } - d->have_new_ctrl = true; + ret = __init_one_rdt_domain(d, r, rdtgrp->closid); + if (ret < 0) + return ret; } } -- cgit v1.2.3 From 47820e73f5b3a1fdb8ebd1219191edc96e0c85c1 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Apr 2019 19:08:49 +0800 Subject: x86/resctrl: Initialize a new resource group with default MBA values Currently, when a new resource group is created, the allocation values of the MBA resource are not initialized and remain meaningless data. For example: mkdir /sys/fs/resctrl/p1 cat /sys/fs/resctrl/p1/schemata MB:0=100;1=100 echo "MB:0=10;1=20" > /sys/fs/resctrl/p1/schemata cat /sys/fs/resctrl/p1/schemata MB:0= 10;1= 20 rmdir /sys/fs/resctrl/p1 mkdir /sys/fs/resctrl/p2 cat /sys/fs/resctrl/p2/schemata MB:0= 10;1= 20 Therefore, when the new group is created, it is reasonable to initialize MBA resource with default values. Initialize the MBA resource and cache resources in separate functions. [ bp: Add newlines between code blocks for better readability. ] Signed-off-by: Xiaochen Shen Signed-off-by: Borislav Petkov Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: pei.p.jia@intel.com Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/1555499329-1170-3-git-send-email-xiaochen.shen@intel.com --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 +-- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 52 ++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 2dbd990a2eb7..89320c0396b1 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -342,10 +342,10 @@ int update_domains(struct rdt_resource *r, int closid) if (cpumask_empty(cpu_mask) || mba_sc) goto done; cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ + /* Update resource control msr on this CPU if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) rdt_ctrl_update(&msr_param); - /* Update CBM on other cpus. */ + /* Update resource control msr on other CPUs. */ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 36ace51ee705..333c177a2471 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2581,8 +2581,8 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, return 0; } -/** - * rdtgroup_init_alloc - Initialize the new RDT group's allocations +/* + * Initialize cache resources with default values. * * A new RDT group is being created on an allocation capable (CAT) * supporting system. Set this group up to start off with all usable @@ -2591,38 +2591,52 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, * If there are no more shareable bits available on any domain then * the entire allocation will fail. */ +static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &r->domains, list) { + ret = __init_one_rdt_domain(d, r, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + d->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { struct rdt_resource *r; - struct rdt_domain *d; int ret; for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; - list_for_each_entry(d, &r->domains, list) { - ret = __init_one_rdt_domain(d, r, rdtgrp->closid); + if (r->rid == RDT_RESOURCE_MBA) { + rdtgroup_init_mba(r); + } else { + ret = rdtgroup_init_cat(r, rdtgrp->closid); if (ret < 0) return ret; } - } - for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } + } rdtgrp->mode = RDT_MODE_SHAREABLE; -- cgit v1.2.3 From 3c454f47e67bf5a65dc892cd50221a7de695870f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 3 Apr 2019 17:30:12 +0900 Subject: x86/build/vdso: Add FORCE to the build rule of %.so $(call if_changed,...) must have FORCE as a prerequisite. Signed-off-by: Masahiro Yamada Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1554280212-10578-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 5bfe2243a08f..42fe42e82baf 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -116,7 +116,7 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) $(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg +$(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE -- cgit v1.2.3 From ec3937107ab43f3e8b2bc9dad95710043c462ff7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 4 Apr 2019 10:03:13 +0800 Subject: x86/mm/KASLR: Fix the size of the direct mapping section kernel_randomize_memory() uses __PHYSICAL_MASK_SHIFT to calculate the maximum amount of system RAM supported. The size of the direct mapping section is obtained from the smaller one of the below two values: (actual system RAM size + padding size) vs (max system RAM size supported) This calculation is wrong since commit b83ce5ee9147 ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52"). In it, __PHYSICAL_MASK_SHIFT was changed to be 52, regardless of whether the kernel is using 4-level or 5-level page tables. Thus, it will always use 4 PB as the maximum amount of system RAM, even in 4-level paging mode where it should actually be 64 TB. Thus, the size of the direct mapping section will always be the sum of the actual system RAM size plus the padding size. Even when the amount of system RAM is 64 TB, the following layout will still be used. Obviously KALSR will be weakened significantly. |____|_______actual RAM_______|_padding_|______the rest_______| 0 64TB ~120TB Instead, it should be like this: |____|_______actual RAM_______|_________the rest______________| 0 64TB ~120TB The size of padding region is controlled by CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING, which is 10 TB by default. The above issue only exists when CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING is set to a non-zero value, which is the case when CONFIG_MEMORY_HOTPLUG is enabled. Otherwise, using __PHYSICAL_MASK_SHIFT doesn't affect KASLR. Fix it by replacing __PHYSICAL_MASK_SHIFT with MAX_PHYSMEM_BITS. [ bp: Massage commit message. ] Fixes: b83ce5ee9147 ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52") Signed-off-by: Baoquan He Signed-off-by: Borislav Petkov Reviewed-by: Thomas Garnier Acked-by: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Kees Cook Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: frank.ramsay@hpe.com Cc: herbert@gondor.apana.org.au Cc: kirill@shutemov.name Cc: mike.travis@hpe.com Cc: thgarnie@google.com Cc: x86-ml Cc: yamada.masahiro@socionext.com Link: https://lkml.kernel.org/r/20190417083536.GE7065@MiWiFi-R3L-srv --- arch/x86/mm/kaslr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 3f452ffed7e9..d669c5e797e0 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -94,7 +94,7 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; - kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT); kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; /* -- cgit v1.2.3 From 5c14068f87d04adc73ba3f41c2a303d3c3d1fa12 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Apr 2019 16:39:02 -0500 Subject: x86/speculation/mds: Add 'mitigations=' support for MDS Add MDS to the new 'mitigations=' cmdline option. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ arch/x86/kernel/cpu/bugs.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9aa3543a8723..18cad2b0392a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2556,6 +2556,7 @@ spectre_v2_user=off [X86] spec_store_bypass_disable=off [X86,PPC] l1tf=off [X86] + mds=off [X86] auto (default) Mitigate all CPU vulnerabilities, but leave SMT @@ -2570,6 +2571,7 @@ if needed. This is for users who always want to be fully mitigated, even if it means losing SMT. Equivalent to: l1tf=flush,nosmt [X86] + mds=full,nosmt [X86] mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3c5c3c3ba734..667c273a66d7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -233,7 +233,7 @@ static const char * const mds_strings[] = { static void __init mds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MDS)) { + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { mds_mitigation = MDS_MITIGATION_OFF; return; } @@ -244,7 +244,8 @@ static void __init mds_select_mitigation(void) static_branch_enable(&mds_user_clear); - if (mds_nosmt && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && + (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } -- cgit v1.2.3 From 3fe3331bb285700ab2253dbb07f8e478fcea2f1b Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 21 Mar 2019 21:15:22 +0000 Subject: perf/x86/amd: Add event map for AMD Family 17h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Family 17h differs from prior families by: - Does not support an L2 cache miss event - It has re-enumerated PMC counters for: - L2 cache references - front & back end stalled cycles So we add a new amd_f17h_perfmon_event_map[] so that the generic perf event names will resolve to the correct h/w events on family 17h and above processors. Reference sections 2.1.13.3.3 (stalls) and 2.1.13.3.6 (L2): https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf Signed-off-by: Kim Phillips Cc: # v4.9+ Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Janakarajan Natarajan Cc: Jiri Olsa Cc: Linus Torvalds Cc: Martin Liška Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Pu Wen Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: e40ed1542dd7 ("perf/x86: Add perf support for AMD family-17h processors") [ Improved the formatting a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/events/amd/core.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 0ecfac84ba91..d45f3fbd232e 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -117,22 +117,39 @@ static __initconst const u64 amd_hw_cache_event_ids }; /* - * AMD Performance Monitor K7 and later. + * AMD Performance Monitor K7 and later, up to and including Family 16h: */ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, - [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; static u64 amd_pmu_event_map(int hw_event) { + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + return amd_perfmon_event_map[hw_event]; } -- cgit v1.2.3 From c09d65d9eab69985c75f98ed64541229f6fa9aa6 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Tue, 16 Apr 2019 20:36:34 +0300 Subject: KVM: x86: Consider LAPIC TSC-Deadline timer expired if deadline too short If guest sets MSR_IA32_TSCDEADLINE to value such that in host time-domain it's shorter than lapic_timer_advance_ns, we can reach a case that we call hrtimer_start() with expiration time set at the past. Because lapic_timer.timer is init with HRTIMER_MODE_ABS_PINNED, it is not allowed to run in softirq and therefore will never expire. To avoid such a scenario, verify that deadline expiration time is set on host time-domain further than (now + lapic_timer_advance_ns). A future patch can also consider adding a min_timer_deadline_ns module parameter, similar to min_timer_period_us to avoid races that amount of ns it takes to run logic could still call hrtimer_start() with expiration timer set at the past. Reviewed-by: Joao Martins Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9bf70cf84564..6cb990dbc15a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1542,9 +1542,12 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); + + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + + if (likely(tscdeadline > guest_tsc) && + likely(ns > lapic_timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, -- cgit v1.2.3 From da66761c2d93a46270d69001abb5692717495a68 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 20 Mar 2019 18:43:20 +0100 Subject: x86: kvm: hyper-v: deal with buggy TLB flush requests from WS2012 It was reported that with some special Multi Processor Group configuration, e.g: bcdedit.exe /set groupsize 1 bcdedit.exe /set maxgroup on bcdedit.exe /set groupaware on for a 16-vCPU guest WS2012 shows BSOD on boot when PV TLB flush mechanism is in use. Tracing kvm_hv_flush_tlb immediately reveals the issue: kvm_hv_flush_tlb: processor_mask 0x0 address_space 0x0 flags 0x2 The only flag set in this request is HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES, however, processor_mask is 0x0 and no HV_FLUSH_ALL_PROCESSORS is specified. We don't flush anything and apparently it's not what Windows expects. TLFS doesn't say anything about such requests and newer Windows versions seem to be unaffected. This all feels like a WS2012 bug, which is, however, easy to workaround in KVM: let's flush everything when we see an empty flush request, over-flushing doesn't hurt. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 421899f6ad7b..cc24b3a32c44 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; - all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; + + /* + * Work around possible WS2012 bug: it sends hypercalls + * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, + * while also expecting us to flush something and crashing if + * we don't. Let's treat processor_mask == 0 same as + * HV_FLUSH_ALL_PROCESSORS. + */ + all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || + flush.processor_mask == 0; } else { if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, sizeof(flush_ex)))) -- cgit v1.2.3 From 57bf67e73ce9bcce2258890f5abf2adf5f619f1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:31 -0700 Subject: KVM: lapic: Disable timer advancement if adaptive tuning goes haywire To minimize the latency of timer interrupts as observed by the guest, KVM adjusts the values it programs into the host timers to account for the host's overhead of programming and handling the timer event. Now that the timer advancement is automatically tuned during runtime, it's effectively unbounded by default, e.g. if KVM is running as L1 the advancement can measure in hundreds of milliseconds. Disable timer advancement if adaptive tuning yields an advancement of more than 5000ns, as large advancements can break reasonable assumptions of the guest, e.g. that a timer configured to fire after 1ms won't arrive on the next instruction. Although KVM busy waits to mitigate the case of a timer event arriving too early, complications can arise when shifting the interrupt too far, e.g. kvm-unit-test's vmx.interrupt test will fail when its "host" exits on interrupts as KVM may inject the INTR before the guest executes STI+HLT. Arguably the unit test is "broken" in the sense that delaying a timer interrupt by 1ms doesn't technically guarantee the interrupt will arrive after STI+HLT, but it's a reasonable assumption that KVM should support. Furthermore, an unbounded advancement also effectively unbounds the time spent busy waiting, e.g. if the guest programs a timer with a very large delay. 5000ns is a somewhat arbitrary threshold. When running on bare metal, which is the intended use case, timer advancement is expected to be in the general vicinity of 1000ns. 5000ns is high enough that false positives are unlikely, while not being so high as to negatively affect the host's performance/stability. Note, a future patch will enable userspace to disable KVM's adaptive tuning, which will allow priveleged userspace will to specifying an advancement value in excess of this arbitrary threshold in order to satisfy an abnormal use case. Cc: Liran Alon Cc: Wanpeng Li Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6cb990dbc15a..bbe1402309e3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1522,6 +1522,10 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) lapic_timer_advance_adjust_done = true; + if (unlikely(lapic_timer_advance_ns > 5000)) { + lapic_timer_advance_ns = 0; + lapic_timer_advance_adjust_done = true; + } } } -- cgit v1.2.3 From 39497d7660d9866a47a2dc9055672358da57ad3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:32 -0700 Subject: KVM: lapic: Track lapic timer advance per vCPU Automatically adjusting the globally-shared timer advancement could corrupt the timer, e.g. if multiple vCPUs are concurrently adjusting the advancement value. That could be partially fixed by using a local variable for the arithmetic, but it would still be susceptible to a race when setting timer_advance_adjust_done. And because virtual_tsc_khz and tsc_scaling_ratio are per-vCPU, the correct calibration for a given vCPU may not apply to all vCPUs. Furthermore, lapic_timer_advance_ns is marked __read_mostly, which is effectively violated when finding a stable advancement takes an extended amount of timer. Opportunistically change the definition of lapic_timer_advance_ns to a u32 so that it matches the style of struct kvm_timer. Explicitly pass the param to kvm_create_lapic() so that it doesn't have to be exposed to lapic.c, thus reducing the probability of unintentionally using the global value instead of the per-vCPU value. Cc: Liran Alon Cc: Wanpeng Li Reviewed-by: Liran Alon Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 36 +++++++++++++++++++----------------- arch/x86/kvm/lapic.h | 4 +++- arch/x86/kvm/vmx/vmx.c | 4 +++- arch/x86/kvm/x86.c | 7 +++---- arch/x86/kvm/x86.h | 2 -- 5 files changed, 28 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bbe1402309e3..7decd58c9cea 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,7 +70,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -static bool lapic_timer_advance_adjust_done = false; #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1485,6 +1484,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 guest_tsc, tsc_deadline, ns; if (!lapic_in_kernel(vcpu)) @@ -1504,34 +1504,36 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + nsec_to_cycles(vcpu, timer_advance_ns))); - if (!lapic_timer_advance_adjust_done) { + if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ if (guest_tsc < tsc_deadline) { ns = (tsc_deadline - guest_tsc) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns -= min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } else { /* too late */ ns = (guest_tsc - tsc_deadline) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns += min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - lapic_timer_advance_adjust_done = true; - if (unlikely(lapic_timer_advance_ns > 5000)) { - lapic_timer_advance_ns = 0; - lapic_timer_advance_adjust_done = true; + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = 0; + apic->lapic_timer.timer_advance_adjust_done = true; } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; } } static void start_sw_tscdeadline(struct kvm_lapic *apic) { - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; @@ -1551,11 +1553,10 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) do_div(ns, this_tsc_khz); if (likely(tscdeadline > guest_tsc) && - likely(ns > lapic_timer_advance_ns)) { + likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); + expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -2262,7 +2263,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu) +int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns) { struct kvm_lapic *apic; @@ -2286,6 +2287,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; + apic->lapic_timer.timer_advance_ns = timer_advance_ns; /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff6ef9c3d760..3e97f8a68967 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,8 +31,10 @@ struct kvm_timer { u32 timer_mode_mask; u64 tscdeadline; u64 expired_tscdeadline; + u32 timer_advance_ns; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; + bool timer_advance_adjust_done; }; struct kvm_lapic { @@ -62,7 +64,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu); +int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b4e7d645275a..bcdd69d90a8c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7032,6 +7032,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; if (kvm_mwait_in_guest(vcpu->kvm)) return -EOPNOTSUPP; @@ -7040,7 +7041,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, + ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0d1fc80ac5a..aa26a3cfc765 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -137,9 +137,8 @@ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int __read_mostly lapic_timer_advance_ns = 1000; +static u32 __read_mostly lapic_timer_advance_ns = 1000; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -7873,7 +7872,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_timer_advance_ns) + if (vcpu->arch.apic->lapic_timer.timer_advance_ns) wait_lapic_expire(vcpu); guest_enter_irqoff(); @@ -9061,7 +9060,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - r = kvm_create_lapic(vcpu); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; } else diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index aedc5d0d4989..534d3f28bb01 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; -extern unsigned int lapic_timer_advance_ns; - extern bool enable_vmware_backdoor; extern struct static_key kvm_no_apic_vcpu; -- cgit v1.2.3 From c3941d9e0ccd48920e4811f133235b3597e5310b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:33 -0700 Subject: KVM: lapic: Allow user to disable adaptive tuning of timer advancement The introduction of adaptive tuning of lapic timer advancement did not allow for the scenario where userspace would want to disable adaptive tuning but still employ timer advancement, e.g. for testing purposes or to handle a use case where adaptive tuning is unable to settle on a suitable time. This is epecially pertinent now that KVM places a hard threshold on the maximum advancment time. Rework the timer semantics to accept signed values, with a value of '-1' being interpreted as "use adaptive tuning with KVM's internal default", and any other value being used as an explicit advancement time, e.g. a time of '0' effectively disables advancement. Note, this does not completely restore the original behavior of lapic_timer_advance_ns. Prior to tracking the advancement per vCPU, which is necessary to support autotuning, userspace could adjust lapic_timer_advance_ns for *running* vCPU. With per-vCPU tracking, the module params are snapshotted at vCPU creation, i.e. applying a new advancement effectively requires restarting a VM. Dynamically updating a running vCPU is possible, e.g. a helper could be added to retrieve the desired delay, choosing between the global module param and the per-VCPU value depending on whether or not auto-tuning is (globally) enabled, but introduces a great deal of complexity. The wrapper itself is not complex, but understanding and documenting the effects of dynamically toggling auto-tuning and/or adjusting the timer advancement is nigh impossible since the behavior would be dependent on KVM's implementation as well as compiler optimizations. In other words, providing stable behavior would require extremely careful consideration now and in the future. Given that the expected use of a manually-tuned timer advancement is to "tune once, run many", use the vastly simpler approach of recognizing changes to the module params only when creating a new vCPU. Cc: Liran Alon Cc: Wanpeng Li Reviewed-by: Liran Alon Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 +++++++++-- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/x86.c | 9 +++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7decd58c9cea..12f2f7dc121f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2263,7 +2263,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns) +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) { struct kvm_lapic *apic; @@ -2287,7 +2287,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; - apic->lapic_timer.timer_advance_ns = timer_advance_ns; + if (timer_advance_ns == -1) { + apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_adjust_done = false; + } else { + apic->lapic_timer.timer_advance_ns = timer_advance_ns; + apic->lapic_timer.timer_advance_adjust_done = true; + } + /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 3e97f8a68967..d6d049ba3045 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -64,7 +64,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns); +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aa26a3cfc765..9482cb36b92a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,8 +136,13 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); -/* lapic timer advance (tscdeadline mode only) in nanoseconds */ -static u32 __read_mostly lapic_timer_advance_ns = 1000; +/* + * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables + * adaptive tuning starting from default advancment of 1000ns. '0' disables + * advancement entirely. Any other value is used as-is and disables adaptive + * tuning, i.e. allows priveleged userspace to set an exact advancement time. + */ +static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; -- cgit v1.2.3 From b6aa57c69cb26ea0160c51f7cf45f1af23542686 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 17 Apr 2019 10:15:34 -0700 Subject: KVM: lapic: Convert guest TSC to host time domain if necessary To minimize the latency of timer interrupts as observed by the guest, KVM adjusts the values it programs into the host timers to account for the host's overhead of programming and handling the timer event. In the event that the adjustments are too aggressive, i.e. the timer fires earlier than the guest expects, KVM busy waits immediately prior to entering the guest. Currently, KVM manually converts the delay from nanoseconds to clock cycles. But, the conversion is done in the guest's time domain, while the delay occurs in the host's time domain. This is perfectly ok when the guest and host are using the same TSC ratio, but if the guest is using a different ratio then the delay may not be accurate and could wait too little or too long. When the guest is not using the host's ratio, convert the delay from guest clock cycles to host nanoseconds and use ndelay() instead of __delay() to provide more accurate timing. Because converting to nanoseconds is relatively expensive, e.g. requires division and more multiplication ops, continue using __delay() directly when guest and host TSCs are running at the same ratio. Cc: Liran Alon Cc: Wanpeng Li Cc: stable@vger.kernel.org Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 12f2f7dc121f..e0fa6fc0b2d8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1481,6 +1481,26 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } +static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) +{ + u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; + + /* + * If the guest TSC is running at a different ratio than the host, then + * convert the delay to nanoseconds to achieve an accurate delay. Note + * that __delay() uses delay_tsc whenever the hardware has TSC, thus + * always for VMX enabled hardware. + */ + if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + __delay(min(guest_cycles, + nsec_to_cycles(vcpu, timer_advance_ns))); + } else { + u64 delay_ns = guest_cycles * 1000000ULL; + do_div(delay_ns, vcpu->arch.virtual_tsc_khz); + ndelay(min_t(u32, delay_ns, timer_advance_ns)); + } +} + void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1501,10 +1521,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, timer_advance_ns))); + __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ -- cgit v1.2.3 From 0806a13c5a4306b6a49f094cd83bd94860be26a5 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 18 Apr 2019 16:46:47 +0200 Subject: x86/platform/olpc: Don't split string literals when fixing up the DT It was pointed out in a review, and checkpatch.pl complains about this. Breaking it down into multiple ofw evaluations works just as well and reads better. Signed-off-by: Lubomir Rintel Acked-by: Thomas Gleixner Signed-off-by: Sebastian Reichel --- arch/x86/platform/olpc/olpc_dt.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index ac9e7bf49b66..87e1d81684c6 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -242,9 +242,9 @@ void __init olpc_dt_fixup(void) pr_info("PROM DT: Old firmware detected, applying fixes\n"); /* Add olpc,xo1-battery compatible marker to battery node */ - olpc_dt_interpret("\" /battery@0\" find-device" - " \" olpc,xo1-battery\" +compatible" - " device-end"); + olpc_dt_interpret("\" /battery@0\" find-device"); + olpc_dt_interpret(" \" olpc,xo1-battery\" +compatible"); + olpc_dt_interpret("device-end"); board_rev = olpc_dt_get_board_revision(); if (!board_rev) @@ -252,19 +252,24 @@ void __init olpc_dt_fixup(void) if (board_rev >= olpc_board_pre(0xd0)) { /* XO-1.5: add dcon device */ - olpc_dt_interpret("\" /pci/display@1\" find-device" - " new-device" - " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible" - " finish-device device-end"); + olpc_dt_interpret("\" /pci/display@1\" find-device"); + olpc_dt_interpret(" new-device"); + olpc_dt_interpret(" \" dcon\" device-name"); + olpc_dt_interpret(" \" olpc,xo1-dcon\" +compatible"); + olpc_dt_interpret(" finish-device"); + olpc_dt_interpret("device-end"); } else { /* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */ - olpc_dt_interpret("\" /pci/display@1,1\" find-device" - " new-device" - " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible" - " finish-device device-end" - " \" /rtc\" find-device" - " \" olpc,xo1-rtc\" +compatible" - " device-end"); + olpc_dt_interpret("\" /pci/display@1,1\" find-device"); + olpc_dt_interpret(" new-device"); + olpc_dt_interpret(" \" dcon\" device-name"); + olpc_dt_interpret(" \" olpc,xo1-dcon\" +compatible"); + olpc_dt_interpret(" finish-device"); + olpc_dt_interpret("device-end"); + + olpc_dt_interpret("\" /rtc\" find-device"); + olpc_dt_interpret(" \" olpc,xo1-rtc\" +compatible"); + olpc_dt_interpret("device-end"); } } -- cgit v1.2.3 From 47e120d3337b83ac6de2ebc622b3c9b6c1a9f315 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 18 Apr 2019 16:46:48 +0200 Subject: x86/platform/olpc: Trivial code move in DT fixup This makes the following patch more concise. Signed-off-by: Lubomir Rintel Acked-by: Thomas Gleixner Signed-off-by: Sebastian Reichel --- arch/x86/platform/olpc/olpc_dt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 87e1d81684c6..820236b511b3 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -241,11 +241,6 @@ void __init olpc_dt_fixup(void) pr_info("PROM DT: Old firmware detected, applying fixes\n"); - /* Add olpc,xo1-battery compatible marker to battery node */ - olpc_dt_interpret("\" /battery@0\" find-device"); - olpc_dt_interpret(" \" olpc,xo1-battery\" +compatible"); - olpc_dt_interpret("device-end"); - board_rev = olpc_dt_get_board_revision(); if (!board_rev) return; @@ -271,6 +266,11 @@ void __init olpc_dt_fixup(void) olpc_dt_interpret(" \" olpc,xo1-rtc\" +compatible"); olpc_dt_interpret("device-end"); } + + /* Add olpc,xo1-battery compatible marker to battery node */ + olpc_dt_interpret("\" /battery@0\" find-device"); + olpc_dt_interpret(" \" olpc,xo1-battery\" +compatible"); + olpc_dt_interpret("device-end"); } void __init olpc_dt_build_devicetree(void) -- cgit v1.2.3 From a7a9bacb9a32f9cebe5feda2e33064440f52d61d Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 18 Apr 2019 16:46:49 +0200 Subject: x86/platform/olpc: Use a correct version when making up a battery node The XO-1 and XO-1.5 batteries apparently differ in an ability to report ambient temperature. We need to use a different compatible string for the XO-1.5 battery. Previously olpc_dt_fixup() used the presence of the battery node's compatible property to decide whether the DT is up to date. Now we need to look for a particular value in the compatible string, to decide Signed-off-by: Lubomir Rintel Acked-by: Pavel Machek Acked-by: Thomas Gleixner Signed-off-by: Sebastian Reichel --- arch/x86/platform/olpc/olpc_dt.c | 64 +++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 820236b511b3..0296c5b55e6f 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -220,10 +220,26 @@ static u32 __init olpc_dt_get_board_revision(void) return be32_to_cpu(rev); } +int olpc_dt_compatible_match(phandle node, const char *compat) +{ + char buf[64], *p; + int plen, len; + + plen = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf)); + if (plen <= 0) + return 0; + + len = strlen(compat); + for (p = buf; p < buf + plen; p += strlen(p) + 1) { + if (strcmp(p, compat) == 0) + return 1; + } + + return 0; +} + void __init olpc_dt_fixup(void) { - int r; - char buf[64]; phandle node; u32 board_rev; @@ -231,22 +247,31 @@ void __init olpc_dt_fixup(void) if (!node) return; - /* - * If the battery node has a compatible property, we are running a new - * enough firmware and don't have fixups to make. - */ - r = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf)); - if (r > 0) - return; - - pr_info("PROM DT: Old firmware detected, applying fixes\n"); - board_rev = olpc_dt_get_board_revision(); if (!board_rev) return; if (board_rev >= olpc_board_pre(0xd0)) { - /* XO-1.5: add dcon device */ + /* XO-1.5 */ + + if (olpc_dt_compatible_match(node, "olpc,xo1.5-battery")) + return; + + /* Add olpc,xo1.5-battery compatible marker to battery node */ + olpc_dt_interpret("\" /battery@0\" find-device"); + olpc_dt_interpret(" \" olpc,xo1.5-battery\" +compatible"); + olpc_dt_interpret("device-end"); + + if (olpc_dt_compatible_match(node, "olpc,xo1-battery")) { + /* + * If we have a olpc,xo1-battery compatible, then we're + * running a new enough firmware that already has + * the dcon node. + */ + return; + } + + /* Add dcon device */ olpc_dt_interpret("\" /pci/display@1\" find-device"); olpc_dt_interpret(" new-device"); olpc_dt_interpret(" \" dcon\" device-name"); @@ -254,7 +279,18 @@ void __init olpc_dt_fixup(void) olpc_dt_interpret(" finish-device"); olpc_dt_interpret("device-end"); } else { - /* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */ + /* XO-1 */ + + if (olpc_dt_compatible_match(node, "olpc,xo1-battery")) { + /* + * If we have a olpc,xo1-battery compatible, then we're + * running a new enough firmware that already has + * the dcon and RTC nodes. + */ + return; + } + + /* Add dcon device, mark RTC as olpc,xo1-rtc */ olpc_dt_interpret("\" /pci/display@1,1\" find-device"); olpc_dt_interpret(" new-device"); olpc_dt_interpret(" \" dcon\" device-name"); -- cgit v1.2.3 From 169d0869962da362b5058e31f87911b2960418af Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 26 Feb 2019 01:20:01 -0500 Subject: x86/smpboot: Rename match_die() to match_pkg() Syntax only, no functional or semantic change. This routine matches packages, not die, so name it thus. Signed-off-by: Len Brown Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/7ca18c4ae7816a1f9eda37414725df676e63589d.1551160674.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce1a67b70168..3f8bbfb26c18 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -455,7 +455,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * multicore group inside a NUMA node. If this happens, we will * discard the MC level of the topology later. */ -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->phys_proc_id == o->phys_proc_id) return true; @@ -546,7 +546,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_die(c, o))) { + if ((i == cpu) || (has_mp && match_pkg(c, o))) { link_mask(topology_core_cpumask, cpu, i); /* @@ -570,7 +570,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_die(c, o) && !topology_same_node(c, o)) + if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; } -- cgit v1.2.3 From 93ddedaa5c9cb828d39a19b5d6e7e1939393085a Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 6 Mar 2019 01:44:25 +0100 Subject: x86/defconfig: Remove archaic partition tables support On x86 systems, only MSDOS and GPT partition tables are typically encountered. Remove all the rest. Note, CONFIG_EFI_PARTITION is also removed since it defaults to `y'. Signed-off-by: Ahmed S. Darwish Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190306004425.GA30537@darwi-home-pc Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 12 ------------ arch/x86/configs/x86_64_defconfig | 12 ------------ 2 files changed, 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9f908112bbb9..2b2481acc661 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -25,18 +25,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 1d3badfda09e..e8829abf063a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -24,18 +24,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_SMP=y CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 -- cgit v1.2.3 From bff9504bfc9c5c6610b42d47f689f350fd969eb8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 Mar 2019 14:47:53 -0500 Subject: rseq: Clean up comments by reflecting removal of event counter The "event counter" was removed from rseq before it was merged upstream. However, a few comments in the source code still refer to it. Adapt the comments to match reality. Signed-off-by: Mathieu Desnoyers Acked-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: H. Peter Anvin Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E. McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20190305194755.2602-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- arch/arm/kernel/signal.c | 3 +-- arch/x86/kernel/signal.c | 5 +---- kernel/rseq.c | 3 +-- 3 files changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 76bb8de6bf6b..be5edfdde558 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -549,8 +549,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) int ret; /* - * Increment event counter and perform fixup for the pre-signal - * frame. + * Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..22c233b509da 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -688,10 +688,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; - /* - * Increment event counter and perform fixup for the pre-signal - * frame. - */ + /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..849afe749131 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs) * - signal delivery, * and return to user-space. * - * This is how we can ensure that the entire rseq critical section, - * consisting of both the C part and the assembly instruction sequence, + * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. -- cgit v1.2.3 From 5dd50aaeb1853ee0953b60fa6d1143d95429ae7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Nov 2018 17:40:31 +0000 Subject: Make anon_inodes unconditional Make the anon_inodes facility unconditional so that it can be used by core VFS code and pidfd code. Signed-off-by: David Howells Signed-off-by: Al Viro [christian@brauner.io: adapt commit message to mention pidfds] Signed-off-by: Christian Brauner --- arch/arm/kvm/Kconfig | 1 - arch/arm64/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/kvm/Kconfig | 1 - drivers/base/Kconfig | 1 - drivers/char/tpm/Kconfig | 1 - drivers/dma-buf/Kconfig | 1 - drivers/gpio/Kconfig | 1 - drivers/iio/Kconfig | 1 - drivers/infiniband/Kconfig | 1 - drivers/vfio/Kconfig | 1 - fs/Makefile | 2 +- fs/notify/fanotify/Kconfig | 1 - fs/notify/inotify/Kconfig | 1 - init/Kconfig | 10 ---------- 18 files changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3f5320f46de2..f591026347a5 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on MMU && OF select PREEMPT_NOTIFIERS - select ANON_INODES select ARM_GIC select ARM_GIC_V3 select ARM_GIC_V3_ITS diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a3f85624313e..a67121d419a2 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM depends on OF select MMU_NOTIFIER select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 4528bc9c3cb1..eac25aef21e0 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM depends on MIPS_FP_SUPPORT select EXPORT_UASM select PREEMPT_NOTIFIERS - select ANON_INODES select KVM_GENERIC_DIRTYLOG_READ_PROTECT select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index bfdde04e4905..f53997a8ca62 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select SRCU diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 767453faacfc..1816ee48eadd 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_EVENTFD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19..7a70fb58b2d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -44,7 +44,6 @@ config X86 # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ANON_INODES select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 72fa955f4a15..fc042419e670 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,7 +27,6 @@ config KVM depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER - select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select IRQ_BYPASS_MANAGER diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 059700ea3521..03f067da12ee 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig" config DMA_SHARED_BUFFER bool default n - select ANON_INODES select IRQ_WORK help This option enables the framework for buffer-sharing between diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 536e55d3919f..f3e4bc490cf0 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -157,7 +157,6 @@ config TCG_CRB config TCG_VTPM_PROXY tristate "VTPM Proxy Interface" depends on TCG_TPM - select ANON_INODES ---help--- This driver proxies for an emulated TPM (vTPM) running in userspace. A device /dev/vtpmx is provided that creates a device pair diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig index 2e5a0faa2cb1..3fc9c2efc583 100644 --- a/drivers/dma-buf/Kconfig +++ b/drivers/dma-buf/Kconfig @@ -3,7 +3,6 @@ menu "DMABUF options" config SYNC_FILE bool "Explicit Synchronization Framework" default n - select ANON_INODES select DMA_SHARED_BUFFER ---help--- The Sync File Framework adds explicit syncronization via diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3f50526a771f..0f91600c27ae 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -12,7 +12,6 @@ config ARCH_HAVE_CUSTOM_GPIO_H menuconfig GPIOLIB bool "GPIO Support" - select ANON_INODES help This enables GPIO support through the generic GPIO library. You only need to enable this, if you also want to enable diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig index d08aeb41cd07..1dec0fecb6ef 100644 --- a/drivers/iio/Kconfig +++ b/drivers/iio/Kconfig @@ -4,7 +4,6 @@ menuconfig IIO tristate "Industrial I/O support" - select ANON_INODES help The industrial I/O subsystem provides a unified framework for drivers for many different types of embedded sensors using a diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a1fb840de45d..d318bab25860 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -25,7 +25,6 @@ config INFINIBAND_USER_MAD config INFINIBAND_USER_ACCESS tristate "InfiniBand userspace access (verbs and CM)" - select ANON_INODES depends on MMU ---help--- Userspace InfiniBand access support. This enables the diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 9de5ed38da83..3798d77d131c 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -22,7 +22,6 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64) - select ANON_INODES help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/fs/Makefile b/fs/Makefile index 427fec226fae..35945f8139e6 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 735bfb2e9190..521dc91d2cb5 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -1,7 +1,6 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY - select ANON_INODES select EXPORTFS default n ---help--- diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index b981fc0c8379..0161c74e76e2 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -1,6 +1,5 @@ config INOTIFY_USER bool "Inotify support for userspace" - select ANON_INODES select FSNOTIFY default y ---help--- diff --git a/init/Kconfig b/init/Kconfig index 4592bf7997c0..be8f97e37a76 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1171,9 +1171,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION config SYSCTL bool -config ANON_INODES - bool - config HAVE_UID16 bool @@ -1378,14 +1375,12 @@ config HAVE_FUTEX_CMPXCHG config EPOLL bool "Enable eventpoll support" if EXPERT default y - select ANON_INODES help Disabling this option will cause the kernel to be built without support for epoll family of system calls. config SIGNALFD bool "Enable signalfd() system call" if EXPERT - select ANON_INODES default y help Enable the signalfd() system call that allows to receive signals @@ -1395,7 +1390,6 @@ config SIGNALFD config TIMERFD bool "Enable timerfd() system call" if EXPERT - select ANON_INODES default y help Enable the timerfd() system call that allows to receive timer @@ -1405,7 +1399,6 @@ config TIMERFD config EVENTFD bool "Enable eventfd() system call" if EXPERT - select ANON_INODES default y help Enable the eventfd() system call that allows to receive both @@ -1516,7 +1509,6 @@ config KALLSYMS_BASE_RELATIVE # syscall, maps, verifier config BPF_SYSCALL bool "Enable bpf() system call" - select ANON_INODES select BPF select IRQ_WORK default n @@ -1533,7 +1525,6 @@ config BPF_JIT_ALWAYS_ON config USERFAULTFD bool "Enable userfaultfd() system call" - select ANON_INODES depends on MMU help Enable the userfaultfd() system call that allows to intercept and @@ -1600,7 +1591,6 @@ config PERF_EVENTS bool "Kernel performance events and counters" default y if PROFILING depends on HAVE_PERF_EVENTS - select ANON_INODES select IRQ_WORK select SRCU help -- cgit v1.2.3 From 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:49:52 +0900 Subject: x86/kprobes: Verify stack frame on kretprobe Verify the stack frame pointer on kretprobe trampoline handler, If the stack frame pointer does not match, it skips the wrong entry and tries to find correct one. This can happen if user puts the kretprobe on the function which can be used in the path of ftrace user-function call. Such functions should not be probed, so this adds a warning message that reports which function should be blacklisted. Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 26 ++++++++++++++++++++++++++ include/linux/kprobes.h | 1 + 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7e..18fbe9be2d68 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -569,6 +569,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -759,15 +760,21 @@ static __used void *trampoline_handler(struct pt_regs *regs) unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -789,8 +796,25 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -808,6 +832,8 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 201f0f2683f2..9a897256e481 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -173,6 +173,7 @@ struct kretprobe_instance { struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + void *fp; char data[0]; }; -- cgit v1.2.3 From b191fa96ea6dc00d331dcc28c1f7db5e075693a0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:50:49 +0900 Subject: x86/kprobes: Avoid kretprobe recursion bug Avoid kretprobe recursion loop bg by setting a dummy kprobes to current_kprobe per-CPU variable. This bug has been introduced with the asm-coded trampoline code, since previously it used another kprobe for hooking the function return placeholder (which only has a nop) and trampoline handler was called from that kprobe. This revives the old lost kprobe again. With this fix, we don't see deadlock anymore. And you can see that all inner-called kretprobe are skipped. event_1 235 0 event_2 19375 19612 The 1st column is recorded count and the 2nd is missed count. Above shows (event_1 rec) + (event_2 rec) ~= (event_2 missed) (some difference are here because the counter is racy) Reported-by: Andrea Righi Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: c9becf58d935 ("[PATCH] kretprobe: kretprobe-booster") Link: http://lkml.kernel.org/r/155094064889.6137.972160690963039.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 18fbe9be2d68..fed46ddb1eef 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -749,11 +749,16 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +static struct kprobe kretprobe_kprobe = { + .addr = (void *)kretprobe_trampoline, +}; + /* * Called from kretprobe_trampoline */ static __used void *trampoline_handler(struct pt_regs *regs) { + struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; @@ -763,6 +768,17 @@ static __used void *trampoline_handler(struct pt_regs *regs) void *frame_pointer; bool skipped = false; + preempt_disable(); + + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kcb = get_kprobe_ctlblk(); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ @@ -838,10 +854,9 @@ static __used void *trampoline_handler(struct pt_regs *regs) orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); } recycle_rp_inst(ri, &empty_rp); @@ -857,6 +872,9 @@ static __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); -- cgit v1.2.3 From 1de7edbb59c8f1b46071f66c5c97b8a59569eb51 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:43 -0700 Subject: x86/cpu/bugs: Use __initconst for 'const' init data Some of the recently added const tables use __initdata which causes section attribute conflicts. Use __initconst instead. Fixes: fa1202ef2243 ("x86/speculation: Add command line control") Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190330004743.29541-9-andi@firstfloor.org --- arch/x86/kernel/cpu/bugs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2da82eff0eb4..b91b3bfa5cfb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -275,7 +275,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -419,7 +419,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -658,7 +658,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ -- cgit v1.2.3 From c03e27506a564ec7db1b179e7464835901f49751 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:35 -0700 Subject: x86/asm: Mark all top level asm statements as .text With gcc toplevel assembler statements that do not mark themselves as .text may end up in other sections. This causes LTO boot crashes because various assembler statements ended up in the middle of the initcall section. It's also a latent problem without LTO, although it's currently not known to cause any real problems. According to the gcc team it's expected behavior. Always mark all the top level assembler statements as text so that they switch to the right section. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190330004743.29541-1-andi@firstfloor.org --- arch/x86/kernel/cpu/amd.c | 3 ++- arch/x86/kernel/kprobes/core.c | 1 + arch/x86/lib/error-inject.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 01004bfb1a1b..1bcb489e07e7 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -83,7 +83,8 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) */ extern __visible void vide(void); -__asm__(".globl vide\n" +__asm__(".text\n" + ".globl vide\n" ".type vide, @function\n" ".align 4\n" "vide: ret\n"); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7e..31ab91c9c4e9 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -715,6 +715,7 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * calls trampoline_handler() runs, which calls the kretprobe's handler. */ asm( + ".text\n" ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 3cdf06128d13..be5b5fb1598b 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -6,6 +6,7 @@ asmlinkage void just_return_func(void); asm( + ".text\n" ".type just_return_func, @function\n" ".globl just_return_func\n" "just_return_func:\n" -- cgit v1.2.3 From 26b31f46f036ad89de20cbbb732b76289411eddb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:36 -0700 Subject: x86/cpu/amd: Exclude 32bit only assembler from 64bit build The "vide" inline assembler is only needed on 32bit kernels for old 32bit only CPUs. Guard it with an #ifdef so it's not included in 64bit builds. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190330004743.29541-2-andi@firstfloor.org --- arch/x86/kernel/cpu/amd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1bcb489e07e7..fb6a64bd765f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -82,12 +82,14 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ +#ifdef CONFIG_X86_32 extern __visible void vide(void); __asm__(".text\n" ".globl vide\n" ".type vide, @function\n" ".align 4\n" "vide: ret\n"); +#endif static void init_amd_k5(struct cpuinfo_x86 *c) { -- cgit v1.2.3 From 81423c37415fe45057d64196ae0ce8e17a9c7148 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:38 -0700 Subject: x86/timer: Don't inline __const_udelay() LTO will happily inline __const_udelay() everywhere it is used. Forcing it noinline saves ~44k text in a LTO build. 13999560 1740864 1499136 17239560 1070e08 vmlinux-with-udelay-inline 13954764 1736768 1499136 17190668 1064f0c vmlinux-wo-udelay-inline Even without LTO this function should never be inlined. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190330004743.29541-4-andi@firstfloor.org --- arch/x86/lib/delay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f5b7f1b3b6d7..b7375dc6898f 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -162,7 +162,7 @@ void __delay(unsigned long loops) } EXPORT_SYMBOL(__delay); -void __const_udelay(unsigned long xloops) +noinline void __const_udelay(unsigned long xloops) { unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy; int d0; -- cgit v1.2.3 From 02143c2931c3c0faf088c5859a10de6c2b4f2d96 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:40 -0700 Subject: x86/hyperv: Make hv_vcpu_is_preempted() visible This function is referrenced from assembler, so it needs to be marked visible for LTO. Fixes: 3a025de64bf8 ("x86/hyperv: Enable PV qspinlock for Hyper-V") Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Yi Sun Cc: kys@microsoft.com Cc: haiyangz@microsoft.com Link: https://lkml.kernel.org/r/20190330004743.29541-6-andi@firstfloor.org --- arch/x86/hyperv/hv_spinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index a861b0456b1a..07f21a06392f 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -56,7 +56,7 @@ static void hv_qlock_wait(u8 *byte, u8 val) /* * Hyper-V does not support this so far. */ -bool hv_vcpu_is_preempted(int vcpu) +__visible bool hv_vcpu_is_preempted(int vcpu) { return false; } -- cgit v1.2.3 From 14e581c381b942ce5463a7e61326d8ce1c843be7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:42 -0700 Subject: x86/kvm: Make steal_time visible This per cpu variable is accessed from assembler code, so it needs to be visible for LTO. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20190330004743.29541-8-andi@firstfloor.org --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5c93a65ee1e5..3f0cc828cc36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -67,7 +67,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; /* -- cgit v1.2.3 From 2ee27796f298b710992a677a7e4d35c8c588b17e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 30 Dec 2018 18:27:15 +0100 Subject: x86/cpu/intel: Lower the "ENERGY_PERF_BIAS: Set to normal" message's log priority The "ENERGY_PERF_BIAS: Set to 'normal', was 'performance'" message triggers on pretty much every Intel machine. The purpose of log messages with a warning level is to notify the user of something which potentially is a problem, or at least somewhat unexpected. This message clearly does not match those criteria, so lower its log priority from warning to info. Signed-off-by: Hans de Goede Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181230172715.17469-1-hdegoede@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..3142fd7a9b32 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -611,8 +611,8 @@ static void init_intel_energy_perf(struct cpuinfo_x86 *c) if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) return; - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); + pr_info_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + pr_info_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); } -- cgit v1.2.3 From 28156d7678431312b463361b23a218a1b5645ba5 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 28 Dec 2018 07:24:13 +0000 Subject: x86/mce: Fix debugfs_simple_attr.cocci warnings Use DEFINE_DEBUGFS_ATTRIBUTE() rather than DEFINE_SIMPLE_ATTRIBUTE() for debugfs files. Semantic patch information: Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file() imposes some significant overhead as compared to DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe(). Generated by: scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci Signed-off-by: YueHaibing Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: Yazen Ghannam Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/1545981853-70877-1-git-send-email-yuehaibing@huawei.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mce/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 58925e7ccb27..3e081428117c 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2429,8 +2429,8 @@ static int fake_panic_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, - fake_panic_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, + "%llu\n"); static int __init mcheck_debugfs_init(void) { @@ -2439,8 +2439,8 @@ static int __init mcheck_debugfs_init(void) dmce = mce_get_debugfs_dir(); if (!dmce) return -ENOMEM; - ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, - &fake_panic_fops); + ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, + NULL, &fake_panic_fops); if (!ffake_panic) return -ENOMEM; -- cgit v1.2.3 From 987ddbe4870b53623d76ac64044c55a13e368113 Mon Sep 17 00:00:00 2001 From: David Wang Date: Thu, 27 Dec 2018 16:41:50 +0800 Subject: x86/power: Optimize C3 entry on Centaur CPUs For new Centaur CPUs the ucode will take care of the preservation of cache coherence between CPU cores in C-states regardless of how deep the C-states are. So, it is not necessary to flush the caches in software befor entering C3. This useless operation will cause performance drop for the cores which share some caches with the idling core. Signed-off-by: David Wang Reviewed-by: Thomas Gleixner Acked-by: Pavel Machek Cc: Linus Torvalds Cc: Peter Zijlstra Cc: brucechang@via-alliance.com Cc: cooperyan@zhaoxin.com Cc: len.brown@intel.com Cc: linux-pm@kernel.org Cc: qiyuanwang@zhaoxin.com Cc: rjw@rjwysocki.net Cc: timguo@zhaoxin.com Link: http://lkml.kernel.org/r/1545900110-2757-1-git-send-email-davidwang@zhaoxin.com [ Tidy up the comment. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/cstate.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 158ad1483c43..cb6e076a6d39 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -51,6 +51,18 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) + flags->bm_check = 1; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); -- cgit v1.2.3 From f28b11a2abd9cf5535baa03e28fc63ad0e14f52a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Dec 2018 13:36:56 -0800 Subject: x86/fault: Reword initial BUG message for unhandled page faults Reword the NULL pointer dereference case to simply state that a NULL pointer was dereferenced, i.e. drop "unable to handle" as that implies that there are instances where the kernel actual does handle NULL pointer dereferences, which is not true barring funky exception fixup. For the non-NULL case, replace "kernel paging request" with "page fault" as the kernel can technically oops on faults that originated in user code. Dropping "kernel" also allows future patches to provide detailed information on where the fault occurred, e.g. user vs. kernel, without conflicting with the initial BUG message. In both cases, replace "at address=" with wording more appropriate to the oops, as "at" may be interpreted as stating that the address is the RIP of the instruction that faulted. Last, and probably least, further qualify the NULL-pointer path by checking that the fault actually originated in kernel code. It's technically possible for userspace to map address 0, and not printing a super specific message is the least of our worries if the kernel does manage to oops on an actual NULL pointer dereference from userspace. Before: BUG: unable to handle kernel NULL pointer dereference at ffffbeef00000000 BUG: unable to handle kernel paging request at ffffbeef00000000 After: BUG: kernel NULL pointer dereference, address = 0000000000000008 BUG: unable to handle page fault for address = ffffbeef00000000 Suggested-by: Linus Torvalds Signed-off-by: Sean Christopherson Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20181221213657.27628-2-sean.j.christopherson@intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 667f1da36208..df2c5cdef5c4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -644,9 +644,12 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad from_kuid(&init_user_ns, current_uid())); } - pr_alert("BUG: unable to handle kernel %s at %px\n", - address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", - (void *)address); + if (address < PAGE_SIZE && !user_mode(regs)) + pr_alert("BUG: kernel NULL pointer dereference, address = %px\n", + (void *)address); + else + pr_alert("BUG: unable to handle page fault for address = %px\n", + (void *)address); err_txt[0] = 0; -- cgit v1.2.3 From 18ea35c5ed9921867194a8efc2a0ac2d5a3c7d2a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Dec 2018 13:36:57 -0800 Subject: x86/fault: Decode and print #PF oops in human readable form Linus pointed out that deciphering the raw #PF error code and printing a more human readable message are two different things, and also that printing the negative cases is mostly just noise[1]. For example, the USER bit doesn't mean the fault originated in user code and stating that an oops wasn't due to a protection keys violation isn't interesting since an oops on a keys violation is a one-in-a-million scenario. Remove the per-bit decoding of the error code and instead print: - the raw error code - why the fault occurred - the effective privilege level of the access - the type of access - whether the fault originated in user code or kernel code This provides the user with the information needed to triage 99.9% of oopses without polluting the log with useless information or conflating the error_code with the CPL. Sample output: BUG: kernel NULL pointer dereference, address = 0000000000000008 #PF: supervisor-privileged instruction fetch from kernel code #PF: error_code(0x0010) - not-present page BUG: unable to handle page fault for address = ffffbeef00000000 #PF: supervisor-privileged instruction fetch from kernel code #PF: error_code(0x0010) - not-present page BUG: unable to handle page fault for address = ffffc90000230000 #PF: supervisor-privileged write access from kernel code #PF: error_code(0x000b) - reserved bit violation [1] https://lkml.kernel.org/r/CAHk-=whk_fsnxVMvF1T2fFCaP2WrvSybABrLQCWLJyCvHw6NKA@mail.gmail.com Suggested-by: Linus Torvalds Signed-off-by: Sean Christopherson Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20181221213657.27628-3-sean.j.christopherson@intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index df2c5cdef5c4..74c9204c5751 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -603,24 +603,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } -/* - * This helper function transforms the #PF error_code bits into - * "[PROT] [USER]" type of descriptive, almost human-readable error strings: - */ -static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) -{ - if (error_code & mask) { - if (buf[0]) - strcat(buf, " "); - strcat(buf, txt); - } -} - static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - char err_txt[64]; - if (!oops_may_print()) return; @@ -651,27 +636,22 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad pr_alert("BUG: unable to handle page fault for address = %px\n", (void *)address); - err_txt[0] = 0; - - /* - * Note: length of these appended strings including the separation space and the - * zero delimiter must fit into err_txt[]. - */ - err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); - err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); - err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); - err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); - err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); - err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); - - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + pr_alert("#PF: %s-privileged %s from %s code\n", + (error_code & X86_PF_USER) ? "user" : "supervisor", + (error_code & X86_PF_INSTR) ? "instruction fetch" : + (error_code & X86_PF_WRITE) ? "write access" : + "read access", + user_mode(regs) ? "user" : "kernel"); + pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, + !(error_code & X86_PF_PROT) ? "not-present page" : + (error_code & X86_PF_RSVD) ? "reserved bit violation" : + (error_code & X86_PF_PK) ? "protection keys violation" : + "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; - pr_alert("This was a system access from user code\n"); - /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps -- cgit v1.2.3 From 8fea0f59e97df3b9b8d2a76af54f633f4586751b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Dec 2018 16:16:47 +0100 Subject: x86/topology: Make DEBUG_HOTPLUG_CPU0 pr_info() more descriptive DEBUG_HOTPLUG_CPU0 debug feature offlines a CPU as early as possible allowing userspace to boot up without that CPU (so that it is possible to check for unwanted dependencies towards the offlined CPU). After doing so it emits a "CPU %u is now offline" pr_info, which is not enough descriptive of why the CPU was offlined (e.g., one might be running with a config that triggered some problem, not being aware that CONFIG_DEBUG_ HOTPLUG_CPU0 is set). Add a bit more of informative text to the pr_info, so that it is immediately obvious why a CPU has been offlined in early boot stages. Background: Got to scratch my head a bit while debugging a WARNING splat related to the offlining of CPU0. Without being aware yet of this debug option it wasn't immediately obvious why CPU0 was being offlined by the kernel. Signed-off-by: Juri Lelli Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Link: http://lkml.kernel.org/r/20181219151647.15073-1-juri.lelli@redhat.com [ Merge line-broken line. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 738bf42b0218..be5bc2e47c71 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -71,7 +71,7 @@ int _debug_hotplug_cpu(int cpu, int action) case 0: ret = cpu_down(cpu); if (!ret) { - pr_info("CPU %u is now offline\n", cpu); + pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); dev->offline = true; kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } else -- cgit v1.2.3 From f36e7495dd3990d6848e6d6703c78f1f17a97538 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Thu, 29 Nov 2018 16:56:15 +0100 Subject: x86/tools/relocs: Fix big section header tables In case when the number of entries in the section header table is larger then or equal to SHN_LORESERVE the size of the table is held in the sh_size member of the initial entry in section header table instead of e_shnum. Same with the string table index which is located in sh_link instead of e_shstrndx. This case is easily reproducible with KCFLAGS="-ffunction-sections", bzImage build fails with "String table index out of bounds" error. Signed-off-by: Artem Savkov Reviewed-by: Josh Poimboeuf Acked-by: Joe Lawrence Cc: Eric W . Biederman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181129155615.2594-1-asavkov@redhat.com [ Simplify the die() lines. ] Signed-off-by: Ingo Molnar --- arch/x86/tools/relocs.c | 74 ++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b629f6992d9f..f345586f5e50 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -11,7 +11,9 @@ #define Elf_Shdr ElfW(Shdr) #define Elf_Sym ElfW(Sym) -static Elf_Ehdr ehdr; +static Elf_Ehdr ehdr; +static unsigned long shnum; +static unsigned int shstrndx; struct relocs { uint32_t *offset; @@ -241,9 +243,9 @@ static const char *sec_name(unsigned shndx) { const char *sec_strtab; const char *name; - sec_strtab = secs[ehdr.e_shstrndx].strtab; + sec_strtab = secs[shstrndx].strtab; name = ""; - if (shndx < ehdr.e_shnum) { + if (shndx < shnum) { name = sec_strtab + secs[shndx].shdr.sh_name; } else if (shndx == SHN_ABS) { @@ -271,7 +273,7 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) static Elf_Sym *sym_lookup(const char *symname) { int i; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; long nsyms; char *strtab; @@ -366,27 +368,41 @@ static void read_ehdr(FILE *fp) ehdr.e_shnum = elf_half_to_cpu(ehdr.e_shnum); ehdr.e_shstrndx = elf_half_to_cpu(ehdr.e_shstrndx); - if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + shnum = ehdr.e_shnum; + shstrndx = ehdr.e_shstrndx; + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) die("Unsupported ELF header type\n"); - } - if (ehdr.e_machine != ELF_MACHINE) { + if (ehdr.e_machine != ELF_MACHINE) die("Not for %s\n", ELF_MACHINE_NAME); - } - if (ehdr.e_version != EV_CURRENT) { + if (ehdr.e_version != EV_CURRENT) die("Unknown ELF version\n"); - } - if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) { + if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) die("Bad Elf header size\n"); - } - if (ehdr.e_phentsize != sizeof(Elf_Phdr)) { + if (ehdr.e_phentsize != sizeof(Elf_Phdr)) die("Bad program header entry\n"); - } - if (ehdr.e_shentsize != sizeof(Elf_Shdr)) { + if (ehdr.e_shentsize != sizeof(Elf_Shdr)) die("Bad section header entry\n"); + + + if (shnum == SHN_UNDEF || shstrndx == SHN_XINDEX) { + Elf_Shdr shdr; + + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) + die("Seek to %d failed: %s\n", ehdr.e_shoff, strerror(errno)); + + if (fread(&shdr, sizeof(shdr), 1, fp) != 1) + die("Cannot read initial ELF section header: %s\n", strerror(errno)); + + if (shnum == SHN_UNDEF) + shnum = elf_xword_to_cpu(shdr.sh_size); + + if (shstrndx == SHN_XINDEX) + shstrndx = elf_word_to_cpu(shdr.sh_link); } - if (ehdr.e_shstrndx >= ehdr.e_shnum) { + + if (shstrndx >= shnum) die("String table index out of bounds\n"); - } } static void read_shdrs(FILE *fp) @@ -394,20 +410,20 @@ static void read_shdrs(FILE *fp) int i; Elf_Shdr shdr; - secs = calloc(ehdr.e_shnum, sizeof(struct section)); + secs = calloc(shnum, sizeof(struct section)); if (!secs) { die("Unable to allocate %d section headers\n", - ehdr.e_shnum); + shnum); } if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { die("Seek to %d failed: %s\n", ehdr.e_shoff, strerror(errno)); } - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (fread(&shdr, sizeof(shdr), 1, fp) != 1) die("Cannot read ELF section headers %d/%d: %s\n", - i, ehdr.e_shnum, strerror(errno)); + i, shnum, strerror(errno)); sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name); sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type); sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags); @@ -418,7 +434,7 @@ static void read_shdrs(FILE *fp) sec->shdr.sh_info = elf_word_to_cpu(shdr.sh_info); sec->shdr.sh_addralign = elf_xword_to_cpu(shdr.sh_addralign); sec->shdr.sh_entsize = elf_xword_to_cpu(shdr.sh_entsize); - if (sec->shdr.sh_link < ehdr.e_shnum) + if (sec->shdr.sh_link < shnum) sec->link = &secs[sec->shdr.sh_link]; } @@ -427,7 +443,7 @@ static void read_shdrs(FILE *fp) static void read_strtabs(FILE *fp) { int i; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (sec->shdr.sh_type != SHT_STRTAB) { continue; @@ -452,7 +468,7 @@ static void read_strtabs(FILE *fp) static void read_symtabs(FILE *fp) { int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (sec->shdr.sh_type != SHT_SYMTAB) { continue; @@ -485,7 +501,7 @@ static void read_symtabs(FILE *fp) static void read_relocs(FILE *fp) { int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (sec->shdr.sh_type != SHT_REL_TYPE) { continue; @@ -528,7 +544,7 @@ static void print_absolute_symbols(void) printf("Absolute symbols\n"); printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; char *sym_strtab; int j; @@ -566,7 +582,7 @@ static void print_absolute_relocs(void) else format = "%08"PRIx32" %08"PRIx32" %10s %08"PRIx32" %s\n"; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; struct section *sec_applies, *sec_symtab; char *sym_strtab; @@ -650,7 +666,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, { int i; /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { char *sym_strtab; Elf_Sym *sh_symtab; struct section *sec_applies, *sec_symtab; @@ -706,7 +722,7 @@ static Elf_Addr per_cpu_load_addr; static void percpu_init(void) { int i; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { ElfW(Sym) *sym; if (strcmp(sec_name(i), ".data..percpu")) continue; -- cgit v1.2.3 From b97369f07e3b826e8da0436e26b01c1f8e36042d Mon Sep 17 00:00:00 2001 From: Leonardo Brás Date: Mon, 22 Oct 2018 22:10:22 -0300 Subject: x86/vdso: Rename variable to fix -Wshadow warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The go32() and go64() functions has an argument and a local variable called ‘name’. Rename both to clarify the code and to fix a warning with -Wshadow. Signed-off-by: Leonardo Brás Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: David.Laight@aculab.com Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Michal Marek Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: helen@koikeco.de Cc: linux-kbuild@vger.kernel.org Cc: lkcamp@lists.libreplanetbr.org Link: http://lkml.kernel.org/r/20181023011022.GA6574@WindFlash Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vdso2c.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index fa847a620f40..a20b134de2a8 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -7,7 +7,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) + FILE *outfile, const char *image_name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ @@ -93,11 +93,12 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, int k; ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + - GET_LE(&sym->st_name); + const char *sym_name = raw_addr + + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k].name)) { + if (!strcmp(sym_name, required_syms[k].name)) { if (syms[k]) { fail("duplicate symbol %s\n", required_syms[k].name); @@ -134,7 +135,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (syms[sym_vvar_start] % 4096) fail("vvar_begin must be a multiple of 4096\n"); - if (!name) { + if (!image_name) { fwrite(stripped_addr, stripped_len, 1, outfile); return; } @@ -157,7 +158,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } fprintf(outfile, "\n};\n\n"); - fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "const struct vdso_image %s = {\n", image_name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", mapping_size); if (alt_sec) { -- cgit v1.2.3 From 5ce5d8a5a4ae64b281bea7ae028c960f12acae21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 22:51:49 +0200 Subject: asm-generic: generalize asm/sockios.h ia64, parisc and sparc just use a copy of the generic version of asm/sockios.h, and x86 is a redirect to the same file, so we can just let the header file be generated. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- arch/ia64/include/uapi/asm/sockios.h | 21 --------------------- arch/parisc/include/uapi/asm/sockios.h | 14 -------------- arch/sparc/include/uapi/asm/sockios.h | 15 --------------- arch/x86/include/uapi/asm/sockios.h | 1 - 4 files changed, 51 deletions(-) delete mode 100644 arch/ia64/include/uapi/asm/sockios.h delete mode 100644 arch/parisc/include/uapi/asm/sockios.h delete mode 100644 arch/sparc/include/uapi/asm/sockios.h delete mode 100644 arch/x86/include/uapi/asm/sockios.h (limited to 'arch/x86') diff --git a/arch/ia64/include/uapi/asm/sockios.h b/arch/ia64/include/uapi/asm/sockios.h deleted file mode 100644 index f27a12f95d20..000000000000 --- a/arch/ia64/include/uapi/asm/sockios.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_IA64_SOCKIOS_H -#define _ASM_IA64_SOCKIOS_H - -/* - * Socket-level I/O control calls. - * - * Based on . - * - * Modified 1998, 1999 - * David Mosberger-Tang , Hewlett-Packard Co - */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* _ASM_IA64_SOCKIOS_H */ diff --git a/arch/parisc/include/uapi/asm/sockios.h b/arch/parisc/include/uapi/asm/sockios.h deleted file mode 100644 index 66a3ba64d53f..000000000000 --- a/arch/parisc/include/uapi/asm/sockios.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __ARCH_PARISC_SOCKIOS__ -#define __ARCH_PARISC_SOCKIOS__ - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif diff --git a/arch/sparc/include/uapi/asm/sockios.h b/arch/sparc/include/uapi/asm/sockios.h deleted file mode 100644 index 18a3ec14a847..000000000000 --- a/arch/sparc/include/uapi/asm/sockios.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_SPARC_SOCKIOS_H -#define _ASM_SPARC_SOCKIOS_H - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* !(_ASM_SPARC_SOCKIOS_H) */ - diff --git a/arch/x86/include/uapi/asm/sockios.h b/arch/x86/include/uapi/asm/sockios.h deleted file mode 100644 index def6d4746ee7..000000000000 --- a/arch/x86/include/uapi/asm/sockios.h +++ /dev/null @@ -1 +0,0 @@ -#include -- cgit v1.2.3 From ea2f8d60603efbd1cb4e193a593945a2fe24d264 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 21 Apr 2019 20:35:24 +0200 Subject: x86/fault: Make fault messages more succinct So we are going to be staring at those in the next years, let's make them more succinct. In particular: - change "address = " to "address: " - "-privileged" reads funny. It should be simply "kernel" or "user" - "from kernel code" reads funny too. "kernel mode" or "user mode" is more natural. An actual example says more than 1000 words, of course: [ 0.248370] BUG: kernel NULL pointer dereference, address: 00000000000005b8 [ 0.249120] #PF: supervisor write access in kernel mode [ 0.249717] #PF: error_code(0x0002) - not-present page Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: riel@surriel.com Cc: sean.j.christopherson@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20190421183524.GC6048@zn.tnic Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 74c9204c5751..a0df19b0897d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -630,13 +630,13 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad } if (address < PAGE_SIZE && !user_mode(regs)) - pr_alert("BUG: kernel NULL pointer dereference, address = %px\n", + pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", (void *)address); else - pr_alert("BUG: unable to handle page fault for address = %px\n", + pr_alert("BUG: unable to handle page fault for address: %px\n", (void *)address); - pr_alert("#PF: %s-privileged %s from %s code\n", + pr_alert("#PF: %s %s in %s mode\n", (error_code & X86_PF_USER) ? "user" : "supervisor", (error_code & X86_PF_INSTR) ? "instruction fetch" : (error_code & X86_PF_WRITE) ? "write access" : -- cgit v1.2.3 From 9ca5c8e632ce8f144ec6d00da2dc5e16b41d593c Mon Sep 17 00:00:00 2001 From: Dave Young Date: Sun, 21 Apr 2019 11:50:59 +0800 Subject: x86/kdump: Have crashkernel=X reserve under 4G by default The kdump crashkernel low reservation is limited to under 896M even for X86_64. This obscure and miserable limitation exists for compatibility with old kexec-tools but the reason is not documented anywhere. Some more tests/investigations about the background: a) Previously, old kexec-tools could only load purgatory to memory under 2G. Eric removed that limitation in 2012 in kexec-tools: b4f9f8599679 ("kexec x86_64: Make purgatory relocatable anywhere in the 64bit address space.") b) Back in 2013 Yinghai removed all the limitations in new kexec-tools, bzImage64 can be loaded anywhere: 82c3dd2280d2 ("kexec, x86_64: Load bzImage64 above 4G") c) Test results with old kexec-tools with old and latest kernels: 1. Old kexec-tools can not build with modern toolchain anymore, I built it in a RHEL6 vm. 2. 2.0.0 kexec-tools does not work with the latest kernel even with memory under 896M and gives an error: "ELF core (kcore) parse failed" For that it needs below kexec-tools fix: ed15ba1b9977 ("build_mem_phdrs(): check if p_paddr is invalid") 3. Even with patched kexec-tools which fixes 2), it still needs some other fixes to work correctly for KASLR-enabled kernels. So the situation is: * Old kexec-tools is already broken with latest kernels. * We can not keep these limitations forever just for compatibility with very old kexec-tools. * If one must use old tools then he/she can choose crashkernel=X@Y. * People have reported bugs where crashkernel=384M failed because KASLR makes the 0-896M space sparse. * Crashkernel can reserve in low or high area, it is natural to understand low as memory under 4G. Hence drop the 896M limitation and change crashkernel low reservation to reserve under 4G by default. Signed-off-by: Dave Young Signed-off-by: Borislav Petkov Reviewed-by: Ingo Molnar Acked-by: Baoquan He Cc: Dave Hansen Cc: David Howells Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juergen Gross Cc: Petr Tesarik Cc: piliu@redhat.com Cc: Ram Pai Cc: Sinan Kaya Cc: Thomas Gleixner Cc: vgoyal@redhat.com Cc: x86-ml Cc: Yinghai Lu Cc: Zhimin Gu Link: https://lkml.kernel.org/r/20190421035058.943630505@redhat.com --- arch/x86/kernel/setup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd9..daf7c5650c18 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include